diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..1f164bf1 --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +LIVEKIT_URL= +LIVEKIT_API_KEY= +LIVEKIT_API_SECRET= +OPENAI_API_KEY= +DEEPGRAM_API_KEY= +CARTESIA_API_KEY= diff --git a/.env.local b/.env.local new file mode 100644 index 00000000..71bd2f89 --- /dev/null +++ b/.env.local @@ -0,0 +1,6 @@ +CARTESIA_API_KEY=" +DEEPGRAM_API_KEY=" +LIVEKIT_API_KEY=" +LIVEKIT_API_SECRET=" +LIVEKIT_URL=" +OPENAI_API_KEY=" \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..98ffdad0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.env.local +venv/ +.DS_Store diff --git a/agent.py b/agent.py new file mode 100644 index 00000000..92a60ac7 --- /dev/null +++ b/agent.py @@ -0,0 +1,78 @@ +import logging + +from dotenv import load_dotenv +from livekit.agents import ( + AutoSubscribe, + JobContext, + JobProcess, + WorkerOptions, + cli, + llm, + metrics, +) +from livekit.agents.pipeline import VoicePipelineAgent +from livekit.plugins import cartesia, openai, deepgram, silero, turn_detector + + +load_dotenv(dotenv_path=".env.local") +logger = logging.getLogger("voice-agent") + + +def prewarm(proc: JobProcess): + proc.userdata["vad"] = silero.VAD.load() + + +async def entrypoint(ctx: JobContext): + initial_ctx = llm.ChatContext().append( + role="system", + text=( + "You are a voice assistant created by LiveKit. Your interface with users will be voice. " + "You should use short and concise responses, and avoiding usage of unpronouncable punctuation. " + "You were created as a demo to showcase the capabilities of LiveKit's agents framework." + ), + ) + + logger.info(f"connecting to room {ctx.room.name}") + await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) + + # Wait for the first participant to connect + participant = await ctx.wait_for_participant() + logger.info(f"starting voice assistant for participant {participant.identity}") + + # This project is configured to use Deepgram STT, OpenAI LLM and Cartesia TTS plugins + # Other great providers exist like Cerebras, ElevenLabs, Groq, Play.ht, Rime, and more + # Learn more and pick the best one for your app: + # https://docs.livekit.io/agents/plugins + agent = VoicePipelineAgent( + vad=ctx.proc.userdata["vad"], + stt=deepgram.STT(), + llm=openai.LLM(model="gpt-4o-mini"), + tts=cartesia.TTS(), + turn_detector=turn_detector.EOUModel(), + # minimum delay for endpointing, used when turn detector believes the user is done with their turn + min_endpointing_delay=0.5, + # maximum delay for endpointing, used when turn detector does not believe the user is done with their turn + max_endpointing_delay=5.0, + chat_ctx=initial_ctx, + ) + + usage_collector = metrics.UsageCollector() + + @agent.on("metrics_collected") + def on_metrics_collected(agent_metrics: metrics.AgentMetrics): + metrics.log_metrics(agent_metrics) + usage_collector.collect(agent_metrics) + + agent.start(ctx.room, participant) + + # The agent should be polite and greet the user when it joins :) + await agent.say("Hey, how can I help you today?", allow_interruptions=True) + + +if __name__ == "__main__": + cli.run_app( + WorkerOptions( + entrypoint_fnc=entrypoint, + prewarm_fnc=prewarm, + ), + ) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..285a65d7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +livekit-agents>=0.12.11 +livekit-plugins-openai>=0.10.17 +livekit-plugins-cartesia>=0.4.7 +livekit-plugins-deepgram>=0.6.17 +livekit-plugins-silero>=0.7.4 +livekit-plugins-turn-detector>=0.4.0 +python-dotenv~=1.0