Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

fastrtc-voice-assistant / app.py

Twelve2five

Update app.py

30a17b1 verified 5 months ago

raw

history blame contribute delete

4.11 kB

	import os
	import time
	import requests
	import gradio as gr
	import numpy as np
	from dotenv import load_dotenv
	from elevenlabs import ElevenLabs
	from fastapi import FastAPI
	from fastrtc import (
	AdditionalOutputs,
	ReplyOnPause,
	Stream,
	get_stt_model,
	get_twilio_turn_credentials,
	)
	from gradio.utils import get_space
	from numpy.typing import NDArray

	# Load environment variables
	load_dotenv()

	# Initialize DeepSeek client
	class DeepSeekAPI:
	def __init__(self, api_key):
	self.api_key = api_key

	def chat_completion(self, messages, temperature=0.7, max_tokens=512):
	url = "https://api.deepseek.com/v1/chat/completions"
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {self.api_key}"
	}
	payload = {
	"model": "deepseek-chat",
	"messages": messages,
	"temperature": temperature,
	"max_tokens": max_tokens
	}
	response = requests.post(url, json=payload, headers=headers)

	# Check for error response
	if response.status_code != 200:
	print(f"DeepSeek API error: {response.status_code} - {response.text}")
	return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}

	return response.json()

	# Initialize clients
	deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
	tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
	stt_model = get_stt_model()

	# Set up Twilio credentials for WebRTC
	# The function doesn't accept keyword arguments, it reads from env vars directly
	twilio_credentials = get_twilio_turn_credentials()

	# Log Twilio status
	if twilio_credentials:
	print("Twilio TURN credentials successfully configured")
	else:
	print("No Twilio credentials found or invalid credentials")


	# Handler function for voice conversation
	def response(
	audio: tuple[int, NDArray[np.int16 \| np.float32]],
	chatbot: list[dict] \| None = None,
	):
	chatbot = chatbot or []
	messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
	start = time.time()
	text = stt_model.stt(audio)
	print("transcription", time.time() - start)
	print("prompt", text)
	chatbot.append({"role": "user", "content": text})
	yield AdditionalOutputs(chatbot)
	messages.append({"role": "user", "content": text})

	# Replace Groq LLM with DeepSeek
	response_data = deepseek_client.chat_completion(
	messages=messages,
	max_tokens=512
	)
	response_text = response_data["choices"][0]["message"]["content"]

	chatbot.append({"role": "assistant", "content": response_text})

	for chunk in tts_client.text_to_speech.convert_as_stream(
	text=response_text,
	voice_id="JBFqnCBsd6RMkjVDRZzb",
	model_id="eleven_multilingual_v2",
	output_format="pcm_24000",
	):
	audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
	yield (24000, audio_array)
	yield AdditionalOutputs(chatbot)


	# Create the chatbot and Stream components
	chatbot = gr.Chatbot(type="messages")
	stream = Stream(
	modality="audio",
	mode="send-receive",
	handler=ReplyOnPause(response, input_sample_rate=16000),
	additional_outputs_handler=lambda a, b: b,
	additional_inputs=[chatbot],
	additional_outputs=[chatbot],
	rtc_configuration=twilio_credentials, # Always use Twilio credentials
	concurrency_limit=5 if get_space() else None,
	time_limit=90 if get_space() else None,
	ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"},
	)

	# Mount the STREAM UI to the FastAPI app
	app = FastAPI()
	app = gr.mount_gradio_app(app, stream.ui, path="/")


	if __name__ == "__main__":
	import os

	os.environ["GRADIO_SSR_MODE"] = "false"

	if (mode := os.getenv("MODE")) == "UI":
	stream.ui.launch(server_port=7860)
	elif mode == "PHONE":
	stream.fastphone(host="0.0.0.0", port=7860)
	else:
	stream.ui.launch(server_port=7860)