Twelve2five's picture
Update app.py
30a17b1 verified
import os
import time
import requests
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from elevenlabs import ElevenLabs
from fastapi import FastAPI
from fastrtc import (
AdditionalOutputs,
ReplyOnPause,
Stream,
get_stt_model,
get_twilio_turn_credentials,
)
from gradio.utils import get_space
from numpy.typing import NDArray
# Load environment variables
load_dotenv()
# Initialize DeepSeek client
class DeepSeekAPI:
def __init__(self, api_key):
self.api_key = api_key
def chat_completion(self, messages, temperature=0.7, max_tokens=512):
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
payload = {
"model": "deepseek-chat",
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens
}
response = requests.post(url, json=payload, headers=headers)
# Check for error response
if response.status_code != 200:
print(f"DeepSeek API error: {response.status_code} - {response.text}")
return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
return response.json()
# Initialize clients
deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
stt_model = get_stt_model()
# Set up Twilio credentials for WebRTC
# The function doesn't accept keyword arguments, it reads from env vars directly
twilio_credentials = get_twilio_turn_credentials()
# Log Twilio status
if twilio_credentials:
print("Twilio TURN credentials successfully configured")
else:
print("No Twilio credentials found or invalid credentials")
# Handler function for voice conversation
def response(
audio: tuple[int, NDArray[np.int16 | np.float32]],
chatbot: list[dict] | None = None,
):
chatbot = chatbot or []
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
start = time.time()
text = stt_model.stt(audio)
print("transcription", time.time() - start)
print("prompt", text)
chatbot.append({"role": "user", "content": text})
yield AdditionalOutputs(chatbot)
messages.append({"role": "user", "content": text})
# Replace Groq LLM with DeepSeek
response_data = deepseek_client.chat_completion(
messages=messages,
max_tokens=512
)
response_text = response_data["choices"][0]["message"]["content"]
chatbot.append({"role": "assistant", "content": response_text})
for chunk in tts_client.text_to_speech.convert_as_stream(
text=response_text,
voice_id="JBFqnCBsd6RMkjVDRZzb",
model_id="eleven_multilingual_v2",
output_format="pcm_24000",
):
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
yield (24000, audio_array)
yield AdditionalOutputs(chatbot)
# Create the chatbot and Stream components
chatbot = gr.Chatbot(type="messages")
stream = Stream(
modality="audio",
mode="send-receive",
handler=ReplyOnPause(response, input_sample_rate=16000),
additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot],
additional_outputs=[chatbot],
rtc_configuration=twilio_credentials, # Always use Twilio credentials
concurrency_limit=5 if get_space() else None,
time_limit=90 if get_space() else None,
ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"},
)
# Mount the STREAM UI to the FastAPI app
app = FastAPI()
app = gr.mount_gradio_app(app, stream.ui, path="/")
if __name__ == "__main__":
import os
os.environ["GRADIO_SSR_MODE"] = "false"
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860)
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
stream.ui.launch(server_port=7860)