Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

File size: 4,106 Bytes

4df6700
797af4f
ccc0748
c0c2699
797af4f
4df6700
 
ccc0748
797af4f
ccc0748
 
797af4f
 
ccc0748
797af4f
ccc0748
 
4df6700
 
 
 
ccc0748
013f6a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccc0748
013f6a1
 
 
4df6700
ccc0748
013f6a1
ccc0748
 
 
30a17b1
 
 
ccc0748
 
 
 
 
 
 
4df6700
ccc0748
797af4f
ccc0748
 
797af4f
 
ccc0748
 
797af4f
ccc0748
 
 
797af4f
ccc0748
797af4f
ccc0748
 
 
 
 
797af4f
ccc0748
 
 
 
 
30a17b1
ccc0748
 
 
 
 
01f7ec4
d518218
797af4f
ccc0748
 
797af4f
 
 
 
 
 
 
ccc0748
 
 
 
797af4f
 
ccc0748
 
 
 
c0c2699
227326d
ccc0748

import os
import time
import requests
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from elevenlabs import ElevenLabs
from fastapi import FastAPI
from fastrtc import (
    AdditionalOutputs,
    ReplyOnPause,
    Stream,
    get_stt_model,
    get_twilio_turn_credentials,
)
from gradio.utils import get_space
from numpy.typing import NDArray

# Load environment variables
load_dotenv()

# Initialize DeepSeek client
class DeepSeekAPI:
    def __init__(self, api_key):
        self.api_key = api_key
        
    def chat_completion(self, messages, temperature=0.7, max_tokens=512):
        url = "https://api.deepseek.com/v1/chat/completions"
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }
        payload = {
            "model": "deepseek-chat",
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        response = requests.post(url, json=payload, headers=headers)
        
        # Check for error response
        if response.status_code != 200:
            print(f"DeepSeek API error: {response.status_code} - {response.text}")
            return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
            
        return response.json()

# Initialize clients
deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
stt_model = get_stt_model()

# Set up Twilio credentials for WebRTC
# The function doesn't accept keyword arguments, it reads from env vars directly
twilio_credentials = get_twilio_turn_credentials()

# Log Twilio status
if twilio_credentials:
    print("Twilio TURN credentials successfully configured")
else:
    print("No Twilio credentials found or invalid credentials")


# Handler function for voice conversation
def response(
    audio: tuple[int, NDArray[np.int16 | np.float32]],
    chatbot: list[dict] | None = None,
):
    chatbot = chatbot or []
    messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
    start = time.time()
    text = stt_model.stt(audio)
    print("transcription", time.time() - start)
    print("prompt", text)
    chatbot.append({"role": "user", "content": text})
    yield AdditionalOutputs(chatbot)
    messages.append({"role": "user", "content": text})
    
    # Replace Groq LLM with DeepSeek
    response_data = deepseek_client.chat_completion(
        messages=messages,
        max_tokens=512
    )
    response_text = response_data["choices"][0]["message"]["content"]

    chatbot.append({"role": "assistant", "content": response_text})

    for chunk in tts_client.text_to_speech.convert_as_stream(
        text=response_text,
        voice_id="JBFqnCBsd6RMkjVDRZzb",
        model_id="eleven_multilingual_v2",
        output_format="pcm_24000",
    ):
        audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
        yield (24000, audio_array)
    yield AdditionalOutputs(chatbot)


# Create the chatbot and Stream components
chatbot = gr.Chatbot(type="messages")
stream = Stream(
    modality="audio",
    mode="send-receive",
    handler=ReplyOnPause(response, input_sample_rate=16000),
    additional_outputs_handler=lambda a, b: b,
    additional_inputs=[chatbot],
    additional_outputs=[chatbot],
    rtc_configuration=twilio_credentials,  # Always use Twilio credentials
    concurrency_limit=5 if get_space() else None,
    time_limit=90 if get_space() else None,
    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"},
)

# Mount the STREAM UI to the FastAPI app
app = FastAPI()
app = gr.mount_gradio_app(app, stream.ui, path="/")


if __name__ == "__main__":
    import os

    os.environ["GRADIO_SSR_MODE"] = "false"

    if (mode := os.getenv("MODE")) == "UI":
        stream.ui.launch(server_port=7860)
    elif mode == "PHONE":
        stream.fastphone(host="0.0.0.0", port=7860)
    else:
        stream.ui.launch(server_port=7860)