Spaces:
Sleeping
Sleeping
import gradio as gr | |
import speech_recognition as sr | |
import requests | |
import json | |
import os | |
from datetime import datetime, timedelta | |
import tempfile | |
import io | |
import base64 | |
from typing import Optional, Dict, Any | |
import asyncio | |
import aiohttp | |
from dotenv import load_dotenv | |
# Load environment variables from .env file | |
load_dotenv() | |
# Configuration | |
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") | |
GOOGLE_CALENDAR_CREDENTIALS = os.getenv("GOOGLE_CALENDAR_CREDENTIALS") | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
# ElevenLabs configuration | |
ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Default voice, can be changed | |
ELEVENLABS_API_URL = "https://api.elevenlabs.io/v1" | |
class VoiceAgent: | |
def __init__(self): | |
self.recognizer = sr.Recognizer() | |
# Remove microphone initialization - we'll use Gradio's audio input | |
async def speech_to_text(self, audio_file) -> str: | |
"""Convert speech to text using speech_recognition""" | |
try: | |
# Handle different audio file types | |
if audio_file.endswith('.webm') or audio_file.endswith('.wav'): | |
with sr.AudioFile(audio_file) as source: | |
audio = self.recognizer.record(source) | |
text = self.recognizer.recognize_google(audio) | |
return text | |
else: | |
# For other formats, try direct processing | |
with sr.AudioFile(audio_file) as source: | |
audio = self.recognizer.record(source) | |
text = self.recognizer.recognize_google(audio) | |
return text | |
except sr.UnknownValueError: | |
return "Sorry, I couldn't understand the audio. Please try speaking more clearly." | |
except sr.RequestError as e: | |
return f"Could not request results from speech recognition service; {e}" | |
except Exception as e: | |
return f"Error in speech recognition: {str(e)}" | |
async def text_to_speech(self, text: str) -> bytes: | |
"""Convert text to speech using ElevenLabs""" | |
if not ELEVENLABS_API_KEY: | |
raise ValueError("ElevenLabs API key not found") | |
url = f"{ELEVENLABS_API_URL}/text-to-speech/{ELEVENLABS_VOICE_ID}" | |
headers = { | |
"Accept": "audio/mpeg", | |
"Content-Type": "application/json", | |
"xi-api-key": ELEVENLABS_API_KEY | |
} | |
data = { | |
"text": text, | |
"model_id": "eleven_monolingual_v1", | |
"voice_settings": { | |
"stability": 0.5, | |
"similarity_boost": 0.5 | |
} | |
} | |
async with aiohttp.ClientSession() as session: | |
async with session.post(url, json=data, headers=headers) as response: | |
if response.status == 200: | |
return await response.read() | |
else: | |
raise Exception(f"ElevenLabs API error: {response.status}") | |
async def process_with_mcp(self, user_input: str) -> Dict[str, Any]: | |
"""Process user input using MCP (Model Context Protocol)""" | |
# Detect intent | |
intent = self.detect_intent(user_input) | |
if intent == "calendar": | |
return await self.handle_calendar_request(user_input) | |
else: | |
return await self.handle_general_question(user_input) | |
def detect_intent(self, text: str) -> str: | |
"""Simple intent detection""" | |
calendar_keywords = ["schedule", "appointment", "meeting", "calendar", "book", "reserve"] | |
if any(keyword in text.lower() for keyword in calendar_keywords): | |
return "calendar" | |
return "general" | |
async def handle_calendar_request(self, text: str) -> Dict[str, Any]: | |
"""Handle calendar appointment creation""" | |
try: | |
# Extract appointment details using simple parsing | |
# In a real implementation, you'd use NLP or LLM for better extraction | |
appointment_data = self.extract_appointment_details(text) | |
# Create calendar event (simplified - would use Google Calendar API) | |
event_summary = f"Appointment: {appointment_data.get('title', 'New Meeting')}" | |
event_time = appointment_data.get('time', 'TBD') | |
response_text = f"I've scheduled your {event_summary} for {event_time}. Please note: This is a demo - in production, this would create an actual Google Calendar event." | |
return { | |
"type": "calendar", | |
"response": response_text, | |
"success": True, | |
"event_data": appointment_data | |
} | |
except Exception as e: | |
return { | |
"type": "calendar", | |
"response": f"I encountered an error while scheduling your appointment: {str(e)}", | |
"success": False | |
} | |
def extract_appointment_details(self, text: str) -> Dict[str, str]: | |
"""Extract appointment details from text (simplified)""" | |
# This is a basic implementation - in production, use NLP/LLM | |
details = { | |
"title": "Meeting", | |
"time": "Next available slot", | |
"duration": "30 minutes" | |
} | |
# Simple keyword extraction | |
if "doctor" in text.lower(): | |
details["title"] = "Doctor Appointment" | |
elif "meeting" in text.lower(): | |
details["title"] = "Meeting" | |
elif "call" in text.lower(): | |
details["title"] = "Phone Call" | |
# Extract time mentions (basic) | |
words = text.lower().split() | |
for i, word in enumerate(words): | |
if word in ["tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", "friday"]: | |
details["time"] = word.capitalize() | |
break | |
elif "at" in words and i < len(words) - 1: | |
if any(char.isdigit() for char in words[i + 1]): | |
details["time"] = f"at {words[i + 1]}" | |
break | |
return details | |
async def handle_general_question(self, text: str) -> Dict[str, Any]: | |
"""Handle general questions""" | |
# Simple responses - in production, integrate with LLM | |
responses = { | |
"hello": "Hello! I'm your voice assistant. I can help you schedule appointments or answer questions.", | |
"how are you": "I'm doing well, thank you! How can I help you today?", | |
"weather": "I'm a demo assistant focused on calendar management. For weather, I'd need to integrate with a weather API.", | |
"time": f"The current time is {datetime.now().strftime('%I:%M %p')}", | |
"default": "I understand you're asking about something. As a demo assistant, I can help you schedule appointments or provide basic information. What would you like to do?" | |
} | |
text_lower = text.lower() | |
response_text = responses.get("default") | |
for key, response in responses.items(): | |
if key in text_lower: | |
response_text = response | |
break | |
return { | |
"type": "general", | |
"response": response_text, | |
"success": True | |
} | |
# Initialize the agent | |
agent = VoiceAgent() | |
async def process_voice_input(audio_file): | |
"""Process voice input and return voice response""" | |
if audio_file is None: | |
return None, "Please record some audio first." | |
try: | |
# Convert speech to text | |
text = await agent.speech_to_text(audio_file) | |
if text.startswith("Error"): | |
return None, text | |
# Process with MCP | |
result = await agent.process_with_mcp(text) | |
response_text = result["response"] | |
# Convert response to speech | |
if ELEVENLABS_API_KEY: | |
try: | |
audio_bytes = await agent.text_to_speech(response_text) | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
tmp_file.write(audio_bytes) | |
return tmp_file.name, f"You said: '{text}'\n\nResponse: {response_text}" | |
except Exception as e: | |
return None, f"Text-to-speech error: {str(e)}\n\nYou said: '{text}'\nResponse: {response_text}" | |
else: | |
return None, f"You said: '{text}'\n\nResponse: {response_text}\n\n(Note: Set ELEVENLABS_API_KEY for voice output)" | |
except Exception as e: | |
return None, f"Error processing audio: {str(e)}" | |
def process_text_input(text_input): | |
"""Process text input directly""" | |
if not text_input.strip(): | |
return "Please enter some text." | |
try: | |
# Process with MCP | |
result = asyncio.run(agent.process_with_mcp(text_input)) | |
return result["response"] | |
except Exception as e: | |
return f"Error processing text: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="Voice Agent - Gradio MCP Hackathon", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# π€ Voice Agent with MCP | |
**Hackathon Project**: Gradio Agents & MCP Hackathon | |
This lightweight voice agent can: | |
- π£οΈ Process voice input and respond with voice | |
- π Schedule calendar appointments | |
- β Answer general questions | |
- π§ Uses MCP (Model Context Protocol) for processing | |
## Setup Instructions: | |
1. Set `ELEVENLABS_API_KEY` environment variable for voice synthesis | |
2. Set `GOOGLE_CALENDAR_CREDENTIALS` for calendar integration (optional) | |
3. Try voice input or type your questions below! | |
""") | |
with gr.Tab("π€ Voice Mode"): | |
gr.Markdown("**Record your voice using the microphone button below**") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
label="ποΈ Click to record your voice", | |
format="wav" | |
) | |
voice_button = gr.Button("π Process Voice Input", variant="primary", size="lg") | |
with gr.Column(): | |
audio_output = gr.Audio(label="π AI Voice Response") | |
text_output = gr.Textbox( | |
label="π Conversation Log", | |
lines=8, | |
interactive=False, | |
placeholder="Your conversation will appear here..." | |
) | |
voice_button.click( | |
fn=process_voice_input, | |
inputs=[audio_input], | |
outputs=[audio_output, text_output] | |
) | |
with gr.Tab("π¬ Text Mode"): | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Type your message", | |
placeholder="Ask me anything or request to schedule an appointment...", | |
lines=3 | |
) | |
text_button = gr.Button("Send Message", variant="primary") | |
with gr.Column(): | |
text_response = gr.Textbox( | |
label="AI Response", | |
lines=6, | |
interactive=False | |
) | |
text_button.click( | |
fn=process_text_input, | |
inputs=[text_input], | |
outputs=[text_response] | |
) | |
# Quick action buttons | |
gr.Markdown("### Quick Actions:") | |
with gr.Row(): | |
quick_hello = gr.Button("π Say Hello") | |
quick_time = gr.Button("π What time is it?") | |
quick_appointment = gr.Button("π Schedule appointment tomorrow at 2pm") | |
quick_hello.click( | |
fn=lambda: process_text_input("hello"), | |
outputs=[text_response] | |
) | |
quick_time.click( | |
fn=lambda: process_text_input("what time is it"), | |
outputs=[text_response] | |
) | |
quick_appointment.click( | |
fn=lambda: process_text_input("schedule an appointment tomorrow at 2pm"), | |
outputs=[text_response] | |
) | |
with gr.Tab("βΉοΈ About"): | |
gr.Markdown(""" | |
## About This Project | |
This is a hackathon submission for the **Gradio Agents & MCP Hackathon**. | |
### Features: | |
- **Voice Input/Output**: Uses speech recognition and ElevenLabs TTS | |
- **MCP Integration**: Implements Model Context Protocol for intelligent processing | |
- **Calendar Management**: Can schedule appointments (demo mode) | |
- **Lightweight**: Optimized for Hugging Face Spaces | |
### Technologies Used: | |
- **Gradio**: For the web interface | |
- **ElevenLabs**: For text-to-speech synthesis | |
- **MCP**: For intelligent request processing | |
- **Speech Recognition**: For voice-to-text conversion | |
### Environment Variables: | |
- `ELEVENLABS_API_KEY`: Your ElevenLabs API key | |
- `GOOGLE_CALENDAR_CREDENTIALS`: Google Calendar API credentials (optional) | |
### Example Interactions: | |
- "Hello, how are you?" | |
- "What time is it?" | |
- "Schedule a doctor appointment for tomorrow at 3pm" | |
- "Book a meeting with John next Monday" | |
""") | |
if __name__ == "__main__": | |
demo.launch() |