TatTwamAI / agents /tools /voice_tools_old.py
Jayashree Sridhar
First Version
20d720d
"""
Multilingual Voice Processing Tools
STT and TTS with language support
"""
import whisper
import numpy as np
from gtts import gTTS
import edge_tts
import io
import asyncio
from typing import Tuple, Optional
from crewai.tools import BaseTool
import speech_recognition as sr
class MultilingualVoiceProcessor:
"""Handles multilingual STT and TTS"""
def __init__(self):
# Load Whisper model for multilingual STT
self.whisper_model = whisper.load_model("base")
# Language voice mappings for Edge TTS
self.voice_map = {
"en": "en-US-AriaNeural",
"es": "es-ES-ElviraNeural",
"fr": "fr-FR-DeniseNeural",
"de": "de-DE-KatjaNeural",
"it": "it-IT-ElsaNeural",
"pt": "pt-BR-FranciscaNeural",
"hi": "hi-IN-SwaraNeural",
"zh": "zh-CN-XiaoxiaoNeural",
"ja": "ja-JP-NanamiNeural",
"ko": "ko-KR-SunHiNeural",
"ar": "ar-SA-ZariyahNeural",
"ru": "ru-RU-SvetlanaNeural"
}
async def transcribe(
self,
audio_data: np.ndarray,
language: Optional[str] = None
) -> Tuple[str, str]:
"""Transcribe audio to text with language detection"""
try:
# Process audio
if isinstance(audio_data, tuple):
sample_rate, audio = audio_data
else:
audio = audio_data
sample_rate = 16000
# Normalize audio
if audio.dtype != np.float32:
audio = audio.astype(np.float32) / 32768.0
# Transcribe with Whisper
if language and language != "auto":
result = self.whisper_model.transcribe(
audio,
language=language
)
else:
# Auto-detect language
result = self.whisper_model.transcribe(audio)
text = result["text"]
detected_language = result["language"]
return text, detected_language
except Exception as e:
print(f"Transcription error: {e}")
return "Could not transcribe audio", "en"
async def synthesize(
self,
text: str,
language: str = "en",
voice_type: str = "normal"
) -> bytes:
"""Convert text to speech with voice modulation"""
try:
voice = self.voice_map.get(language, "en-US-AriaNeural")
# Apply voice settings for meditation tone
if voice_type == "meditation":
rate = "-15%" # Slower
pitch = "-50Hz" # Lower pitch
else:
rate = "+0%"
pitch = "+0Hz"
# Generate speech
communicate = edge_tts.Communicate(
text,
voice,
rate=rate,
pitch=pitch
)
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
return audio_data
except Exception as e:
print(f"TTS error: {e}")
# Fallback to gTTS
try:
tts = gTTS(text=text, lang=language[:2])
fp = io.BytesIO()
tts.write_to_fp(fp)
return fp.getvalue()
except:
return None
class TranscribeTool(BaseTool):
name: str = "transcribe_audio"
description: str = "Transcribe audio input to text with language detection"
def _run(self, audio_data: np.ndarray, language: str = None) -> dict:
processor = MultilingualVoiceProcessor()
text, detected_lang = asyncio.run(
processor.transcribe(audio_data, language)
)
return {
"text": text,
"language": detected_lang
}
class DetectEmotionTool(BaseTool):
name: str = "detect_emotion"
description: str = "Detect emotional state from text using Mistral"
def _run(self, text: str) -> dict:
# Use Mistral for emotion detection
from models.mistral_model import MistralModel
model = MistralModel()
prompt = f"""
Analyze the emotional state in this text: "{text}"
Identify:
1. Primary emotion (joy, sadness, anger, fear, anxiety, confusion, etc.)
2. Emotional intensity (low, medium, high)
3. Underlying feelings
4. Key concerns
Format as JSON with keys: primary_emotion, intensity, feelings, concerns
"""
response = model.generate(prompt)
# Parse response (simplified)
return {
"primary_emotion": "detected_emotion",
"intensity": "medium",
"feelings": ["feeling1", "feeling2"],
"concerns": ["concern1", "concern2"]
}
class GenerateQuestionsTool(BaseTool):
name: str = "generate_reflective_questions"
description: str = "Generate empathetic reflective questions"
def _run(self, context: dict) -> list:
emotion = context.get("primary_emotion", "neutral")
questions_map = {
"anxiety": [
"What specific thoughts are creating this anxiety?",
"What would feeling calm look like in this situation?",
"What has helped you manage anxiety before?"
],
"sadness": [
"What would comfort mean to you right now?",
"What are you grieving or missing?",
"How can you be gentle with yourself today?"
],
"confusion": [
"What would clarity feel like?",
"What's the main question you're grappling with?",
"What does your intuition tell you?"
]
}
return questions_map.get(emotion, [
"How are you feeling in this moment?",
"What would support look like for you?",
"What's most important to explore right now?"
])