|
""" |
|
Multilingual Voice Processing Tools |
|
STT and TTS with language support |
|
""" |
|
|
|
import whisper |
|
import numpy as np |
|
from gtts import gTTS |
|
import edge_tts |
|
import io |
|
import asyncio |
|
from typing import Tuple, Optional |
|
from crewai.tools import BaseTool |
|
import speech_recognition as sr |
|
|
|
class MultilingualVoiceProcessor: |
|
"""Handles multilingual STT and TTS""" |
|
|
|
def __init__(self): |
|
|
|
self.whisper_model = whisper.load_model("base") |
|
|
|
|
|
self.voice_map = { |
|
"en": "en-US-AriaNeural", |
|
"es": "es-ES-ElviraNeural", |
|
"fr": "fr-FR-DeniseNeural", |
|
"de": "de-DE-KatjaNeural", |
|
"it": "it-IT-ElsaNeural", |
|
"pt": "pt-BR-FranciscaNeural", |
|
"hi": "hi-IN-SwaraNeural", |
|
"zh": "zh-CN-XiaoxiaoNeural", |
|
"ja": "ja-JP-NanamiNeural", |
|
"ko": "ko-KR-SunHiNeural", |
|
"ar": "ar-SA-ZariyahNeural", |
|
"ru": "ru-RU-SvetlanaNeural" |
|
} |
|
|
|
async def transcribe( |
|
self, |
|
audio_data: np.ndarray, |
|
language: Optional[str] = None |
|
) -> Tuple[str, str]: |
|
"""Transcribe audio to text with language detection""" |
|
try: |
|
|
|
if isinstance(audio_data, tuple): |
|
sample_rate, audio = audio_data |
|
else: |
|
audio = audio_data |
|
sample_rate = 16000 |
|
|
|
|
|
if audio.dtype != np.float32: |
|
audio = audio.astype(np.float32) / 32768.0 |
|
|
|
|
|
if language and language != "auto": |
|
result = self.whisper_model.transcribe( |
|
audio, |
|
language=language |
|
) |
|
else: |
|
|
|
result = self.whisper_model.transcribe(audio) |
|
|
|
text = result["text"] |
|
detected_language = result["language"] |
|
|
|
return text, detected_language |
|
|
|
except Exception as e: |
|
print(f"Transcription error: {e}") |
|
return "Could not transcribe audio", "en" |
|
|
|
async def synthesize( |
|
self, |
|
text: str, |
|
language: str = "en", |
|
voice_type: str = "normal" |
|
) -> bytes: |
|
"""Convert text to speech with voice modulation""" |
|
try: |
|
voice = self.voice_map.get(language, "en-US-AriaNeural") |
|
|
|
|
|
if voice_type == "meditation": |
|
rate = "-15%" |
|
pitch = "-50Hz" |
|
else: |
|
rate = "+0%" |
|
pitch = "+0Hz" |
|
|
|
|
|
communicate = edge_tts.Communicate( |
|
text, |
|
voice, |
|
rate=rate, |
|
pitch=pitch |
|
) |
|
|
|
audio_data = b"" |
|
async for chunk in communicate.stream(): |
|
if chunk["type"] == "audio": |
|
audio_data += chunk["data"] |
|
|
|
return audio_data |
|
|
|
except Exception as e: |
|
print(f"TTS error: {e}") |
|
|
|
try: |
|
tts = gTTS(text=text, lang=language[:2]) |
|
fp = io.BytesIO() |
|
tts.write_to_fp(fp) |
|
return fp.getvalue() |
|
except: |
|
return None |
|
|
|
class TranscribeTool(BaseTool): |
|
name: str = "transcribe_audio" |
|
description: str = "Transcribe audio input to text with language detection" |
|
|
|
def _run(self, audio_data: np.ndarray, language: str = None) -> dict: |
|
processor = MultilingualVoiceProcessor() |
|
text, detected_lang = asyncio.run( |
|
processor.transcribe(audio_data, language) |
|
) |
|
return { |
|
"text": text, |
|
"language": detected_lang |
|
} |
|
|
|
class DetectEmotionTool(BaseTool): |
|
name: str = "detect_emotion" |
|
description: str = "Detect emotional state from text using Mistral" |
|
|
|
def _run(self, text: str) -> dict: |
|
|
|
from models.mistral_model import MistralModel |
|
model = MistralModel() |
|
|
|
prompt = f""" |
|
Analyze the emotional state in this text: "{text}" |
|
|
|
Identify: |
|
1. Primary emotion (joy, sadness, anger, fear, anxiety, confusion, etc.) |
|
2. Emotional intensity (low, medium, high) |
|
3. Underlying feelings |
|
4. Key concerns |
|
|
|
Format as JSON with keys: primary_emotion, intensity, feelings, concerns |
|
""" |
|
|
|
response = model.generate(prompt) |
|
|
|
|
|
return { |
|
"primary_emotion": "detected_emotion", |
|
"intensity": "medium", |
|
"feelings": ["feeling1", "feeling2"], |
|
"concerns": ["concern1", "concern2"] |
|
} |
|
|
|
class GenerateQuestionsTool(BaseTool): |
|
name: str = "generate_reflective_questions" |
|
description: str = "Generate empathetic reflective questions" |
|
|
|
def _run(self, context: dict) -> list: |
|
emotion = context.get("primary_emotion", "neutral") |
|
|
|
questions_map = { |
|
"anxiety": [ |
|
"What specific thoughts are creating this anxiety?", |
|
"What would feeling calm look like in this situation?", |
|
"What has helped you manage anxiety before?" |
|
], |
|
"sadness": [ |
|
"What would comfort mean to you right now?", |
|
"What are you grieving or missing?", |
|
"How can you be gentle with yourself today?" |
|
], |
|
"confusion": [ |
|
"What would clarity feel like?", |
|
"What's the main question you're grappling with?", |
|
"What does your intuition tell you?" |
|
] |
|
} |
|
|
|
return questions_map.get(emotion, [ |
|
"How are you feeling in this moment?", |
|
"What would support look like for you?", |
|
"What's most important to explore right now?" |
|
]) |