TatTwamAI / agents /tools /voice_tools.py
Jayashree Sridhar
added privateattr from pydantic
3019028
import numpy as np
import asyncio
#from .base_tool import BaseTool
from models.tinygpt2_model import TinyGPT2Model
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import os
import tempfile
import soundfile as sf
import torch
from pydantic import PrivateAttr
from crewai.tools import BaseTool
# class MultilingualVoiceProcessor:
# def __init__(self, model_name="openai/whisper-base", device=None):
# cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
# if device is None:
# device = 0 if torch.cuda.is_available() else -1
# # Load model and processor with cache_dir
# processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
# # Create the pipeline, DO NOT PASS cache_dir here
# # self.pipe = pipeline(
# # "automatic-speech-recognition",
# # model=model,
# # tokenizer=processor,
# # feature_extractor=processor,
# # device=device,
# # generate_kwargs={"task": "transcribe", "return_timestamps": False},
# # )
# self.pipe = pipeline(
# "automatic-speech-recognition",
# model=model_name,
# device=device,
# generate_kwargs={"task": "transcribe", "return_timestamps": False},
# )
# async def transcribe(self, audio_data: np.ndarray, language: str = None):
# with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
# sf.write(tmp_wav.name, audio_data, samplerate=16000)
# extra = {"language": language} if language else {}
# result = self.pipe(tmp_wav.name, **extra)
# text = result['text']
# return text, language or "unknown"
# async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
# raise NotImplementedError("Use gTTS or edge-tts as before.")
# class TranscribeAudioTool(BaseTool):
# name: str = "transcribe_audio"
# description: str = "Transcribe audio to text and detect language."
# model_config = {"arbitrary_types_allowed": True}
# #_vp: MultilingualVoiceProcessor = PrivateAttr()
# def __init__(self, config=None):
# super().__init__()
# self.vp = MultilingualVoiceProcessor()
# def _run(self, audio_data: np.ndarray, language=None):
# text, detected_lang = asyncio.run(self.vp.transcribe(audio_data, language))
# return {"text": text, "language": detected_lang}
# class DetectEmotionTool(BaseTool):
# name: str = "detect_emotion"
# description: str = "Detect the emotional state from text."
# model_config = {"arbitrary_types_allowed": True}
# def __init__(self, config=None):
# super().__init__()
# def _run(self, text: str):
# model = TinyGPT2Model()
# prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
# response = model.generate(prompt)
# return {"primary_emotion": "detected_emotion",
# "intensity": "medium",
# "feelings": ["feeling1"],
# "concerns": ["concern1"]}
# class GenerateReflectiveQuestionsTool(BaseTool):
# name: str = "generate_reflective_questions"
# description: str = "Generate reflective questions."
# model_config = {"arbitrary_types_allowed": True}
# def __init__(self, config=None):
# super().__init__()
# def _run(self, context: dict):
# emotion = context.get("primary_emotion", "neutral")
# questions_map = {
# "anxiety": ["What triggers your anxiety?", "How do you cope?"],
# "sadness": ["What helps when you feel sad?", "Who can you talk to?"]
# }
# return questions_map.get(emotion, [
# "How are you feeling?",
# "What feels important now?"
# ])
# class VoiceTools:
# def __init__(self, config=None):
# self.transcribe_audio = TranscribeAudioTool(config)
# self.detect_emotion = DetectEmotionTool(config)
# self.generate_reflective_questions = GenerateReflectiveQuestionsTool(config)
import numpy as np
import asyncio
from typing import List, Optional
from models.tinygpt2_model import TinyGPT2Model
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import os
import tempfile
import soundfile as sf
import torch
from crewai.tools import BaseTool
class MultilingualVoiceProcessor:
def __init__(self, model_name="openai/whisper-base", device=None):
cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
if device is None:
device = 0 if torch.cuda.is_available() else -1
self.pipe = pipeline(
"automatic-speech-recognition",
model=model_name,
device=device,
generate_kwargs={"task": "transcribe", "return_timestamps": False},
)
async def transcribe(self, audio_data: np.ndarray, language: str = None):
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
sf.write(tmp_wav.name, audio_data, samplerate=16000)
extra = {"language": language} if language else {}
result = self.pipe(tmp_wav.name, **extra)
text = result['text']
return text, language or "unknown"
async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
raise NotImplementedError("Use gTTS or edge-tts as before.")
class TranscribeAudioTool(BaseTool):
name: str = "transcribe_audio"
description: str = "Transcribe audio to text and detect language."
model_config = {"arbitrary_types_allowed": True}
_vp: MultilingualVoiceProcessor = PrivateAttr()
def __init__(self, config=None):
super().__init__()
self._vp = MultilingualVoiceProcessor()
def _run(self, audio_data: List[float], language: Optional[str] = None):
audio_np = np.array(audio_data, dtype=np.float32)
text, detected_lang = asyncio.run(self.vp.transcribe(audio_np, language))
return {"text": text, "language": detected_lang}
class DetectEmotionTool(BaseTool):
name: str = "detect_emotion"
description: str = "Detect the emotional state from text."
model_config = {"arbitrary_types_allowed": True}
def __init__(self, config=None):
super().__init__()
def _run(self, text: str):
model = TinyGPT2Model()
prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
response = model.generate(prompt)
return {"primary_emotion": "detected_emotion",
"intensity": "medium",
"feelings": ["feeling1"],
"concerns": ["concern1"]}
class GenerateReflectiveQuestionsTool(BaseTool):
name: str = "generate_reflective_questions"
description: str = "Generate reflective questions."
model_config = {"arbitrary_types_allowed": True}
def __init__(self, config=None):
super().__init__()
def _run(self, context: dict):
emotion = context.get("primary_emotion", "neutral")
questions_map = {
"anxiety": ["What triggers your anxiety?", "How do you cope?"],
"sadness": ["What helps when you feel sad?", "Who can you talk to?"]
}
return questions_map.get(emotion, [
"How are you feeling?",
"What feels important now?"
])
class VoiceTools:
def __init__(self, config=None):
self.transcribe_audio = TranscribeAudioTool(config)
self.detect_emotion = DetectEmotionTool(config)
self.generate_reflective_questions = GenerateReflectiveQuestionsTool(config)