the11's picture
Upload 9 files
a704a0c verified
import gradio as gr
from rag import process_pdfs, retrieve_context
from emotion import get_emotion_and_tone
from llm import get_llm_response
from tts_gemini import tts_gemini
import whisper
import time
import csv
import os
# Load Whisper model once
whisper_model = whisper.load_model("base")
LOG_DIR = "logs"
LOG_FILE = os.path.join(LOG_DIR, "latency_log.csv")
# Ensure log directory and CSV header
os.makedirs(LOG_DIR, exist_ok=True)
if not os.path.exists(LOG_FILE):
with open(LOG_FILE, mode="w", newline="") as f:
writer = csv.writer(f)
writer.writerow([
"Whisper STT (s)", "Document Retrieval (s)", "Sentiment Analysis (s)", "Response Gen (LLM) (s)", "TTS Synthesis (s)", "Total (s)"
])
def process_audio_with_rag(audio):
t0 = time.time()
stt_start = time.time()
result = whisper_model.transcribe(audio)
text = result["text"]
stt_end = time.time()
retrieval_start = time.time()
context = retrieve_context(text)
retrieval_end = time.time()
sentiment_start = time.time()
emotion, tone_instruction = get_emotion_and_tone(text)
sentiment_end = time.time()
llm_start = time.time()
llm_output = get_llm_response(text, context, emotion, tone_instruction)
llm_end = time.time()
tts_start = time.time()
tts_path = tts_gemini(llm_output)
tts_end = time.time()
t1 = time.time()
stt_latency = stt_end - stt_start
retrieval_latency = retrieval_end - retrieval_start
sentiment_latency = sentiment_end - sentiment_start
llm_latency = llm_end - llm_start
tts_latency = tts_end - tts_start
total_latency = t1 - t0
# Log to CSV (latency only)
with open(LOG_FILE, mode="a", newline="") as f:
writer = csv.writer(f)
writer.writerow([
f"{stt_latency:.3f}",
f"{retrieval_latency:.3f}",
f"{sentiment_latency:.3f}",
f"{llm_latency:.3f}",
f"{tts_latency:.3f}",
f"{total_latency:.3f}"
])
return llm_output, emotion, text, context, tts_path
demo = gr.Blocks()
with demo:
gr.Markdown("""
# Voice to LLM & Sentiment Analyzer with RAG (PDF Upload & Gemini TTS)
""")
with gr.Row():
pdf_input = gr.Files(label="Upload PDF(s)", type="filepath")
pdf_status = gr.Textbox(label="PDF Processing Status")
pdf_input.upload(process_pdfs, inputs=pdf_input, outputs=pdf_status)
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak your query")
llm_output = gr.Textbox(label="LLM Output")
sentiment_output = gr.Textbox(label="Sentiment")
transcript_output = gr.Textbox(label="Transcribed Text")
context_output = gr.Textbox(label="Retrieved Context from PDFs")
tts_output = gr.Audio(label="LLM Output (Gemini TTS)")
audio_input.change(
process_audio_with_rag,
inputs=audio_input,
outputs=[llm_output, sentiment_output, transcript_output, context_output, tts_output]
)