import gradio as gr from rag import process_pdfs, retrieve_context from emotion import get_emotion_and_tone from llm import get_llm_response from tts_gemini import tts_gemini import whisper import time import csv import os # Load Whisper model once whisper_model = whisper.load_model("base") LOG_DIR = "logs" LOG_FILE = os.path.join(LOG_DIR, "latency_log.csv") # Ensure log directory and CSV header os.makedirs(LOG_DIR, exist_ok=True) if not os.path.exists(LOG_FILE): with open(LOG_FILE, mode="w", newline="") as f: writer = csv.writer(f) writer.writerow([ "Whisper STT (s)", "Document Retrieval (s)", "Sentiment Analysis (s)", "Response Gen (LLM) (s)", "TTS Synthesis (s)", "Total (s)" ]) def process_audio_with_rag(audio): t0 = time.time() stt_start = time.time() result = whisper_model.transcribe(audio) text = result["text"] stt_end = time.time() retrieval_start = time.time() context = retrieve_context(text) retrieval_end = time.time() sentiment_start = time.time() emotion, tone_instruction = get_emotion_and_tone(text) sentiment_end = time.time() llm_start = time.time() llm_output = get_llm_response(text, context, emotion, tone_instruction) llm_end = time.time() tts_start = time.time() tts_path = tts_gemini(llm_output) tts_end = time.time() t1 = time.time() stt_latency = stt_end - stt_start retrieval_latency = retrieval_end - retrieval_start sentiment_latency = sentiment_end - sentiment_start llm_latency = llm_end - llm_start tts_latency = tts_end - tts_start total_latency = t1 - t0 # Log to CSV (latency only) with open(LOG_FILE, mode="a", newline="") as f: writer = csv.writer(f) writer.writerow([ f"{stt_latency:.3f}", f"{retrieval_latency:.3f}", f"{sentiment_latency:.3f}", f"{llm_latency:.3f}", f"{tts_latency:.3f}", f"{total_latency:.3f}" ]) return llm_output, emotion, text, context, tts_path demo = gr.Blocks() with demo: gr.Markdown(""" # Voice to LLM & Sentiment Analyzer with RAG (PDF Upload & Gemini TTS) """) with gr.Row(): pdf_input = gr.Files(label="Upload PDF(s)", type="filepath") pdf_status = gr.Textbox(label="PDF Processing Status") pdf_input.upload(process_pdfs, inputs=pdf_input, outputs=pdf_status) audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak your query") llm_output = gr.Textbox(label="LLM Output") sentiment_output = gr.Textbox(label="Sentiment") transcript_output = gr.Textbox(label="Transcribed Text") context_output = gr.Textbox(label="Retrieved Context from PDFs") tts_output = gr.Audio(label="LLM Output (Gemini TTS)") audio_input.change( process_audio_with_rag, inputs=audio_input, outputs=[llm_output, sentiment_output, transcript_output, context_output, tts_output] )