|
import gradio as gr |
|
from rag import process_pdfs, retrieve_context |
|
from emotion import get_emotion_and_tone |
|
from llm import get_llm_response |
|
from tts_gemini import tts_gemini |
|
import whisper |
|
import time |
|
import csv |
|
import os |
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
|
|
LOG_DIR = "logs" |
|
LOG_FILE = os.path.join(LOG_DIR, "latency_log.csv") |
|
|
|
|
|
os.makedirs(LOG_DIR, exist_ok=True) |
|
if not os.path.exists(LOG_FILE): |
|
with open(LOG_FILE, mode="w", newline="") as f: |
|
writer = csv.writer(f) |
|
writer.writerow([ |
|
"Whisper STT (s)", "Document Retrieval (s)", "Sentiment Analysis (s)", "Response Gen (LLM) (s)", "TTS Synthesis (s)", "Total (s)" |
|
]) |
|
|
|
def process_audio_with_rag(audio): |
|
t0 = time.time() |
|
stt_start = time.time() |
|
result = whisper_model.transcribe(audio) |
|
text = result["text"] |
|
stt_end = time.time() |
|
retrieval_start = time.time() |
|
context = retrieve_context(text) |
|
retrieval_end = time.time() |
|
sentiment_start = time.time() |
|
emotion, tone_instruction = get_emotion_and_tone(text) |
|
sentiment_end = time.time() |
|
llm_start = time.time() |
|
llm_output = get_llm_response(text, context, emotion, tone_instruction) |
|
llm_end = time.time() |
|
tts_start = time.time() |
|
tts_path = tts_gemini(llm_output) |
|
tts_end = time.time() |
|
t1 = time.time() |
|
stt_latency = stt_end - stt_start |
|
retrieval_latency = retrieval_end - retrieval_start |
|
sentiment_latency = sentiment_end - sentiment_start |
|
llm_latency = llm_end - llm_start |
|
tts_latency = tts_end - tts_start |
|
total_latency = t1 - t0 |
|
|
|
with open(LOG_FILE, mode="a", newline="") as f: |
|
writer = csv.writer(f) |
|
writer.writerow([ |
|
f"{stt_latency:.3f}", |
|
f"{retrieval_latency:.3f}", |
|
f"{sentiment_latency:.3f}", |
|
f"{llm_latency:.3f}", |
|
f"{tts_latency:.3f}", |
|
f"{total_latency:.3f}" |
|
]) |
|
return llm_output, emotion, text, context, tts_path |
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
gr.Markdown(""" |
|
# Voice to LLM & Sentiment Analyzer with RAG (PDF Upload & Gemini TTS) |
|
""") |
|
with gr.Row(): |
|
pdf_input = gr.Files(label="Upload PDF(s)", type="filepath") |
|
pdf_status = gr.Textbox(label="PDF Processing Status") |
|
pdf_input.upload(process_pdfs, inputs=pdf_input, outputs=pdf_status) |
|
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak your query") |
|
llm_output = gr.Textbox(label="LLM Output") |
|
sentiment_output = gr.Textbox(label="Sentiment") |
|
transcript_output = gr.Textbox(label="Transcribed Text") |
|
context_output = gr.Textbox(label="Retrieved Context from PDFs") |
|
tts_output = gr.Audio(label="LLM Output (Gemini TTS)") |
|
audio_input.change( |
|
process_audio_with_rag, |
|
inputs=audio_input, |
|
outputs=[llm_output, sentiment_output, transcript_output, context_output, tts_output] |
|
) |