File size: 2,992 Bytes
a704a0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from rag import process_pdfs, retrieve_context
from emotion import get_emotion_and_tone
from llm import get_llm_response
from tts_gemini import tts_gemini
import whisper
import time
import csv
import os

# Load Whisper model once
whisper_model = whisper.load_model("base")

LOG_DIR = "logs"
LOG_FILE = os.path.join(LOG_DIR, "latency_log.csv")

# Ensure log directory and CSV header
os.makedirs(LOG_DIR, exist_ok=True)
if not os.path.exists(LOG_FILE):
    with open(LOG_FILE, mode="w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Whisper STT (s)", "Document Retrieval (s)", "Sentiment Analysis (s)", "Response Gen (LLM) (s)", "TTS Synthesis (s)", "Total (s)"
        ])

def process_audio_with_rag(audio):
    t0 = time.time()
    stt_start = time.time()
    result = whisper_model.transcribe(audio)
    text = result["text"]
    stt_end = time.time()
    retrieval_start = time.time()
    context = retrieve_context(text)
    retrieval_end = time.time()
    sentiment_start = time.time()
    emotion, tone_instruction = get_emotion_and_tone(text)
    sentiment_end = time.time()
    llm_start = time.time()
    llm_output = get_llm_response(text, context, emotion, tone_instruction)
    llm_end = time.time()
    tts_start = time.time()
    tts_path = tts_gemini(llm_output)
    tts_end = time.time()
    t1 = time.time()
    stt_latency = stt_end - stt_start
    retrieval_latency = retrieval_end - retrieval_start
    sentiment_latency = sentiment_end - sentiment_start
    llm_latency = llm_end - llm_start
    tts_latency = tts_end - tts_start
    total_latency = t1 - t0
    # Log to CSV (latency only)
    with open(LOG_FILE, mode="a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            f"{stt_latency:.3f}",
            f"{retrieval_latency:.3f}",
            f"{sentiment_latency:.3f}",
            f"{llm_latency:.3f}",
            f"{tts_latency:.3f}",
            f"{total_latency:.3f}"
        ])
    return llm_output, emotion, text, context, tts_path

demo = gr.Blocks()
with demo:
    gr.Markdown("""
    # Voice to LLM & Sentiment Analyzer with RAG (PDF Upload & Gemini TTS)
    """)
    with gr.Row():
        pdf_input = gr.Files(label="Upload PDF(s)", type="filepath")
        pdf_status = gr.Textbox(label="PDF Processing Status")
    pdf_input.upload(process_pdfs, inputs=pdf_input, outputs=pdf_status)
    audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak your query")
    llm_output = gr.Textbox(label="LLM Output")
    sentiment_output = gr.Textbox(label="Sentiment")
    transcript_output = gr.Textbox(label="Transcribed Text")
    context_output = gr.Textbox(label="Retrieved Context from PDFs")
    tts_output = gr.Audio(label="LLM Output (Gemini TTS)")
    audio_input.change(
        process_audio_with_rag,
        inputs=audio_input,
        outputs=[llm_output, sentiment_output, transcript_output, context_output, tts_output]
    )