Spaces:

the11
/

Voice-Activated-RAG-System

Running

App Files Files Community

Voice-Activated-RAG-System / ui.py

the11

Upload 9 files

a704a0c verified 29 days ago

raw

history blame contribute delete

2.99 kB

	import gradio as gr
	from rag import process_pdfs, retrieve_context
	from emotion import get_emotion_and_tone
	from llm import get_llm_response
	from tts_gemini import tts_gemini
	import whisper
	import time
	import csv
	import os

	# Load Whisper model once
	whisper_model = whisper.load_model("base")

	LOG_DIR = "logs"
	LOG_FILE = os.path.join(LOG_DIR, "latency_log.csv")

	# Ensure log directory and CSV header
	os.makedirs(LOG_DIR, exist_ok=True)
	if not os.path.exists(LOG_FILE):
	with open(LOG_FILE, mode="w", newline="") as f:
	writer = csv.writer(f)
	writer.writerow([
	"Whisper STT (s)", "Document Retrieval (s)", "Sentiment Analysis (s)", "Response Gen (LLM) (s)", "TTS Synthesis (s)", "Total (s)"
	])

	def process_audio_with_rag(audio):
	t0 = time.time()
	stt_start = time.time()
	result = whisper_model.transcribe(audio)
	text = result["text"]
	stt_end = time.time()
	retrieval_start = time.time()
	context = retrieve_context(text)
	retrieval_end = time.time()
	sentiment_start = time.time()
	emotion, tone_instruction = get_emotion_and_tone(text)
	sentiment_end = time.time()
	llm_start = time.time()
	llm_output = get_llm_response(text, context, emotion, tone_instruction)
	llm_end = time.time()
	tts_start = time.time()
	tts_path = tts_gemini(llm_output)
	tts_end = time.time()
	t1 = time.time()
	stt_latency = stt_end - stt_start
	retrieval_latency = retrieval_end - retrieval_start
	sentiment_latency = sentiment_end - sentiment_start
	llm_latency = llm_end - llm_start
	tts_latency = tts_end - tts_start
	total_latency = t1 - t0
	# Log to CSV (latency only)
	with open(LOG_FILE, mode="a", newline="") as f:
	writer = csv.writer(f)
	writer.writerow([
	f"{stt_latency:.3f}",
	f"{retrieval_latency:.3f}",
	f"{sentiment_latency:.3f}",
	f"{llm_latency:.3f}",
	f"{tts_latency:.3f}",
	f"{total_latency:.3f}"
	])
	return llm_output, emotion, text, context, tts_path

	demo = gr.Blocks()
	with demo:
	gr.Markdown("""
	# Voice to LLM & Sentiment Analyzer with RAG (PDF Upload & Gemini TTS)
	""")
	with gr.Row():
	pdf_input = gr.Files(label="Upload PDF(s)", type="filepath")
	pdf_status = gr.Textbox(label="PDF Processing Status")
	pdf_input.upload(process_pdfs, inputs=pdf_input, outputs=pdf_status)
	audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak your query")
	llm_output = gr.Textbox(label="LLM Output")
	sentiment_output = gr.Textbox(label="Sentiment")
	transcript_output = gr.Textbox(label="Transcribed Text")
	context_output = gr.Textbox(label="Retrieved Context from PDFs")
	tts_output = gr.Audio(label="LLM Output (Gemini TTS)")
	audio_input.change(
	process_audio_with_rag,
	inputs=audio_input,
	outputs=[llm_output, sentiment_output, transcript_output, context_output, tts_output]
	)