import gradio as gr
import pandas as pd
import edge_tts
import asyncio
import tempfile
import numpy as np
import soxr
from pydub import AudioSegment
import torch
import sentencepiece as spm
import onnxruntime as ort
from huggingface_hub import hf_hub_download
# Load Menu Data
def load_menu():
menu_file = "menu.xlsx"
try:
return pd.read_excel(menu_file)
except Exception as e:
raise ValueError(f"Error loading menu file: {e}")
# Filter Menu Items
def filter_menu(preference):
menu_data = load_menu()
if preference == "Halal/Non-Veg":
filtered_data = menu_data[menu_data["Ingredients"].str.contains("Chicken|Mutton|Fish|Prawns|Goat", case=False, na=False)]
elif preference == "Vegetarian":
filtered_data = menu_data[~menu_data["Ingredients"].str.contains("Chicken|Mutton|Fish|Prawns|Goat", case=False, na=False)]
elif preference == "Guilt-Free":
filtered_data = menu_data[menu_data["Description"].str.contains(r"Fat: ([0-9]|10)g", case=False, na=False)]
else:
filtered_data = menu_data
menu_html = """" # Prepare dynamic HTML for the menu
for _, item in filtered_data.iterrows():
menu_html += f"""
{item['Dish Name']}
Price: ${item['Price ($)']}
Description: {item['Description']}
"""
return menu_html
# Speech Recognition Model Configuration
model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
sample_rate = 16000
preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
async def respond(audio_path, preference):
# Transcribe audio to text
transcription = transcribe(audio_path)
# Voice-based interaction logic
if "vegetarian" in transcription.lower():
preference = "Vegetarian"
elif "non-veg" in transcription.lower() or "halal" in transcription.lower():
preference = "Halal/Non-Veg"
elif "guilt-free" in transcription.lower():
preference = "Guilt-Free"
elif "menu details" in transcription.lower():
preference = "All"
# Filter menu based on preference
menu_html = filter_menu(preference)
# Text-to-Speech Response
reply = f"Here are some {preference} dishes available." if preference != "All" else "Here are all the menu details available."
communicate = edge_tts.Communicate(reply)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, menu_html
def transcribe(audio_path):
audio_file = AudioSegment.from_file(audio_path)
sr = audio_file.frame_rate
audio_buffer = np.array(audio_file.get_array_of_samples())
audio_fp32 = np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
audio_16k = soxr.resample(audio_fp32, sr, sample_rate)
input_signal = torch.tensor(audio_16k).unsqueeze(0)
length = torch.tensor(len(audio_16k)).unsqueeze(0)
processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]
blank_id = tokenizer.vocab_size()
decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
text = tokenizer.decode_ids(decoded_prediction)
return text
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("""
""")
audio_input = gr.Audio(label="Speak your preference", source="microphone", type="filepath")
preference = gr.Textbox(label="Current Preference", value="All")
audio_output = gr.Audio(label="Assistant Response", autoplay=True)
menu_output = gr.HTML(label="Menu Suggestions")
audio_input.change(respond, inputs=[audio_input, preference], outputs=[audio_output, menu_output])
if __name__ == "__main__":
demo.queue().launch()