import gradio as gr import pandas as pd import edge_tts import asyncio import tempfile import numpy as np import soxr from pydub import AudioSegment import torch import sentencepiece as spm import onnxruntime as ort from huggingface_hub import hf_hub_download # Load Menu Data def load_menu(): menu_file = "menu.xlsx" try: return pd.read_excel(menu_file) except Exception as e: raise ValueError(f"Error loading menu file: {e}") # Filter Menu Items def filter_menu(preference): menu_data = load_menu() if preference == "Halal/Non-Veg": filtered_data = menu_data[menu_data["Ingredients"].str.contains("Chicken|Mutton|Fish|Prawns|Goat", case=False, na=False)] elif preference == "Vegetarian": filtered_data = menu_data[~menu_data["Ingredients"].str.contains("Chicken|Mutton|Fish|Prawns|Goat", case=False, na=False)] elif preference == "Guilt-Free": filtered_data = menu_data[menu_data["Description"].str.contains(r"Fat: ([0-9]|10)g", case=False, na=False)] else: filtered_data = menu_data menu_html = """" # Prepare dynamic HTML for the menu for _, item in filtered_data.iterrows(): menu_html += f"""

{item['Dish Name']}

Price: ${item['Price ($)']}

Description: {item['Description']}

""" return menu_html # Speech Recognition Model Configuration model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25" sample_rate = 16000 preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx")) encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx")) tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx")) async def respond(audio_path, preference): # Transcribe audio to text transcription = transcribe(audio_path) # Voice-based interaction logic if "vegetarian" in transcription.lower(): preference = "Vegetarian" elif "non-veg" in transcription.lower() or "halal" in transcription.lower(): preference = "Halal/Non-Veg" elif "guilt-free" in transcription.lower(): preference = "Guilt-Free" elif "menu details" in transcription.lower(): preference = "All" # Filter menu based on preference menu_html = filter_menu(preference) # Text-to-Speech Response reply = f"Here are some {preference} dishes available." if preference != "All" else "Here are all the menu details available." communicate = edge_tts.Communicate(reply) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path, menu_html def transcribe(audio_path): audio_file = AudioSegment.from_file(audio_path) sr = audio_file.frame_rate audio_buffer = np.array(audio_file.get_array_of_samples()) audio_fp32 = np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32) audio_16k = soxr.resample(audio_fp32, sr, sample_rate) input_signal = torch.tensor(audio_16k).unsqueeze(0) length = torch.tensor(len(audio_16k)).unsqueeze(0) processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length) logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0] blank_id = tokenizer.vocab_size() decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id] text = tokenizer.decode_ids(decoded_prediction) return text with gr.Blocks() as demo: with gr.Row(): gr.Markdown("""
Microphone Icon
""") audio_input = gr.Audio(label="Speak your preference", source="microphone", type="filepath") preference = gr.Textbox(label="Current Preference", value="All") audio_output = gr.Audio(label="Assistant Response", autoplay=True) menu_output = gr.HTML(label="Menu Suggestions") audio_input.change(respond, inputs=[audio_input, preference], outputs=[audio_output, menu_output]) if __name__ == "__main__": demo.queue().launch()