File size: 4,399 Bytes
0778417
16a0699
e1ee033
 
 
 
 
 
 
 
 
 
 
 
16a0699
e1ee033
16a0699
 
 
 
 
e1ee033
16a0699
 
 
 
 
 
 
 
 
 
 
e1ee033
16a0699
e1ee033
 
 
 
 
0778417
 
e1ee033
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16a0699
 
e1ee033
 
 
 
16a0699
e1ee033
16a0699
 
e1ee033
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import pandas as pd
import edge_tts
import asyncio
import tempfile
import numpy as np
import soxr
from pydub import AudioSegment
import torch
import sentencepiece as spm
import onnxruntime as ort
from huggingface_hub import hf_hub_download

# Load Menu Data
def load_menu():
    menu_file = "menu.xlsx"
    try:
        return pd.read_excel(menu_file)
    except Exception as e:
        raise ValueError(f"Error loading menu file: {e}")

# Filter Menu Items
def filter_menu(preference):
    menu_data = load_menu()
    if preference == "Halal/Non-Veg":
        filtered_data = menu_data[menu_data["Ingredients"].str.contains("Chicken|Mutton|Fish|Prawns|Goat", case=False, na=False)]
    elif preference == "Vegetarian":
        filtered_data = menu_data[~menu_data["Ingredients"].str.contains("Chicken|Mutton|Fish|Prawns|Goat", case=False, na=False)]
    elif preference == "Guilt-Free":
        filtered_data = menu_data[menu_data["Description"].str.contains(r"Fat: ([0-9]|10)g", case=False, na=False)]
    else:
        filtered_data = menu_data

    menu_html = """"  # Prepare dynamic HTML for the menu
    for _, item in filtered_data.iterrows():
        menu_html += f"""
        <div>
            <h3>{item['Dish Name']}</h3>
            <p>Price: ${item['Price ($)']}</p>
            <p>Description: {item['Description']}</p>
        </div>
        """
    return menu_html

# Speech Recognition Model Configuration
model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
sample_rate = 16000

preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))

async def respond(audio_path, preference):
    # Transcribe audio to text
    transcription = transcribe(audio_path)

    # Voice-based interaction logic
    if "vegetarian" in transcription.lower():
        preference = "Vegetarian"
    elif "non-veg" in transcription.lower() or "halal" in transcription.lower():
        preference = "Halal/Non-Veg"
    elif "guilt-free" in transcription.lower():
        preference = "Guilt-Free"
    elif "menu details" in transcription.lower():
        preference = "All"

    # Filter menu based on preference
    menu_html = filter_menu(preference)

    # Text-to-Speech Response
    reply = f"Here are some {preference} dishes available." if preference != "All" else "Here are all the menu details available."
    communicate = edge_tts.Communicate(reply)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)

    return tmp_path, menu_html

def transcribe(audio_path):
    audio_file = AudioSegment.from_file(audio_path)
    sr = audio_file.frame_rate
    audio_buffer = np.array(audio_file.get_array_of_samples())
    audio_fp32 = np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
    audio_16k = soxr.resample(audio_fp32, sr, sample_rate)

    input_signal = torch.tensor(audio_16k).unsqueeze(0)
    length = torch.tensor(len(audio_16k)).unsqueeze(0)
    processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
    logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]

    blank_id = tokenizer.vocab_size()
    decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
    text = tokenizer.decode_ids(decoded_prediction)

    return text

with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("""
        <div style="text-align: right; margin-bottom: 10px;">
            <img src="/mnt/data/Screenshot%202024-12-28%20102122.png" alt="Microphone Icon" style="width: 30px; height: 30px; cursor: pointer;">
        </div>
        """)
        audio_input = gr.Audio(label="Speak your preference", source="microphone", type="filepath")
        preference = gr.Textbox(label="Current Preference", value="All")
        audio_output = gr.Audio(label="Assistant Response", autoplay=True)
        menu_output = gr.HTML(label="Menu Suggestions")

        audio_input.change(respond, inputs=[audio_input, preference], outputs=[audio_output, menu_output])

if __name__ == "__main__":
    demo.queue().launch()