import os, time, requests, tempfile, asyncio, logging import gradio as gr from transformers import pipeline import edge_tts from collections import Counter # ─── Configuration ────────────────────────────────────────────────────────────── ENDPOINT_URL = "https://xzup8268xrmmxcma.us-east-1.aws.endpoints.huggingface.cloud/invocations" HF_TOKEN = os.getenv("HF_TOKEN") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ─── Helpers ─────────────────────────────────────────────────────────────────── # 1) Speech→Text asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") def speech_to_text(audio): if not audio: return "" # Gradio supplies a tuple (sr, ndarray) if isinstance(audio, tuple): sr, arr = audio return asr(arr, sampling_rate=sr)["text"] # filepath return asr(audio)["text"] # 2) Prompt formatting def format_prompt(message, history): fixed_prompt = """ You are a smart mood analyzer tasked with determining the user's mood for a music recommendation system. Your goal is to classify the user's mood into one of four categories: Happy, Sad, Instrumental, or Party. Instructions: 1. Engage in a conversation with the user to understand their mood. 2. Ask relevant questions to guide the conversation towards mood classification. 3. If the user's mood is clear, respond with a single word: "Happy", "Sad", "Instrumental", or "Party". 4. If the mood is unclear, continue the conversation with a follow-up question. 5. Limit the conversation to a maximum of 5 exchanges. 6. Do not classify the mood prematurely if it's not evident from the user's responses. 7. Focus on the user's emotional state rather than specific activities or preferences. 8. If unable to classify after 5 exchanges, respond with "Unclear" to indicate the need for more information. Remember: Your primary goal is mood classification. Stay on topic and guide the conversation towards understanding the user's emotional state. """ prompt = f"{fixed_prompt}\n" for i, (u, b) in enumerate(history): prompt += f"User: {u}\nAssistant: {b}\n" if i == 3: prompt += "Note: This is the last exchange. Classify the mood if possible or respond with 'Unclear'.\n" prompt += f"User: {message}\nAssistant:" return prompt # 3) Call HF Invocation Endpoint def query_model(prompt, max_new_tokens=64, temperature=0.1): headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json", } payload = { "inputs": prompt, "parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature}, } resp = requests.post(ENDPOINT_URL, headers=headers, json=payload, timeout=30) resp.raise_for_status() return resp.json()[0]["generated_text"] # 4) Aggregate mood from history def aggregate_mood_from_history(history): mood_words = {"happy", "sad", "instrumental", "party"} counts = Counter() for _, bot_response in history: for tok in bot_response.split(): w = tok.strip('.,?!;"\'').lower() if w in mood_words: counts[w] += 1 if not counts: return None return counts.most_common(1)[0][0] # 5) Text→Speech def text_to_speech(text): communicate = edge_tts.Communicate(text) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: # save synchronously to simplify callback asyncio.get_event_loop().run_until_complete(communicate.save(tmp.name)) return tmp.name # ─── Gradio Callbacks ─────────────────────────────────────────────────────────── def user_turn(user_input, history): history = history + [(user_input, None)] formatted = format_prompt(user_input, history) raw = query_model(formatted) # temporarily assign raw history[-1] = (user_input, raw) # aggregate mood mood = aggregate_mood_from_history(history) if mood: reply = f"Playing {mood.capitalize()} playlist for you!" else: reply = raw history[-1] = (user_input, reply) return history, history, "" async def bot_audio(history): last = history[-1][1] return text_to_speech(last) def speech_callback(audio): return speech_to_text(audio) # ─── Build the Interface ──────────────────────────────────────────────────────── with gr.Blocks() as demo: gr.Markdown("## 🎵 Mood-Based Music Buddy") chat = gr.Chatbot() txt = gr.Textbox(placeholder="Type your mood...", label="Text") send = gr.Button("Send") mic = gr.Audio() out_audio = gr.Audio(label="Response (Audio)", autoplay=True) state = gr.State([]) def init(): greeting = "Hi! I'm your music buddy—tell me how you’re feeling today." return [("", greeting)], [("", greeting)], None demo.load(init, outputs=[state, chat, out_audio]) txt.submit(user_turn, [txt, state], [state, chat, txt])\ .then(bot_audio, [state], [out_audio]) send.click(user_turn, [txt, state], [state, chat, txt])\ .then(bot_audio, [state], [out_audio]) mic.change(speech_callback, [mic], [txt])\ .then(user_turn, [txt, state], [state, chat, txt])\ .then(bot_audio, [state], [out_audio]) if __name__ == "__main__": demo.launch(debug=True) # import gradio as gr # import requests # from transformers import pipeline # import edge_tts # import tempfile # import asyncio # import os # import json # import time # import logging # # Set up logging # logging.basicConfig(level=logging.INFO) # logger = logging.getLogger(__name__) # ENDPOINT_URL = "https://xzup8268xrmmxcma.us-east-1.aws.endpoints.huggingface.cloud/invocations" # hf_token = os.getenv("HF_TOKEN") # print(f"DEBUG: Starting application at {time.strftime('%Y-%m-%d %H:%M:%S')}") # print(f"DEBUG: HF_TOKEN available: {bool(hf_token)}") # print(f"DEBUG: Endpoint URL: {ENDPOINT_URL}") # try: # print("DEBUG: Loading ASR pipeline...") # start_time = time.time() # asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") # print(f"DEBUG: ASR pipeline loaded in {time.time() - start_time:.2f} seconds") # except Exception as e: # print(f"DEBUG: Error loading ASR pipeline: {e}") # asr = None # INITIAL_MESSAGE = "Hi! I'm your music buddy—tell me about your mood and the type of tunes you're in the mood for today!" # def speech_to_text(speech): # print(f"DEBUG: speech_to_text called with input: {speech is not None}") # if speech is None: # print("DEBUG: No speech input provided") # return "" # try: # start_time = time.time() # print("DEBUG: Starting speech recognition...") # result = asr(speech)["text"] # print(f"DEBUG: Speech recognition completed in {time.time() - start_time:.2f} seconds") # print(f"DEBUG: Recognized text: '{result}'") # return result # except Exception as e: # print(f"DEBUG: Error in speech_to_text: {e}") # return "" # def classify_mood(input_string): # print(f"DEBUG: classify_mood called with: '{input_string}'") # input_string = input_string.lower() # mood_words = {"happy", "sad", "instrumental", "party"} # for word in mood_words: # if word in input_string: # print(f"DEBUG: Mood classified as: {word}") # return word, True # print("DEBUG: No mood classified") # return None, False # def generate(prompt, history, temperature=0.1, max_new_tokens=2048): # print(f"DEBUG: generate() called at {time.strftime('%H:%M:%S')}") # print(f"DEBUG: Prompt length: {len(prompt)}") # print(f"DEBUG: History length: {len(history)}") # if not hf_token: # error_msg = "Error: Hugging Face authentication required. Please set your HF_TOKEN." # print(f"DEBUG: {error_msg}") # return error_msg # try: # print("DEBUG: Formatting prompt...") # start_time = time.time() # formatted_prompt = format_prompt(prompt, history) # print(f"DEBUG: Prompt formatted in {time.time() - start_time:.2f} seconds") # print(f"DEBUG: Formatted prompt length: {len(formatted_prompt)}") # headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"} # payload = { # "inputs": formatted_prompt, # "parameters": { # "temperature": temperature, # "max_new_tokens": max_new_tokens # } # } # print("DEBUG: Making API request...") # api_start_time = time.time() # response = requests.post(ENDPOINT_URL, headers=headers, json=payload, timeout=60) # api_duration = time.time() - api_start_time # print(f"DEBUG: API request completed in {api_duration:.2f} seconds") # print(f"DEBUG: Response status code: {response.status_code}") # if response.status_code == 200: # print("DEBUG: Parsing API response...") # result = response.json() # output = result[0]["generated_text"] # print(f"DEBUG: Generated output: '{output[:100]}...'") # mood, is_classified = classify_mood(output) # if is_classified: # playlist_message = f"Playing {mood.capitalize()} playlist for you!" # print(f"DEBUG: Returning playlist message: {playlist_message}") # return playlist_message # print(f"DEBUG: Returning generated output") # return output # else: # error_msg = f"Error: {response.status_code} - {response.text}" # print(f"DEBUG: API error: {error_msg}") # return error_msg # except requests.exceptions.Timeout: # error_msg = "Error: API request timed out after 60 seconds" # print(f"DEBUG: {error_msg}") # return error_msg # except Exception as e: # error_msg = f"Error generating response: {str(e)}" # print(f"DEBUG: Exception in generate(): {error_msg}") # return error_msg # def format_prompt(message, history): # print("DEBUG: format_prompt called") # fixed_prompt = """ # You are a smart mood analyzer tasked with determining the user's mood for a music recommendation system. Your goal is to classify the user's mood into one of four categories: Happy, Sad, Instrumental, or Party. # Instructions: # 1. Engage in a conversation with the user to understand their mood. # 2. Ask relevant questions to guide the conversation towards mood classification. # 3. If the user's mood is clear, respond with a single word: "Happy", "Sad", "Instrumental", or "Party". # 4. If the mood is unclear, continue the conversation with a follow-up question. # 5. Limit the conversation to a maximum of 5 exchanges. # 6. Do not classify the mood prematurely if it's not evident from the user's responses. # 7. Focus on the user's emotional state rather than specific activities or preferences. # 8. If unable to classify after 5 exchanges, respond with "Unclear" to indicate the need for more information. # Remember: Your primary goal is mood classification. Stay on topic and guide the conversation towards understanding the user's emotional state. # """ # prompt = f"{fixed_prompt}\n" # for i, (user_prompt, bot_response) in enumerate(history): # prompt += f"User: {user_prompt}\nAssistant: {bot_response}\n" # if i == 3: # prompt += "Note: This is the last exchange. Classify the mood if possible or respond with 'Unclear'.\n" # prompt += f"User: {message}\nAssistant:" # print(f"DEBUG: Final prompt length: {len(prompt)}") # return prompt # async def text_to_speech(text): # print(f"DEBUG: text_to_speech called with text length: {len(text)}") # try: # start_time = time.time() # print("DEBUG: Creating TTS communicate object...") # communicate = edge_tts.Communicate(text) # print("DEBUG: Creating temporary file...") # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: # tmp_path = tmp_file.name # print(f"DEBUG: Saving TTS to: {tmp_path}") # await communicate.save(tmp_path) # duration = time.time() - start_time # print(f"DEBUG: TTS completed in {duration:.2f} seconds") # print(f"DEBUG: TTS file size: {os.path.getsize(tmp_path) if os.path.exists(tmp_path) else 'File not found'}") # return tmp_path # except Exception as e: # print(f"DEBUG: TTS Error: {e}") # return None # def process_input(input_text, history): # print(f"DEBUG: process_input called with text: '{input_text[:50]}...'") # if not input_text: # print("DEBUG: No input text provided") # return history, history, "" # print("DEBUG: Calling generate function...") # start_time = time.time() # response = generate(input_text, history) # duration = time.time() - start_time # print(f"DEBUG: generate() completed in {duration:.2f} seconds") # print(f"DEBUG: Response: '{response[:100]}...'") # history.append((input_text, response)) # print(f"DEBUG: Updated history length: {len(history)}") # return history, history, "" # async def generate_audio(history): # print(f"DEBUG: generate_audio called with history length: {len(history)}") # if history and len(history) > 0: # last_response = history[-1][1] # print(f"DEBUG: Generating audio for: '{last_response[:50]}...'") # start_time = time.time() # audio_path = await text_to_speech(last_response) # duration = time.time() - start_time # print(f"DEBUG: Audio generation completed in {duration:.2f} seconds") # return audio_path # print("DEBUG: No history available for audio generation") # return None # async def init_chat(): # print("DEBUG: init_chat called") # try: # history = [("", INITIAL_MESSAGE)] # print("DEBUG: Generating initial audio...") # start_time = time.time() # audio_path = await text_to_speech(INITIAL_MESSAGE) # duration = time.time() - start_time # print(f"DEBUG: Initial audio generated in {duration:.2f} seconds") # print("DEBUG: init_chat completed successfully") # return history, history, audio_path # except Exception as e: # print(f"DEBUG: Error in init_chat: {e}") # return [("", INITIAL_MESSAGE)], [("", INITIAL_MESSAGE)], None # def handle_voice_upload(audio_file): # print(f"DEBUG: handle_voice_upload called with file: {audio_file}") # if audio_file is None: # print("DEBUG: No audio file provided") # return "" # try: # start_time = time.time() # result = speech_to_text(audio_file) # duration = time.time() - start_time # print(f"DEBUG: Voice upload processing completed in {duration:.2f} seconds") # return result # except Exception as e: # print(f"DEBUG: Error in handle_voice_upload: {e}") # return "" # print("DEBUG: Creating Gradio interface...") # with gr.Blocks() as demo: # gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat") # chatbot = gr.Chatbot() # with gr.Row(): # msg = gr.Textbox( # placeholder="Type your message here...", # label="Text Input", # scale=4 # ) # submit = gr.Button("Send", scale=1) # with gr.Row(): # voice_input = gr.Audio( # label="🎤 Record your voice or upload audio file", # sources=["microphone", "upload"], # type="filepath" # ) # audio_output = gr.Audio(label="AI Response", autoplay=True) # state = gr.State([]) # print("DEBUG: Setting up Gradio event handlers...") # demo.load(init_chat, outputs=[state, chatbot, audio_output]) # def submit_and_generate_audio(input_text, history): # print(f"DEBUG: submit_and_generate_audio called at {time.strftime('%H:%M:%S')}") # start_time = time.time() # new_state, new_chatbot, empty_msg = process_input(input_text, history) # duration = time.time() - start_time # print(f"DEBUG: submit_and_generate_audio completed in {duration:.2f} seconds") # return new_state, new_chatbot, empty_msg # msg.submit( # submit_and_generate_audio, # inputs=[msg, state], # outputs=[state, chatbot, msg] # ).then( # generate_audio, # inputs=[state], # outputs=[audio_output] # ) # submit.click( # submit_and_generate_audio, # inputs=[msg, state], # outputs=[state, chatbot, msg] # ).then( # generate_audio, # inputs=[state], # outputs=[audio_output] # ) # voice_input.upload( # handle_voice_upload, # inputs=[voice_input], # outputs=[msg] # ).then( # submit_and_generate_audio, # inputs=[msg, state], # outputs=[state, chatbot, msg] # ).then( # generate_audio, # inputs=[state], # outputs=[audio_output] # ) # print("DEBUG: Gradio interface created successfully") # if __name__ == "__main__": # print("DEBUG: Launching Gradio app...") # demo.launch(share=True, debug=True)