import gradio as gr import transformers import librosa import numpy as np # Load the model pipeline pipe = transformers.pipeline( model='fixie-ai/ultravox-v0_5-llama-3_1-8b', trust_remote_code=True ) def transcribe(audio): if audio is None: return "No audio provided." # Load audio using librosa audio_array, sr = librosa.load(audio, sr=16000) # Define initial system prompt turns = [ { "role": "system", "content": "You are a friendly and helpful character. You love to answer questions for people." }, ] # Run inference result = pipe( {'audio': audio_array, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=30 ) # Return result content return result[0]['content'] if isinstance(result, list) else str(result) # Build Gradio Interface demo = gr.Interface( fn=transcribe, inputs=gr.Audio(source="upload", type="filepath", label="Upload an Audio File"), outputs=gr.Textbox(label="Ultravox Response"), title="🎙️ Ultravox AI Voicebot", description="Upload an audio file and Ultravox will respond intelligently!" ) # Launch app if __name__ == "__main__": demo.launch()