Spaces:

Somnath3570
/

Voice_Assistant

Runtime error

File size: 2,700 Bytes

c336d2f
 
 
 
 
 
 
 
 
3e435ed
 
 
04cf931
3e435ed
c336d2f
3e435ed
 
04cf931
 
c336d2f
 
3e435ed
c336d2f
 
3e435ed
c336d2f
3e435ed
 
 
 
 
c336d2f
3e435ed
 
 
 
 
 
 
 
 
c336d2f
3e435ed
 
 
c336d2f
 
 
 
 
 
 
04cf931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c336d2f
 
04cf931
c336d2f
04cf931

import gradio as gr
import torch
import transformers
import librosa
import numpy as np
import os

class UltravoxInterface:
    def __init__(self):
        """Initialize with smaller model footprint"""
        print("Initializing voice interface...")
        
        # Use smaller whisper model
        self.model_name = "openai/whisper-small"
        self.pipe = transformers.pipeline(
            "automatic-speech-recognition",
            model=self.model_name,
            torch_dtype=torch.float16,
            device="cpu"  # Explicitly set to CPU
        )
        
        print("Model loaded successfully!")
        
    def process_audio(self, audio_path, custom_prompt=None):
        """Process audio with optimized memory usage"""
        try:
            if audio_path is None:
                return "Please provide an audio input."
                
            # Load audio in chunks to save memory
            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
            
            # Process audio in smaller segments if needed
            max_length = 30 * sr  # 30 seconds chunks
            if len(audio) > max_length:
                segments = []
                for i in range(0, len(audio), max_length):
                    segment = audio[i:i + max_length]
                    result = self.pipe(segment, batch_size=1)
                    segments.append(result["text"])
                return " ".join(segments)
            
            # Process shorter audio directly
            result = self.pipe(audio, batch_size=1)
            return result["text"]
            
        except Exception as e:
            return f"Error processing audio: {str(e)}"

    def create_interface(self):
        """Create and configure the Gradio interface"""
        
        interface = gr.Interface(
            fn=self.process_audio,
            inputs=[
                gr.Audio(
                    label="Speak here",
                    sources=["microphone"],
                    type="filepath"
                )
            ],
            outputs=[
                gr.Textbox(
                    label="Transcription",
                    lines=5,
                    placeholder="Transcription will appear here..."
                )
            ],
            title="Voice Assistant",
            description="Speak into the microphone and get text transcription!",
            theme=gr.themes.Soft(primary_hue="orange"),
            examples=[[None]],
        )
        
        return interface

# Create the interface
app = UltravoxInterface()
interface = app.create_interface()

# Launch the interface - this is crucial for Hugging Face Spaces
interface.launch()