File size: 2,700 Bytes
c336d2f
 
 
 
 
 
 
 
 
3e435ed
 
 
04cf931
3e435ed
c336d2f
3e435ed
 
04cf931
 
c336d2f
 
3e435ed
c336d2f
 
3e435ed
c336d2f
3e435ed
 
 
 
 
c336d2f
3e435ed
 
 
 
 
 
 
 
 
c336d2f
3e435ed
 
 
c336d2f
 
 
 
 
 
 
04cf931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c336d2f
 
04cf931
c336d2f
04cf931
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
import torch
import transformers
import librosa
import numpy as np
import os

class UltravoxInterface:
    def __init__(self):
        """Initialize with smaller model footprint"""
        print("Initializing voice interface...")
        
        # Use smaller whisper model
        self.model_name = "openai/whisper-small"
        self.pipe = transformers.pipeline(
            "automatic-speech-recognition",
            model=self.model_name,
            torch_dtype=torch.float16,
            device="cpu"  # Explicitly set to CPU
        )
        
        print("Model loaded successfully!")
        
    def process_audio(self, audio_path, custom_prompt=None):
        """Process audio with optimized memory usage"""
        try:
            if audio_path is None:
                return "Please provide an audio input."
                
            # Load audio in chunks to save memory
            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
            
            # Process audio in smaller segments if needed
            max_length = 30 * sr  # 30 seconds chunks
            if len(audio) > max_length:
                segments = []
                for i in range(0, len(audio), max_length):
                    segment = audio[i:i + max_length]
                    result = self.pipe(segment, batch_size=1)
                    segments.append(result["text"])
                return " ".join(segments)
            
            # Process shorter audio directly
            result = self.pipe(audio, batch_size=1)
            return result["text"]
            
        except Exception as e:
            return f"Error processing audio: {str(e)}"

    def create_interface(self):
        """Create and configure the Gradio interface"""
        
        interface = gr.Interface(
            fn=self.process_audio,
            inputs=[
                gr.Audio(
                    label="Speak here",
                    sources=["microphone"],
                    type="filepath"
                )
            ],
            outputs=[
                gr.Textbox(
                    label="Transcription",
                    lines=5,
                    placeholder="Transcription will appear here..."
                )
            ],
            title="Voice Assistant",
            description="Speak into the microphone and get text transcription!",
            theme=gr.themes.Soft(primary_hue="orange"),
            examples=[[None]],
        )
        
        return interface

# Create the interface
app = UltravoxInterface()
interface = app.create_interface()

# Launch the interface - this is crucial for Hugging Face Spaces
interface.launch()