Spaces:
Running
Running
1. Removed global pipeline initialization to prevent warm-up crashes
Browse files2. Added lazy loading of ASR pipeline
3. Fixed input processing with proper features and attention mask
4. Improved error handling
5. Proper audio preprocessing with correct sampling rate
- src/app.py +32 -28
src/app.py
CHANGED
@@ -41,33 +41,33 @@ MODEL_OPTIONS = {
|
|
41 |
}
|
42 |
}
|
43 |
|
44 |
-
# Initialize Whisper components
|
45 |
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
|
46 |
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
"
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
)
|
|
|
61 |
|
62 |
# Audio preprocessing function
|
63 |
-
def
|
64 |
-
"""
|
65 |
-
# Convert stereo to mono
|
66 |
if audio_array.ndim > 1:
|
67 |
audio_array = audio_array.mean(axis=1)
|
68 |
-
audio_array = audio_array.astype(np.float32)
|
69 |
|
70 |
# Normalize audio
|
|
|
71 |
audio_array /= np.max(np.abs(audio_array))
|
72 |
|
73 |
# Resample to 16kHz if needed
|
@@ -77,10 +77,16 @@ def prepare_audio_features(audio_array, sample_rate):
|
|
77 |
audio_tensor = resampler(audio_tensor)
|
78 |
audio_array = audio_tensor.numpy()
|
79 |
|
80 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return {
|
82 |
-
"
|
83 |
-
"
|
84 |
}
|
85 |
|
86 |
# Update transcriber configuration
|
@@ -526,14 +532,12 @@ with gr.Blocks(
|
|
526 |
|
527 |
try:
|
528 |
sample_rate, audio_array = audio
|
|
|
529 |
|
530 |
-
#
|
531 |
-
|
532 |
-
|
533 |
-
# Pass to transcriber
|
534 |
-
result = transcriber(input_features)
|
535 |
|
536 |
-
# Extract text from result
|
537 |
if isinstance(result, dict):
|
538 |
return result.get("text", "").strip()
|
539 |
elif isinstance(result, str):
|
|
|
41 |
}
|
42 |
}
|
43 |
|
44 |
+
# Initialize Whisper components globally (these are lightweight)
|
45 |
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
|
46 |
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
|
47 |
+
processor = WhisperProcessor(feature_extractor, tokenizer)
|
48 |
+
|
49 |
+
def get_asr_pipeline():
|
50 |
+
"""Lazy load ASR pipeline with proper configuration."""
|
51 |
+
global transcriber
|
52 |
+
if "transcriber" not in globals():
|
53 |
+
transcriber = pipeline(
|
54 |
+
"automatic-speech-recognition",
|
55 |
+
model="openai/whisper-base.en",
|
56 |
+
chunk_length_s=30,
|
57 |
+
stride_length_s=5,
|
58 |
+
device="cpu",
|
59 |
+
torch_dtype=torch.float32
|
60 |
+
)
|
61 |
+
return transcriber
|
62 |
|
63 |
# Audio preprocessing function
|
64 |
+
def process_audio(audio_array, sample_rate):
|
65 |
+
"""Pre-process audio for Whisper."""
|
|
|
66 |
if audio_array.ndim > 1:
|
67 |
audio_array = audio_array.mean(axis=1)
|
|
|
68 |
|
69 |
# Normalize audio
|
70 |
+
audio_array = audio_array.astype(np.float32)
|
71 |
audio_array /= np.max(np.abs(audio_array))
|
72 |
|
73 |
# Resample to 16kHz if needed
|
|
|
77 |
audio_tensor = resampler(audio_tensor)
|
78 |
audio_array = audio_tensor.numpy()
|
79 |
|
80 |
+
# Process with correct input format
|
81 |
+
inputs = processor(
|
82 |
+
audio_array,
|
83 |
+
sampling_rate=16000,
|
84 |
+
return_tensors="pt"
|
85 |
+
)
|
86 |
+
|
87 |
return {
|
88 |
+
"input_features": inputs.input_features,
|
89 |
+
"attention_mask": inputs.attention_mask
|
90 |
}
|
91 |
|
92 |
# Update transcriber configuration
|
|
|
532 |
|
533 |
try:
|
534 |
sample_rate, audio_array = audio
|
535 |
+
features = process_audio(audio_array, sample_rate)
|
536 |
|
537 |
+
# Get pipeline and transcribe
|
538 |
+
asr = get_asr_pipeline()
|
539 |
+
result = asr(features)
|
|
|
|
|
540 |
|
|
|
541 |
if isinstance(result, dict):
|
542 |
return result.get("text", "").strip()
|
543 |
elif isinstance(result, str):
|