gpaasch commited on
Commit
9ae574b
·
1 Parent(s): 1f0d5ee

1. Removed global pipeline initialization to prevent warm-up crashes

Browse files

2. Added lazy loading of ASR pipeline
3. Fixed input processing with proper features and attention mask
4. Improved error handling
5. Proper audio preprocessing with correct sampling rate

Files changed (1) hide show
  1. src/app.py +32 -28
src/app.py CHANGED
@@ -41,33 +41,33 @@ MODEL_OPTIONS = {
41
  }
42
  }
43
 
44
- # Initialize Whisper components
45
  feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
46
  tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
47
-
48
- # Configure transcription pipeline with only necessary components
49
- transcriber = pipeline(
50
- "automatic-speech-recognition",
51
- model="openai/whisper-base.en",
52
- chunk_length_s=30,
53
- stride_length_s=5,
54
- device="cpu",
55
- torch_dtype=torch.float32,
56
- generate_kwargs={
57
- "use_cache": True,
58
- "return_timestamps": True
59
- }
60
- )
 
61
 
62
  # Audio preprocessing function
63
- def prepare_audio_features(audio_array, sample_rate):
64
- """Prepare audio features with proper format."""
65
- # Convert stereo to mono
66
  if audio_array.ndim > 1:
67
  audio_array = audio_array.mean(axis=1)
68
- audio_array = audio_array.astype(np.float32)
69
 
70
  # Normalize audio
 
71
  audio_array /= np.max(np.abs(audio_array))
72
 
73
  # Resample to 16kHz if needed
@@ -77,10 +77,16 @@ def prepare_audio_features(audio_array, sample_rate):
77
  audio_tensor = resampler(audio_tensor)
78
  audio_array = audio_tensor.numpy()
79
 
80
- # Return proper dictionary format for pipeline
 
 
 
 
 
 
81
  return {
82
- "raw": audio_array,
83
- "sampling_rate": 16000
84
  }
85
 
86
  # Update transcriber configuration
@@ -526,14 +532,12 @@ with gr.Blocks(
526
 
527
  try:
528
  sample_rate, audio_array = audio
 
529
 
530
- # Process audio and get proper format
531
- input_features = prepare_audio_features(audio_array, sample_rate)
532
-
533
- # Pass to transcriber
534
- result = transcriber(input_features)
535
 
536
- # Extract text from result
537
  if isinstance(result, dict):
538
  return result.get("text", "").strip()
539
  elif isinstance(result, str):
 
41
  }
42
  }
43
 
44
+ # Initialize Whisper components globally (these are lightweight)
45
  feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
46
  tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
47
+ processor = WhisperProcessor(feature_extractor, tokenizer)
48
+
49
+ def get_asr_pipeline():
50
+ """Lazy load ASR pipeline with proper configuration."""
51
+ global transcriber
52
+ if "transcriber" not in globals():
53
+ transcriber = pipeline(
54
+ "automatic-speech-recognition",
55
+ model="openai/whisper-base.en",
56
+ chunk_length_s=30,
57
+ stride_length_s=5,
58
+ device="cpu",
59
+ torch_dtype=torch.float32
60
+ )
61
+ return transcriber
62
 
63
  # Audio preprocessing function
64
+ def process_audio(audio_array, sample_rate):
65
+ """Pre-process audio for Whisper."""
 
66
  if audio_array.ndim > 1:
67
  audio_array = audio_array.mean(axis=1)
 
68
 
69
  # Normalize audio
70
+ audio_array = audio_array.astype(np.float32)
71
  audio_array /= np.max(np.abs(audio_array))
72
 
73
  # Resample to 16kHz if needed
 
77
  audio_tensor = resampler(audio_tensor)
78
  audio_array = audio_tensor.numpy()
79
 
80
+ # Process with correct input format
81
+ inputs = processor(
82
+ audio_array,
83
+ sampling_rate=16000,
84
+ return_tensors="pt"
85
+ )
86
+
87
  return {
88
+ "input_features": inputs.input_features,
89
+ "attention_mask": inputs.attention_mask
90
  }
91
 
92
  # Update transcriber configuration
 
532
 
533
  try:
534
  sample_rate, audio_array = audio
535
+ features = process_audio(audio_array, sample_rate)
536
 
537
+ # Get pipeline and transcribe
538
+ asr = get_asr_pipeline()
539
+ result = asr(features)
 
 
540
 
 
541
  if isinstance(result, dict):
542
  return result.get("text", "").strip()
543
  elif isinstance(result, str):