Spaces:

Agents-MCP-Hackathon
/

MedCodeMCP

Running

gpaasch commited on Jun 8

Commit

9ae574b

1 Parent(s): 1f0d5ee

1. Removed global pipeline initialization to prevent warm-up crashes

2. Added lazy loading of ASR pipeline
3. Fixed input processing with proper features and attention mask
4. Improved error handling
5. Proper audio preprocessing with correct sampling rate

Files changed (1) hide show

src/app.py +32 -28

src/app.py CHANGED Viewed

@@ -41,33 +41,33 @@ MODEL_OPTIONS = {
     }
 }
-# Initialize Whisper components
 feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
 tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
-# Configure transcription pipeline with only necessary components
-transcriber = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-base.en",
-    chunk_length_s=30,
-    stride_length_s=5,
-    device="cpu",
-    torch_dtype=torch.float32,
-    generate_kwargs={
-        "use_cache": True,
-        "return_timestamps": True
-    }
-)
 # Audio preprocessing function
-def prepare_audio_features(audio_array, sample_rate):
-    """Prepare audio features with proper format."""
-    # Convert stereo to mono
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
-    audio_array = audio_array.astype(np.float32)
     # Normalize audio
     audio_array /= np.max(np.abs(audio_array))
     # Resample to 16kHz if needed
@@ -77,10 +77,16 @@ def prepare_audio_features(audio_array, sample_rate):
         audio_tensor = resampler(audio_tensor)
         audio_array = audio_tensor.numpy()
-    # Return proper dictionary format for pipeline
     return {
-        "raw": audio_array,
-        "sampling_rate": 16000
     }
 # Update transcriber configuration
@@ -526,14 +532,12 @@ with gr.Blocks(
         try:
             sample_rate, audio_array = audio
-            # Process audio and get proper format
-            input_features = prepare_audio_features(audio_array, sample_rate)
-            # Pass to transcriber
-            result = transcriber(input_features)
-            # Extract text from result
             if isinstance(result, dict):
                 return result.get("text", "").strip()
             elif isinstance(result, str):

     }
 }
+# Initialize Whisper components globally (these are lightweight)
 feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
 tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
+processor = WhisperProcessor(feature_extractor, tokenizer)
+def get_asr_pipeline():
+    """Lazy load ASR pipeline with proper configuration."""
+    global transcriber
+    if "transcriber" not in globals():
+        transcriber = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-base.en",
+            chunk_length_s=30,
+            stride_length_s=5,
+            device="cpu",
+            torch_dtype=torch.float32
+        )
+    return transcriber
 # Audio preprocessing function
+def process_audio(audio_array, sample_rate):
+    """Pre-process audio for Whisper."""
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
     # Normalize audio
+    audio_array = audio_array.astype(np.float32)
     audio_array /= np.max(np.abs(audio_array))
     # Resample to 16kHz if needed
         audio_tensor = resampler(audio_tensor)
         audio_array = audio_tensor.numpy()
+    # Process with correct input format
+    inputs = processor(
+        audio_array,
+        sampling_rate=16000,
+        return_tensors="pt"
+    )
     return {
+        "input_features": inputs.input_features,
+        "attention_mask": inputs.attention_mask
     }
 # Update transcriber configuration
         try:
             sample_rate, audio_array = audio
+            features = process_audio(audio_array, sample_rate)
+            # Get pipeline and transcribe
+            asr = get_asr_pipeline()
+            result = asr(features)
             if isinstance(result, dict):
                 return result.get("text", "").strip()
             elif isinstance(result, str):