parakeet-tdt-0.6b-v2

Running on Zero

App Files Files Community

fdaudens HF Staff commited on May 1

Commit

ec29942

verified ·

1 Parent(s): 59b3391

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -44

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import gradio.themes as gr_themes
 import csv
 device = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
 model = ASRModel.from_pretrained(model_name=MODEL_NAME)
 model.eval()
@@ -39,12 +39,12 @@ def get_audio_segment(audio_path, start_second, end_second):
         frame_rate = clipped_audio.frame_rate
         if frame_rate <= 0:
-             print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
-             frame_rate = audio.frame_rate
         if samples.size == 0:
-             print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
-             return None
         return (frame_rate, samples)
     except FileNotFoundError:
@@ -56,9 +56,25 @@ def get_audio_segment(audio_path, start_second, end_second):
 @spaces.GPU
 def get_transcripts_and_raw_times(audio_path):
     if not audio_path:
         gr.Error("No audio file path provided for transcription.", duration=None)
-        # Return an update to hide the button
         return [], [], None, gr.DownloadButton(visible=False)
     vis_data = [["N/A", "N/A", "Processing failed"]]
@@ -74,34 +90,29 @@ def get_transcripts_and_raw_times(audio_path):
             audio = AudioSegment.from_file(audio_path)
         except Exception as load_e:
             gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
-            # Return an update to hide the button
             return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         resampled = False
         mono = False
         target_sr = 16000
         if audio.frame_rate != target_sr:
             try:
                 audio = audio.set_frame_rate(target_sr)
                 resampled = True
             except Exception as resample_e:
-                 gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
-                 # Return an update to hide the button
-                 return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         if audio.channels == 2:
             try:
                 audio = audio.set_channels(1)
                 mono = True
             except Exception as mono_e:
-                 gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
-                 # Return an update to hide the button
-                 return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         elif audio.channels > 2:
-             gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
-             # Return an update to hide the button
-             return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         if resampled or mono:
             try:
@@ -113,9 +124,8 @@ def get_transcripts_and_raw_times(audio_path):
                 info_path_name = f"{original_path_name} (processed)"
             except Exception as export_e:
                 gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
-                if temp_file and hasattr(temp_file, 'name') and os.path.exists(temp_file.name): # Check temp_file has 'name' attribute
                     os.remove(temp_file.name)
-                # Return an update to hide the button
                 return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         else:
             transcribe_path = audio_path
@@ -127,16 +137,14 @@ def get_transcripts_and_raw_times(audio_path):
             output = model.transcribe([transcribe_path], timestamps=True)
             if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
-                 gr.Error("Transcription failed or produced unexpected output format.", duration=None)
-                 # Return an update to hide the button
-                 return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
             segment_timestamps = output[0].timestamp['segment']
             csv_headers = ["Start (s)", "End (s)", "Segment"]
             vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
             raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
-            # Default button update (hidden) in case CSV writing fails
             button_update = gr.DownloadButton(visible=False)
             try:
                 temp_csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w', newline='', encoding='utf-8')
@@ -145,61 +153,65 @@ def get_transcripts_and_raw_times(audio_path):
                 writer.writerows(vis_data)
                 csv_file_path = temp_csv_file.name
                 temp_csv_file.close()
-                print(f"CSV transcript saved to temporary file: {csv_file_path}")
-                # If CSV is saved, create update to show button with path
                 button_update = gr.DownloadButton(value=csv_file_path, visible=True)
             except Exception as csv_e:
                 gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
-                print(f"Error writing CSV: {csv_e}")
-                # csv_file_path remains None, button_update remains hidden
             gr.Info("Transcription complete.", duration=2)
-            # Return the data and the button update dictionary
             return vis_data, raw_times_data, audio_path, button_update
         except torch.cuda.OutOfMemoryError as e:
             error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
-            print(f"CUDA OutOfMemoryError: {e}")
             gr.Error(error_msg, duration=None)
-            # Return an update to hide the button
             return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         except FileNotFoundError:
             error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
-            print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
             gr.Error(error_msg, duration=None)
-            # Return an update to hide the button
             return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         except Exception as e:
             error_msg = f"Transcription failed: {e}"
-            print(f"Error during transcription processing: {e}")
             gr.Error(error_msg, duration=None)
             vis_data = [["Error", "Error", error_msg]]
             raw_times_data = [[0.0, 0.0]]
-            # Return an update to hide the button
             return vis_data, raw_times_data, audio_path, gr.DownloadButton(visible=False)
         finally:
             try:
                 if 'model' in locals() and hasattr(model, 'cpu'):
-                     if device == 'cuda':
-                          model.cpu()
                 gc.collect()
                 if device == 'cuda':
                     torch.cuda.empty_cache()
             except Exception as cleanup_e:
-                print(f"Error during model cleanup: {cleanup_e}")
                 gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
     finally:
         if processed_audio_path and os.path.exists(processed_audio_path):
             try:
                 os.remove(processed_audio_path)
-                print(f"Temporary audio file {processed_audio_path} removed.")
             except Exception as e:
                 print(f"Error removing temporary audio file {processed_audio_path}: {e}")
 def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
     if not isinstance(raw_ts_list, list):
         print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
         return gr.Audio(value=None, label="Selected Segment")
@@ -211,15 +223,14 @@ def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
     selected_index = evt.index[0]
     if selected_index < 0 or selected_index >= len(raw_ts_list):
-         print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
-         return gr.Audio(value=None, label="Selected Segment")
     if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
-         print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
-         return gr.Audio(value=None, label="Selected Segment")
     start_time_s, end_time_s = raw_ts_list[selected_index]
     print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
     segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
@@ -334,4 +345,4 @@ with gr.Blocks(theme=nvidia_theme) as demo:
 if __name__ == "__main__":
     print("Launching Gradio Demo...")
     demo.queue()
-    demo.launch(mcp_server=True)

 import csv
 device = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
 model = ASRModel.from_pretrained(model_name=MODEL_NAME)
 model.eval()
         frame_rate = clipped_audio.frame_rate
         if frame_rate <= 0:
+            print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
+            frame_rate = audio.frame_rate
         if samples.size == 0:
+            print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
+            return None
         return (frame_rate, samples)
     except FileNotFoundError:
 @spaces.GPU
 def get_transcripts_and_raw_times(audio_path):
+    """
+    Transcribe an audio file or microphone input and return transcription segments, timestamps, and a CSV download button.
+    Exposed as MCP endpoints:
+        - transcribe_mic: for microphone recordings
+        - transcribe_file: for uploaded audio files
+    Parameters:
+        audio_path (str): Path to the audio file or microphone recording to transcribe.
+    Returns:
+        tuple: A 4-tuple containing:
+            - vis_data (List[List[str]]): Displayable transcription segments [start_str, end_str, segment_text].
+            - raw_times_data (List[List[float]]): Raw timestamps [[start, end], ...].
+            - current_audio_path (str): The path to the audio used for transcription.
+            - download_button (gr.DownloadButton): A Gradio DownloadButton component for downloading the transcript CSV.
+    """
     if not audio_path:
         gr.Error("No audio file path provided for transcription.", duration=None)
         return [], [], None, gr.DownloadButton(visible=False)
     vis_data = [["N/A", "N/A", "Processing failed"]]
             audio = AudioSegment.from_file(audio_path)
         except Exception as load_e:
             gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
             return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         resampled = False
         mono = False
         target_sr = 16000
         if audio.frame_rate != target_sr:
             try:
                 audio = audio.set_frame_rate(target_sr)
                 resampled = True
             except Exception as resample_e:
+                gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
+                return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         if audio.channels == 2:
             try:
                 audio = audio.set_channels(1)
                 mono = True
             except Exception as mono_e:
+                gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
+                return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         elif audio.channels > 2:
+            gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
+            return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         if resampled or mono:
             try:
                 info_path_name = f"{original_path_name} (processed)"
             except Exception as export_e:
                 gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
+                if temp_file and hasattr(temp_file, 'name') and os.path.exists(temp_file.name):
                     os.remove(temp_file.name)
                 return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         else:
             transcribe_path = audio_path
             output = model.transcribe([transcribe_path], timestamps=True)
             if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
+                gr.Error("Transcription failed or produced unexpected output format.", duration=None)
+                return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
             segment_timestamps = output[0].timestamp['segment']
             csv_headers = ["Start (s)", "End (s)", "Segment"]
             vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
             raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
             button_update = gr.DownloadButton(visible=False)
             try:
                 temp_csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w', newline='', encoding='utf-8')
                 writer.writerows(vis_data)
                 csv_file_path = temp_csv_file.name
                 temp_csv_file.close()
                 button_update = gr.DownloadButton(value=csv_file_path, visible=True)
             except Exception as csv_e:
                 gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
             gr.Info("Transcription complete.", duration=2)
             return vis_data, raw_times_data, audio_path, button_update
         except torch.cuda.OutOfMemoryError as e:
             error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
             gr.Error(error_msg, duration=None)
             return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         except FileNotFoundError:
             error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
             gr.Error(error_msg, duration=None)
             return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
         except Exception as e:
             error_msg = f"Transcription failed: {e}"
             gr.Error(error_msg, duration=None)
             vis_data = [["Error", "Error", error_msg]]
             raw_times_data = [[0.0, 0.0]]
             return vis_data, raw_times_data, audio_path, gr.DownloadButton(visible=False)
         finally:
             try:
                 if 'model' in locals() and hasattr(model, 'cpu'):
+                    if device == 'cuda':
+                        model.cpu()
                 gc.collect()
                 if device == 'cuda':
                     torch.cuda.empty_cache()
             except Exception as cleanup_e:
                 gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
     finally:
         if processed_audio_path and os.path.exists(processed_audio_path):
             try:
                 os.remove(processed_audio_path)
             except Exception as e:
                 print(f"Error removing temporary audio file {processed_audio_path}: {e}")
+@spaces.API(name="play_segment")
 def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
+    """
+    Play a selected audio segment based on the user's selection event.
+    Exposed as MCP endpoint:
+        - play_segment
+    Parameters:
+        evt (gr.SelectData): The Gradio SelectData event triggered by selecting a row in the DataFrame.
+        raw_ts_list (List[List[float]]): List of timestamp pairs [[start, end], ...] from transcription.
+        current_audio_path (str): Path to the original audio file.
+    Returns:
+        gr.Audio: A Gradio Audio component for the clipped segment or an empty Audio component on error.
+    """
     if not isinstance(raw_ts_list, list):
         print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
         return gr.Audio(value=None, label="Selected Segment")
     selected_index = evt.index[0]
     if selected_index < 0 or selected_index >= len(raw_ts_list):
+        print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
+        return gr.Audio(value=None, label="Selected Segment")
     if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
+        print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
+        return gr.Audio(value=None, label="Selected Segment")
     start_time_s, end_time_s = raw_ts_list[selected_index]
     print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
     segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
 if __name__ == "__main__":
     print("Launching Gradio Demo...")
     demo.queue()
+    demo.launch(mcp_server=True)