| """ |
| Basic Pitch Audio-to-MIDI Converter |
| Hugging Face Space for CPU inference |
| July 2024 version |
| """ |
|
|
| import gradio as gr |
| import numpy as np |
| from basic_pitch.inference import predict |
| from basic_pitch import ICASSP_2022_MODEL_PATH |
| import tempfile |
| import os |
|
|
|
|
| def transcribe_audio(audio_input): |
| """ |
| Transcribe audio to MIDI using Basic Pitch model. |
| |
| Args: |
| audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component |
| |
| Returns: |
| midi_file_path: Path to generated MIDI file |
| note_summary: Summary of detected notes |
| """ |
| try: |
| if audio_input is None: |
| return None, "Please upload an audio file first." |
| |
| sample_rate, audio_data = audio_input |
| |
| # Create temporary directory for processing |
| with tempfile.TemporaryDirectory() as tmpdir: |
| # Save audio to temporary file |
| audio_path = os.path.join(tmpdir, "input_audio.wav") |
| |
| import soundfile as sf |
| sf.write(audio_path, audio_data, sample_rate) |
| |
| # Run Basic Pitch inference |
| model_output, midi_data, note_events = predict( |
| audio_path, |
| model_or_model_path=ICASSP_2022_MODEL_PATH, |
| onset_thresh=0.5, |
| frame_thresh=0.3, |
| minimum_note_length=127.70254248031496, |
| minimum_frequency=10, |
| maximum_frequency=2000, |
| melodia_trick=True, |
| sonify=False |
| ) |
| |
| # Save MIDI output |
| midi_path = os.path.join(tmpdir, "output.mid") |
| midi_data.write(midi_path) |
| |
| # Generate note summary |
| note_summary = generate_note_summary(note_events) |
| |
| return midi_path, note_summary |
| |
| except Exception as e: |
| return None, f"Error: {str(e)}" |
|
|
|
|
| def generate_note_summary(note_events): |
| """ |
| Generate a human-readable summary of detected notes. |
| |
| Args: |
| note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends) |
| |
| Returns: |
| Formatted string summary |
| """ |
| if not note_events or len(note_events) == 0: |
| return "No notes detected in the audio." |
| |
| summary = f"β Transcription Complete\n" |
| summary += f"Total notes detected: {len(note_events)}\n\n" |
| summary += "Note Events:\n" |
| summary += "-" * 70 + "\n" |
| summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n" |
| summary += "-" * 70 + "\n" |
| |
| for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events: |
| duration = end_time - start_time |
| summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n" |
| |
| summary += "-" * 70 + "\n" |
| |
| # Calculate statistics |
| avg_duration = np.mean([end - start for start, end, _, _, _ in note_events]) |
| avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events]) |
| |
| summary += f"\nStatistics:\n" |
| summary += f"Average note duration: {avg_duration:.3f}s\n" |
| summary += f"Average amplitude: {avg_amplitude:.3f}\n" |
| |
| return summary |
|
|
|
|
| def create_gradio_interface(): |
| """ |
| Create the Gradio interface for Basic Pitch transcription. |
| """ |
| |
| with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo: |
| |
| gr.Markdown(""" |
| # π΅ Basic Pitch: Automatic Music Transcription |
| |
| Convert audio files to MIDI notation using Spotify's **Basic Pitch** model. |
| |
| This lightweight neural network performs **automatic music transcription (AMT)** |
| and works with any instrument or voice. |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### π€ Input") |
| |
| audio_input = gr.Audio( |
| label="Upload Audio File", |
| type="numpy", |
| sources=["upload", "microphone"] |
| ) |
| |
| gr.Markdown(""" |
| **Supported formats:** |
| - `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a` |
| |
| **Recommended:** |
| - Mono audio (single instrument) |
| - Clear, high-quality recordings |
| - 30 seconds to 5 minutes duration |
| """) |
| |
| transcribe_btn = gr.Button( |
| "πΌ Transcribe to MIDI", |
| variant="primary", |
| size="lg" |
| ) |
| |
| with gr.Column(scale=1): |
| gr.Markdown("### π₯ Output") |
| |
| midi_file = gr.File( |
| label="Download MIDI", |
| type="filepath" |
| ) |
| |
| note_info = gr.Textbox( |
| label="Note Detection Summary", |
| lines=15, |
| interactive=False, |
| max_lines=20 |
| ) |
| |
| gr.Markdown(""" |
| --- |
| ### βοΈ Model Details |
| |
| **Model:** ICASSP 2022 (Spotify Basic Pitch) |
| - Lightweight: ~20 MB |
| - CPU-optimized inference |
| - No GPU required |
| |
| **Detection Parameters:** |
| - Onset threshold: 0.5 (note attack sensitivity) |
| - Frame threshold: 0.3 (note sustain sensitivity) |
| - Frequency range: 10 Hz - 2000 Hz |
| - Melodia post-processing: Enabled |
| |
| **Output:** |
| - MIDI file with detected notes |
| - Note timing and pitch information |
| - Amplitude/velocity data |
| """) |
| |
| gr.Markdown(""" |
| --- |
| ### π‘ Tips for Best Results |
| |
| 1. **Single instrument:** Works best with one instrument or voice |
| 2. **Mono audio:** Use mono recordings when possible |
| 3. **Clear audio:** Avoid background noise |
| 4. **Duration:** Works with any length, but 30s-5min is typical |
| 5. **Polyphonic:** Can detect multiple simultaneous notes |
| |
| **Limitations:** |
| - Works best with pitched instruments (not drums) |
| - May struggle with very fast passages |
| - Polyphonic music may need manual correction |
| """) |
| |
| gr.Markdown(""" |
| --- |
| ### π About Basic Pitch |
| |
| Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch) |
| |
| **Citation:** |
| ``` |
| Basic Pitch: A Lightweight Yet Effective Pitch Detection Model |
| for Automatic Music Transcription |
| Spotify, 2022 |
| ``` |
| """) |
| |
| # Connect button to function |
| transcribe_btn.click( |
| fn=transcribe_audio, |
| inputs=[audio_input], |
| outputs=[midi_file, note_info] |
| ) |
| |
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| interface = create_gradio_interface() |
| interface.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False |
| ) |
|
|