""" Basic Pitch Audio-to-MIDI Converter Hugging Face Space for CPU inference July 2024 version """ import gradio as gr import numpy as np from basic_pitch.inference import predict from basic_pitch import ICASSP_2022_MODEL_PATH import tempfile import os def transcribe_audio(audio_input): """ Transcribe audio to MIDI using Basic Pitch model. Args: audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component Returns: midi_file_path: Path to generated MIDI file note_summary: Summary of detected notes """ try: if audio_input is None: return None, "Please upload an audio file first." sample_rate, audio_data = audio_input # Create temporary directory for processing with tempfile.TemporaryDirectory() as tmpdir: # Save audio to temporary file audio_path = os.path.join(tmpdir, "input_audio.wav") import soundfile as sf sf.write(audio_path, audio_data, sample_rate) # Run Basic Pitch inference model_output, midi_data, note_events = predict( audio_path, model_or_model_path=ICASSP_2022_MODEL_PATH, onset_thresh=0.5, frame_thresh=0.3, minimum_note_length=127.70254248031496, minimum_frequency=10, maximum_frequency=2000, melodia_trick=True, sonify=False ) # Save MIDI output midi_path = os.path.join(tmpdir, "output.mid") midi_data.write(midi_path) # Generate note summary note_summary = generate_note_summary(note_events) return midi_path, note_summary except Exception as e: return None, f"Error: {str(e)}" def generate_note_summary(note_events): """ Generate a human-readable summary of detected notes. Args: note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends) Returns: Formatted string summary """ if not note_events or len(note_events) == 0: return "No notes detected in the audio." summary = f"✓ Transcription Complete\n" summary += f"Total notes detected: {len(note_events)}\n\n" summary += "Note Events:\n" summary += "-" * 70 + "\n" summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n" summary += "-" * 70 + "\n" for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events: duration = end_time - start_time summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n" summary += "-" * 70 + "\n" # Calculate statistics avg_duration = np.mean([end - start for start, end, _, _, _ in note_events]) avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events]) summary += f"\nStatistics:\n" summary += f"Average note duration: {avg_duration:.3f}s\n" summary += f"Average amplitude: {avg_amplitude:.3f}\n" return summary def create_gradio_interface(): """ Create the Gradio interface for Basic Pitch transcription. """ with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo: gr.Markdown(""" # 🎵 Basic Pitch: Automatic Music Transcription Convert audio files to MIDI notation using Spotify's **Basic Pitch** model. This lightweight neural network performs **automatic music transcription (AMT)** and works with any instrument or voice. """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📤 Input") audio_input = gr.Audio( label="Upload Audio File", type="numpy", sources=["upload", "microphone"] ) gr.Markdown(""" **Supported formats:** - `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a` **Recommended:** - Mono audio (single instrument) - Clear, high-quality recordings - 30 seconds to 5 minutes duration """) transcribe_btn = gr.Button( "🎼 Transcribe to MIDI", variant="primary", size="lg" ) with gr.Column(scale=1): gr.Markdown("### 📥 Output") midi_file = gr.File( label="Download MIDI", type="filepath" ) note_info = gr.Textbox( label="Note Detection Summary", lines=15, interactive=False, max_lines=20 ) gr.Markdown(""" --- ### ⚙️ Model Details **Model:** ICASSP 2022 (Spotify Basic Pitch) - Lightweight: ~20 MB - CPU-optimized inference - No GPU required **Detection Parameters:** - Onset threshold: 0.5 (note attack sensitivity) - Frame threshold: 0.3 (note sustain sensitivity) - Frequency range: 10 Hz - 2000 Hz - Melodia post-processing: Enabled **Output:** - MIDI file with detected notes - Note timing and pitch information - Amplitude/velocity data """) gr.Markdown(""" --- ### 💡 Tips for Best Results 1. **Single instrument:** Works best with one instrument or voice 2. **Mono audio:** Use mono recordings when possible 3. **Clear audio:** Avoid background noise 4. **Duration:** Works with any length, but 30s-5min is typical 5. **Polyphonic:** Can detect multiple simultaneous notes **Limitations:** - Works best with pitched instruments (not drums) - May struggle with very fast passages - Polyphonic music may need manual correction """) gr.Markdown(""" --- ### 📚 About Basic Pitch Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch) **Citation:** ``` Basic Pitch: A Lightweight Yet Effective Pitch Detection Model for Automatic Music Transcription Spotify, 2022 ``` """) # Connect button to function transcribe_btn.click( fn=transcribe_audio, inputs=[audio_input], outputs=[midi_file, note_info] ) return demo if __name__ == "__main__": interface = create_gradio_interface() interface.launch( server_name="0.0.0.0", server_port=7860, share=False )