File size: 7,185 Bytes

5e5a371

"""
Basic Pitch Audio-to-MIDI Converter
Hugging Face Space for CPU inference
July 2024 version
"""

import gradio as gr
import numpy as np
from basic_pitch.inference import predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import tempfile
import os


def transcribe_audio(audio_input):
    """
    Transcribe audio to MIDI using Basic Pitch model.
    
    Args:
        audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component
    
    Returns:
        midi_file_path: Path to generated MIDI file
        note_summary: Summary of detected notes
    """
    try:
        if audio_input is None:
            return None, "Please upload an audio file first."
        
        sample_rate, audio_data = audio_input
        
        # Create temporary directory for processing
        with tempfile.TemporaryDirectory() as tmpdir:
            # Save audio to temporary file
            audio_path = os.path.join(tmpdir, "input_audio.wav")
            
            import soundfile as sf
            sf.write(audio_path, audio_data, sample_rate)
            
            # Run Basic Pitch inference
            model_output, midi_data, note_events = predict(
                audio_path,
                model_or_model_path=ICASSP_2022_MODEL_PATH,
                onset_thresh=0.5,
                frame_thresh=0.3,
                minimum_note_length=127.70254248031496,
                minimum_frequency=10,
                maximum_frequency=2000,
                melodia_trick=True,
                sonify=False
            )
            
            # Save MIDI output
            midi_path = os.path.join(tmpdir, "output.mid")
            midi_data.write(midi_path)
            
            # Generate note summary
            note_summary = generate_note_summary(note_events)
            
            return midi_path, note_summary
    
    except Exception as e:
        return None, f"Error: {str(e)}"


def generate_note_summary(note_events):
    """
    Generate a human-readable summary of detected notes.
    
    Args:
        note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends)
    
    Returns:
        Formatted string summary
    """
    if not note_events or len(note_events) == 0:
        return "No notes detected in the audio."
    
    summary = f"✓ Transcription Complete\n"
    summary += f"Total notes detected: {len(note_events)}\n\n"
    summary += "Note Events:\n"
    summary += "-" * 70 + "\n"
    summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n"
    summary += "-" * 70 + "\n"
    
    for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events:
        duration = end_time - start_time
        summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n"
    
    summary += "-" * 70 + "\n"
    
    # Calculate statistics
    avg_duration = np.mean([end - start for start, end, _, _, _ in note_events])
    avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events])
    
    summary += f"\nStatistics:\n"
    summary += f"Average note duration: {avg_duration:.3f}s\n"
    summary += f"Average amplitude: {avg_amplitude:.3f}\n"
    
    return summary


def create_gradio_interface():
    """
    Create the Gradio interface for Basic Pitch transcription.
    """
    
    with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo:
        
        gr.Markdown("""
        # 🎵 Basic Pitch: Automatic Music Transcription
        
        Convert audio files to MIDI notation using Spotify's **Basic Pitch** model.
        
        This lightweight neural network performs **automatic music transcription (AMT)** 
        and works with any instrument or voice.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📤 Input")
                
                audio_input = gr.Audio(
                    label="Upload Audio File",
                    type="numpy",
                    sources=["upload", "microphone"]
                )
                
                gr.Markdown("""
                **Supported formats:**
                - `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a`
                
                **Recommended:**
                - Mono audio (single instrument)
                - Clear, high-quality recordings
                - 30 seconds to 5 minutes duration
                """)
                
                transcribe_btn = gr.Button(
                    "🎼 Transcribe to MIDI",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                gr.Markdown("### 📥 Output")
                
                midi_file = gr.File(
                    label="Download MIDI",
                    type="filepath"
                )
                
                note_info = gr.Textbox(
                    label="Note Detection Summary",
                    lines=15,
                    interactive=False,
                    max_lines=20
                )
        
        gr.Markdown("""
        ---
        ### ⚙️ Model Details
        
        **Model:** ICASSP 2022 (Spotify Basic Pitch)
        - Lightweight: ~20 MB
        - CPU-optimized inference
        - No GPU required
        
        **Detection Parameters:**
        - Onset threshold: 0.5 (note attack sensitivity)
        - Frame threshold: 0.3 (note sustain sensitivity)
        - Frequency range: 10 Hz - 2000 Hz
        - Melodia post-processing: Enabled
        
        **Output:**
        - MIDI file with detected notes
        - Note timing and pitch information
        - Amplitude/velocity data
        """)
        
        gr.Markdown("""
        ---
        ### 💡 Tips for Best Results
        
        1. **Single instrument:** Works best with one instrument or voice
        2. **Mono audio:** Use mono recordings when possible
        3. **Clear audio:** Avoid background noise
        4. **Duration:** Works with any length, but 30s-5min is typical
        5. **Polyphonic:** Can detect multiple simultaneous notes
        
        **Limitations:**
        - Works best with pitched instruments (not drums)
        - May struggle with very fast passages
        - Polyphonic music may need manual correction
        """)
        
        gr.Markdown("""
        ---
        ### 📚 About Basic Pitch
        
        Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch)
        
        **Citation:**
        ```
        Basic Pitch: A Lightweight Yet Effective Pitch Detection Model 
        for Automatic Music Transcription
        Spotify, 2022
        ```
        """)
        
        # Connect button to function
        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=[audio_input],
            outputs=[midi_file, note_info]
        )
    
    return demo


if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )