File size: 7,185 Bytes
5e5a371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
Basic Pitch Audio-to-MIDI Converter
Hugging Face Space for CPU inference
July 2024 version
"""

import gradio as gr
import numpy as np
from basic_pitch.inference import predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import tempfile
import os


def transcribe_audio(audio_input):
    """
    Transcribe audio to MIDI using Basic Pitch model.
    
    Args:
        audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component
    
    Returns:
        midi_file_path: Path to generated MIDI file
        note_summary: Summary of detected notes
    """
    try:
        if audio_input is None:
            return None, "Please upload an audio file first."
        
        sample_rate, audio_data = audio_input
        
        # Create temporary directory for processing
        with tempfile.TemporaryDirectory() as tmpdir:
            # Save audio to temporary file
            audio_path = os.path.join(tmpdir, "input_audio.wav")
            
            import soundfile as sf
            sf.write(audio_path, audio_data, sample_rate)
            
            # Run Basic Pitch inference
            model_output, midi_data, note_events = predict(
                audio_path,
                model_or_model_path=ICASSP_2022_MODEL_PATH,
                onset_thresh=0.5,
                frame_thresh=0.3,
                minimum_note_length=127.70254248031496,
                minimum_frequency=10,
                maximum_frequency=2000,
                melodia_trick=True,
                sonify=False
            )
            
            # Save MIDI output
            midi_path = os.path.join(tmpdir, "output.mid")
            midi_data.write(midi_path)
            
            # Generate note summary
            note_summary = generate_note_summary(note_events)
            
            return midi_path, note_summary
    
    except Exception as e:
        return None, f"Error: {str(e)}"


def generate_note_summary(note_events):
    """
    Generate a human-readable summary of detected notes.
    
    Args:
        note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends)
    
    Returns:
        Formatted string summary
    """
    if not note_events or len(note_events) == 0:
        return "No notes detected in the audio."
    
    summary = f"โœ“ Transcription Complete\n"
    summary += f"Total notes detected: {len(note_events)}\n\n"
    summary += "Note Events:\n"
    summary += "-" * 70 + "\n"
    summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n"
    summary += "-" * 70 + "\n"
    
    for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events:
        duration = end_time - start_time
        summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n"
    
    summary += "-" * 70 + "\n"
    
    # Calculate statistics
    avg_duration = np.mean([end - start for start, end, _, _, _ in note_events])
    avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events])
    
    summary += f"\nStatistics:\n"
    summary += f"Average note duration: {avg_duration:.3f}s\n"
    summary += f"Average amplitude: {avg_amplitude:.3f}\n"
    
    return summary


def create_gradio_interface():
    """
    Create the Gradio interface for Basic Pitch transcription.
    """
    
    with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo:
        
        gr.Markdown("""
        # ๐ŸŽต Basic Pitch: Automatic Music Transcription
        
        Convert audio files to MIDI notation using Spotify's **Basic Pitch** model.
        
        This lightweight neural network performs **automatic music transcription (AMT)** 
        and works with any instrument or voice.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### ๐Ÿ“ค Input")
                
                audio_input = gr.Audio(
                    label="Upload Audio File",
                    type="numpy",
                    sources=["upload", "microphone"]
                )
                
                gr.Markdown("""
                **Supported formats:**
                - `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a`
                
                **Recommended:**
                - Mono audio (single instrument)
                - Clear, high-quality recordings
                - 30 seconds to 5 minutes duration
                """)
                
                transcribe_btn = gr.Button(
                    "๐ŸŽผ Transcribe to MIDI",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                gr.Markdown("### ๐Ÿ“ฅ Output")
                
                midi_file = gr.File(
                    label="Download MIDI",
                    type="filepath"
                )
                
                note_info = gr.Textbox(
                    label="Note Detection Summary",
                    lines=15,
                    interactive=False,
                    max_lines=20
                )
        
        gr.Markdown("""
        ---
        ### โš™๏ธ Model Details
        
        **Model:** ICASSP 2022 (Spotify Basic Pitch)
        - Lightweight: ~20 MB
        - CPU-optimized inference
        - No GPU required
        
        **Detection Parameters:**
        - Onset threshold: 0.5 (note attack sensitivity)
        - Frame threshold: 0.3 (note sustain sensitivity)
        - Frequency range: 10 Hz - 2000 Hz
        - Melodia post-processing: Enabled
        
        **Output:**
        - MIDI file with detected notes
        - Note timing and pitch information
        - Amplitude/velocity data
        """)
        
        gr.Markdown("""
        ---
        ### ๐Ÿ’ก Tips for Best Results
        
        1. **Single instrument:** Works best with one instrument or voice
        2. **Mono audio:** Use mono recordings when possible
        3. **Clear audio:** Avoid background noise
        4. **Duration:** Works with any length, but 30s-5min is typical
        5. **Polyphonic:** Can detect multiple simultaneous notes
        
        **Limitations:**
        - Works best with pitched instruments (not drums)
        - May struggle with very fast passages
        - Polyphonic music may need manual correction
        """)
        
        gr.Markdown("""
        ---
        ### ๐Ÿ“š About Basic Pitch
        
        Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch)
        
        **Citation:**
        ```
        Basic Pitch: A Lightweight Yet Effective Pitch Detection Model 
        for Automatic Music Transcription
        Spotify, 2022
        ```
        """)
        
        # Connect button to function
        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=[audio_input],
            outputs=[midi_file, note_info]
        )
    
    return demo


if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )