File size: 7,185 Bytes
5e5a371 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | """
Basic Pitch Audio-to-MIDI Converter
Hugging Face Space for CPU inference
July 2024 version
"""
import gradio as gr
import numpy as np
from basic_pitch.inference import predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import tempfile
import os
def transcribe_audio(audio_input):
"""
Transcribe audio to MIDI using Basic Pitch model.
Args:
audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component
Returns:
midi_file_path: Path to generated MIDI file
note_summary: Summary of detected notes
"""
try:
if audio_input is None:
return None, "Please upload an audio file first."
sample_rate, audio_data = audio_input
# Create temporary directory for processing
with tempfile.TemporaryDirectory() as tmpdir:
# Save audio to temporary file
audio_path = os.path.join(tmpdir, "input_audio.wav")
import soundfile as sf
sf.write(audio_path, audio_data, sample_rate)
# Run Basic Pitch inference
model_output, midi_data, note_events = predict(
audio_path,
model_or_model_path=ICASSP_2022_MODEL_PATH,
onset_thresh=0.5,
frame_thresh=0.3,
minimum_note_length=127.70254248031496,
minimum_frequency=10,
maximum_frequency=2000,
melodia_trick=True,
sonify=False
)
# Save MIDI output
midi_path = os.path.join(tmpdir, "output.mid")
midi_data.write(midi_path)
# Generate note summary
note_summary = generate_note_summary(note_events)
return midi_path, note_summary
except Exception as e:
return None, f"Error: {str(e)}"
def generate_note_summary(note_events):
"""
Generate a human-readable summary of detected notes.
Args:
note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends)
Returns:
Formatted string summary
"""
if not note_events or len(note_events) == 0:
return "No notes detected in the audio."
summary = f"โ Transcription Complete\n"
summary += f"Total notes detected: {len(note_events)}\n\n"
summary += "Note Events:\n"
summary += "-" * 70 + "\n"
summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n"
summary += "-" * 70 + "\n"
for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events:
duration = end_time - start_time
summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n"
summary += "-" * 70 + "\n"
# Calculate statistics
avg_duration = np.mean([end - start for start, end, _, _, _ in note_events])
avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events])
summary += f"\nStatistics:\n"
summary += f"Average note duration: {avg_duration:.3f}s\n"
summary += f"Average amplitude: {avg_amplitude:.3f}\n"
return summary
def create_gradio_interface():
"""
Create the Gradio interface for Basic Pitch transcription.
"""
with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo:
gr.Markdown("""
# ๐ต Basic Pitch: Automatic Music Transcription
Convert audio files to MIDI notation using Spotify's **Basic Pitch** model.
This lightweight neural network performs **automatic music transcription (AMT)**
and works with any instrument or voice.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ๐ค Input")
audio_input = gr.Audio(
label="Upload Audio File",
type="numpy",
sources=["upload", "microphone"]
)
gr.Markdown("""
**Supported formats:**
- `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a`
**Recommended:**
- Mono audio (single instrument)
- Clear, high-quality recordings
- 30 seconds to 5 minutes duration
""")
transcribe_btn = gr.Button(
"๐ผ Transcribe to MIDI",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
gr.Markdown("### ๐ฅ Output")
midi_file = gr.File(
label="Download MIDI",
type="filepath"
)
note_info = gr.Textbox(
label="Note Detection Summary",
lines=15,
interactive=False,
max_lines=20
)
gr.Markdown("""
---
### โ๏ธ Model Details
**Model:** ICASSP 2022 (Spotify Basic Pitch)
- Lightweight: ~20 MB
- CPU-optimized inference
- No GPU required
**Detection Parameters:**
- Onset threshold: 0.5 (note attack sensitivity)
- Frame threshold: 0.3 (note sustain sensitivity)
- Frequency range: 10 Hz - 2000 Hz
- Melodia post-processing: Enabled
**Output:**
- MIDI file with detected notes
- Note timing and pitch information
- Amplitude/velocity data
""")
gr.Markdown("""
---
### ๐ก Tips for Best Results
1. **Single instrument:** Works best with one instrument or voice
2. **Mono audio:** Use mono recordings when possible
3. **Clear audio:** Avoid background noise
4. **Duration:** Works with any length, but 30s-5min is typical
5. **Polyphonic:** Can detect multiple simultaneous notes
**Limitations:**
- Works best with pitched instruments (not drums)
- May struggle with very fast passages
- Polyphonic music may need manual correction
""")
gr.Markdown("""
---
### ๐ About Basic Pitch
Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch)
**Citation:**
```
Basic Pitch: A Lightweight Yet Effective Pitch Detection Model
for Automatic Music Transcription
Spotify, 2022
```
""")
# Connect button to function
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[midi_file, note_info]
)
return demo
if __name__ == "__main__":
interface = create_gradio_interface()
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)
|