Basic_pitch_from_Spotify / python.py.txt
Aliwan's picture
Upload 3 files
5e5a371 verified
"""
Basic Pitch Audio-to-MIDI Converter
Hugging Face Space for CPU inference
July 2024 version
"""
import gradio as gr
import numpy as np
from basic_pitch.inference import predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import tempfile
import os
def transcribe_audio(audio_input):
"""
Transcribe audio to MIDI using Basic Pitch model.
Args:
audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component
Returns:
midi_file_path: Path to generated MIDI file
note_summary: Summary of detected notes
"""
try:
if audio_input is None:
return None, "Please upload an audio file first."
sample_rate, audio_data = audio_input
# Create temporary directory for processing
with tempfile.TemporaryDirectory() as tmpdir:
# Save audio to temporary file
audio_path = os.path.join(tmpdir, "input_audio.wav")
import soundfile as sf
sf.write(audio_path, audio_data, sample_rate)
# Run Basic Pitch inference
model_output, midi_data, note_events = predict(
audio_path,
model_or_model_path=ICASSP_2022_MODEL_PATH,
onset_thresh=0.5,
frame_thresh=0.3,
minimum_note_length=127.70254248031496,
minimum_frequency=10,
maximum_frequency=2000,
melodia_trick=True,
sonify=False
)
# Save MIDI output
midi_path = os.path.join(tmpdir, "output.mid")
midi_data.write(midi_path)
# Generate note summary
note_summary = generate_note_summary(note_events)
return midi_path, note_summary
except Exception as e:
return None, f"Error: {str(e)}"
def generate_note_summary(note_events):
"""
Generate a human-readable summary of detected notes.
Args:
note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends)
Returns:
Formatted string summary
"""
if not note_events or len(note_events) == 0:
return "No notes detected in the audio."
summary = f"βœ“ Transcription Complete\n"
summary += f"Total notes detected: {len(note_events)}\n\n"
summary += "Note Events:\n"
summary += "-" * 70 + "\n"
summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n"
summary += "-" * 70 + "\n"
for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events:
duration = end_time - start_time
summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n"
summary += "-" * 70 + "\n"
# Calculate statistics
avg_duration = np.mean([end - start for start, end, _, _, _ in note_events])
avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events])
summary += f"\nStatistics:\n"
summary += f"Average note duration: {avg_duration:.3f}s\n"
summary += f"Average amplitude: {avg_amplitude:.3f}\n"
return summary
def create_gradio_interface():
"""
Create the Gradio interface for Basic Pitch transcription.
"""
with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo:
gr.Markdown("""
# 🎡 Basic Pitch: Automatic Music Transcription
Convert audio files to MIDI notation using Spotify's **Basic Pitch** model.
This lightweight neural network performs **automatic music transcription (AMT)**
and works with any instrument or voice.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“€ Input")
audio_input = gr.Audio(
label="Upload Audio File",
type="numpy",
sources=["upload", "microphone"]
)
gr.Markdown("""
**Supported formats:**
- `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a`
**Recommended:**
- Mono audio (single instrument)
- Clear, high-quality recordings
- 30 seconds to 5 minutes duration
""")
transcribe_btn = gr.Button(
"🎼 Transcribe to MIDI",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
gr.Markdown("### πŸ“₯ Output")
midi_file = gr.File(
label="Download MIDI",
type="filepath"
)
note_info = gr.Textbox(
label="Note Detection Summary",
lines=15,
interactive=False,
max_lines=20
)
gr.Markdown("""
---
### βš™οΈ Model Details
**Model:** ICASSP 2022 (Spotify Basic Pitch)
- Lightweight: ~20 MB
- CPU-optimized inference
- No GPU required
**Detection Parameters:**
- Onset threshold: 0.5 (note attack sensitivity)
- Frame threshold: 0.3 (note sustain sensitivity)
- Frequency range: 10 Hz - 2000 Hz
- Melodia post-processing: Enabled
**Output:**
- MIDI file with detected notes
- Note timing and pitch information
- Amplitude/velocity data
""")
gr.Markdown("""
---
### πŸ’‘ Tips for Best Results
1. **Single instrument:** Works best with one instrument or voice
2. **Mono audio:** Use mono recordings when possible
3. **Clear audio:** Avoid background noise
4. **Duration:** Works with any length, but 30s-5min is typical
5. **Polyphonic:** Can detect multiple simultaneous notes
**Limitations:**
- Works best with pitched instruments (not drums)
- May struggle with very fast passages
- Polyphonic music may need manual correction
""")
gr.Markdown("""
---
### πŸ“š About Basic Pitch
Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch)
**Citation:**
```
Basic Pitch: A Lightweight Yet Effective Pitch Detection Model
for Automatic Music Transcription
Spotify, 2022
```
""")
# Connect button to function
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[midi_file, note_info]
)
return demo
if __name__ == "__main__":
interface = create_gradio_interface()
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)