Upload 3 files

5e5a371 verified 4 days ago

7.19 kB

	"""
	Basic Pitch Audio-to-MIDI Converter
	Hugging Face Space for CPU inference
	July 2024 version
	"""

	import gradio as gr
	import numpy as np
	from basic_pitch.inference import predict
	from basic_pitch import ICASSP_2022_MODEL_PATH
	import tempfile
	import os


	def transcribe_audio(audio_input):
	"""
	Transcribe audio to MIDI using Basic Pitch model.

	Args:
	audio_input: Tuple of (sample_rate, audio_array) from Gradio Audio component

	Returns:
	midi_file_path: Path to generated MIDI file
	note_summary: Summary of detected notes
	"""
	try:
	if audio_input is None:
	return None, "Please upload an audio file first."

	sample_rate, audio_data = audio_input

	# Create temporary directory for processing
	with tempfile.TemporaryDirectory() as tmpdir:
	# Save audio to temporary file
	audio_path = os.path.join(tmpdir, "input_audio.wav")

	import soundfile as sf
	sf.write(audio_path, audio_data, sample_rate)

	# Run Basic Pitch inference
	model_output, midi_data, note_events = predict(
	audio_path,
	model_or_model_path=ICASSP_2022_MODEL_PATH,
	onset_thresh=0.5,
	frame_thresh=0.3,
	minimum_note_length=127.70254248031496,
	minimum_frequency=10,
	maximum_frequency=2000,
	melodia_trick=True,
	sonify=False
	)

	# Save MIDI output
	midi_path = os.path.join(tmpdir, "output.mid")
	midi_data.write(midi_path)

	# Generate note summary
	note_summary = generate_note_summary(note_events)

	return midi_path, note_summary

	except Exception as e:
	return None, f"Error: {str(e)}"


	def generate_note_summary(note_events):
	"""
	Generate a human-readable summary of detected notes.

	Args:
	note_events: List of tuples (start_time, end_time, pitch_midi, amplitude, pitch_bends)

	Returns:
	Formatted string summary
	"""
	if not note_events or len(note_events) == 0:
	return "No notes detected in the audio."

	summary = f"✓ Transcription Complete\n"
	summary += f"Total notes detected: {len(note_events)}\n\n"
	summary += "Note Events:\n"
	summary += "-" * 70 + "\n"
	summary += f"{'Start (s)':<12} {'End (s)':<12} {'MIDI':<8} {'Duration':<12} {'Amplitude':<12}\n"
	summary += "-" * 70 + "\n"

	for start_time, end_time, midi_pitch, amplitude, pitch_bends in note_events:
	duration = end_time - start_time
	summary += f"{start_time:<12.3f} {end_time:<12.3f} {midi_pitch:<8} {duration:<12.3f} {amplitude:<12.3f}\n"

	summary += "-" * 70 + "\n"

	# Calculate statistics
	avg_duration = np.mean([end - start for start, end, _, _, _ in note_events])
	avg_amplitude = np.mean([amp for _, _, _, amp, _ in note_events])

	summary += f"\nStatistics:\n"
	summary += f"Average note duration: {avg_duration:.3f}s\n"
	summary += f"Average amplitude: {avg_amplitude:.3f}\n"

	return summary


	def create_gradio_interface():
	"""
	Create the Gradio interface for Basic Pitch transcription.
	"""

	with gr.Blocks(title="Basic Pitch - Audio to MIDI") as demo:

	gr.Markdown("""
	# 🎵 Basic Pitch: Automatic Music Transcription

	Convert audio files to MIDI notation using Spotify's Basic Pitch model.

	This lightweight neural network performs automatic music transcription (AMT)
	and works with any instrument or voice.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Input")

	audio_input = gr.Audio(
	label="Upload Audio File",
	type="numpy",
	sources=["upload", "microphone"]
	)

	gr.Markdown("""
	Supported formats:
	- `.wav`, `.mp3`, `.ogg`, `.flac`, `.m4a`

	Recommended:
	- Mono audio (single instrument)
	- Clear, high-quality recordings
	- 30 seconds to 5 minutes duration
	""")

	transcribe_btn = gr.Button(
	"🎼 Transcribe to MIDI",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	gr.Markdown("### 📥 Output")

	midi_file = gr.File(
	label="Download MIDI",
	type="filepath"
	)

	note_info = gr.Textbox(
	label="Note Detection Summary",
	lines=15,
	interactive=False,
	max_lines=20
	)

	gr.Markdown("""
	---
	### ⚙️ Model Details

	Model: ICASSP 2022 (Spotify Basic Pitch)
	- Lightweight: ~20 MB
	- CPU-optimized inference
	- No GPU required

	Detection Parameters:
	- Onset threshold: 0.5 (note attack sensitivity)
	- Frame threshold: 0.3 (note sustain sensitivity)
	- Frequency range: 10 Hz - 2000 Hz
	- Melodia post-processing: Enabled

	Output:
	- MIDI file with detected notes
	- Note timing and pitch information
	- Amplitude/velocity data
	""")

	gr.Markdown("""
	---
	### 💡 Tips for Best Results

	1. Single instrument: Works best with one instrument or voice
	2. Mono audio: Use mono recordings when possible
	3. Clear audio: Avoid background noise
	4. Duration: Works with any length, but 30s-5min is typical
	5. Polyphonic: Can detect multiple simultaneous notes

	Limitations:
	- Works best with pitched instruments (not drums)
	- May struggle with very fast passages
	- Polyphonic music may need manual correction
	""")

	gr.Markdown("""
	---
	### 📚 About Basic Pitch

	Developed by [Spotify's Audio Intelligence Lab](https://github.com/spotify/basic-pitch)

	Citation:
	```
	Basic Pitch: A Lightweight Yet Effective Pitch Detection Model
	for Automatic Music Transcription
	Spotify, 2022
	```
	""")

	# Connect button to function
	transcribe_btn.click(
	fn=transcribe_audio,
	inputs=[audio_input],
	outputs=[midi_file, note_info]
	)

	return demo


	if __name__ == "__main__":
	interface = create_gradio_interface()
	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)