Spaces:

mtg-upf
/

audio-difficulty

Running on Zero

App Files Files Community

PRamoneda commited on May 15

Commit

c66e52a

0 Parent(s):

Initial commit for Hugging Face Space

Browse files

Files changed (22) hide show

.idea/.gitignore +8 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/interface-audio-difficulty.iml +7 -0
.idea/misc.xml +7 -0
.idea/vcs.xml +7 -0
README.md +9 -0
__pycache__/get_difficulty.cpython-310.pyc +0 -0
__pycache__/get_difficulty.cpython-312.pyc +0 -0
__pycache__/model.cpython-310.pyc +0 -0
__pycache__/model.cpython-312.pyc +0 -0
__pycache__/model.cpython-38.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
__pycache__/utils.cpython-312.pyc +0 -0
__pycache__/utils.cpython-38.pyc +0 -0
app.py +39 -0
app.txt +1 -0
clean.py +15 -0
get_difficulty.py +115 -0
poetry.lock +0 -0
pyproject.toml +23 -0
requirements.txt +106 -0
utils.py +37 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/interface-audio-difficulty.iml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Poetry (interface-audio-difficulty)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Poetry (interface-audio-difficulty) (2)" project-jdk-type="Python SDK" />
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/temp-space" vcs="Git" />
+  </component>
+</project>

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Music Difficulty Estimator 🎹
+Upload an MP3, MP4, or YouTube link. The app extracts audio, predicts piano score difficulty, and generates a MIDI file.
+- Supports video/audio inputs
+- Uses Gradio and ffmpeg-python
+- Fully Python-based, no system-level ffmpeg required for conversion
+Built with ❤️ using Poetry + Gradio.

__pycache__/get_difficulty.cpython-310.pyc ADDED Viewed

Binary file (3.75 kB). View file

__pycache__/get_difficulty.cpython-312.pyc ADDED Viewed

Binary file (6.19 kB). View file

__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (10.8 kB). View file

__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (20.5 kB). View file

__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (10.7 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.29 kB). View file

__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (1.94 kB). View file

__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.26 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from huggingface_hub import hf_hub_download
+import torch
+import os
+REPO_ID = "pramoneda/audio"
+CACHE_BASE = "models"
+def download_model_checkpoint(model_name: str, checkpoint_id: int):
+    filename = f"{model_name}/checkpoint_{checkpoint_id}_clean.pth"
+    cache_dir = os.path.join(CACHE_BASE, model_name)
+    print(f"Downloading {filename} from {REPO_ID} to {cache_dir}")
+    path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=filename,
+        cache_dir=cache_dir
+    )
+    state_dict = torch.load(path, map_location="cpu")
+    return state_dict
+def ensure_local_checkpoints():
+    models = {
+        "audio_midi_cqt5_ps_v5": 0,
+        "audio_midi_pianoroll_ps_5_v4": 0,
+        "audio_midi_multi_ps_v5": 0
+    }
+    for model_name, checkpoint_id in models.items():
+        try:
+            _ = download_model_checkpoint(model_name, checkpoint_id)
+        except Exception as e:
+            print(f"❌ Failed to download {model_name}: {e}")
+if __name__ == "__main__":
+    ensure_local_checkpoints()

app.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

clean.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+def delete_clean_checkpoints(root_dir="models"):
+    deleted = 0
+    for dirpath, _, filenames in os.walk(root_dir):
+        for fname in filenames:
+            if fname.endswith("_clean.pth"):
+                file_path = os.path.join(dirpath, fname)
+                print(f"🗑️ Deleting: {file_path}")
+                os.remove(file_path)
+                deleted += 1
+    print(f"\n✅ Deleted {deleted} clean checkpoint(s) from '{root_dir}'")
+if __name__ == "__main__":
+    delete_clean_checkpoints("models")

get_difficulty.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import pdb
+from statistics import mean
+import torch
+from torch import nn
+import numpy as np
+import librosa
+from piano_transcription_inference import PianoTranscription, sample_rate, load_audio
+import pretty_midi
+from utils import prediction2label
+from model import AudioModel
+from scipy.signal import resample
+def downsample_log_cqt(cqt_matrix, target_fs=5):
+    original_fs = 44100 / 160
+    ratio = original_fs / target_fs
+    downsampled = resample(cqt_matrix, int(cqt_matrix.shape[0] / ratio), axis=0)
+    return downsampled
+def downsample_matrix(mat, original_fs, target_fs):
+    ratio = original_fs / target_fs
+    return resample(mat, int(mat.shape[0] / ratio), axis=0)
+def get_cqt_from_mp3(mp3_path):
+    sample_rate = 44100
+    hop_length = 160
+    y, sr = librosa.load(mp3_path, sr=sample_rate, mono=True)
+    cqt = librosa.cqt(y, sr=sr, hop_length=hop_length, n_bins=88, bins_per_octave=12)
+    log_cqt = librosa.amplitude_to_db(np.abs(cqt))
+    log_cqt = log_cqt.T  # shape (T, 88)
+    log_cqt = downsample_log_cqt(log_cqt, target_fs=5)
+    cqt_tensor = torch.tensor(log_cqt, dtype=torch.float32).unsqueeze(0).unsqueeze(0).cuda()
+    # pdb.set_trace()
+    print(f"cqt shape: {log_cqt.shape}")
+    return cqt_tensor
+def get_pianoroll_from_mp3(mp3_path):
+    audio, _ = load_audio(mp3_path, sr=sample_rate, mono=True)
+    transcriptor = PianoTranscription(device='cuda')
+    midi_path = "temp.mid"
+    transcriptor.transcribe(audio, midi_path)
+    midi_data = pretty_midi.PrettyMIDI(midi_path)
+    # Create pianoroll and onset matrix
+    fs = 5  # original frames per second
+    piano_roll = midi_data.get_piano_roll(fs=fs)[21:109].T  # shape: (T, 88)
+    piano_roll = piano_roll / 127
+    time_steps = piano_roll.shape[0]
+    onsets = np.zeros_like(piano_roll)
+    for instrument in midi_data.instruments:
+        for note in instrument.notes:
+            pitch = note.pitch - 21
+            onset_frame = int(note.start * fs)
+            if 0 <= pitch < 88 and onset_frame < time_steps:
+                onsets[onset_frame, pitch] = 1.0
+    pr_tensor = torch.tensor(piano_roll.T).unsqueeze(0).unsqueeze(1).cuda().float()
+    on_tensor = torch.tensor(onsets.T).unsqueeze(0).unsqueeze(1).cuda().float()
+    out_tensor = torch.cat([pr_tensor, on_tensor], dim=1)
+    print(f"piano_roll shape: {out_tensor.shape}")
+    return out_tensor.transpose(2, 3)
+def predict_difficulty(mp3_path, model_name, rep):
+    if "only_cqt" in rep:
+        only_cqt, only_pr = True, False
+        rep_clean = "multimodal5"
+    elif "only_pr" in rep:
+        only_cqt, only_pr = False, True
+        rep_clean = "multimodal5"
+    else:
+        only_cqt = only_pr = False
+        rep_clean = rep
+    model = AudioModel(num_classes=11, rep=rep_clean, modality_dropout=False, only_cqt=only_cqt, only_pr=only_pr)
+    checkpoint = [torch.load(f"models/{model_name}/checkpoint_{i}.pth", map_location="cuda", weights_only=False)
+                  for i in range(5)]
+    if rep == "cqt5":
+        inp_data = get_cqt_from_mp3(mp3_path)
+    elif rep == "pianoroll5":
+        inp_data = get_pianoroll_from_mp3(mp3_path)
+    elif rep_clean == "multimodal5":
+        x1 = get_pianoroll_from_mp3(mp3_path)
+        x2 = get_cqt_from_mp3(mp3_path)
+        inp_data = [x1, x2]
+    else:
+        raise ValueError(f"Representation {rep} not supported")
+    preds = []
+    for cheks in checkpoint:
+        model.load_state_dict(cheks["model_state_dict"])
+        model = model.cuda().eval()
+        with torch.inference_mode():
+            logits = model(inp_data, None)
+            pred = prediction2label(logits).item()
+            preds.append(pred)
+    return mean(preds)
+    # return preds
+if __name__ == "__main__":
+    mp3_path = "yt_audio.mp3"
+    model_name = ""
+    # pred_cqt = predict_difficulty(mp3_path, model_name="audio_midi_cqt5_ps_v5", rep="cqt5")
+    # print(f"Predicción dificultad CQT: {pred_cqt}")
+    # pred_pr = predict_difficulty(mp3_path, model_name="audio_midi_pianoroll_ps_5_v4", rep="pianoroll5")
+    # print(f"Predicción dificultad PR: {pred_pr}")
+    pred_multi = predict_difficulty(mp3_path, model_name="audio_midi_multi_ps_v5", rep="multimodal5")
+    print(f"Predicción dificultad multimodal: {pred_multi}")

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[tool.poetry]
+name = "interface-audio-difficulty"
+version = "0.1.0"
+description = ""
+authors = ["PRamoneda <PRamoneda@github.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.10"
+gradio = "^5.29.0"
+pydub = "^0.25.1"
+yt-dlp = "^2025.4.30"
+librosa = "0.9.2"
+pretty_midi = "^0.2.10"
+ffmpeg-python = "^0.2.0"
+scipy = "^1.13.0"
+torch = "^2.2.0"
+piano-transcription-inference = "^0.0.6"
+seaborn = "^0.13.2"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,106 @@

+aiofiles==24.1.0 ; python_version >= "3.10" and python_version < "4.0"
+annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
+anyio==4.9.0 ; python_version >= "3.10" and python_version < "4.0"
+audioop-lts==0.2.1 ; python_version >= "3.13" and python_version < "4.0"
+audioread==3.0.1 ; python_version >= "3.10" and python_version < "4.0"
+certifi==2025.4.26 ; python_version >= "3.10" and python_version < "4.0"
+cffi==1.17.1 ; python_version >= "3.10" and python_version < "4.0"
+charset-normalizer==3.4.2 ; python_version >= "3.10" and python_version < "4.0"
+click==8.2.0 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" and platform_system == "Windows"
+contourpy==1.3.2 ; python_version >= "3.10" and python_version < "4.0"
+cycler==0.12.1 ; python_version >= "3.10" and python_version < "4.0"
+decorator==5.2.1 ; python_version >= "3.10" and python_version < "4.0"
+exceptiongroup==1.3.0 ; python_version >= "3.10" and python_version < "3.11"
+fastapi==0.115.12 ; python_version >= "3.10" and python_version < "4.0"
+ffmpeg-python==0.2.0 ; python_version >= "3.10" and python_version < "4.0"
+ffmpy==0.5.0 ; python_version >= "3.10" and python_version < "4.0"
+filelock==3.18.0 ; python_version >= "3.10" and python_version < "4.0"
+fonttools==4.58.0 ; python_version >= "3.10" and python_version < "4.0"
+fsspec==2025.3.2 ; python_version >= "3.10" and python_version < "4.0"
+future==1.0.0 ; python_version >= "3.10" and python_version < "4.0"
+gradio-client==1.10.1 ; python_version >= "3.10" and python_version < "4.0"
+gradio==5.29.1 ; python_version >= "3.10" and python_version < "4.0"
+groovy==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
+h11==0.16.0 ; python_version >= "3.10" and python_version < "4.0"
+httpcore==1.0.9 ; python_version >= "3.10" and python_version < "4.0"
+httpx==0.28.1 ; python_version >= "3.10" and python_version < "4.0"
+huggingface-hub==0.31.2 ; python_version >= "3.10" and python_version < "4.0"
+idna==3.10 ; python_version >= "3.10" and python_version < "4.0"
+jinja2==3.1.6 ; python_version >= "3.10" and python_version < "4.0"
+joblib==1.5.0 ; python_version >= "3.10" and python_version < "4.0"
+kiwisolver==1.4.8 ; python_version >= "3.10" and python_version < "4.0"
+librosa==0.9.2 ; python_version >= "3.10" and python_version < "4.0"
+llvmlite==0.44.0 ; python_version >= "3.10" and python_version < "4.0"
+markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+markupsafe==3.0.2 ; python_version >= "3.10" and python_version < "4.0"
+matplotlib==3.10.3 ; python_version >= "3.10" and python_version < "4.0"
+mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+mido==1.3.3 ; python_version >= "3.10" and python_version < "4.0"
+mpmath==1.3.0 ; python_version >= "3.10" and python_version < "4.0"
+networkx==3.4.2 ; python_version >= "3.10" and python_version < "4.0"
+numba==0.61.2 ; python_version >= "3.10" and python_version < "4.0"
+numpy==2.2.5 ; python_version >= "3.10" and python_version < "4.0"
+nvidia-cublas-cu12==12.6.4.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cuda-cupti-cu12==12.6.80 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cuda-runtime-cu12==12.6.77 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cudnn-cu12==9.5.1.17 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cufft-cu12==11.3.0.4 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cufile-cu12==1.11.1.6 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-curand-cu12==10.3.7.77 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cusolver-cu12==11.7.1.2 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cusparse-cu12==12.5.4.2 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-cusparselt-cu12==0.6.3 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-nccl-cu12==2.26.2 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-nvjitlink-cu12==12.6.85 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+nvidia-nvtx-cu12==12.6.77 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+orjson==3.10.18 ; python_version >= "3.10" and python_version < "4.0"
+packaging==25.0 ; python_version >= "3.10" and python_version < "4.0"
+pandas==2.2.3 ; python_version >= "3.10" and python_version < "4.0"
+piano-transcription-inference==0.0.6 ; python_version >= "3.10" and python_version < "4.0"
+pillow==11.2.1 ; python_version >= "3.10" and python_version < "4.0"
+platformdirs==4.3.8 ; python_version >= "3.10" and python_version < "4.0"
+pooch==1.8.2 ; python_version >= "3.10" and python_version < "4.0"
+pretty-midi==0.2.10 ; python_version >= "3.10" and python_version < "4.0"
+pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0"
+pydantic-core==2.33.2 ; python_version >= "3.10" and python_version < "4.0"
+pydantic==2.11.4 ; python_version >= "3.10" and python_version < "4.0"
+pydub==0.25.1 ; python_version >= "3.10" and python_version < "4.0"
+pygments==2.19.1 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+pyparsing==3.2.3 ; python_version >= "3.10" and python_version < "4.0"
+python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0"
+python-multipart==0.0.20 ; python_version >= "3.10" and python_version < "4.0"
+pytz==2025.2 ; python_version >= "3.10" and python_version < "4.0"
+pyyaml==6.0.2 ; python_version >= "3.10" and python_version < "4.0"
+requests==2.32.3 ; python_version >= "3.10" and python_version < "4.0"
+resampy==0.4.3 ; python_version >= "3.10" and python_version < "4.0"
+rich==14.0.0 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+ruff==0.11.10 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+safehttpx==0.1.6 ; python_version >= "3.10" and python_version < "4.0"
+scikit-learn==1.6.1 ; python_version >= "3.10" and python_version < "4.0"
+scipy==1.15.3 ; python_version >= "3.10" and python_version < "4.0"
+seaborn==0.13.2 ; python_version >= "3.10" and python_version < "4.0"
+semantic-version==2.10.0 ; python_version >= "3.10" and python_version < "4.0"
+setuptools==80.7.1 ; python_version >= "3.12" and python_version < "4.0" or platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+shellingham==1.5.4 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+six==1.17.0 ; python_version >= "3.10" and python_version < "4.0"
+sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
+soundfile==0.13.1 ; python_version >= "3.10" and python_version < "4.0"
+starlette==0.46.2 ; python_version >= "3.10" and python_version < "4.0"
+sympy==1.14.0 ; python_version >= "3.10" and python_version < "4.0"
+threadpoolctl==3.6.0 ; python_version >= "3.10" and python_version < "4.0"
+tomlkit==0.13.2 ; python_version >= "3.10" and python_version < "4.0"
+torch==2.7.0 ; python_version >= "3.10" and python_version < "4.0"
+torchlibrosa==0.1.0 ; python_version >= "3.10" and python_version < "4.0"
+tqdm==4.67.1 ; python_version >= "3.10" and python_version < "4.0"
+triton==3.3.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
+typer==0.15.3 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+typing-extensions==4.13.2 ; python_version >= "3.10" and python_version < "4.0"
+typing-inspection==0.4.0 ; python_version >= "3.10" and python_version < "4.0"
+tzdata==2025.2 ; python_version >= "3.10" and python_version < "4.0"
+urllib3==2.4.0 ; python_version >= "3.10" and python_version < "4.0"
+uvicorn==0.34.2 ; python_version >= "3.10" and python_version < "4.0" and sys_platform != "emscripten"
+websockets==15.0.1 ; python_version >= "3.10" and python_version < "4.0"
+yt-dlp==2025.4.30 ; python_version >= "3.10" and python_version < "4.0"
+huggingface_hub

utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+import pickle
+def save_json(dictionary, name_file):
+    with open(name_file, 'w') as fp:
+        json.dump(dictionary, fp, sort_keys=True, indent=4)
+def prediction2label(pred):
+    """Convert ordinal predictions to class labels, e.g.
+    [0.9, 0.1, 0.1, 0.1] -> 0
+    [0.9, 0.9, 0.1, 0.1] -> 1
+    [0.9, 0.9, 0.9, 0.1] -> 2
+    etc.
+    """
+    return (pred > 0.5).cumprod(axis=1).sum(axis=1) - 1
+def load_json(name_file):
+    data = None
+    with open(name_file, 'r') as fp:
+        data = json.load(fp)
+    return data
+def save_binary(dictionary, name_file):
+    with open(name_file, 'wb') as fp:
+        pickle.dump(dictionary, fp, protocol=pickle.HIGHEST_PROTOCOL)
+def load_binary(name_file):
+    data = None
+    with open(name_file, 'rb') as fp:
+        data = pickle.load(fp)
+    return data