Spaces:
Sleeping
Sleeping
import librosa | |
import soundfile as sf | |
import numpy as np | |
import torch | |
import uuid | |
from pathlib import Path | |
# ----------- Initialization ----------- | |
def init_singmos(): | |
print("[Init] Loading SingMOS...") | |
return torch.hub.load( | |
"South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True | |
) | |
def init_basic_pitch(): | |
print("[Init] Loading BasicPitch...") | |
from basic_pitch.inference import predict | |
return predict | |
def init_per(): | |
print("[Init] Loading PER...") | |
from transformers import pipeline | |
import jiwer | |
asr_pipeline = pipeline( | |
"automatic-speech-recognition", model="openai/whisper-large-v3-turbo" | |
) | |
return { | |
"asr_pipeline": asr_pipeline, | |
"jiwer": jiwer, | |
} | |
def init_audiobox_aesthetics(): | |
print("[Init] Loading AudioboxAesthetics...") | |
from audiobox_aesthetics.infer import initialize_predictor | |
predictor = initialize_predictor() | |
return predictor | |
# ----------- Evaluation ----------- | |
def eval_singmos(audio_path, predictor): | |
audio_array, sr = librosa.load(audio_path, sr=44100) | |
wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000) | |
wav_tensor = torch.from_numpy(wav).unsqueeze(0) | |
length_tensor = torch.tensor([wav_tensor.shape[1]]) | |
score = predictor(wav_tensor, length_tensor) | |
return {"singmos": float(score)} | |
def eval_melody_metrics(audio_path, pitch_extractor): | |
model_output, midi_data, note_events = pitch_extractor(audio_path) | |
metrics = {} | |
assert ( | |
len(midi_data.instruments) == 1 | |
), f"Detected {len(midi_data.instruments)} instruments for {audio_path}" | |
midi_notes = midi_data.instruments[0].notes | |
melody = [note.pitch for note in midi_notes] | |
if len(melody) == 0: | |
print(f"No notes detected in {audio_path}") | |
return {} | |
intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)] | |
metrics["pitch_range"] = max(melody) - min(melody) | |
if len(intervals) > 0: | |
metrics["interval_mean"] = np.mean(intervals) | |
metrics["interval_std"] = np.std(intervals) | |
metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals]) | |
metrics["dissonance_rate"] = compute_dissonance_rate(intervals) | |
return metrics | |
def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}): | |
dissonant = [i % 12 in dissonant_intervals for i in intervals] | |
return np.mean(dissonant) if intervals else np.nan | |
def pypinyin_g2p_phone_without_prosody(text): | |
from pypinyin import Style, pinyin | |
from pypinyin.style._utils import get_finals, get_initials | |
phones = [] | |
for phone in pinyin(text, style=Style.NORMAL, strict=False): | |
initial = get_initials(phone[0], strict=False) | |
final = get_finals(phone[0], strict=False) | |
if len(initial) != 0: | |
if initial in ["x", "y", "j", "q"]: | |
if final == "un": | |
final = "vn" | |
elif final == "uan": | |
final = "van" | |
elif final == "u": | |
final = "v" | |
if final == "ue": | |
final = "ve" | |
phones.append(initial) | |
phones.append(final) | |
else: | |
phones.append(final) | |
return phones | |
def eval_per(audio_path, reference_text, evaluator): | |
audio_array, sr = librosa.load(audio_path, sr=16000) | |
asr_result = evaluator["asr_pipeline"]( | |
audio_array, generate_kwargs={"language": "mandarin"} | |
)["text"] | |
hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result) | |
ref_pinyin = pypinyin_g2p_phone_without_prosody(reference_text) | |
per = evaluator["jiwer"].wer(" ".join(ref_pinyin), " ".join(hyp_pinyin)) | |
return {"per": per} | |
def eval_aesthetic(audio_path, predictor): | |
score = predictor.forward([{"path": str(audio_path)}]) | |
return score | |
# ----------- Main Function ----------- | |
def load_evaluators(config): | |
loaded = {} | |
if "singmos" in config: | |
loaded["singmos"] = init_singmos() | |
if "melody" in config: | |
loaded["melody"] = init_basic_pitch() | |
if "per" in config: | |
loaded["per"] = init_per() | |
if "aesthetic" in config: | |
loaded["aesthetic"] = init_audiobox_aesthetics() | |
return loaded | |
def run_evaluation(audio_path, evaluators, **kwargs): | |
results = {} | |
if "singmos" in evaluators: | |
results.update(eval_singmos(audio_path, evaluators["singmos"])) | |
if "per" in evaluators: | |
results.update(eval_per(audio_path, kwargs["llm_text"], evaluators["per"])) | |
if "melody" in evaluators: | |
results.update(eval_melody_metrics(audio_path, evaluators["melody"])) | |
if "aesthetic" in evaluators: | |
results.update(eval_aesthetic(audio_path, evaluators["aesthetic"])[0]) | |
return results | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--wav_path", type=str, required=True) | |
parser.add_argument("--results_csv", type=str, required=True) | |
parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic") | |
args = parser.parse_args() | |
evaluators = load_evaluators(args.evaluators.split(",")) | |
results = run_evaluation(args.wav_path, evaluators) | |
print(results) | |
with open(args.results_csv, "a") as f: | |
header = "file," + ",".join(results.keys()) + "\n" | |
if f.tell() == 0: | |
f.write(header) | |
else: | |
with open(args.results_csv, "r") as f2: | |
file_header = f2.readline() | |
if file_header != header: | |
raise ValueError(f"Header mismatch: {file_header} vs {header}") | |
line = ( | |
",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n" | |
) | |
f.write(line) | |