wavlm-test / app.py
yychao's picture
Update app.py
67df979 verified
import torch
import torchaudio
import gradio as gr
import torch.nn.functional as F
from transformers import WavLMForXVector, Wav2Vec2FeatureExtractor
# 準備模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-sv").to(device)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-sv")
# 音訊處理函式
def preprocess(audio):
if audio is None:
return None
waveform, sr = torchaudio.load(audio)
if sr != 16000:
waveform = torchaudio.functional.resample(waveform, sr, 16000)
return waveform.squeeze(0)
# 取得 normalized embedding
def get_embedding(waveform):
inputs = feature_extractor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
embedding = model(**inputs).embeddings
return F.normalize(embedding, p=2, dim=1)
# 主處理函式
def compare_audio(native_audio, user_audio):
native_wav = preprocess(native_audio)
user_wav = preprocess(user_audio)
if native_wav is None or user_wav is None:
return "請上傳兩段語音"
emb1 = get_embedding(native_wav)
emb2 = get_embedding(user_wav)
similarity = F.cosine_similarity(emb1, emb2).item()
score = round(similarity * 100, 2) # 轉換為 0~100 分數
# 評語
if score > 90:
feedback = "非常接近!你模仿得很好 👏"
elif score > 75:
feedback = "不錯,再接再厲 👍"
elif score > 60:
feedback = "有些相似,但還有改進空間 🙂"
else:
feedback = "相似度不高,請再試一次 😅"
return f"相似度分數:{score}/100\n{feedback}"
# Gradio UI
title = "🎤 語音模仿評分器"
description = "上傳 native speaker 的語音,以及你模仿的語音,系統會幫你評分你的發音相似度。"
demo = gr.Interface(
fn=compare_audio,
inputs=[
gr.Audio(type="filepath", label="📢 Native Speaker 語音"),
gr.Audio(type="filepath", label="🗣️ 你的模仿錄音"),
],
outputs="text",
title=title,
description=description,
)
if __name__ == "__main__":
demo.launch()