|
import os |
|
import warnings |
|
import torch |
|
import json |
|
import soundfile as sf |
|
import numpy as np |
|
|
|
from huggingface_hub import login |
|
from transformers.utils import logging as transformers_logging |
|
from transformers import ( |
|
SpeechT5Processor, |
|
SpeechT5ForTextToSpeech, |
|
SpeechT5HifiGan, |
|
pipeline |
|
) |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from jiwer import wer |
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
HF_Key = os.environ.get("HF_Key") |
|
login(token=HF_Key) |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
transformers_logging.set_verbosity_error() |
|
warnings.filterwarnings("ignore", category=UserWarning) |
|
|
|
|
|
|
|
|
|
def cosine_sim_wer_single(reference, prediction): |
|
ref = reference.strip() if reference else "" |
|
pred = prediction.strip() if prediction else "" |
|
if not ref or not pred: |
|
return 100.0 |
|
|
|
try: |
|
vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3)) |
|
vectors = vectorizer.fit_transform([ref, pred]) |
|
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100 |
|
error_rate = 100.0 - similarity |
|
return round(error_rate, 2) |
|
except Exception: |
|
return 100.0 |
|
|
|
|
|
|
|
|
|
model_id = 'Jacaranda-Health/Speecht5' |
|
speaker_file_path = 'speaker.json' |
|
|
|
with open(speaker_file_path, 'r') as file: |
|
example = json.load(file) |
|
speaker_embeddings = torch.tensor(example).unsqueeze(0) |
|
|
|
l_model = SpeechT5ForTextToSpeech.from_pretrained("eolang/speecht5_v4-2") |
|
l_processor = SpeechT5Processor.from_pretrained(model_id) |
|
l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
def synthesize(input_text): |
|
inputs = l_processor(text=input_text, return_tensors="pt") |
|
speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder) |
|
output_path = 'test_output.wav' |
|
sf.write(output_path, speech.numpy(), 16000) |
|
return output_path |
|
|
|
|
|
|
|
|
|
tuned_pipeline = pipeline( |
|
"automatic-speech-recognition", |
|
model="Jacaranda-Health/ASR-STT", |
|
device=device, |
|
return_timestamps=True, |
|
generate_kwargs={ |
|
"no_repeat_ngram_size": 3, |
|
"repetition_penalty": 1.5, |
|
} |
|
) |
|
|
|
openai_pipeline = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-small", |
|
device=device, |
|
return_timestamps=True, |
|
generate_kwargs={ |
|
"no_repeat_ngram_size": 3, |
|
"repetition_penalty": 1.5, |
|
} |
|
) |
|
|
|
def tunned_transcribe(filepath): |
|
transcription = tuned_pipeline(filepath, return_timestamps=True) |
|
return transcription["text"] |
|
|
|
def openai_transcribe(filepath): |
|
transcription = openai_pipeline(filepath, return_timestamps=True) |
|
return transcription["text"] |
|
|
|
|
|
|
|
|
|
def full_loop(ref_text): |
|
output_wav = synthesize(ref_text) |
|
|
|
tunned_text = tunned_transcribe(output_wav) |
|
openai_text = openai_transcribe(output_wav) |
|
|
|
tunned_WER = wer(ref_text, tunned_text) |
|
base_WER = wer(ref_text, openai_text) |
|
|
|
tunned_cosine = cosine_sim_wer_single(ref_text, tunned_text) |
|
base_cosine = cosine_sim_wer_single(ref_text, openai_text) |
|
|
|
result = f"""π **Tunned Model Transcription:** |
|
{tunned_text} |
|
WER: {round(tunned_WER, 2)} |
|
Cosine-based WER: {tunned_cosine}% |
|
|
|
π’ **Base Model Transcription:** |
|
{openai_text} |
|
WER: {round(base_WER, 2)} |
|
Cosine-based WER: {base_cosine}%""" |
|
|
|
return output_wav, result |
|
|
|
|
|
|
|
|
|
def transcribe_tab(audio, model_type): |
|
if model_type == "Tuned": |
|
return tunned_transcribe(audio) |
|
return openai_transcribe(audio) |
|
|
|
transcribe_ui = gr.Interface( |
|
fn=transcribe_tab, |
|
inputs=[ |
|
gr.Audio(label="Upload Audio", type="filepath"), |
|
gr.Radio(["Tuned", "Base"], label="Choose Model", value="Tuned") |
|
], |
|
outputs=gr.Textbox(label="Transcription"), |
|
title="Speech to Text" |
|
) |
|
|
|
|
|
|
|
|
|
synthesize_ui = gr.Interface( |
|
fn=full_loop, |
|
inputs=gr.Textbox(label="Enter Text"), |
|
outputs=[gr.Audio(label="Synthesized Audio"), gr.Textbox(label="Evaluation")], |
|
title="Text to Speech + Eval" |
|
) |
|
|
|
|
|
|
|
|
|
demo = gr.TabbedInterface( |
|
interface_list=[transcribe_ui, synthesize_ui], |
|
tab_names=["Transcribe Audio", "Synthesize & Evaluate"] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|