eolang's picture
Rename app_bkp.py to app.py
438e300 verified
import os
import warnings
import torch
import json
import soundfile as sf
import numpy as np
from huggingface_hub import login
from transformers.utils import logging as transformers_logging
from transformers import (
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
pipeline
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from jiwer import wer
import gradio as gr
# -------------------------------------------------------------------------------------------------------------------
# Environment + Logging Setup
HF_Key = os.environ.get("HF_Key")
login(token=HF_Key)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)
# -------------------------------------------------------------------------------------------------------------------
# Utility: Cosine similarity-based WER
def cosine_sim_wer_single(reference, prediction):
ref = reference.strip() if reference else ""
pred = prediction.strip() if prediction else ""
if not ref or not pred:
return 100.0
try:
vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3))
vectors = vectorizer.fit_transform([ref, pred])
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
error_rate = 100.0 - similarity
return round(error_rate, 2)
except Exception:
return 100.0
# -------------------------------------------------------------------------------------------------------------------
# Load TTS Models
model_id = 'Jacaranda-Health/Speecht5'
speaker_file_path = 'speaker.json'
with open(speaker_file_path, 'r') as file:
example = json.load(file)
speaker_embeddings = torch.tensor(example).unsqueeze(0)
l_model = SpeechT5ForTextToSpeech.from_pretrained("eolang/speecht5_v4-2")
l_processor = SpeechT5Processor.from_pretrained(model_id)
l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
def synthesize(input_text):
inputs = l_processor(text=input_text, return_tensors="pt")
speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder)
output_path = 'test_output.wav'
sf.write(output_path, speech.numpy(), 16000)
return output_path
# -------------------------------------------------------------------------------------------------------------------
# Load STT Pipelines
tuned_pipeline = pipeline(
"automatic-speech-recognition",
model="Jacaranda-Health/ASR-STT",
device=device,
return_timestamps=True,
generate_kwargs={
"no_repeat_ngram_size": 3,
"repetition_penalty": 1.5,
}
)
openai_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
device=device,
return_timestamps=True,
generate_kwargs={
"no_repeat_ngram_size": 3,
"repetition_penalty": 1.5,
}
)
def tunned_transcribe(filepath):
transcription = tuned_pipeline(filepath, return_timestamps=True)
return transcription["text"]
def openai_transcribe(filepath):
transcription = openai_pipeline(filepath, return_timestamps=True)
return transcription["text"]
# -------------------------------------------------------------------------------------------------------------------
# Full Loop: TTS β†’ STT β†’ Eval
def full_loop(ref_text):
output_wav = synthesize(ref_text)
tunned_text = tunned_transcribe(output_wav)
openai_text = openai_transcribe(output_wav)
tunned_WER = wer(ref_text, tunned_text)
base_WER = wer(ref_text, openai_text)
tunned_cosine = cosine_sim_wer_single(ref_text, tunned_text)
base_cosine = cosine_sim_wer_single(ref_text, openai_text)
result = f"""πŸ”Š **Tunned Model Transcription:**
{tunned_text}
WER: {round(tunned_WER, 2)}
Cosine-based WER: {tunned_cosine}%
🟒 **Base Model Transcription:**
{openai_text}
WER: {round(base_WER, 2)}
Cosine-based WER: {base_cosine}%"""
return output_wav, result
# -------------------------------------------------------------------------------------------------------------------
# Transcription Tab
def transcribe_tab(audio, model_type):
if model_type == "Tuned":
return tunned_transcribe(audio)
return openai_transcribe(audio)
transcribe_ui = gr.Interface(
fn=transcribe_tab,
inputs=[
gr.Audio(label="Upload Audio", type="filepath"),
gr.Radio(["Tuned", "Base"], label="Choose Model", value="Tuned")
],
outputs=gr.Textbox(label="Transcription"),
title="Speech to Text"
)
# -------------------------------------------------------------------------------------------------------------------
# Synthesis Tab
synthesize_ui = gr.Interface(
fn=full_loop,
inputs=gr.Textbox(label="Enter Text"),
outputs=[gr.Audio(label="Synthesized Audio"), gr.Textbox(label="Evaluation")],
title="Text to Speech + Eval"
)
# -------------------------------------------------------------------------------------------------------------------
# Combine as Tabs
demo = gr.TabbedInterface(
interface_list=[transcribe_ui, synthesize_ui],
tab_names=["Transcribe Audio", "Synthesize & Evaluate"]
)
if __name__ == "__main__":
demo.launch()