Spaces:

Jacaranda-Health
/

TTS-STT-Full-Loop-Eval

Running

File size: 5,424 Bytes

68c747b

import os
import warnings
import torch
import json
import soundfile as sf
import numpy as np

from huggingface_hub import login
from transformers.utils import logging as transformers_logging
from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    pipeline
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from jiwer import wer
import gradio as gr

# -------------------------------------------------------------------------------------------------------------------
# Environment + Logging Setup

HF_Key = os.environ.get("HF_Key")
login(token=HF_Key)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)

# -------------------------------------------------------------------------------------------------------------------
# Utility: Cosine similarity-based WER

def cosine_sim_wer_single(reference, prediction):
    ref = reference.strip() if reference else ""
    pred = prediction.strip() if prediction else ""
    if not ref or not pred:
        return 100.0
    
    try:
        vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3))
        vectors = vectorizer.fit_transform([ref, pred])
        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
        error_rate = 100.0 - similarity
        return round(error_rate, 2)
    except Exception:
        return 100.0

# -------------------------------------------------------------------------------------------------------------------
# Load TTS Models

model_id = 'Jacaranda-Health/Speecht5'
speaker_file_path = 'speaker.json'

with open(speaker_file_path, 'r') as file:
    example = json.load(file)
speaker_embeddings = torch.tensor(example).unsqueeze(0)

l_model = SpeechT5ForTextToSpeech.from_pretrained("eolang/speecht5_v4-2")
l_processor = SpeechT5Processor.from_pretrained(model_id)
l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

def synthesize(input_text):
    inputs = l_processor(text=input_text, return_tensors="pt")
    speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder)
    output_path = 'test_output.wav'
    sf.write(output_path, speech.numpy(), 16000)
    return output_path

# -------------------------------------------------------------------------------------------------------------------
# Load STT Pipelines

tuned_pipeline = pipeline(
    "automatic-speech-recognition",
    model="Jacaranda-Health/ASR-STT",
    device=device,
    return_timestamps=True,
    generate_kwargs={
        "no_repeat_ngram_size": 3,
        "repetition_penalty": 1.5,
    }
)

openai_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    device=device,
    return_timestamps=True,
    generate_kwargs={
        "no_repeat_ngram_size": 3,
        "repetition_penalty": 1.5,
    }
)

def tunned_transcribe(filepath):
    transcription = tuned_pipeline(filepath, return_timestamps=True)
    return transcription["text"]

def openai_transcribe(filepath):
    transcription = openai_pipeline(filepath, return_timestamps=True)
    return transcription["text"]

# -------------------------------------------------------------------------------------------------------------------
# Full Loop: TTS → STT → Eval

def full_loop(ref_text):
    output_wav = synthesize(ref_text)
    
    tunned_text = tunned_transcribe(output_wav)
    openai_text = openai_transcribe(output_wav)
    
    tunned_WER = wer(ref_text, tunned_text)
    base_WER = wer(ref_text, openai_text)
    
    tunned_cosine = cosine_sim_wer_single(ref_text, tunned_text)
    base_cosine = cosine_sim_wer_single(ref_text, openai_text)

    result = f"""🔊 **Tunned Model Transcription:**  
{tunned_text}  
WER: {round(tunned_WER, 2)}  
Cosine-based WER: {tunned_cosine}%

🟢 **Base Model Transcription:**  
{openai_text}  
WER: {round(base_WER, 2)}  
Cosine-based WER: {base_cosine}%"""

    return output_wav, result

# -------------------------------------------------------------------------------------------------------------------
# Transcription Tab

def transcribe_tab(audio, model_type):
    if model_type == "Tuned":
        return tunned_transcribe(audio)
    return openai_transcribe(audio)

transcribe_ui = gr.Interface(
    fn=transcribe_tab,
    inputs=[
        gr.Audio(label="Upload Audio", type="filepath"),
        gr.Radio(["Tuned", "Base"], label="Choose Model", value="Tuned")
    ],
    outputs=gr.Textbox(label="Transcription"),
    title="Speech to Text"
)

# -------------------------------------------------------------------------------------------------------------------
# Synthesis Tab

synthesize_ui = gr.Interface(
    fn=full_loop,
    inputs=gr.Textbox(label="Enter Text"),
    outputs=[gr.Audio(label="Synthesized Audio"), gr.Textbox(label="Evaluation")],
    title="Text to Speech + Eval"
)

# -------------------------------------------------------------------------------------------------------------------
# Combine as Tabs

demo = gr.TabbedInterface(
    interface_list=[transcribe_ui, synthesize_ui],
    tab_names=["Transcribe Audio", "Synthesize & Evaluate"]
)

if __name__ == "__main__":
    demo.launch()