Spaces:

Jacaranda-Health
/

TTS-STT-Full-Loop-Eval

Running

App Files Files Community

TTS-STT-Full-Loop-Eval / app.py

eolang

Rename app_bkp.py to app.py

438e300 verified 7 days ago

raw

history blame contribute delete

5.42 kB

	import os
	import warnings
	import torch
	import json
	import soundfile as sf
	import numpy as np

	from huggingface_hub import login
	from transformers.utils import logging as transformers_logging
	from transformers import (
	SpeechT5Processor,
	SpeechT5ForTextToSpeech,
	SpeechT5HifiGan,
	pipeline
	)
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from jiwer import wer
	import gradio as gr

	# -------------------------------------------------------------------------------------------------------------------
	# Environment + Logging Setup

	HF_Key = os.environ.get("HF_Key")
	login(token=HF_Key)

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	transformers_logging.set_verbosity_error()
	warnings.filterwarnings("ignore", category=UserWarning)

	# -------------------------------------------------------------------------------------------------------------------
	# Utility: Cosine similarity-based WER

	def cosine_sim_wer_single(reference, prediction):
	ref = reference.strip() if reference else ""
	pred = prediction.strip() if prediction else ""
	if not ref or not pred:
	return 100.0

	try:
	vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3))
	vectors = vectorizer.fit_transform([ref, pred])
	similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
	error_rate = 100.0 - similarity
	return round(error_rate, 2)
	except Exception:
	return 100.0

	# -------------------------------------------------------------------------------------------------------------------
	# Load TTS Models

	model_id = 'Jacaranda-Health/Speecht5'
	speaker_file_path = 'speaker.json'

	with open(speaker_file_path, 'r') as file:
	example = json.load(file)
	speaker_embeddings = torch.tensor(example).unsqueeze(0)

	l_model = SpeechT5ForTextToSpeech.from_pretrained("eolang/speecht5_v4-2")
	l_processor = SpeechT5Processor.from_pretrained(model_id)
	l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	def synthesize(input_text):
	inputs = l_processor(text=input_text, return_tensors="pt")
	speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder)
	output_path = 'test_output.wav'
	sf.write(output_path, speech.numpy(), 16000)
	return output_path

	# -------------------------------------------------------------------------------------------------------------------
	# Load STT Pipelines

	tuned_pipeline = pipeline(
	"automatic-speech-recognition",
	model="Jacaranda-Health/ASR-STT",
	device=device,
	return_timestamps=True,
	generate_kwargs={
	"no_repeat_ngram_size": 3,
	"repetition_penalty": 1.5,
	}
	)

	openai_pipeline = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-small",
	device=device,
	return_timestamps=True,
	generate_kwargs={
	"no_repeat_ngram_size": 3,
	"repetition_penalty": 1.5,
	}
	)

	def tunned_transcribe(filepath):
	transcription = tuned_pipeline(filepath, return_timestamps=True)
	return transcription["text"]

	def openai_transcribe(filepath):
	transcription = openai_pipeline(filepath, return_timestamps=True)
	return transcription["text"]

	# -------------------------------------------------------------------------------------------------------------------
	# Full Loop: TTS → STT → Eval

	def full_loop(ref_text):
	output_wav = synthesize(ref_text)

	tunned_text = tunned_transcribe(output_wav)
	openai_text = openai_transcribe(output_wav)

	tunned_WER = wer(ref_text, tunned_text)
	base_WER = wer(ref_text, openai_text)

	tunned_cosine = cosine_sim_wer_single(ref_text, tunned_text)
	base_cosine = cosine_sim_wer_single(ref_text, openai_text)

	result = f"""🔊 Tunned Model Transcription:
	{tunned_text}
	WER: {round(tunned_WER, 2)}
	Cosine-based WER: {tunned_cosine}%

	🟢 Base Model Transcription:
	{openai_text}
	WER: {round(base_WER, 2)}
	Cosine-based WER: {base_cosine}%"""

	return output_wav, result

	# -------------------------------------------------------------------------------------------------------------------
	# Transcription Tab

	def transcribe_tab(audio, model_type):
	if model_type == "Tuned":
	return tunned_transcribe(audio)
	return openai_transcribe(audio)

	transcribe_ui = gr.Interface(
	fn=transcribe_tab,
	inputs=[
	gr.Audio(label="Upload Audio", type="filepath"),
	gr.Radio(["Tuned", "Base"], label="Choose Model", value="Tuned")
	],
	outputs=gr.Textbox(label="Transcription"),
	title="Speech to Text"
	)

	# -------------------------------------------------------------------------------------------------------------------
	# Synthesis Tab

	synthesize_ui = gr.Interface(
	fn=full_loop,
	inputs=gr.Textbox(label="Enter Text"),
	outputs=[gr.Audio(label="Synthesized Audio"), gr.Textbox(label="Evaluation")],
	title="Text to Speech + Eval"
	)

	# -------------------------------------------------------------------------------------------------------------------
	# Combine as Tabs

	demo = gr.TabbedInterface(
	interface_list=[transcribe_ui, synthesize_ui],
	tab_names=["Transcribe Audio", "Synthesize & Evaluate"]
	)

	if __name__ == "__main__":
	demo.launch()