File size: 5,424 Bytes
68c747b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import warnings
import torch
import json
import soundfile as sf
import numpy as np

from huggingface_hub import login
from transformers.utils import logging as transformers_logging
from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    pipeline
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from jiwer import wer
import gradio as gr

# -------------------------------------------------------------------------------------------------------------------
# Environment + Logging Setup

HF_Key = os.environ.get("HF_Key")
login(token=HF_Key)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)

# -------------------------------------------------------------------------------------------------------------------
# Utility: Cosine similarity-based WER

def cosine_sim_wer_single(reference, prediction):
    ref = reference.strip() if reference else ""
    pred = prediction.strip() if prediction else ""
    if not ref or not pred:
        return 100.0
    
    try:
        vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3))
        vectors = vectorizer.fit_transform([ref, pred])
        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
        error_rate = 100.0 - similarity
        return round(error_rate, 2)
    except Exception:
        return 100.0

# -------------------------------------------------------------------------------------------------------------------
# Load TTS Models

model_id = 'Jacaranda-Health/Speecht5'
speaker_file_path = 'speaker.json'

with open(speaker_file_path, 'r') as file:
    example = json.load(file)
speaker_embeddings = torch.tensor(example).unsqueeze(0)

l_model = SpeechT5ForTextToSpeech.from_pretrained("eolang/speecht5_v4-2")
l_processor = SpeechT5Processor.from_pretrained(model_id)
l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

def synthesize(input_text):
    inputs = l_processor(text=input_text, return_tensors="pt")
    speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder)
    output_path = 'test_output.wav'
    sf.write(output_path, speech.numpy(), 16000)
    return output_path

# -------------------------------------------------------------------------------------------------------------------
# Load STT Pipelines

tuned_pipeline = pipeline(
    "automatic-speech-recognition",
    model="Jacaranda-Health/ASR-STT",
    device=device,
    return_timestamps=True,
    generate_kwargs={
        "no_repeat_ngram_size": 3,
        "repetition_penalty": 1.5,
    }
)

openai_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    device=device,
    return_timestamps=True,
    generate_kwargs={
        "no_repeat_ngram_size": 3,
        "repetition_penalty": 1.5,
    }
)

def tunned_transcribe(filepath):
    transcription = tuned_pipeline(filepath, return_timestamps=True)
    return transcription["text"]

def openai_transcribe(filepath):
    transcription = openai_pipeline(filepath, return_timestamps=True)
    return transcription["text"]

# -------------------------------------------------------------------------------------------------------------------
# Full Loop: TTS β†’ STT β†’ Eval

def full_loop(ref_text):
    output_wav = synthesize(ref_text)
    
    tunned_text = tunned_transcribe(output_wav)
    openai_text = openai_transcribe(output_wav)
    
    tunned_WER = wer(ref_text, tunned_text)
    base_WER = wer(ref_text, openai_text)
    
    tunned_cosine = cosine_sim_wer_single(ref_text, tunned_text)
    base_cosine = cosine_sim_wer_single(ref_text, openai_text)

    result = f"""πŸ”Š **Tunned Model Transcription:**  
{tunned_text}  
WER: {round(tunned_WER, 2)}  
Cosine-based WER: {tunned_cosine}%

🟒 **Base Model Transcription:**  
{openai_text}  
WER: {round(base_WER, 2)}  
Cosine-based WER: {base_cosine}%"""

    return output_wav, result

# -------------------------------------------------------------------------------------------------------------------
# Transcription Tab

def transcribe_tab(audio, model_type):
    if model_type == "Tuned":
        return tunned_transcribe(audio)
    return openai_transcribe(audio)

transcribe_ui = gr.Interface(
    fn=transcribe_tab,
    inputs=[
        gr.Audio(label="Upload Audio", type="filepath"),
        gr.Radio(["Tuned", "Base"], label="Choose Model", value="Tuned")
    ],
    outputs=gr.Textbox(label="Transcription"),
    title="Speech to Text"
)

# -------------------------------------------------------------------------------------------------------------------
# Synthesis Tab

synthesize_ui = gr.Interface(
    fn=full_loop,
    inputs=gr.Textbox(label="Enter Text"),
    outputs=[gr.Audio(label="Synthesized Audio"), gr.Textbox(label="Evaluation")],
    title="Text to Speech + Eval"
)

# -------------------------------------------------------------------------------------------------------------------
# Combine as Tabs

demo = gr.TabbedInterface(
    interface_list=[transcribe_ui, synthesize_ui],
    tab_names=["Transcribe Audio", "Synthesize & Evaluate"]
)

if __name__ == "__main__":
    demo.launch()