Solo448 commited on
Commit
78e2ea0
·
verified ·
1 Parent(s): d3d3b36

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset, Audio
6
+ import numpy as np
7
+ from speechbrain.inference import EncoderClassifier
8
+
9
+ # Load models and processor
10
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/SpeechT5-fine-tune-en")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
+
14
+ # Load speaker encoder
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ speaker_model = EncoderClassifier.from_hparams(
17
+ source="speechbrain/spkrec-xvect-voxceleb",
18
+ run_opts={"device": device},
19
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
20
+ )
21
+
22
+ # Load a sample from the dataset for speaker embedding
23
+ try:
24
+ dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train", trust_remote_code=True)
25
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
26
+ sample = dataset[0]
27
+ speaker_embedding = create_speaker_embedding(sample['audio']['array'])
28
+ except Exception as e:
29
+ print(f"Error loading dataset: {e}")
30
+ # Use a random speaker embedding as fallback
31
+ speaker_embedding = torch.randn(1, 512)
32
+
33
+ def create_speaker_embedding(waveform):
34
+ with torch.no_grad():
35
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
36
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
37
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
38
+ return speaker_embeddings
39
+
40
+ def text_to_speech(text):
41
+ # Clean up text
42
+ replacements = [
43
+ ("0", "zero"),
44
+ ("1", "one"),
45
+ ("2", "two"),
46
+ ("3", "three"),
47
+ ("4", "four"),
48
+ ("5", "five"),
49
+ ("6", "six"),
50
+ ("7", "seven"),
51
+ ("8", "eight"),
52
+ ("9", "nine"),
53
+ ("_", " ")
54
+ ]
55
+ for src, dst in replacements:
56
+ text = text.replace(src, dst)
57
+
58
+ inputs = processor(text=text, return_tensors="pt")
59
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
60
+ return (16000, speech.numpy())
61
+
62
+ iface = gr.Interface(
63
+ fn=text_to_speech,
64
+ inputs="text",
65
+ outputs="audio",
66
+ title="Technical english Text-to-Speech",
67
+ description="Enter english text to convert to speech"
68
+ )
69
+
70
+ iface.launch()