Spaces:
Running
Running
File size: 5,471 Bytes
451ec22 ad94dbf 451ec22 ad94dbf 451ec22 ad94dbf 451ec22 ad94dbf a05081b ad94dbf 46ad6e3 ad94dbf 46ad6e3 451ec22 ad94dbf 46ad6e3 ad94dbf 46ad6e3 ad94dbf 46ad6e3 ad94dbf 46ad6e3 ad94dbf 46ad6e3 ad94dbf 46ad6e3 ad94dbf 22824ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import os
import io
import torch
import torchaudio
import numpy as np
import gradio as gr
from transformers import (
SeamlessM4TFeatureExtractor,
SeamlessM4TTokenizer,
SeamlessM4Tv2ForSpeechToText,
AutoTokenizer,
AutoFeatureExtractor,
)
from pydub import AudioSegment
import nltk
from parler_tts import ParlerTTSForConditionalGeneration
from lang_list import LANGUAGE_NAME_TO_CODE, ASR_TARGET_LANGUAGE_NAMES, S2TT_TARGET_LANGUAGE_NAMES
nltk.download('punkt_tab')
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32
SAMPLE_RATE = 16000
stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
"ai4bharat/indic-seamless",
torch_dtype=DTYPE
).to(DEVICE)
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(
"ai4bharat/indic-seamless"
)
tt_tokenizer = SeamlessM4TTokenizer.from_pretrained(
"ai4bharat/indic-seamless"
)
repo_id = "ai4bharat/indic-parler-tts-pretrained"
finetuned_repo_id = "ai4bharat/indic-parler-tts"
tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
repo_id,
attn_implementation="eager",
torch_dtype=DTYPE,
).to(DEVICE)
finetuned_tts = ParlerTTSForConditionalGeneration.from_pretrained(
finetuned_repo_id,
attn_implementation="eager",
torch_dtype=DTYPE,
).to(DEVICE)
tts_tokenizer = AutoTokenizer.from_pretrained(repo_id)
description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
VOICES = [
"Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"
]
def numpy_to_mp3(audio_array, sampling_rate):
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767
audio_array = audio_array.astype(np.int16)
segment = AudioSegment(
audio_array.tobytes(),
frame_rate=sampling_rate,
sample_width=audio_array.dtype.itemsize,
channels=1
)
mp3_io = io.BytesIO()
segment.export(mp3_io, format="mp3", bitrate="320k")
return mp3_io.getvalue()
def transcribe_and_translate(audio_path, source_language, target_language):
wav, orig_sr = torchaudio.load(audio_path)
wav = torchaudio.functional.resample(wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE)
inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
tgt = LANGUAGE_NAME_TO_CODE[target_language]
gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
text = tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return text
def generate_tts(text, voice, finetuned=False):
description = f"{voice} speaks in a neutral tone with clear audio."
sentences = nltk.sent_tokenize(text)
all_audio = []
for sent in sentences:
desc_inputs = description_tokenizer(description, return_tensors="pt").to(DEVICE)
prompt_inputs = tts_tokenizer(sent, return_tensors="pt").to(DEVICE)
model = finetuned_tts if finetuned else tts_model
gen = model.generate(
input_ids=desc_inputs.input_ids,
attention_mask=desc_inputs.attention_mask,
prompt_input_ids=prompt_inputs.input_ids,
prompt_attention_mask=prompt_inputs.attention_mask,
do_sample=True,
return_dict_in_generate=True
)
if hasattr(gen, 'sequences') and hasattr(gen, 'audios_length'):
audio = gen.sequences[0, :gen.audios_length[0]]
audio_np = audio.to(torch.float32).cpu().numpy().flatten()
all_audio.append(audio_np)
combined = np.concatenate(all_audio)
return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
def pipeline(audio_path, source_language, target_language, voice, finetuned):
text = transcribe_and_translate(audio_path, source_language, target_language)
audio_bytes = generate_tts(text, voice, finetuned)
return text, audio_bytes
def build_ui():
with gr.Blocks() as demo:
gr.Markdown("🎙AUDIO TRANSLATOR🎙")
gr.Markdown(" ")
gr.Markdown("How to Use:")
gr.Markdown("1. Upload or record your audio clip.")
gr.Markdown("2. Select source & target languages.")
gr.Markdown("3. Choose a voice persona.")
gr.Markdown("4. (Optional) Toggle fine-tuned TTS (for better speech).")
gr.Markdown("5. Click \"Run\" for translated text & speech.")
with gr.Row():
with gr.Column():
audio_in = gr.Audio(label="Input Audio", type="filepath")
src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
voice = gr.Dropdown(VOICES, label="Voice", value=VOICES[0])
finetune = gr.Checkbox(label="Use Finetuned TTS", value=False)
run_btn = gr.Button("Run")
with gr.Column():
text_out = gr.Textbox(label="Translated Text")
audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
run_btn.click(
fn=pipeline,
inputs=[audio_in, src, tgt, voice, finetune],
outputs=[text_out, audio_out]
)
return demo
if __name__ == "__main__":
ui = build_ui()
ui.launch(share=True) |