import os import io import torch import torchaudio import numpy as np import gradio as gr from transformers import ( SeamlessM4TFeatureExtractor, SeamlessM4TTokenizer, SeamlessM4Tv2ForSpeechToText, AutoTokenizer, AutoFeatureExtractor, ) from pydub import AudioSegment import nltk from parler_tts import ParlerTTSForConditionalGeneration from lang_list import LANGUAGE_NAME_TO_CODE, ASR_TARGET_LANGUAGE_NAMES, S2TT_TARGET_LANGUAGE_NAMES nltk.download('punkt_tab') DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32 SAMPLE_RATE = 16000 stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained( "ai4bharat/indic-seamless", torch_dtype=DTYPE ).to(DEVICE) feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained( "ai4bharat/indic-seamless" ) tt_tokenizer = SeamlessM4TTokenizer.from_pretrained( "ai4bharat/indic-seamless" ) repo_id = "ai4bharat/indic-parler-tts-pretrained" finetuned_repo_id = "ai4bharat/indic-parler-tts" tts_model = ParlerTTSForConditionalGeneration.from_pretrained( repo_id, attn_implementation="eager", torch_dtype=DTYPE, ).to(DEVICE) finetuned_tts = ParlerTTSForConditionalGeneration.from_pretrained( finetuned_repo_id, attn_implementation="eager", torch_dtype=DTYPE, ).to(DEVICE) tts_tokenizer = AutoTokenizer.from_pretrained(repo_id) description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large") tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) VOICES = [ "Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya" ] def numpy_to_mp3(audio_array, sampling_rate): if np.issubdtype(audio_array.dtype, np.floating): max_val = np.max(np.abs(audio_array)) audio_array = (audio_array / max_val) * 32767 audio_array = audio_array.astype(np.int16) segment = AudioSegment( audio_array.tobytes(), frame_rate=sampling_rate, sample_width=audio_array.dtype.itemsize, channels=1 ) mp3_io = io.BytesIO() segment.export(mp3_io, format="mp3", bitrate="320k") return mp3_io.getvalue() def transcribe_and_translate(audio_path, source_language, target_language): wav, orig_sr = torchaudio.load(audio_path) wav = torchaudio.functional.resample(wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE) inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE) tgt = LANGUAGE_NAME_TO_CODE[target_language] gen = stt_model.generate(**inputs, tgt_lang=tgt)[0] text = tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True) return text def generate_tts(text, voice, finetuned=False): description = f"{voice} speaks in a neutral tone with clear audio." sentences = nltk.sent_tokenize(text) all_audio = [] for sent in sentences: desc_inputs = description_tokenizer(description, return_tensors="pt").to(DEVICE) prompt_inputs = tts_tokenizer(sent, return_tensors="pt").to(DEVICE) model = finetuned_tts if finetuned else tts_model gen = model.generate( input_ids=desc_inputs.input_ids, attention_mask=desc_inputs.attention_mask, prompt_input_ids=prompt_inputs.input_ids, prompt_attention_mask=prompt_inputs.attention_mask, do_sample=True, return_dict_in_generate=True ) if hasattr(gen, 'sequences') and hasattr(gen, 'audios_length'): audio = gen.sequences[0, :gen.audios_length[0]] audio_np = audio.to(torch.float32).cpu().numpy().flatten() all_audio.append(audio_np) combined = np.concatenate(all_audio) return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate) def pipeline(audio_path, source_language, target_language, voice, finetuned): text = transcribe_and_translate(audio_path, source_language, target_language) audio_bytes = generate_tts(text, voice, finetuned) return text, audio_bytes def build_ui(): with gr.Blocks() as demo: gr.Markdown("🎙AUDIO TRANSLATOR🎙") gr.Markdown(" ") gr.Markdown("How to Use:") gr.Markdown("1. Upload or record your audio clip.") gr.Markdown("2. Select source & target languages.") gr.Markdown("3. Choose a voice persona.") gr.Markdown("4. (Optional) Toggle fine-tuned TTS (for better speech).") gr.Markdown("5. Click \"Run\" for translated text & speech.") with gr.Row(): with gr.Column(): audio_in = gr.Audio(label="Input Audio", type="filepath") src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English") tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English") voice = gr.Dropdown(VOICES, label="Voice", value=VOICES[0]) finetune = gr.Checkbox(label="Use Finetuned TTS", value=False) run_btn = gr.Button("Run") with gr.Column(): text_out = gr.Textbox(label="Translated Text") audio_out = gr.Audio(label="Synthesized Speech", format="mp3") run_btn.click( fn=pipeline, inputs=[audio_in, src, tgt, voice, finetune], outputs=[text_out, audio_out] ) return demo if __name__ == "__main__": ui = build_ui() ui.launch(share=True)