import gradio as gr from transformers import pipeline from gtts import gTTS import tempfile # Load ASR & Translation models asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=0) translator = pipeline("translation", model="ai4bharat/indictrans2-en-indic", device=0) # Map language names to codes lang_code_map = { "Hindi": "hi", "Kannada": "kn", "Tamil": "ta", "Telugu": "te", "Bengali": "bn", "Marathi": "mr", "Gujarati": "gu", "Punjabi": "pa", "Malayalam": "ml", "Assamese": "as", "Odia": "or", "Urdu": "ur", "Sanskrit": "sa", "Nepali": "ne", "Manipuri": "mni", "English": "en" } languages = list(lang_code_map.keys()) def process_speech(audio, speaker_lang, listener_lang): # Step 1: Speech → Text transcript = asr_model(audio)["text"] # Step 2: Translate try: result = translator(transcript, src_lang=speaker_lang.lower(), tgt_lang=listener_lang.lower()) translated_text = result[0]["translation_text"] except: translated_text = "[Translation failed]" # Step 3: Text → Speech tts_lang = lang_code_map.get(listener_lang, "en") tts = gTTS(translated_text, lang=tts_lang) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f: tts.save(f.name) return transcript, translated_text, f.name # Gradio UI with gr.Blocks(title="Indian Language Speech Translator") as demo: gr.Markdown("## 🗣️ Real-Time Indian Speech Translator") gr.Markdown("Choose two languages and take turns speaking. The app will translate and speak for you in real-time.") with gr.Row(): speaker_lang = gr.Dropdown(choices=languages, value="Hindi", label="🎙️ Speaker's Language") listener_lang = gr.Dropdown(choices=languages, value="Kannada", label="👂 Listener's Language") audio_input = gr.Audio(source="microphone", type="filepath", label="🎤 Speak here") with gr.Row(): transcript_box = gr.Textbox(label="📝 Recognized Speech") translation_box = gr.Textbox(label="🌐 Translated Text") audio_output = gr.Audio(label="🔊 Translated Speech") btn = gr.Button("Translate & Speak") btn.click( fn=process_speech, inputs=[audio_input, speaker_lang, listener_lang], outputs=[transcript_box, translation_box, audio_output] ) demo.launch()