import gradio as gr import torch import speech_recognition as sr from pydub import AudioSegment import os # Constants MAX_AUDIO_DURATION = 600 # in seconds # --- Helper: Convert audio to wav --- def convert_audio_to_wav(file_path): audio = AudioSegment.from_file(file_path) wav_path = file_path.replace(file_path.split(".")[-1], "wav") audio.export(wav_path, format="wav") return wav_path # --- Helper: Transcribe audio in chunks --- def transcribe_audio_in_chunks(audio_path, chunk_duration=30): recognizer = sr.Recognizer() audio = AudioSegment.from_wav(audio_path) if len(audio) > MAX_AUDIO_DURATION * 1000: audio = audio[:MAX_AUDIO_DURATION * 1000] full_text = [] for i in range(0, len(audio), chunk_duration * 1000): chunk = audio[i: i + chunk_duration * 1000] chunk_path = "temp_chunk.wav" chunk.export(chunk_path, format="wav") with sr.AudioFile(chunk_path) as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data, language="en-IN") full_text.append(text) except sr.UnknownValueError: full_text.append("[Unrecognized Audio]") except sr.RequestError as e: full_text.append(f"[Speech Error: {e}]") return " ".join(full_text) # --- Main Function --- def transcribe_audio(audio): if not audio.endswith(".wav"): audio = convert_audio_to_wav(audio) transcription = transcribe_audio_in_chunks(audio) return transcription # --- Gradio UI --- iface = gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio") ], outputs=[ gr.Textbox(label="Transcribed Text") ], title="English Speech Recognition", description="Upload or record English audio → Transcribe to text.", allow_flagging="never" ) iface.launch(debug=True, share=True)