File size: 2,020 Bytes
0f0361e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0da2c4e
0f0361e
 
 
 
 
0da2c4e
 
0f0361e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0da2c4e
0f0361e
 
 
 
0da2c4e
0f0361e
 
 
0da2c4e
0f0361e
8668b5d
0f0361e
 
0da2c4e
0f0361e
0da2c4e
 
0f0361e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
import torch
import speech_recognition as sr
from pydub import AudioSegment
import os

# Constants
MAX_AUDIO_DURATION = 600  # in seconds

# --- Helper: Convert audio to wav ---
def convert_audio_to_wav(file_path):
    audio = AudioSegment.from_file(file_path)
    wav_path = file_path.replace(file_path.split(".")[-1], "wav")
    audio.export(wav_path, format="wav")
    return wav_path

# --- Helper: Transcribe audio in chunks ---
def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
    recognizer = sr.Recognizer()
    audio = AudioSegment.from_wav(audio_path)

    if len(audio) > MAX_AUDIO_DURATION * 1000:
        audio = audio[:MAX_AUDIO_DURATION * 1000]

    full_text = []
    for i in range(0, len(audio), chunk_duration * 1000):
        chunk = audio[i: i + chunk_duration * 1000]
        chunk_path = "temp_chunk.wav"
        chunk.export(chunk_path, format="wav")

        with sr.AudioFile(chunk_path) as source:
            audio_data = recognizer.record(source)
            try:
                text = recognizer.recognize_google(audio_data, language="en-IN")
                full_text.append(text)
            except sr.UnknownValueError:
                full_text.append("[Unrecognized Audio]")
            except sr.RequestError as e:
                full_text.append(f"[Speech Error: {e}]")

    return " ".join(full_text)

# --- Main Function ---
def transcribe_audio(audio):
    if not audio.endswith(".wav"):
        audio = convert_audio_to_wav(audio)

    transcription = transcribe_audio_in_chunks(audio)
    return transcription

# --- Gradio UI ---
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio")
    ],
    outputs=[
        gr.Textbox(label="Transcribed Text")
    ],
    title="English Speech Recognition",
    description="Upload or record English audio → Transcribe to text.",
    allow_flagging="never"
)

iface.launch(debug=True, share=True)