# app.py import gradio as gr import soundfile as sf import os from transformers import pipeline asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en") def transcribe_speech(audio_filepath): if audio_filepath is None: gr.Warning('No audio found. Please try again!') # This line defines a Python function named 'transcribe_speech' # It takes one argument: 'audio_filepath', which is expected to be a string # representing the path to an audio file on your system (e.g., 'my_audio.wav'). # 1. Load audio from file # This line uses 'sf.read()' (likely from the 'soundfile' library, or similar) # to read the contents of the audio file specified by 'audio_filepath'. # It returns two main pieces of information: # - 'audio': A NumPy array containing the numerical samples of the audio waveform. # This is the raw digital representation of the sound. # - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many # samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz). audio, sr = sf.read(audio_filepath) # 2. Pass audio data to the ASR model/pipeline for transcription # This is the core step where the speech recognition happens. # - 'asr': This variable (which must be defined and initialized elsewhere in your code) # represents your pre-trained ASR model or, more likely, a Hugging Face # ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`). # - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format # expected by many Hugging Face ASR models and pipelines. It's a dictionary # where: # - 'array': Contains the raw numerical audio waveform. # - 'sampling_rate': Provides the corresponding sampling rate. # The ASR model needs both to correctly interpret the audio. # - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is # typically a dictionary containing the transcribed text and potentially # other metadata (like word timestamps or confidence scores). result = asr( {"array": audio, "sampling_rate": sr} ) # 3. Extract and return the transcribed text # The ASR pipeline or model usually returns its primary output (the transcription) # under a specific key, commonly 'text'. # This line extracts that text string from the 'result' dictionary. return result['text'] mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio( sources="microphone", type="filepath", label="🎤 Speak into your microphone" # Appealing label ), outputs=gr.Textbox( label="📝 Transcription Result", # Appealing label lines=4, # Slightly more lines for longer transcriptions placeholder="Your transcribed text will appear here..." ), flagging_mode="never", # Disable flagging description="Record your voice directly using your device's microphone. Get an instant transcription." ) file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio( sources="upload", # Allow input from file upload type="filepath", # Function receives audio as a temporary file path label="📁 Upload an Audio File" # Appealing label ), outputs=gr.Textbox( label="📝 Transcription Result", # Appealing label lines=4, # Slightly more lines placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription." ), flagging_mode="never", # Disable flagging description="Upload an audio file for transcription." ) custom_css = """ /* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */ /* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work, but it's good practice for other fonts. */ @import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap'); /* Apply Arial to ALL text elements by default within the Gradio container */ .gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 { font-family: 'Arial', sans-serif !important; } /* Overall container styling */ .gradio-container { max-width: 900px; /* Limit overall width for better readability */ margin: 30px auto; /* Center the app on the page */ padding: 30px; border-radius: 15px; /* Rounded corners for a softer look */ box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */ background-color: #ffffff; /* White background for the main content area */ } /* Titles and Headers */ h1 { color: #34495e; /* Darker blue-grey for main title */ text-align: center; font-size: 2.5em; /* Larger main title */ margin-bottom: 10px; font-weight: 700; /* Bold */ } h3 { color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */ text-align: center; font-size: 1.2em; margin-top: 0; margin-bottom: 25px; } p { text-align: center; color: #7f8c8d; /* Muted grey for descriptions */ font-size: 0.95em; margin-bottom: 20px; } /* Tabbed Interface Styling */ .tabs { border-radius: 10px; overflow: hidden; /* Ensures rounded corners on tabs */ margin-bottom: 20px; } .tab-nav button { background-color: #ecf0f1; /* Light grey for inactive tabs */ color: #34495e; /* Dark text for inactive tabs */ font-weight: bold; padding: 12px 20px; border-radius: 8px 8px 0 0; margin-right: 5px; /* Small space between tabs */ transition: all 0.3s ease; } .tab-nav button.selected { background-color: #4a90e2; /* Vibrant blue for active tab */ color: white; /* White text for active tab */ box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */ } /* Input and Output Component Styling (General) */ .gr-box { border-radius: 10px; /* Rounded corners for input/output boxes */ border: 1px solid #dfe6e9; /* Light border */ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */ padding: 20px; background-color: #fcfcfc; /* Slightly off-white background */ } /* Labels within components (e.g., "Upload Audio File", "Transcription Result") */ .label { font-weight: bold; color: #2c3e50; /* Dark text for labels */ font-size: 1.1em; margin-bottom: 8px; } /* Buttons (Clear, Submit) */ .gr-button { background-color: #4a90e2 !important; /* Primary blue for actions */ color: white !important; border: none !important; border-radius: 8px !important; /* Rounded buttons */ padding: 12px 25px !important; font-weight: bold !important; transition: background-color 0.3s ease, box-shadow 0.3s ease !important; margin: 5px; /* Spacing between buttons */ } .gr-button:hover { background-color: #3a7bd2 !important; /* Darker blue on hover */ box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important; } /* Clear button specific */ .gr-button.secondary { background-color: #e0e6eb !important; /* Lighter grey for clear */ color: #34495e !important; } .gr-button.secondary:hover { background-color: #d1d8df !important; box-shadow: none !important; } /* Textbox specific */ textarea { border-radius: 8px !important; border: 1px solid #bdc3c7 !important; padding: 10px !important; resize: vertical; /* Allow vertical resizing */ } /* Audio component player */ .gr-audio-player { border-radius: 8px; background-color: #f0f0f0; padding: 10px; } /* Footer styling */ hr { border: none; border-top: 1px solid #e0e0e0; margin-top: 30px; margin-bottom: 15px; } .footer-text { font-size: 0.85em; color: #a0a0a0; text-align: center; } """ # --- 6. Main Gradio App using Blocks for layout and styling --- # Initialize a Gradio Blocks interface with a theme and custom CSS. demo = gr.Blocks( theme=gr.themes.Soft(), # A good base theme for soft colors css=custom_css # Apply our custom CSS ) # Define the layout within the 'demo' Blocks context with demo: # Main Title and Description using Markdown for rich formatting and appealing colors # Removed inline style for font-family as it's handled by global CSS now. gr.Markdown( """

🎙️ AI-Powered Speech-to-Text Transcriber 📝

Developed by Muhammad Farhan Aslam.

Convert spoken words into accurate text with ease and precision.

Effortlessly transcribe audio from your microphone or by uploading a file. This application leverages advanced AI to provide clear and reliable transcriptions.

""" ) # Create a tabbed interface for microphone and file upload transcription gr.TabbedInterface( [file_transcribe, mic_transcribe], ["📁 Transcribe Audio File", "🎤 Transcribe from Microphone"], ) # Add a subtle footer for information or credits gr.Markdown( """
""" ) # start_port = int(os.environ.get('PORT1', 7861)) demo.launch(share=True)