MuhammadFarhanAslam's picture
Upload folder using huggingface_hub
6c5a3b0 verified
# app.py
import gradio as gr
import soundfile as sf
import os
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition",
model="distil-whisper/distil-small.en")
def transcribe_speech(audio_filepath):
if audio_filepath is None:
gr.Warning('No audio found. Please try again!')
# This line defines a Python function named 'transcribe_speech'
# It takes one argument: 'audio_filepath', which is expected to be a string
# representing the path to an audio file on your system (e.g., 'my_audio.wav').
# 1. Load audio from file
# This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
# to read the contents of the audio file specified by 'audio_filepath'.
# It returns two main pieces of information:
# - 'audio': A NumPy array containing the numerical samples of the audio waveform.
# This is the raw digital representation of the sound.
# - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
# samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
audio, sr = sf.read(audio_filepath)
# 2. Pass audio data to the ASR model/pipeline for transcription
# This is the core step where the speech recognition happens.
# - 'asr': This variable (which must be defined and initialized elsewhere in your code)
# represents your pre-trained ASR model or, more likely, a Hugging Face
# ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
# - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
# expected by many Hugging Face ASR models and pipelines. It's a dictionary
# where:
# - 'array': Contains the raw numerical audio waveform.
# - 'sampling_rate': Provides the corresponding sampling rate.
# The ASR model needs both to correctly interpret the audio.
# - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
# typically a dictionary containing the transcribed text and potentially
# other metadata (like word timestamps or confidence scores).
result = asr(
{"array": audio, "sampling_rate": sr}
)
# 3. Extract and return the transcribed text
# The ASR pipeline or model usually returns its primary output (the transcription)
# under a specific key, commonly 'text'.
# This line extracts that text string from the 'result' dictionary.
return result['text']
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(
sources="microphone",
type="filepath",
label="🎀 Speak into your microphone" # Appealing label
),
outputs=gr.Textbox(
label="πŸ“ Transcription Result", # Appealing label
lines=4, # Slightly more lines for longer transcriptions
placeholder="Your transcribed text will appear here..."
),
flagging_mode="never", # Disable flagging
description="Record your voice directly using your device's microphone. Get an instant transcription."
)
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(
sources="upload", # Allow input from file upload
type="filepath", # Function receives audio as a temporary file path
label="πŸ“ Upload an Audio File" # Appealing label
),
outputs=gr.Textbox(
label="πŸ“ Transcription Result", # Appealing label
lines=4, # Slightly more lines
placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
),
flagging_mode="never", # Disable flagging
description="Upload an audio file for transcription."
)
custom_css = """
/* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
/* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
but it's good practice for other fonts. */
@import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');
/* Apply Arial to ALL text elements by default within the Gradio container */
.gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
font-family: 'Arial', sans-serif !important;
}
/* Overall container styling */
.gradio-container {
max-width: 900px; /* Limit overall width for better readability */
margin: 30px auto; /* Center the app on the page */
padding: 30px;
border-radius: 15px; /* Rounded corners for a softer look */
box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
background-color: #ffffff; /* White background for the main content area */
}
/* Titles and Headers */
h1 {
color: #34495e; /* Darker blue-grey for main title */
text-align: center;
font-size: 2.5em; /* Larger main title */
margin-bottom: 10px;
font-weight: 700; /* Bold */
}
h3 {
color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
text-align: center;
font-size: 1.2em;
margin-top: 0;
margin-bottom: 25px;
}
p {
text-align: center;
color: #7f8c8d; /* Muted grey for descriptions */
font-size: 0.95em;
margin-bottom: 20px;
}
/* Tabbed Interface Styling */
.tabs {
border-radius: 10px;
overflow: hidden; /* Ensures rounded corners on tabs */
margin-bottom: 20px;
}
.tab-nav button {
background-color: #ecf0f1; /* Light grey for inactive tabs */
color: #34495e; /* Dark text for inactive tabs */
font-weight: bold;
padding: 12px 20px;
border-radius: 8px 8px 0 0;
margin-right: 5px; /* Small space between tabs */
transition: all 0.3s ease;
}
.tab-nav button.selected {
background-color: #4a90e2; /* Vibrant blue for active tab */
color: white; /* White text for active tab */
box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
}
/* Input and Output Component Styling (General) */
.gr-box {
border-radius: 10px; /* Rounded corners for input/output boxes */
border: 1px solid #dfe6e9; /* Light border */
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
padding: 20px;
background-color: #fcfcfc; /* Slightly off-white background */
}
/* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
.label {
font-weight: bold;
color: #2c3e50; /* Dark text for labels */
font-size: 1.1em;
margin-bottom: 8px;
}
/* Buttons (Clear, Submit) */
.gr-button {
background-color: #4a90e2 !important; /* Primary blue for actions */
color: white !important;
border: none !important;
border-radius: 8px !important; /* Rounded buttons */
padding: 12px 25px !important;
font-weight: bold !important;
transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
margin: 5px; /* Spacing between buttons */
}
.gr-button:hover {
background-color: #3a7bd2 !important; /* Darker blue on hover */
box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
}
/* Clear button specific */
.gr-button.secondary {
background-color: #e0e6eb !important; /* Lighter grey for clear */
color: #34495e !important;
}
.gr-button.secondary:hover {
background-color: #d1d8df !important;
box-shadow: none !important;
}
/* Textbox specific */
textarea {
border-radius: 8px !important;
border: 1px solid #bdc3c7 !important;
padding: 10px !important;
resize: vertical; /* Allow vertical resizing */
}
/* Audio component player */
.gr-audio-player {
border-radius: 8px;
background-color: #f0f0f0;
padding: 10px;
}
/* Footer styling */
hr {
border: none;
border-top: 1px solid #e0e0e0;
margin-top: 30px;
margin-bottom: 15px;
}
.footer-text {
font-size: 0.85em;
color: #a0a0a0;
text-align: center;
}
"""
# --- 6. Main Gradio App using Blocks for layout and styling ---
# Initialize a Gradio Blocks interface with a theme and custom CSS.
demo = gr.Blocks(
theme=gr.themes.Soft(), # A good base theme for soft colors
css=custom_css # Apply our custom CSS
)
# Define the layout within the 'demo' Blocks context
with demo:
# Main Title and Description using Markdown for rich formatting and appealing colors
# Removed inline style for font-family as it's handled by global CSS now.
gr.Markdown(
"""
<center>
<h1 style="color: #4A90E2;">
πŸŽ™οΈ AI-Powered Speech-to-Text Transcriber πŸ“
</h1>
<h3 style="color: #6C7A89;">
Developed by Muhammad Farhan Aslam.
</h3>
<h3 style="color: #6C7A89;">
Convert spoken words into accurate text with ease and precision.
</h3>
<p style="color: #8C9CA7; font-size: 1.05em;">
Effortlessly transcribe audio from your microphone or by uploading a file.
This application leverages advanced AI to provide clear and reliable transcriptions.
</p>
</center>
"""
)
# Create a tabbed interface for microphone and file upload transcription
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["πŸ“ Transcribe Audio File", "🎀 Transcribe from Microphone"],
)
# Add a subtle footer for information or credits
gr.Markdown(
"""
<hr>
<p class="footer-text">
Built with ❀️ and Gradio on Hugging Face Transformers.
</p>
"""
)
# start_port = int(os.environ.get('PORT1', 7861))
demo.launch(share=True)