Spaces:

MuhammadFarhanAslam
/

AI-Powered_Speech-to-Text_Transcriber

Sleeping

App Files Files Community

AI-Powered_Speech-to-Text_Transcriber / app.py

MuhammadFarhanAslam

Upload folder using huggingface_hub

6c5a3b0 verified 3 months ago

raw

history blame contribute delete

9.72 kB


	# app.py

	import gradio as gr
	import soundfile as sf
	import os
	from transformers import pipeline

	asr = pipeline(task="automatic-speech-recognition",
	model="distil-whisper/distil-small.en")

	def transcribe_speech(audio_filepath):
	if audio_filepath is None:
	gr.Warning('No audio found. Please try again!')
	# This line defines a Python function named 'transcribe_speech'
	# It takes one argument: 'audio_filepath', which is expected to be a string
	# representing the path to an audio file on your system (e.g., 'my_audio.wav').

	# 1. Load audio from file
	# This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
	# to read the contents of the audio file specified by 'audio_filepath'.
	# It returns two main pieces of information:
	# - 'audio': A NumPy array containing the numerical samples of the audio waveform.
	# This is the raw digital representation of the sound.
	# - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
	# samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
	audio, sr = sf.read(audio_filepath)

	# 2. Pass audio data to the ASR model/pipeline for transcription
	# This is the core step where the speech recognition happens.
	# - 'asr': This variable (which must be defined and initialized elsewhere in your code)
	# represents your pre-trained ASR model or, more likely, a Hugging Face
	# ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
	# - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
	# expected by many Hugging Face ASR models and pipelines. It's a dictionary
	# where:
	# - 'array': Contains the raw numerical audio waveform.
	# - 'sampling_rate': Provides the corresponding sampling rate.
	# The ASR model needs both to correctly interpret the audio.
	# - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
	# typically a dictionary containing the transcribed text and potentially
	# other metadata (like word timestamps or confidence scores).
	result = asr(
	{"array": audio, "sampling_rate": sr}
	)

	# 3. Extract and return the transcribed text
	# The ASR pipeline or model usually returns its primary output (the transcription)
	# under a specific key, commonly 'text'.
	# This line extracts that text string from the 'result' dictionary.
	return result['text']


	mic_transcribe = gr.Interface(
	fn=transcribe_speech,
	inputs=gr.Audio(
	sources="microphone",
	type="filepath",
	label="🎤 Speak into your microphone" # Appealing label
	),
	outputs=gr.Textbox(
	label="📝 Transcription Result", # Appealing label
	lines=4, # Slightly more lines for longer transcriptions
	placeholder="Your transcribed text will appear here..."
	),
	flagging_mode="never", # Disable flagging
	description="Record your voice directly using your device's microphone. Get an instant transcription."
	)


	file_transcribe = gr.Interface(
	fn=transcribe_speech,
	inputs=gr.Audio(
	sources="upload", # Allow input from file upload
	type="filepath", # Function receives audio as a temporary file path
	label="📁 Upload an Audio File" # Appealing label
	),
	outputs=gr.Textbox(
	label="📝 Transcription Result", # Appealing label
	lines=4, # Slightly more lines
	placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
	),
	flagging_mode="never", # Disable flagging
	description="Upload an audio file for transcription."
	)


	custom_css = """
	/* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
	/* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
	but it's good practice for other fonts. */
	@import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');

	/* Apply Arial to ALL text elements by default within the Gradio container */
	.gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
	font-family: 'Arial', sans-serif !important;
	}

	/* Overall container styling */
	.gradio-container {
	max-width: 900px; /* Limit overall width for better readability */
	margin: 30px auto; /* Center the app on the page */
	padding: 30px;
	border-radius: 15px; /* Rounded corners for a softer look */
	box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
	background-color: #ffffff; /* White background for the main content area */
	}

	/* Titles and Headers */
	h1 {
	color: #34495e; /* Darker blue-grey for main title */
	text-align: center;
	font-size: 2.5em; /* Larger main title */
	margin-bottom: 10px;
	font-weight: 700; /* Bold */
	}

	h3 {
	color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
	text-align: center;
	font-size: 1.2em;
	margin-top: 0;
	margin-bottom: 25px;
	}

	p {
	text-align: center;
	color: #7f8c8d; /* Muted grey for descriptions */
	font-size: 0.95em;
	margin-bottom: 20px;
	}

	/* Tabbed Interface Styling */
	.tabs {
	border-radius: 10px;
	overflow: hidden; /* Ensures rounded corners on tabs */
	margin-bottom: 20px;
	}

	.tab-nav button {
	background-color: #ecf0f1; /* Light grey for inactive tabs */
	color: #34495e; /* Dark text for inactive tabs */
	font-weight: bold;
	padding: 12px 20px;
	border-radius: 8px 8px 0 0;
	margin-right: 5px; /* Small space between tabs */
	transition: all 0.3s ease;
	}

	.tab-nav button.selected {
	background-color: #4a90e2; /* Vibrant blue for active tab */
	color: white; /* White text for active tab */
	box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
	}

	/* Input and Output Component Styling (General) */
	.gr-box {
	border-radius: 10px; /* Rounded corners for input/output boxes */
	border: 1px solid #dfe6e9; /* Light border */
	box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
	padding: 20px;
	background-color: #fcfcfc; /* Slightly off-white background */
	}

	/* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
	.label {
	font-weight: bold;
	color: #2c3e50; /* Dark text for labels */
	font-size: 1.1em;
	margin-bottom: 8px;
	}

	/* Buttons (Clear, Submit) */
	.gr-button {
	background-color: #4a90e2 !important; /* Primary blue for actions */
	color: white !important;
	border: none !important;
	border-radius: 8px !important; /* Rounded buttons */
	padding: 12px 25px !important;
	font-weight: bold !important;
	transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
	margin: 5px; /* Spacing between buttons */
	}

	.gr-button:hover {
	background-color: #3a7bd2 !important; /* Darker blue on hover */
	box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
	}

	/* Clear button specific */
	.gr-button.secondary {
	background-color: #e0e6eb !important; /* Lighter grey for clear */
	color: #34495e !important;
	}
	.gr-button.secondary:hover {
	background-color: #d1d8df !important;
	box-shadow: none !important;
	}

	/* Textbox specific */
	textarea {
	border-radius: 8px !important;
	border: 1px solid #bdc3c7 !important;
	padding: 10px !important;
	resize: vertical; /* Allow vertical resizing */
	}

	/* Audio component player */
	.gr-audio-player {
	border-radius: 8px;
	background-color: #f0f0f0;
	padding: 10px;
	}

	/* Footer styling */
	hr {
	border: none;
	border-top: 1px solid #e0e0e0;
	margin-top: 30px;
	margin-bottom: 15px;
	}

	.footer-text {
	font-size: 0.85em;
	color: #a0a0a0;
	text-align: center;
	}
	"""

	# --- 6. Main Gradio App using Blocks for layout and styling ---
	# Initialize a Gradio Blocks interface with a theme and custom CSS.
	demo = gr.Blocks(
	theme=gr.themes.Soft(), # A good base theme for soft colors
	css=custom_css # Apply our custom CSS
	)

	# Define the layout within the 'demo' Blocks context
	with demo:
	# Main Title and Description using Markdown for rich formatting and appealing colors
	# Removed inline style for font-family as it's handled by global CSS now.
	gr.Markdown(
	"""
	<center>
	<h1 style="color: #4A90E2;">
	🎙️ AI-Powered Speech-to-Text Transcriber 📝
	</h1>
	<h3 style="color: #6C7A89;">
	Developed by Muhammad Farhan Aslam.
	</h3>
	<h3 style="color: #6C7A89;">
	Convert spoken words into accurate text with ease and precision.
	</h3>
	<p style="color: #8C9CA7; font-size: 1.05em;">
	Effortlessly transcribe audio from your microphone or by uploading a file.
	This application leverages advanced AI to provide clear and reliable transcriptions.
	</p>
	</center>
	"""
	)

	# Create a tabbed interface for microphone and file upload transcription
	gr.TabbedInterface(
	[file_transcribe, mic_transcribe],
	["📁 Transcribe Audio File", "🎤 Transcribe from Microphone"],
	)

	# Add a subtle footer for information or credits
	gr.Markdown(
	"""
	<hr>
	<p class="footer-text">
	Built with ❤️ and Gradio on Hugging Face Transformers.
	</p>
	"""
	)
	# start_port = int(os.environ.get('PORT1', 7861))
	demo.launch(share=True)