alisharifi's picture
supports
f38370b
from pydub import AudioSegment
from transformers import pipeline
import gradio as gr
import ffmpeg
import soundfile as sf
import numpy as np
def process_audio_length(audio_path):
audio = AudioSegment.from_file(audio_path)
audio_length_ms = len(audio)
chunks = []
target_length_ms=30000
if audio_length_ms < target_length_ms:
padding = AudioSegment.silent(duration=target_length_ms - audio_length_ms)
padded_audio = audio + padding
chunks.append(padded_audio)
else:
for start in range(0, audio_length_ms, target_length_ms):
end = min(start + target_length_ms, audio_length_ms)
chunk = audio[start:end]
if len(chunk) < target_length_ms:
padding = AudioSegment.silent(duration=target_length_ms - len(chunk))
chunk += padding
chunks.append(chunk)
return chunks
def save_chunks(chunks):
import os
output_folder="audio_chunks"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
chunk_files = []
for i, chunk in enumerate(chunks):
chunk_name = f"{output_folder}/chunk_{i}.wav"
chunk.export(chunk_name, format="wav")
chunk_files.append(chunk_name)
return chunk_files
# Load the model using the pipeline
pipe = pipeline(task="automatic-speech-recognition", model="alisharifi/whisper-farsi")
# Extracting sound from video file
def extract_audio_from_video(video_file):
output_audio = "output_audio.wav"
stream = ffmpeg.input(video_file)
stream = ffmpeg.output(stream, output_audio, ac=1, ar="16000")
ffmpeg.run(stream, overwrite_output=True)
return output_audio
# ASR function
def process_audio(audio):
if audio is None:
return "لطفاً صدایی ضبط کنید."
chunks = process_audio_length(audio)
chunk_files = save_chunks(chunks)
transcriptions = []
for chunk_file in chunk_files:
transcription = pipe(chunk_file)["text"]
transcriptions.append(transcription)
return " ".join(transcriptions)
# Video handler
def process_video(video_file):
audio_path = extract_audio_from_video(video_file)
return process_audio(audio_path)
# Audio handler
def process_audio_file(audio_file):
return process_audio(audio_file)
# microphone handler
def process_microphone(audio_data):
print(f"Audio data type: {type(audio_data)}")
print(f"Audio data content: {audio_data}")
if audio_data is None:
return "هیچ صدایی ضبط نشد. لطفاً دوباره امتحان کنید."
# بررسی فرمت داده
if not isinstance(audio_data, tuple) or len(audio_data) != 2:
return f"فرمت داده صوتی نادرست است: {type(audio_data)}"
audio_array, sample_rate = audio_data
if not isinstance(audio_array, np.ndarray):
return f"داده صوتی نادرست است: {type(audio_array)}"
# اطمینان از اینکه داده‌های صوتی دو‌بعدی است
if audio_array.ndim == 1:
audio_array = audio_array[:, np.newaxis]
# ذخیره فایل صوتی
audio_path = "recorded_audio.wav"
sf.write(audio_path, audio_array, sample_rate)
return process_audio(audio_path)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## سامانه تبدیل گفتار به متن")
# Video input
with gr.Tab("آپلود ویدئو"):
video_input = gr.Video(label="آپلود فایل ویدئو")
video_output = gr.Textbox(label="متن استخراج شده")
video_button = gr.Button("پردازش")
video_button.click(process_video, inputs=video_input, outputs=video_output)
# Audio input
with gr.Tab("آپلود فایل صوتی"):
audio_input = gr.Audio(label="آپلود فایل صوتی", type="filepath")
audio_output = gr.Textbox(label="متن استخراج شده")
audio_button = gr.Button("پردازش")
audio_button.click(process_audio_file, inputs=audio_input, outputs=audio_output)
# recording voice
with gr.Tab("ضبط صدا"):
mic_input = gr.Audio(sources="microphone", type="filepath", label="ضبط صدا")
mic_output = gr.Textbox(label="متن استخراج شده")
mic_button = gr.Button("پردازش")
mic_button.click(process_audio_file, inputs=mic_input, outputs=mic_output)
# Ensure share=True for a shareable link
demo.launch(share=True)