Spaces:
Sleeping
Sleeping
File size: 4,548 Bytes
f38370b 6762ee0 f38370b 2329662 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 2329662 f38370b 2329662 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b 6762ee0 f38370b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from pydub import AudioSegment
from transformers import pipeline
import gradio as gr
import ffmpeg
import soundfile as sf
import numpy as np
def process_audio_length(audio_path):
audio = AudioSegment.from_file(audio_path)
audio_length_ms = len(audio)
chunks = []
target_length_ms=30000
if audio_length_ms < target_length_ms:
padding = AudioSegment.silent(duration=target_length_ms - audio_length_ms)
padded_audio = audio + padding
chunks.append(padded_audio)
else:
for start in range(0, audio_length_ms, target_length_ms):
end = min(start + target_length_ms, audio_length_ms)
chunk = audio[start:end]
if len(chunk) < target_length_ms:
padding = AudioSegment.silent(duration=target_length_ms - len(chunk))
chunk += padding
chunks.append(chunk)
return chunks
def save_chunks(chunks):
import os
output_folder="audio_chunks"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
chunk_files = []
for i, chunk in enumerate(chunks):
chunk_name = f"{output_folder}/chunk_{i}.wav"
chunk.export(chunk_name, format="wav")
chunk_files.append(chunk_name)
return chunk_files
# Load the model using the pipeline
pipe = pipeline(task="automatic-speech-recognition", model="alisharifi/whisper-farsi")
# Extracting sound from video file
def extract_audio_from_video(video_file):
output_audio = "output_audio.wav"
stream = ffmpeg.input(video_file)
stream = ffmpeg.output(stream, output_audio, ac=1, ar="16000")
ffmpeg.run(stream, overwrite_output=True)
return output_audio
# ASR function
def process_audio(audio):
if audio is None:
return "لطفاً صدایی ضبط کنید."
chunks = process_audio_length(audio)
chunk_files = save_chunks(chunks)
transcriptions = []
for chunk_file in chunk_files:
transcription = pipe(chunk_file)["text"]
transcriptions.append(transcription)
return " ".join(transcriptions)
# Video handler
def process_video(video_file):
audio_path = extract_audio_from_video(video_file)
return process_audio(audio_path)
# Audio handler
def process_audio_file(audio_file):
return process_audio(audio_file)
# microphone handler
def process_microphone(audio_data):
print(f"Audio data type: {type(audio_data)}")
print(f"Audio data content: {audio_data}")
if audio_data is None:
return "هیچ صدایی ضبط نشد. لطفاً دوباره امتحان کنید."
# بررسی فرمت داده
if not isinstance(audio_data, tuple) or len(audio_data) != 2:
return f"فرمت داده صوتی نادرست است: {type(audio_data)}"
audio_array, sample_rate = audio_data
if not isinstance(audio_array, np.ndarray):
return f"داده صوتی نادرست است: {type(audio_array)}"
# اطمینان از اینکه دادههای صوتی دوبعدی است
if audio_array.ndim == 1:
audio_array = audio_array[:, np.newaxis]
# ذخیره فایل صوتی
audio_path = "recorded_audio.wav"
sf.write(audio_path, audio_array, sample_rate)
return process_audio(audio_path)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## سامانه تبدیل گفتار به متن")
# Video input
with gr.Tab("آپلود ویدئو"):
video_input = gr.Video(label="آپلود فایل ویدئو")
video_output = gr.Textbox(label="متن استخراج شده")
video_button = gr.Button("پردازش")
video_button.click(process_video, inputs=video_input, outputs=video_output)
# Audio input
with gr.Tab("آپلود فایل صوتی"):
audio_input = gr.Audio(label="آپلود فایل صوتی", type="filepath")
audio_output = gr.Textbox(label="متن استخراج شده")
audio_button = gr.Button("پردازش")
audio_button.click(process_audio_file, inputs=audio_input, outputs=audio_output)
# recording voice
with gr.Tab("ضبط صدا"):
mic_input = gr.Audio(sources="microphone", type="filepath", label="ضبط صدا")
mic_output = gr.Textbox(label="متن استخراج شده")
mic_button = gr.Button("پردازش")
mic_button.click(process_audio_file, inputs=mic_input, outputs=mic_output)
# Ensure share=True for a shareable link
demo.launch(share=True)
|