File size: 4,548 Bytes
f38370b
6762ee0
 
 
f38370b
2329662
6762ee0
 
f38370b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6762ee0
 
 
f38370b
6762ee0
 
 
 
 
 
 
f38370b
6762ee0
 
 
f38370b
 
 
 
 
 
 
 
6762ee0
f38370b
6762ee0
 
 
 
f38370b
6762ee0
 
 
 
 
f38370b
 
2329662
f38370b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2329662
6762ee0
f38370b
6762ee0
 
 
f38370b
6762ee0
 
 
 
 
 
f38370b
6762ee0
 
 
 
 
f38370b
6762ee0
f38370b
6762ee0
 
f38370b
6762ee0
f38370b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from pydub import AudioSegment
from transformers import pipeline
import gradio as gr
import ffmpeg
import soundfile as sf
import numpy as np


def process_audio_length(audio_path):
    audio = AudioSegment.from_file(audio_path)
    audio_length_ms = len(audio)  
    chunks = []
    target_length_ms=30000
    if audio_length_ms < target_length_ms:
        padding = AudioSegment.silent(duration=target_length_ms - audio_length_ms)
        padded_audio = audio + padding
        chunks.append(padded_audio)
    else:
        for start in range(0, audio_length_ms, target_length_ms):
            end = min(start + target_length_ms, audio_length_ms)
            chunk = audio[start:end]
            if len(chunk) < target_length_ms:
                padding = AudioSegment.silent(duration=target_length_ms - len(chunk))
                chunk += padding
            chunks.append(chunk)
    
    return chunks

def save_chunks(chunks):
    import os
    output_folder="audio_chunks"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    chunk_files = []
    for i, chunk in enumerate(chunks):
        chunk_name = f"{output_folder}/chunk_{i}.wav"
        chunk.export(chunk_name, format="wav")
        chunk_files.append(chunk_name)
    
    return chunk_files

# Load the model using the pipeline
pipe = pipeline(task="automatic-speech-recognition", model="alisharifi/whisper-farsi")

# Extracting sound from video file
def extract_audio_from_video(video_file):
    output_audio = "output_audio.wav"
    stream = ffmpeg.input(video_file)
    stream = ffmpeg.output(stream, output_audio, ac=1, ar="16000")
    ffmpeg.run(stream, overwrite_output=True)
    return output_audio

# ASR function
def process_audio(audio):
    if audio is None:
        return "لطفاً صدایی ضبط کنید."    
    chunks = process_audio_length(audio)  
    chunk_files = save_chunks(chunks)
    transcriptions = []
    for chunk_file in chunk_files:
        transcription = pipe(chunk_file)["text"] 
        transcriptions.append(transcription)
    
    return " ".join(transcriptions)

# Video handler
def process_video(video_file):
    audio_path = extract_audio_from_video(video_file)
    return process_audio(audio_path)

# Audio handler
def process_audio_file(audio_file):
    return process_audio(audio_file)

# microphone handler
def process_microphone(audio_data):
    print(f"Audio data type: {type(audio_data)}")
    print(f"Audio data content: {audio_data}")
    
    if audio_data is None:
        return "هیچ صدایی ضبط نشد. لطفاً دوباره امتحان کنید."
    
    # بررسی فرمت داده
    if not isinstance(audio_data, tuple) or len(audio_data) != 2:
        return f"فرمت داده صوتی نادرست است: {type(audio_data)}"
    
    audio_array, sample_rate = audio_data  
    if not isinstance(audio_array, np.ndarray):
        return f"داده صوتی نادرست است: {type(audio_array)}"
    
    # اطمینان از اینکه داده‌های صوتی دو‌بعدی است
    if audio_array.ndim == 1:
        audio_array = audio_array[:, np.newaxis]
    
    # ذخیره فایل صوتی
    audio_path = "recorded_audio.wav"
    sf.write(audio_path, audio_array, sample_rate)
    return process_audio(audio_path)


# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## سامانه تبدیل گفتار به متن")
   
    # Video input
    with gr.Tab("آپلود ویدئو"):
        video_input = gr.Video(label="آپلود فایل ویدئو")
        video_output = gr.Textbox(label="متن استخراج شده")
        video_button = gr.Button("پردازش")
        video_button.click(process_video, inputs=video_input, outputs=video_output)
   
    # Audio input
    with gr.Tab("آپلود فایل صوتی"):
        audio_input = gr.Audio(label="آپلود فایل صوتی", type="filepath")
        audio_output = gr.Textbox(label="متن استخراج شده")
        audio_button = gr.Button("پردازش")
        audio_button.click(process_audio_file, inputs=audio_input, outputs=audio_output)
 # recording voice
    with gr.Tab("ضبط صدا"):
        mic_input = gr.Audio(sources="microphone", type="filepath", label="ضبط صدا")
        mic_output = gr.Textbox(label="متن استخراج شده")
        mic_button = gr.Button("پردازش")
        mic_button.click(process_audio_file, inputs=mic_input, outputs=mic_output)

# Ensure share=True for a shareable link
demo.launch(share=True)