import gradio as gr
import torch
import torchaudio
from transformers import pipeline
import numpy as np

# Tải mô hình Ichigo-whisper
model_id = "Menlo/Ichigo-whisper-v0.1"
transcriber = pipeline("automatic-speech-recognition", model=model_id)

def transcribe_stream(stream, new_chunk):
    # Trích xuất sample rate và dữ liệu âm thanh
    sr, y = new_chunk
    
    # Chuyển về mono nếu là stereo
    if y.ndim > 1:
        y = y.mean(axis=1)
        
    # Chuẩn hóa âm thanh
    y = y.astype(np.float32)
    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0

    # Nối với audio trước đó
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    
    # Dự đoán kết quả
    result = transcriber({"sampling_rate": sr, "raw": stream})
    return stream, result["text"]

# Tạo giao diện Gradio
title = "Ichigo Whisper Streaming Demo"
description = """
# 🍓 Ichigo Whisper Streaming Recognition
Nhận dạng giọng nói theo thời gian thực với mô hình Menlo/Ichigo-whisper-v0.1.
"""

# Tạo giao diện streaming
streaming_demo = gr.Interface(
    fn=transcribe_stream,
    inputs=[
        "state", 
        gr.Audio(sources=["microphone"], streaming=True)
    ],
    outputs=[
        "state",
        gr.Textbox(label="Phiên âm theo thời gian thực")
    ],
    live=True,
    title=title,
    description=description
)

# Khởi chạy ứng dụng
if __name__ == "__main__":
    streaming_demo.launch()