Spaces:

vankienemk
/

Voice-regconizer

Running

Voice-regconizer / app.py

Update app.py

0c2f2f9 verified 4 months ago

1.52 kB

	import gradio as gr
	import torch
	import torchaudio
	from transformers import pipeline
	import numpy as np

	# Tải mô hình Ichigo-whisper
	model_id = "Menlo/Ichigo-whisper-v0.1"
	transcriber = pipeline("automatic-speech-recognition", model=model_id)

	def transcribe_stream(stream, new_chunk):
	# Trích xuất sample rate và dữ liệu âm thanh
	sr, y = new_chunk

	# Chuyển về mono nếu là stereo
	if y.ndim > 1:
	y = y.mean(axis=1)

	# Chuẩn hóa âm thanh
	y = y.astype(np.float32)
	y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0

	# Nối với audio trước đó
	if stream is not None:
	stream = np.concatenate([stream, y])
	else:
	stream = y

	# Dự đoán kết quả
	result = transcriber({"sampling_rate": sr, "raw": stream})
	return stream, result["text"]

	# Tạo giao diện Gradio
	title = "Ichigo Whisper Streaming Demo"
	description = """
	# 🍓 Ichigo Whisper Streaming Recognition
	Nhận dạng giọng nói theo thời gian thực với mô hình Menlo/Ichigo-whisper-v0.1.
	"""

	# Tạo giao diện streaming
	streaming_demo = gr.Interface(
	fn=transcribe_stream,
	inputs=[
	"state",
	gr.Audio(sources=["microphone"], streaming=True)
	],
	outputs=[
	"state",
	gr.Textbox(label="Phiên âm theo thời gian thực")
	],
	live=True,
	title=title,
	description=description
	)

	# Khởi chạy ứng dụng
	if __name__ == "__main__":
	streaming_demo.launch()