programindz's picture
Update app/asr.py
18f67bb verified
import os
from pathlib import Path
import numpy as np
import sherpa_onnx
import scipy.signal
from opencc import OpenCC
# Ensure Hugging Face cache is in a user-writable directory
CACHE_DIR = Path(__file__).parent / "hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
to_ZHTW = OpenCC('s2t')
to_ZHCN = OpenCC('t2s')
# Streaming Zipformer model registry: paths relative to repo root
# Audio resampling utility
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
return scipy.signal.resample_poly(audio, target_sr, orig_sr)
# Create an online recognizer for a given model and precision
# model_id: full HF repo ID
# precision: "int8" or "fp32"
def create_recognizer():
tokens_path = 'app/model parts/tokens.txt'
encoder_path = 'app/model parts/encoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx'
decoder_path = 'app/model parts/decoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx'
joiner_path = 'app/model parts/joiner-epoch-35-avg-7-chunk-32-left-256.fp16.onnx'
# β€”β€”β€” Fallback to original greedy-search (no hotword biasing) β€”β€”β€”
return sherpa_onnx.OnlineRecognizer.from_transducer(
tokens=tokens_path,
encoder=encoder_path,
decoder=decoder_path,
joiner=joiner_path,
provider="cpu",
# device=0,
num_threads=1,
sample_rate=16000,
feature_dim=80,
decoding_method="greedy_search",
# endpoint detection parameters
enable_endpoint_detection=True,
)
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
if audio.size == 0:
return "", 0.0
resampled = resample_audio(audio, orig_sr, 16000)
rms = float(np.sqrt(np.mean(resampled ** 2)))
stream.accept_waveform(16000, resampled)
if recognizer.is_ready(stream):
# print(stream)
recognizer.decode_streams([stream])
result = recognizer.get_result(stream)
return to_ZHTW.convert(result), rms