import os from pathlib import Path import numpy as np import sherpa_onnx import scipy.signal from opencc import OpenCC # Ensure Hugging Face cache is in a user-writable directory CACHE_DIR = Path(__file__).parent / "hf_cache" os.makedirs(CACHE_DIR, exist_ok=True) to_ZHTW = OpenCC('s2t') to_ZHCN = OpenCC('t2s') # Streaming Zipformer model registry: paths relative to repo root # Audio resampling utility def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: return scipy.signal.resample_poly(audio, target_sr, orig_sr) # Create an online recognizer for a given model and precision # model_id: full HF repo ID # precision: "int8" or "fp32" def create_recognizer(): tokens_path = 'app/model parts/tokens.txt' encoder_path = 'app/model parts/encoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx' decoder_path = 'app/model parts/decoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx' joiner_path = 'app/model parts/joiner-epoch-35-avg-7-chunk-32-left-256.fp16.onnx' # ——— Fallback to original greedy-search (no hotword biasing) ——— return sherpa_onnx.OnlineRecognizer.from_transducer( tokens=tokens_path, encoder=encoder_path, decoder=decoder_path, joiner=joiner_path, provider="cpu", # device=0, num_threads=1, sample_rate=16000, feature_dim=80, decoding_method="greedy_search", # endpoint detection parameters enable_endpoint_detection=True, ) def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr): audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32) if audio.size == 0: return "", 0.0 resampled = resample_audio(audio, orig_sr, 16000) rms = float(np.sqrt(np.mean(resampled ** 2))) stream.accept_waveform(16000, resampled) if recognizer.is_ready(stream): # print(stream) recognizer.decode_streams([stream]) result = recognizer.get_result(stream) return to_ZHTW.convert(result), rms