|
import os |
|
from pathlib import Path |
|
import numpy as np |
|
import sherpa_onnx |
|
import scipy.signal |
|
from opencc import OpenCC |
|
|
|
|
|
CACHE_DIR = Path(__file__).parent / "hf_cache" |
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
to_ZHTW = OpenCC('s2t') |
|
to_ZHCN = OpenCC('t2s') |
|
|
|
|
|
|
|
|
|
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: |
|
return scipy.signal.resample_poly(audio, target_sr, orig_sr) |
|
|
|
|
|
|
|
|
|
def create_recognizer(): |
|
|
|
tokens_path = 'app/model parts/tokens.txt' |
|
encoder_path = 'app/model parts/encoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx' |
|
decoder_path = 'app/model parts/decoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx' |
|
joiner_path = 'app/model parts/joiner-epoch-35-avg-7-chunk-32-left-256.fp16.onnx' |
|
|
|
|
|
|
|
return sherpa_onnx.OnlineRecognizer.from_transducer( |
|
tokens=tokens_path, |
|
encoder=encoder_path, |
|
decoder=decoder_path, |
|
joiner=joiner_path, |
|
provider="cpu", |
|
|
|
num_threads=1, |
|
sample_rate=16000, |
|
feature_dim=80, |
|
decoding_method="greedy_search", |
|
|
|
enable_endpoint_detection=True, |
|
) |
|
|
|
def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr): |
|
audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32) |
|
if audio.size == 0: |
|
return "", 0.0 |
|
|
|
resampled = resample_audio(audio, orig_sr, 16000) |
|
rms = float(np.sqrt(np.mean(resampled ** 2))) |
|
|
|
stream.accept_waveform(16000, resampled) |
|
if recognizer.is_ready(stream): |
|
|
|
recognizer.decode_streams([stream]) |
|
result = recognizer.get_result(stream) |
|
return to_ZHTW.convert(result), rms |
|
|