Spaces:

programindz
/

kashmiri-streaming-asr-zipformer

Sleeping

App Files Files Community

kashmiri-streaming-asr-zipformer / app /asr.py

programindz

Update app/asr.py

18f67bb verified about 1 month ago

raw

history blame contribute delete

2.02 kB

	import os
	from pathlib import Path
	import numpy as np
	import sherpa_onnx
	import scipy.signal
	from opencc import OpenCC

	# Ensure Hugging Face cache is in a user-writable directory
	CACHE_DIR = Path(__file__).parent / "hf_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)

	to_ZHTW = OpenCC('s2t')
	to_ZHCN = OpenCC('t2s')

	# Streaming Zipformer model registry: paths relative to repo root

	# Audio resampling utility
	def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
	return scipy.signal.resample_poly(audio, target_sr, orig_sr)

	# Create an online recognizer for a given model and precision
	# model_id: full HF repo ID
	# precision: "int8" or "fp32"
	def create_recognizer():

	tokens_path = 'app/model parts/tokens.txt'
	encoder_path = 'app/model parts/encoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx'
	decoder_path = 'app/model parts/decoder-epoch-35-avg-7-chunk-32-left-256.fp16.onnx'
	joiner_path = 'app/model parts/joiner-epoch-35-avg-7-chunk-32-left-256.fp16.onnx'


	# ——— Fallback to original greedy-search (no hotword biasing) ———
	return sherpa_onnx.OnlineRecognizer.from_transducer(
	tokens=tokens_path,
	encoder=encoder_path,
	decoder=decoder_path,
	joiner=joiner_path,
	provider="cpu",
	# device=0,
	num_threads=1,
	sample_rate=16000,
	feature_dim=80,
	decoding_method="greedy_search",
	# endpoint detection parameters
	enable_endpoint_detection=True,
	)

	def stream_audio(raw_pcm_bytes, stream, recognizer, orig_sr):
	audio = np.frombuffer(raw_pcm_bytes, dtype=np.float32)
	if audio.size == 0:
	return "", 0.0

	resampled = resample_audio(audio, orig_sr, 16000)
	rms = float(np.sqrt(np.mean(resampled ** 2)))

	stream.accept_waveform(16000, resampled)
	if recognizer.is_ready(stream):
	# print(stream)
	recognizer.decode_streams([stream])
	result = recognizer.get_result(stream)
	return to_ZHTW.convert(result), rms