Spaces:

akulubala
/

clone-voice

Sleeping

App Files Files Community

clone-voice / runtime /triton_trtllm /model_repo /audio_tokenizer /1 /model.py

akulubala

start deploy

7cde53f 2 months ago

raw

history blame contribute delete

5.4 kB

	# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# * Neither the name of NVIDIA CORPORATION nor the names of its
	# contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	import json
	import torch
	from torch.utils.dlpack import to_dlpack

	import triton_python_backend_utils as pb_utils

	import os
	import numpy as np

	from sparktts.models.audio_tokenizer import BiCodecTokenizer

	class TritonPythonModel:
	"""Triton Python model for audio tokenization.

	This model takes reference audio input and extracts semantic and global tokens
	using BiCodec tokenizer.
	"""

	def initialize(self, args):
	"""Initialize the model.

	Args:
	args: Dictionary containing model configuration
	"""
	# Parse model parameters
	parameters = json.loads(args['model_config'])['parameters']
	model_params = {k: v["string_value"] for k, v in parameters.items()}

	# Initialize tokenizer
	self.device = torch.device("cuda")
	self.audio_tokenizer = BiCodecTokenizer(model_params["model_dir"],
	device=self.device)

	def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
	"""Extract reference audio clip for speaker embedding.

	Args:
	wav: Input waveform array

	Returns:
	Reference clip of fixed duration
	"""
	SAMPLE_RATE = 16000
	REF_SEGMENT_DURATION = 6 # seconds
	LATENT_HOP_LENGTH = 320

	ref_segment_length = (
	int(SAMPLE_RATE * REF_SEGMENT_DURATION)
	// LATENT_HOP_LENGTH
	* LATENT_HOP_LENGTH
	)
	wav_length = len(wav)

	if ref_segment_length > wav_length:
	# Repeat and truncate if input is too short
	repeat_times = ref_segment_length // wav_length + 1
	wav = np.tile(wav, repeat_times)

	return wav[:ref_segment_length]

	def execute(self, requests):
	"""Execute inference on the batched requests.

	Args:
	requests: List of inference requests

	Returns:
	List of inference responses containing tokenized outputs
	"""
	reference_wav_list = []
	reference_wav_ref_clip_list = []

	# Process each request in batch
	for request in requests:
	# Extract input tensors
	wav_array = pb_utils.get_input_tensor_by_name(
	request, "reference_wav").as_numpy()
	wav_len = pb_utils.get_input_tensor_by_name(
	request, "reference_wav_len").as_numpy().item()

	# Prepare inputs
	wav = wav_array[:, :wav_len].squeeze(0)
	reference_wav_list.append(wav)

	wav_ref_clip = self.get_ref_clip(wav)
	reference_wav_ref_clip_list.append(torch.from_numpy(wav_ref_clip))

	# Batch process through tokenizer
	ref_wav_clip_tensor = torch.stack(reference_wav_ref_clip_list, dim=0)
	wav2vec2_features = self.audio_tokenizer.extract_wav2vec2_features(
	reference_wav_list)

	audio_tokenizer_input = {
	"ref_wav": ref_wav_clip_tensor.to(self.device),
	"feat": wav2vec2_features.to(self.device),
	}
	semantic_tokens, global_tokens = self.audio_tokenizer.model.tokenize(
	audio_tokenizer_input)

	# Prepare responses
	responses = []
	for i in range(len(requests)):
	global_tokens_tensor = pb_utils.Tensor.from_dlpack(
	"global_tokens", to_dlpack(global_tokens[i]))
	semantic_tokens_tensor = pb_utils.Tensor.from_dlpack(
	"semantic_tokens", to_dlpack(semantic_tokens[i]))

	inference_response = pb_utils.InferenceResponse(
	output_tensors=[global_tokens_tensor, semantic_tokens_tensor])
	responses.append(inference_response)

	return responses