Spaces:
Sleeping
Sleeping
# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions | |
# are met: | |
# * Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# * Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# * Neither the name of NVIDIA CORPORATION nor the names of its | |
# contributors may be used to endorse or promote products derived | |
# from this software without specific prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | |
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
import json | |
import torch | |
from torch.utils.dlpack import to_dlpack | |
import triton_python_backend_utils as pb_utils | |
import os | |
import numpy as np | |
from sparktts.models.audio_tokenizer import BiCodecTokenizer | |
class TritonPythonModel: | |
"""Triton Python model for audio tokenization. | |
This model takes reference audio input and extracts semantic and global tokens | |
using BiCodec tokenizer. | |
""" | |
def initialize(self, args): | |
"""Initialize the model. | |
Args: | |
args: Dictionary containing model configuration | |
""" | |
# Parse model parameters | |
parameters = json.loads(args['model_config'])['parameters'] | |
model_params = {k: v["string_value"] for k, v in parameters.items()} | |
# Initialize tokenizer | |
self.device = torch.device("cuda") | |
self.audio_tokenizer = BiCodecTokenizer(model_params["model_dir"], | |
device=self.device) | |
def get_ref_clip(self, wav: np.ndarray) -> np.ndarray: | |
"""Extract reference audio clip for speaker embedding. | |
Args: | |
wav: Input waveform array | |
Returns: | |
Reference clip of fixed duration | |
""" | |
SAMPLE_RATE = 16000 | |
REF_SEGMENT_DURATION = 6 # seconds | |
LATENT_HOP_LENGTH = 320 | |
ref_segment_length = ( | |
int(SAMPLE_RATE * REF_SEGMENT_DURATION) | |
// LATENT_HOP_LENGTH | |
* LATENT_HOP_LENGTH | |
) | |
wav_length = len(wav) | |
if ref_segment_length > wav_length: | |
# Repeat and truncate if input is too short | |
repeat_times = ref_segment_length // wav_length + 1 | |
wav = np.tile(wav, repeat_times) | |
return wav[:ref_segment_length] | |
def execute(self, requests): | |
"""Execute inference on the batched requests. | |
Args: | |
requests: List of inference requests | |
Returns: | |
List of inference responses containing tokenized outputs | |
""" | |
reference_wav_list = [] | |
reference_wav_ref_clip_list = [] | |
# Process each request in batch | |
for request in requests: | |
# Extract input tensors | |
wav_array = pb_utils.get_input_tensor_by_name( | |
request, "reference_wav").as_numpy() | |
wav_len = pb_utils.get_input_tensor_by_name( | |
request, "reference_wav_len").as_numpy().item() | |
# Prepare inputs | |
wav = wav_array[:, :wav_len].squeeze(0) | |
reference_wav_list.append(wav) | |
wav_ref_clip = self.get_ref_clip(wav) | |
reference_wav_ref_clip_list.append(torch.from_numpy(wav_ref_clip)) | |
# Batch process through tokenizer | |
ref_wav_clip_tensor = torch.stack(reference_wav_ref_clip_list, dim=0) | |
wav2vec2_features = self.audio_tokenizer.extract_wav2vec2_features( | |
reference_wav_list) | |
audio_tokenizer_input = { | |
"ref_wav": ref_wav_clip_tensor.to(self.device), | |
"feat": wav2vec2_features.to(self.device), | |
} | |
semantic_tokens, global_tokens = self.audio_tokenizer.model.tokenize( | |
audio_tokenizer_input) | |
# Prepare responses | |
responses = [] | |
for i in range(len(requests)): | |
global_tokens_tensor = pb_utils.Tensor.from_dlpack( | |
"global_tokens", to_dlpack(global_tokens[i])) | |
semantic_tokens_tensor = pb_utils.Tensor.from_dlpack( | |
"semantic_tokens", to_dlpack(semantic_tokens[i])) | |
inference_response = pb_utils.InferenceResponse( | |
output_tensors=[global_tokens_tensor, semantic_tokens_tensor]) | |
responses.append(inference_response) | |
return responses | |