|
--- |
|
license: mit |
|
license_name: neuralaudioai-license |
|
license_link: LICENSE |
|
language: |
|
- en |
|
- es |
|
- ko |
|
- zh |
|
- pl |
|
- pt |
|
- uk |
|
- de |
|
- fr |
|
- el |
|
- ru |
|
pipeline_tag: text-to-speech |
|
library_name: transformers |
|
tags: |
|
- tts |
|
--- |
|
|
|
[](https://neuralaudioai-na-base.hf.space) |
|
|
|
## Model Information |
|
**NA_base** is a **state-of-the-art** open-source Text-to-Speech (TTS) model designed for **high-quality, real-time speech synthesis**. Built using cutting-edge neural architectures, **NA_base** is optimized for **speed, efficiency, and multilingual support**—making it the perfect choice for developers, businesses, and researchers. |
|
|
|
**Key Features**: |
|
- Supports **15 languages** |
|
- **Fast real-time inference** |
|
- Natural-sounding, **human-like speech** |
|
- Designed for **deployment in cloud, edge, and offline environments** |
|
|
|
## How It Works |
|
**NA_base** leverages deep learning-based neural TTS techniques to synthesize speech from raw text. It is **lightweight**, efficient, and trained on high-quality datasets for robust generalization. |
|
|
|
## Usage |
|
Install the required dependencies: |
|
|
|
```bash |
|
pip install xcodec2 |
|
``` |
|
### Synthesizing Speech from Text |
|
```python |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import soundfile as sf |
|
|
|
# Define the model |
|
model_name = "NeuralAudioAI/NA_base" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
model.eval().to("cuda") |
|
|
|
from xcodec2.modeling_xcodec2 import XCodec2Model |
|
|
|
# Load the Codec model |
|
codec_model_path = "NeuralAudioAI/xcodec2" |
|
Codec_model = XCodec2Model.from_pretrained(codec_model_path) |
|
Codec_model.eval().cuda() |
|
|
|
# Input text for synthesis |
|
input_text = "Dealing with family secrets is never easy. Yet, sometimes, omission is a form of protection, intending to safeguard some from the harsh truths. One day, I hope you understand the reasons behind my actions. Until then, please, bear with me." |
|
|
|
def ids_to_speech_tokens(speech_ids): |
|
""" Convert speech IDs into token strings """ |
|
return [f"<|s_{speech_id}|>" for speech_id in speech_ids] |
|
|
|
def extract_speech_ids(speech_tokens_str): |
|
""" Extract speech token IDs from the token strings """ |
|
speech_ids = [] |
|
for token_str in speech_tokens_str: |
|
if token_str.startswith('<|s_') and token_str.endswith('|>'): |
|
num_str = token_str[4:-2] |
|
speech_ids.append(int(num_str)) |
|
else: |
|
print(f"Unexpected token: {token_str}") |
|
return speech_ids |
|
|
|
# TTS Generation |
|
with torch.no_grad(): |
|
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>" |
|
|
|
# Tokenize the input |
|
chat = [ |
|
{"role": "user", "content": "Convert the text to speech:" + formatted_text}, |
|
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"} |
|
] |
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
chat, |
|
tokenize=True, |
|
return_tensors='pt', |
|
continue_final_message=True |
|
).to("cuda") |
|
|
|
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>') |
|
|
|
# Generate speech tokens |
|
outputs = model.generate( |
|
input_ids, |
|
max_length=2048, # Trained with a max length of 2048 |
|
eos_token_id=speech_end_id, |
|
do_sample=True, |
|
top_p=1, # Adjusts the diversity of generated content |
|
temperature=0.8, # Controls randomness in output |
|
) |
|
|
|
# Extract the generated speech tokens |
|
generated_ids = outputs[0][input_ids.shape[1]:-1] |
|
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) |
|
speech_tokens = extract_speech_ids(speech_tokens) |
|
|
|
# Convert to tensor for decoding |
|
speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0) |
|
|
|
# Decode to waveform |
|
gen_wav = Codec_model.decode_code(speech_tokens) |
|
|
|
# Save generated audio |
|
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000) |
|
``` |
|
### Synthesizing Speech from Text |
|
```python |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import soundfile as sf |
|
|
|
# Define the model |
|
model_name = "NeuralAudioAI/NA_base" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
model.eval().to("cuda") |
|
|
|
from xcodec2.modeling_xcodec2 import XCodec2Model |
|
|
|
# Load the Codec model |
|
codec_model_path = "NeuralAudioAI/xcodec2" |
|
Codec_model = XCodec2Model.from_pretrained(codec_model_path) |
|
Codec_model.eval().cuda() |
|
|
|
# Only 16kHz speech support! |
|
prompt_wav, sr = sf.read("prompt.wav") # Use an appropriate prompt speech file |
|
prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0) |
|
|
|
# Define input text |
|
prompt_text = "This is a sample prompt speech input." |
|
target_text = "This is the generated speech continuation." |
|
input_text = prompt_text + target_text |
|
|
|
def ids_to_speech_tokens(speech_ids): |
|
""" Convert speech IDs into token strings """ |
|
return [f"<|s_{speech_id}|>" for speech_id in speech_ids] |
|
|
|
def extract_speech_ids(speech_tokens_str): |
|
""" Extract speech token IDs from the token strings """ |
|
speech_ids = [] |
|
for token_str in speech_tokens_str: |
|
if token_str.startswith('<|s_') and token_str.endswith('|>'): |
|
num_str = token_str[4:-2] |
|
speech_ids.append(int(num_str)) |
|
else: |
|
print(f"Unexpected token: {token_str}") |
|
return speech_ids |
|
|
|
# TTS Generation with Speech Prompt |
|
with torch.no_grad(): |
|
# Encode the prompt wav |
|
vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav) |
|
print("Prompt VQ Code Shape:", vq_code_prompt.shape) |
|
|
|
vq_code_prompt = vq_code_prompt[0, 0, :] |
|
# Convert int 12345 to token <|s_12345|> |
|
speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt) |
|
|
|
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>" |
|
|
|
# Tokenize the text and speech prefix |
|
chat = [ |
|
{"role": "user", "content": "Convert the text to speech:" + formatted_text}, |
|
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)} |
|
] |
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
chat, |
|
tokenize=True, |
|
return_tensors='pt', |
|
continue_final_message=True |
|
).to("cuda") |
|
|
|
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>') |
|
|
|
# Generate the speech autoregressively |
|
outputs = model.generate( |
|
input_ids, |
|
max_length=2048, # Trained with a max length of 2048 |
|
eos_token_id=speech_end_id, |
|
do_sample=True, |
|
top_p=1, # Adjusts the diversity of generated content |
|
temperature=0.8, |
|
) |
|
|
|
# Extract the speech tokens |
|
generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1] |
|
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) |
|
speech_tokens = extract_speech_ids(speech_tokens) |
|
|
|
# Convert to tensor for decoding |
|
speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0) |
|
|
|
# Decode to waveform |
|
gen_wav = Codec_model.decode_code(speech_tokens) |
|
|
|
# If only need the generated part |
|
# gen_wav = gen_wav[:, :, prompt_wav.shape[1]:] |
|
|
|
# Save generated audio |
|
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000) |
|
``` |