NA_base / README.md

Update README.md

6c52c41 verified 6 months ago

7.34 kB

	---
	license: mit
	license_name: neuralaudioai-license
	license_link: LICENSE
	language:
	- en
	- es
	- ko
	- zh
	- pl
	- pt
	- uk
	- de
	- fr
	- el
	- ru
	pipeline_tag: text-to-speech
	library_name: transformers
	tags:
	- tts
	---

	[![Open Demo](https://img.shields.io/badge/🤗-Open%20Demo-blue.svg)](https://neuralaudioai-na-base.hf.space)

	## Model Information
	NA_base is a state-of-the-art open-source Text-to-Speech (TTS) model designed for high-quality, real-time speech synthesis. Built using cutting-edge neural architectures, NA_base is optimized for speed, efficiency, and multilingual support—making it the perfect choice for developers, businesses, and researchers.

	Key Features:
	- Supports 15 languages
	- Fast real-time inference
	- Natural-sounding, human-like speech
	- Designed for deployment in cloud, edge, and offline environments

	## How It Works
	NA_base leverages deep learning-based neural TTS techniques to synthesize speech from raw text. It is lightweight, efficient, and trained on high-quality datasets for robust generalization.

	## Usage
	Install the required dependencies:

	```bash
	pip install xcodec2
	```
	### Synthesizing Speech from Text
	```python
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import soundfile as sf

	# Define the model
	model_name = "NeuralAudioAI/NA_base"

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)
	model.eval().to("cuda")

	from xcodec2.modeling_xcodec2 import XCodec2Model

	# Load the Codec model
	codec_model_path = "NeuralAudioAI/xcodec2"
	Codec_model = XCodec2Model.from_pretrained(codec_model_path)
	Codec_model.eval().cuda()

	# Input text for synthesis
	input_text = "Dealing with family secrets is never easy. Yet, sometimes, omission is a form of protection, intending to safeguard some from the harsh truths. One day, I hope you understand the reasons behind my actions. Until then, please, bear with me."

	def ids_to_speech_tokens(speech_ids):
	""" Convert speech IDs into token strings """
	return [f"<\|s_{speech_id}\|>" for speech_id in speech_ids]

	def extract_speech_ids(speech_tokens_str):
	""" Extract speech token IDs from the token strings """
	speech_ids = []
	for token_str in speech_tokens_str:
	if token_str.startswith('<\|s_') and token_str.endswith('\|>'):
	num_str = token_str[4:-2]
	speech_ids.append(int(num_str))
	else:
	print(f"Unexpected token: {token_str}")
	return speech_ids

	# TTS Generation
	with torch.no_grad():
	formatted_text = f"<\|TEXT_UNDERSTANDING_START\|>{input_text}<\|TEXT_UNDERSTANDING_END\|>"

	# Tokenize the input
	chat = [
	{"role": "user", "content": "Convert the text to speech:" + formatted_text},
	{"role": "assistant", "content": "<\|SPEECH_GENERATION_START\|>"}
	]

	input_ids = tokenizer.apply_chat_template(
	chat,
	tokenize=True,
	return_tensors='pt',
	continue_final_message=True
	).to("cuda")

	speech_end_id = tokenizer.convert_tokens_to_ids('<\|SPEECH_GENERATION_END\|>')

	# Generate speech tokens
	outputs = model.generate(
	input_ids,
	max_length=2048, # Trained with a max length of 2048
	eos_token_id=speech_end_id,
	do_sample=True,
	top_p=1, # Adjusts the diversity of generated content
	temperature=0.8, # Controls randomness in output
	)

	# Extract the generated speech tokens
	generated_ids = outputs[0][input_ids.shape[1]:-1]
	speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
	speech_tokens = extract_speech_ids(speech_tokens)

	# Convert to tensor for decoding
	speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

	# Decode to waveform
	gen_wav = Codec_model.decode_code(speech_tokens)

	# Save generated audio
	sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
	```
	### Synthesizing Speech from Text
	```python
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import soundfile as sf

	# Define the model
	model_name = "NeuralAudioAI/NA_base"

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)
	model.eval().to("cuda")

	from xcodec2.modeling_xcodec2 import XCodec2Model

	# Load the Codec model
	codec_model_path = "NeuralAudioAI/xcodec2"
	Codec_model = XCodec2Model.from_pretrained(codec_model_path)
	Codec_model.eval().cuda()

	# Only 16kHz speech support!
	prompt_wav, sr = sf.read("prompt.wav") # Use an appropriate prompt speech file
	prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0)

	# Define input text
	prompt_text = "This is a sample prompt speech input."
	target_text = "This is the generated speech continuation."
	input_text = prompt_text + target_text

	def ids_to_speech_tokens(speech_ids):
	""" Convert speech IDs into token strings """
	return [f"<\|s_{speech_id}\|>" for speech_id in speech_ids]

	def extract_speech_ids(speech_tokens_str):
	""" Extract speech token IDs from the token strings """
	speech_ids = []
	for token_str in speech_tokens_str:
	if token_str.startswith('<\|s_') and token_str.endswith('\|>'):
	num_str = token_str[4:-2]
	speech_ids.append(int(num_str))
	else:
	print(f"Unexpected token: {token_str}")
	return speech_ids

	# TTS Generation with Speech Prompt
	with torch.no_grad():
	# Encode the prompt wav
	vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
	print("Prompt VQ Code Shape:", vq_code_prompt.shape)

	vq_code_prompt = vq_code_prompt[0, 0, :]
	# Convert int 12345 to token <\|s_12345\|>
	speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)

	formatted_text = f"<\|TEXT_UNDERSTANDING_START\|>{input_text}<\|TEXT_UNDERSTANDING_END\|>"

	# Tokenize the text and speech prefix
	chat = [
	{"role": "user", "content": "Convert the text to speech:" + formatted_text},
	{"role": "assistant", "content": "<\|SPEECH_GENERATION_START\|>" + ''.join(speech_ids_prefix)}
	]

	input_ids = tokenizer.apply_chat_template(
	chat,
	tokenize=True,
	return_tensors='pt',
	continue_final_message=True
	).to("cuda")

	speech_end_id = tokenizer.convert_tokens_to_ids('<\|SPEECH_GENERATION_END\|>')

	# Generate the speech autoregressively
	outputs = model.generate(
	input_ids,
	max_length=2048, # Trained with a max length of 2048
	eos_token_id=speech_end_id,
	do_sample=True,
	top_p=1, # Adjusts the diversity of generated content
	temperature=0.8,
	)

	# Extract the speech tokens
	generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1]
	speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
	speech_tokens = extract_speech_ids(speech_tokens)

	# Convert to tensor for decoding
	speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

	# Decode to waveform
	gen_wav = Codec_model.decode_code(speech_tokens)

	# If only need the generated part
	# gen_wav = gen_wav[:, :, prompt_wav.shape[1]:]

	# Save generated audio
	sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
	```