How do I get it to use the GPU?
I understand that hooking it up into a llama-cpp instance that uses the GPU is the way, but the only way to do that without getting a bit complex is to use pre-existing binaries like those that come with oollama, lmstudio, etc.
Can the creators find a way to hook it up into a version of torch, or is it already set in stone?
here is a gradio demo that uses the gpu
import gradio as gr
import soundfile as sf
import tempfile
import os
from pathlib import Path
from neuttsair.neutts import NeuTTSAir
# Global variable to store the TTS model
tts_model = None
current_backbone = None
def initialize_model(backbone_choice):
"""Initialize or reinitialize the TTS model with the selected backbone."""
global tts_model, current_backbone
backbone_map = {
"Standard (PyTorch)": "neuphonic/neutts-air",
"Q8 GGUF (Faster)": "neuphonic/neutts-air-q8-gguf",
"Q4 GGUF (Fastest)": "neuphonic/neutts-air-q4-gguf"
}
backbone_repo = backbone_map[backbone_choice]
# Only reinitialize if the backbone changed
if current_backbone != backbone_repo:
print(f"Initializing model with {backbone_repo}...")
tts_model = NeuTTSAir(
backbone_repo=backbone_repo,
backbone_device="cuda", # Changed from "cpu"
codec_repo="neuphonic/neucodec",
codec_device="cuda" # Changed from "cpu"
)
current_backbone = backbone_repo
return f"β
Model loaded: {backbone_choice}"
return f"β
Using: {backbone_choice}"
def generate_speech(input_text, ref_audio, ref_text, backbone_choice, progress=gr.Progress()):
"""Generate speech from text using reference audio for voice cloning."""
try:
if not input_text or not input_text.strip():
return None, "β Please enter text to synthesize."
if ref_audio is None:
return None, "β Please upload a reference audio file."
if not ref_text or not ref_text.strip():
return None, "β Please enter the reference text."
# Initialize model if needed
progress(0.1, desc="Loading model...")
status = initialize_model(backbone_choice)
# Encode reference audio
progress(0.3, desc="Encoding reference audio...")
ref_codes = tts_model.encode_reference(ref_audio)
# Generate speech
progress(0.6, desc="Generating speech...")
wav = tts_model.infer(input_text, ref_codes, ref_text.strip())
# Save to temporary file
progress(0.9, desc="Saving audio...")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
sf.write(tmp_file.name, wav, 24000)
output_path = tmp_file.name
return output_path, f"β
Speech generated successfully! ({len(wav)/24000:.2f}s)"
except Exception as e:
return None, f"β Error: {str(e)}"
def load_example(example_name):
"""Load example reference audio and text."""
example_map = {
"Dave": ("samples/dave.wav", open("samples/dave.txt").read().strip()),
"Jo": ("samples/jo.wav", open("samples/jo.txt").read().strip())
}
if example_name in example_map:
audio_path, text = example_map[example_name]
if os.path.exists(audio_path):
return audio_path, text, "β
Example loaded!"
else:
return None, "", "β Example file not found."
return None, "", ""
# Create Gradio interface
with gr.Blocks(title="NeuTTS Air - Voice Cloning Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ποΈ NeuTTS Air - Voice Cloning Demo
Create ultra-realistic voice clones with as little as 3 seconds of audio! This demo uses NeuTTS Air,
a state-of-the-art on-device TTS model with instant voice cloning capabilities.
### π How to use:
1. **Upload reference audio** (3-15 seconds, clean speech, WAV format recommended)
2. **Enter what the reference says** (the transcript of your reference audio)
3. **Type what you want to synthesize** (the text in the cloned voice)
4. **Choose a model** (Q4 GGUF is fastest for CPU)
5. **Click Generate!**
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π΅ Reference Audio (Voice to Clone)")
# Example selector
with gr.Row():
example_selector = gr.Radio(
choices=["Dave", "Jo", "Custom"],
value="Custom",
label="Quick Examples or Custom",
info="Try built-in examples or upload your own"
)
ref_audio_input = gr.Audio(
label="Reference Audio (3-15s of clean speech)",
type="filepath"
)
ref_text_input = gr.Textbox(
label="Reference Text (Transcript of the reference audio)",
placeholder="What does the reference audio say?",
lines=3
)
example_status = gr.Textbox(label="Status", interactive=False, visible=False)
with gr.Column(scale=1):
gr.Markdown("### βοΈ Text to Synthesize")
input_text = gr.Textbox(
label="Input Text (Text to synthesize in the cloned voice)",
placeholder="Enter the text you want to synthesize in the cloned voice...",
lines=5
)
backbone_choice = gr.Radio(
choices=["Standard (PyTorch)", "Q8 GGUF (Faster)", "Q4 GGUF (Fastest)"],
value="Q4 GGUF (Fastest)",
label="Model Selection (Q4 GGUF recommended for CPU)"
)
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg")
with gr.Row():
with gr.Column():
output_audio = gr.Audio(
label="Generated Speech",
type="filepath"
)
status_output = gr.Textbox(label="Generation Status", interactive=False)
gr.Markdown("""
### π Tips for Best Results:
- **Reference audio should be**: mono, 16-44kHz, 3-15 seconds, clean with minimal background noise
- **Natural speech works best**: conversational tone with few pauses
- **GGUF models**: Much faster on CPU, slight quality trade-off
- **Longer texts**: May take more time but produce great results
### β οΈ Responsible Use:
- Only clone voices you have permission to use
- Generated audio is watermarked for detection
- Built by [Neuphonic](https://neuphonic.com)
""")
# Event handlers
def handle_example_change(example_name):
if example_name in ["Dave", "Jo"]:
return load_example(example_name)
return None, "", ""
example_selector.change(
fn=handle_example_change,
inputs=[example_selector],
outputs=[ref_audio_input, ref_text_input, example_status]
)
generate_btn.click(
fn=generate_speech,
inputs=[input_text, ref_audio_input, ref_text_input, backbone_choice],
outputs=[output_audio, status_output]
)
# Example inputs for quick testing
gr.Examples(
examples=[
["Hey there, I'm exploring the world of voice AI and it's absolutely fascinating!"],
["The quick brown fox jumps over the lazy dog."],
["My name is Alex, and I'm 28 years old. I work in technology and love building innovative products."],
],
inputs=[input_text],
label="Example Texts to Try"
)
if __name__ == "__main__":
demo.launch(
share=False,
server_name="0.0.0.0",
server_port=7868
)
Appears you've just changed CPU to CUDA -- doesn't seem to work,
"AssertionError: Torch not compiled with CUDA enabled"
@jattoedaltni
it's because in the requirements.txt you have "torch" compiled for CPU only. To change it to a CUDA version follow these steps:
check https://pytorch.org/get-started/locally/ to get the necessary pip command for your version of CUDA. Let's presume you have CUDA v12.6, then you will get
pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
. You need only the link part.open requirements.txt. Remove torch and add this to the end of the file:
--extra-index-url https://download.pytorch.org/whl/cu126
torch
torchvisionexecute
pip uninstall torch
to remove the previous version of torchexecute
pip install -r requirements.txt
. A different type of torch must be downloading (weights about 2 GB)Use the example above with gradio from the previous commenter, and run the app. It must use GPU now