Text-to-Speech
Safetensors
GGUF
qwen2
audio
speech
speech-language-models
conversational

How do I get it to use the GPU?

#13
by jattoedaltni - opened

I understand that hooking it up into a llama-cpp instance that uses the GPU is the way, but the only way to do that without getting a bit complex is to use pre-existing binaries like those that come with oollama, lmstudio, etc.
Can the creators find a way to hook it up into a version of torch, or is it already set in stone?

here is a gradio demo that uses the gpu

import gradio as gr
import soundfile as sf
import tempfile
import os
from pathlib import Path
from neuttsair.neutts import NeuTTSAir

# Global variable to store the TTS model
tts_model = None
current_backbone = None

def initialize_model(backbone_choice):
    """Initialize or reinitialize the TTS model with the selected backbone."""
    global tts_model, current_backbone
    
    backbone_map = {
        "Standard (PyTorch)": "neuphonic/neutts-air",
        "Q8 GGUF (Faster)": "neuphonic/neutts-air-q8-gguf",
        "Q4 GGUF (Fastest)": "neuphonic/neutts-air-q4-gguf"
    }
    
    backbone_repo = backbone_map[backbone_choice]
    
    # Only reinitialize if the backbone changed
    if current_backbone != backbone_repo:
        print(f"Initializing model with {backbone_repo}...")
        tts_model = NeuTTSAir(
        backbone_repo=backbone_repo,
        backbone_device="cuda",  # Changed from "cpu"
        codec_repo="neuphonic/neucodec",
        codec_device="cuda"      # Changed from "cpu"
    )
        current_backbone = backbone_repo
        return f"βœ… Model loaded: {backbone_choice}"
    return f"βœ… Using: {backbone_choice}"

def generate_speech(input_text, ref_audio, ref_text, backbone_choice, progress=gr.Progress()):
    """Generate speech from text using reference audio for voice cloning."""
    try:
        if not input_text or not input_text.strip():
            return None, "❌ Please enter text to synthesize."
        
        if ref_audio is None:
            return None, "❌ Please upload a reference audio file."
        
        if not ref_text or not ref_text.strip():
            return None, "❌ Please enter the reference text."
        
        # Initialize model if needed
        progress(0.1, desc="Loading model...")
        status = initialize_model(backbone_choice)
        
        # Encode reference audio
        progress(0.3, desc="Encoding reference audio...")
        ref_codes = tts_model.encode_reference(ref_audio)
        
        # Generate speech
        progress(0.6, desc="Generating speech...")
        wav = tts_model.infer(input_text, ref_codes, ref_text.strip())
        
        # Save to temporary file
        progress(0.9, desc="Saving audio...")
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            sf.write(tmp_file.name, wav, 24000)
            output_path = tmp_file.name
        
        return output_path, f"βœ… Speech generated successfully! ({len(wav)/24000:.2f}s)"
    
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

def load_example(example_name):
    """Load example reference audio and text."""
    example_map = {
        "Dave": ("samples/dave.wav", open("samples/dave.txt").read().strip()),
        "Jo": ("samples/jo.wav", open("samples/jo.txt").read().strip())
    }
    
    if example_name in example_map:
        audio_path, text = example_map[example_name]
        if os.path.exists(audio_path):
            return audio_path, text, "βœ… Example loaded!"
        else:
            return None, "", "❌ Example file not found."
    return None, "", ""

# Create Gradio interface
with gr.Blocks(title="NeuTTS Air - Voice Cloning Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # πŸŽ™οΈ NeuTTS Air - Voice Cloning Demo
    
    Create ultra-realistic voice clones with as little as 3 seconds of audio! This demo uses NeuTTS Air, 
    a state-of-the-art on-device TTS model with instant voice cloning capabilities.
    
    ### πŸ“‹ How to use:
    1. **Upload reference audio** (3-15 seconds, clean speech, WAV format recommended)
    2. **Enter what the reference says** (the transcript of your reference audio)
    3. **Type what you want to synthesize** (the text in the cloned voice)
    4. **Choose a model** (Q4 GGUF is fastest for CPU)
    5. **Click Generate!**
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎡 Reference Audio (Voice to Clone)")
            
            # Example selector
            with gr.Row():
                example_selector = gr.Radio(
                    choices=["Dave", "Jo", "Custom"],
                    value="Custom",
                    label="Quick Examples or Custom",
                    info="Try built-in examples or upload your own"
                )
            
            ref_audio_input = gr.Audio(
                label="Reference Audio (3-15s of clean speech)",
                type="filepath"
            )
            
            ref_text_input = gr.Textbox(
                label="Reference Text (Transcript of the reference audio)",
                placeholder="What does the reference audio say?",
                lines=3
            )
            
            example_status = gr.Textbox(label="Status", interactive=False, visible=False)
        
        with gr.Column(scale=1):
            gr.Markdown("### ✍️ Text to Synthesize")
            
            input_text = gr.Textbox(
                label="Input Text (Text to synthesize in the cloned voice)",
                placeholder="Enter the text you want to synthesize in the cloned voice...",
                lines=5
            )
            
            backbone_choice = gr.Radio(
                choices=["Standard (PyTorch)", "Q8 GGUF (Faster)", "Q4 GGUF (Fastest)"],
                value="Q4 GGUF (Fastest)",
                label="Model Selection (Q4 GGUF recommended for CPU)"
            )
            
            generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
    
    with gr.Row():
        with gr.Column():
            output_audio = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            status_output = gr.Textbox(label="Generation Status", interactive=False)
    
    gr.Markdown("""
    ### πŸ“Œ Tips for Best Results:
    - **Reference audio should be**: mono, 16-44kHz, 3-15 seconds, clean with minimal background noise
    - **Natural speech works best**: conversational tone with few pauses
    - **GGUF models**: Much faster on CPU, slight quality trade-off
    - **Longer texts**: May take more time but produce great results
    
    ### ⚠️ Responsible Use:
    - Only clone voices you have permission to use
    - Generated audio is watermarked for detection
    - Built by [Neuphonic](https://neuphonic.com)
    """)
    
    # Event handlers
    def handle_example_change(example_name):
        if example_name in ["Dave", "Jo"]:
            return load_example(example_name)
        return None, "", ""
    
    example_selector.change(
        fn=handle_example_change,
        inputs=[example_selector],
        outputs=[ref_audio_input, ref_text_input, example_status]
    )
    
    generate_btn.click(
        fn=generate_speech,
        inputs=[input_text, ref_audio_input, ref_text_input, backbone_choice],
        outputs=[output_audio, status_output]
    )
    
    # Example inputs for quick testing
    gr.Examples(
        examples=[
            ["Hey there, I'm exploring the world of voice AI and it's absolutely fascinating!"],
            ["The quick brown fox jumps over the lazy dog."],
            ["My name is Alex, and I'm 28 years old. I work in technology and love building innovative products."],
        ],
        inputs=[input_text],
        label="Example Texts to Try"
    )

if __name__ == "__main__":
    demo.launch(
        share=False,
        server_name="0.0.0.0",
        server_port=7868
    )

Appears you've just changed CPU to CUDA -- doesn't seem to work,
"AssertionError: Torch not compiled with CUDA enabled"

@jattoedaltni
it's because in the requirements.txt you have "torch" compiled for CPU only. To change it to a CUDA version follow these steps:

  1. check https://pytorch.org/get-started/locally/ to get the necessary pip command for your version of CUDA. Let's presume you have CUDA v12.6, then you will get pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126. You need only the link part.

  2. open requirements.txt. Remove torch and add this to the end of the file:
    --extra-index-url https://download.pytorch.org/whl/cu126
    torch
    torchvision

  3. execute pip uninstall torch to remove the previous version of torch

  4. execute pip install -r requirements.txt. A different type of torch must be downloading (weights about 2 GB)

  5. Use the example above with gradio from the previous commenter, and run the app. It must use GPU now

Sign up or log in to comment