Voice-Clone-Router

Paused

App Files Files Community

kemuriririn commited on Jun 3

Commit

1edfb59

1 Parent(s): 87f7c84

(wip)modify for voice clone

Browse files

Files changed (6) hide show

README.md +4 -4
app.py +22 -6
templates/arena.html +43 -16
templates/base.html +6 -6
tts.old.py +0 -117
tts.py +112 -63

README.md CHANGED Viewed

@@ -1,16 +1,16 @@
 ---
-title: TTS Arena V2
 emoji: 🏆
 colorFrom: blue
 colorTo: blue
 sdk: gradio
 app_file: app.py
-short_description: Vote on the latest TTS models!
 pinned: true
 hf_oauth: true
 ---
-Please see the [GitHub repo](https://github.com/TTS-AGI/TTS-Arena-V2) for information.
-Join the [Discord server](https://discord.gg/HB8fMR6GTr) for updates and support.

 ---
+title: Voice Clone Arena
 emoji: 🏆
 colorFrom: blue
 colorTo: blue
 sdk: gradio
 app_file: app.py
+short_description: Vote on the latest Voice Clone TTS models!
 pinned: true
 hf_oauth: true
 ---
+[//]: # (Please see the [GitHub repo]&#40;https://github.com/TTS-AGI/TTS-Arena-V2&#41; for information.)
+[//]: # (Join the [Discord server]&#40;https://discord.gg/HB8fMR6GTr&#41; for updates and support.)

app.py CHANGED Viewed

@@ -509,8 +509,19 @@ def generate_tts():
     if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
         return jsonify({"error": "Turnstile verification required"}), 403
-    data = request.json
-    text = data.get("text", "").strip() # Ensure text is stripped
     if not text or len(text) > 1000:
         return jsonify({"error": "Invalid or too long text"}), 400
@@ -584,9 +595,8 @@ def generate_tts():
         # Function to process a single model (generate directly to TEMP_AUDIO_DIR, not cache subdir)
         def process_model_on_the_fly(model):
-             # Generate and save directly to the main temp dir
-             # Assume predict_tts handles saving temporary files
-             temp_audio_path = predict_tts(text, model.id)
              if not temp_audio_path or not os.path.exists(temp_audio_path):
                  raise ValueError(f"predict_tts failed for model {model.id}")
@@ -597,7 +607,6 @@ def generate_tts():
              return {"model_id": model.id, "audio_path": dest_path}
         # Use ThreadPoolExecutor to process models concurrently
         with ThreadPoolExecutor(max_workers=2) as executor:
             results = list(executor.map(process_model_on_the_fly, selected_models))
@@ -620,6 +629,10 @@ def generate_tts():
             "voted": False,
         }
         # Return audio file paths and session
         return jsonify(
             {
@@ -641,6 +654,9 @@ def generate_tts():
                          os.remove(res['audio_path'])
                      except OSError:
                          pass
         return jsonify({"error": "Failed to generate TTS"}), 500
     # --- End Cache Miss ---

     if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
         return jsonify({"error": "Turnstile verification required"}), 403
+    # 新增：支持 multipart/form-data 以接收音频文件
+    if request.content_type and request.content_type.startswith('multipart/form-data'):
+        text = request.form.get("text", "").strip()
+        voice_file = request.files.get("voice_file")
+        reference_audio_path = None
+        if voice_file:
+            temp_voice_path = os.path.join(TEMP_AUDIO_DIR, f"ref_{uuid.uuid4()}.wav")
+            voice_file.save(temp_voice_path)
+            reference_audio_path = temp_voice_path
+    else:
+        data = request.json
+        text = data.get("text", "").strip() # Ensure text is stripped
+        reference_audio_path = None
     if not text or len(text) > 1000:
         return jsonify({"error": "Invalid or too long text"}), 400
         # Function to process a single model (generate directly to TEMP_AUDIO_DIR, not cache subdir)
         def process_model_on_the_fly(model):
+             # 传递 reference_audio_path 给 predict_tts
+             temp_audio_path = predict_tts(text, model.id, reference_audio_path=reference_audio_path)
              if not temp_audio_path or not os.path.exists(temp_audio_path):
                  raise ValueError(f"predict_tts failed for model {model.id}")
              return {"model_id": model.id, "audio_path": dest_path}
         # Use ThreadPoolExecutor to process models concurrently
         with ThreadPoolExecutor(max_workers=2) as executor:
             results = list(executor.map(process_model_on_the_fly, selected_models))
             "voted": False,
         }
+        # 清理临时参考音频文件
+        if reference_audio_path and os.path.exists(reference_audio_path):
+            os.remove(reference_audio_path)
         # Return audio file paths and session
         return jsonify(
             {
                          os.remove(res['audio_path'])
                      except OSError:
                          pass
+        # 清理临时参考音频文件
+        if reference_audio_path and os.path.exists(reference_audio_path):
+            os.remove(reference_audio_path)
         return jsonify({"error": "Failed to generate TTS"}), 500
     # --- End Cache Miss ---

templates/arena.html CHANGED Viewed

@@ -12,6 +12,11 @@
 <div id="tts-tab" class="tab-content active">
     <form class="input-container">
         <div class="input-group">
             <button type="button" class="segmented-btn random-btn" title="Roll random text">
                 <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-shuffle-icon lucide-shuffle">
@@ -62,7 +67,7 @@
                     </span>
                 </button>
             </div>
             <div class="player">
                 <div class="player-label">Model B <span class="model-name-display"></span></div>
                 <div class="wave-player-container" data-model="b"></div>
@@ -76,7 +81,6 @@
             </div>
         </div>
     </div>
     <div class="vote-results" style="display: none;">
         <h3 class="results-heading">Vote Recorded!</h3>
         <div class="results-content">
@@ -88,11 +92,9 @@
             </div>
         </div>
     </div>
     <div class="next-round-container" style="display: none;">
         <button class="next-round-btn">Next Round</button>
     </div>
     <div id="playback-keyboard-hint" class="keyboard-hint" style="display: none;">
         Press <kbd>Space</kbd> to play/pause, <kbd>A</kbd>/<kbd>B</kbd> to vote, <kbd>R</kbd> for random text, <kbd>N</kbd> for next random round
     </div>
@@ -1017,7 +1019,8 @@
         let modelNames = { a: '', b: '' };
         let wavePlayers = { a: null, b: null };
         let cachedSentences = []; // To store sentences available in cache
         // Initialize WavePlayers with mobile settings
         wavePlayerContainers.forEach(container => {
             const model = container.dataset.model;
@@ -1137,15 +1140,31 @@
             // Reset the flag for both samples played
             bothSamplesPlayed = false;
             // Call the API to generate TTS
-            fetch('/api/tts/generate', {
-                method: 'POST',
-                headers: {
-                    'Content-Type': 'application/json',
-                },
-                body: JSON.stringify({ text: text }),
-            })
             .then(response => {
                 if (!response.ok) {
                     return response.json().then(err => {
@@ -1199,6 +1218,11 @@
         }
         function handleVote(model) {
             // Disable both vote buttons
             voteButtons.forEach(btn => {
                 btn.disabled = true;
@@ -1220,8 +1244,9 @@
             })
             .then(response => {
                 if (!response.ok) {
                     return response.json().then(err => {
-                        throw new Error(err.error || 'Failed to submit vote');
                     });
                 }
                 return response.json();
@@ -1257,9 +1282,10 @@
                 nextRoundContainer.style.display = 'block';
                 // Show success toast
-                openToast("Vote recorded successfully!", "success");
             })
             .catch(error => {
                 // Re-enable vote buttons
                 voteButtons.forEach(btn => {
                     btn.disabled = false;
@@ -1311,6 +1337,7 @@
             // Show initial hint, hide playback hint
             initialKeyboardHint.style.display = 'block';
             playbackKeyboardHint.style.display = 'none';
         }
         function handleRandom() {
@@ -1990,4 +2017,4 @@
         initializePodcastLines();
     });
 </script>
-{% endblock %}

 <div id="tts-tab" class="tab-content active">
     <form class="input-container">
+        <div class="input-group">
+            <label for="voice-file">上传参考音色：</label>
+            <input type="file" id="voice-file" accept="audio/*">
+            <audio id="voice-preview" controls style="display:none;"></audio>
+        </div>
         <div class="input-group">
             <button type="button" class="segmented-btn random-btn" title="Roll random text">
                 <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-shuffle-icon lucide-shuffle">
                     </span>
                 </button>
             </div>
             <div class="player">
                 <div class="player-label">Model B <span class="model-name-display"></span></div>
                 <div class="wave-player-container" data-model="b"></div>
             </div>
         </div>
     </div>
     <div class="vote-results" style="display: none;">
         <h3 class="results-heading">Vote Recorded!</h3>
         <div class="results-content">
             </div>
         </div>
     </div>
     <div class="next-round-container" style="display: none;">
         <button class="next-round-btn">Next Round</button>
     </div>
     <div id="playback-keyboard-hint" class="keyboard-hint" style="display: none;">
         Press <kbd>Space</kbd> to play/pause, <kbd>A</kbd>/<kbd>B</kbd> to vote, <kbd>R</kbd> for random text, <kbd>N</kbd> for next random round
     </div>
         let modelNames = { a: '', b: '' };
         let wavePlayers = { a: null, b: null };
         let cachedSentences = []; // To store sentences available in cache
+        let hasVoted = false; // 防止重复投票
         // Initialize WavePlayers with mobile settings
         wavePlayerContainers.forEach(container => {
             const model = container.dataset.model;
             // Reset the flag for both samples played
             bothSamplesPlayed = false;
+            // 新增：处理参考音色文件上传
+            const voiceFileInput = document.getElementById('voice-file');
+            const file = voiceFileInput.files[0];
+            let fetchOptions;
+            if (file) {
+                const formData = new FormData();
+                formData.append('text', text);
+                formData.append('voice_file', file);
+                fetchOptions = {
+                    method: 'POST',
+                    body: formData
+                };
+            } else {
+                fetchOptions = {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({ text: text }),
+                };
+            }
             // Call the API to generate TTS
+            fetch('/api/tts/generate', fetchOptions)
             .then(response => {
                 if (!response.ok) {
                     return response.json().then(err => {
         }
         function handleVote(model) {
+            if (hasVoted) {
+                openToast("You have already voted. Duplicate voting is not allowed.", "warning");
+                return;
+            }
+            hasVoted = true;
             // Disable both vote buttons
             voteButtons.forEach(btn => {
                 btn.disabled = true;
             })
             .then(response => {
                 if (!response.ok) {
+                    hasVoted = false; // allow retry
                     return response.json().then(err => {
+                        throw new Error(err.error || 'Vote failed, please try again later.');
                     });
                 }
                 return response.json();
                 nextRoundContainer.style.display = 'block';
                 // Show success toast
+                openToast("Vote successful!", "success");
             })
             .catch(error => {
+                hasVoted = false;
                 // Re-enable vote buttons
                 voteButtons.forEach(btn => {
                     btn.disabled = false;
             // Show initial hint, hide playback hint
             initialKeyboardHint.style.display = 'block';
             playbackKeyboardHint.style.display = 'none';
+            hasVoted = false;
         }
         function handleRandom() {
         initializePodcastLines();
     });
 </script>
+{% endblock %}

templates/base.html CHANGED Viewed

@@ -1086,12 +1086,12 @@
         </nav>
         <div class="sidebar-footer">
-            <a href="https://discord.gg/HB8fMR6GTr" target="_blank" rel="noopener noreferrer" class="discord-link">
-                <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 127.14 96.36" fill="currentColor">
-                    <path d="M107.7,8.07A105.15,105.15,0,0,0,81.47,0a72.06,72.06,0,0,0-3.36,6.83A97.68,97.68,0,0,0,49,6.83,72.37,72.37,0,0,0,45.64,0,105.89,105.89,0,0,0,19.39,8.09C2.79,32.65-1.71,56.6.54,80.21h0A105.73,105.73,0,0,0,32.71,96.36,77.7,77.7,0,0,0,39.6,85.25a68.42,68.42,0,0,1-10.85-5.18c.91-.66,1.8-1.34,2.66-2a75.57,75.57,0,0,0,64.32,0c.87.71,1.76,1.39,2.66,2a68.68,68.68,0,0,1-10.87,5.19,77,77,0,0,0,6.89,11.1A105.25,105.25,0,0,0,126.6,80.22h0C129.24,52.84,122.09,29.11,107.7,8.07ZM42.45,65.69C36.18,65.69,31,60,31,53s5-12.74,11.43-12.74S54,46,53.89,53,48.84,65.69,42.45,65.69Zm42.24,0C78.41,65.69,73.25,60,73.25,53s5-12.74,11.44-12.74S96.23,46,96.12,53,91.08,65.69,84.69,65.69Z"/>
-                </svg>
-                Join our Discord
-            </a>
             {% if current_user.is_authenticated %}
             <div class="user-auth" onclick="toggleUserDropdown(event)">

         </nav>
         <div class="sidebar-footer">
+{#            <a href="https://discord.gg/HB8fMR6GTr" target="_blank" rel="noopener noreferrer" class="discord-link">#}
+{#                <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 127.14 96.36" fill="currentColor">#}
+{#                    <path d="M107.7,8.07A105.15,105.15,0,0,0,81.47,0a72.06,72.06,0,0,0-3.36,6.83A97.68,97.68,0,0,0,49,6.83,72.37,72.37,0,0,0,45.64,0,105.89,105.89,0,0,0,19.39,8.09C2.79,32.65-1.71,56.6.54,80.21h0A105.73,105.73,0,0,0,32.71,96.36,77.7,77.7,0,0,0,39.6,85.25a68.42,68.42,0,0,1-10.85-5.18c.91-.66,1.8-1.34,2.66-2a75.57,75.57,0,0,0,64.32,0c.87.71,1.76,1.39,2.66,2a68.68,68.68,0,0,1-10.87,5.19,77,77,0,0,0,6.89,11.1A105.25,105.25,0,0,0,126.6,80.22h0C129.24,52.84,122.09,29.11,107.7,8.07ZM42.45,65.69C36.18,65.69,31,60,31,53s5-12.74,11.43-12.74S54,46,53.89,53,48.84,65.69,42.45,65.69Zm42.24,0C78.41,65.69,73.25,60,73.25,53s5-12.74,11.44-12.74S96.23,46,96.12,53,91.08,65.69,84.69,65.69Z"/>#}
+{#                </svg>#}
+{#                Join our Discord#}
+{#            </a>#}
             {% if current_user.is_authenticated %}
             <div class="user-auth" onclick="toggleUserDropdown(event)">

tts.old.py DELETED Viewed

@@ -1,117 +0,0 @@
-# TODO: V2 of TTS Router
-# Currently just use current TTS router.
-from gradio_client import Client
-import os
-from dotenv import load_dotenv
-import fal_client
-import requests
-import time
-import io
-from pyht import Client as PyhtClient
-from pyht.client import TTSOptions
-load_dotenv()
-try:
-    client = Client("TTS-AGI/tts-router", hf_token=os.getenv("HF_TOKEN"))
-except Exception as e:
-    print(f"Error initializing client: {e}")
-    client = None
-model_mapping = {
-    "eleven-multilingual-v2": "eleven",
-    "playht-2.0": "playht",
-    "styletts2": "styletts2",
-    "kokoro-v1": "kokorov1",
-    "cosyvoice-2.0": "cosyvoice",
-    "playht-3.0-mini": "playht3",
-    "papla-p1": "papla",
-    "hume-octave": "hume",
-}
-def predict_csm(script):
-    result = fal_client.subscribe(
-        "fal-ai/csm-1b",
-        arguments={
-            # "scene": [{
-            #     "text": "Hey how are you doing.",
-            #     "speaker_id": 0
-            # }, {
-            #     "text": "Pretty good, pretty good.",
-            #     "speaker_id": 1
-            # }, {
-            #     "text": "I'm great, so happy to be speaking to you.",
-            #     "speaker_id": 0
-            # }]
-            "scene": script
-        },
-        with_logs=True,
-    )
-    return requests.get(result["audio"]["url"]).content
-def predict_playdialog(script):
-    # Initialize the PyHT client
-    pyht_client = PyhtClient(
-        user_id=os.getenv("PLAY_USERID"),
-        api_key=os.getenv("PLAY_SECRETKEY"),
-    )
-    # Define the voices
-    voice_1 = "s3://voice-cloning-zero-shot/baf1ef41-36b6-428c-9bdf-50ba54682bd8/original/manifest.json"
-    voice_2 = "s3://voice-cloning-zero-shot/e040bd1b-f190-4bdb-83f0-75ef85b18f84/original/manifest.json"
-    # Convert script format from CSM to PlayDialog format
-    if isinstance(script, list):
-        # Process script in CSM format (list of dictionaries)
-        text = ""
-        for turn in script:
-            speaker_id = turn.get("speaker_id", 0)
-            prefix = "Host 1:" if speaker_id == 0 else "Host 2:"
-            text += f"{prefix} {turn['text']}\n"
-    else:
-        # If it's already a string, use as is
-        text = script
-    # Set up TTSOptions
-    options = TTSOptions(
-        voice=voice_1, voice_2=voice_2, turn_prefix="Host 1:", turn_prefix_2="Host 2:"
-    )
-    # Generate audio using PlayDialog
-    audio_chunks = []
-    for chunk in pyht_client.tts(text, options, voice_engine="PlayDialog"):
-        audio_chunks.append(chunk)
-    # Combine all chunks into a single audio file
-    return b"".join(audio_chunks)
-def predict_tts(text, model):
-    global client
-    # Exceptions: special models that shouldn't be passed to the router
-    if model == "csm-1b":
-        return predict_csm(text)
-    elif model == "playdialog-1.0":
-        return predict_playdialog(text)
-    if not model in model_mapping:
-        raise ValueError(f"Model {model} not found")
-    result = client.predict(
-        text=text, model=model_mapping[model], api_name="/synthesize"
-    )  # returns path to audio file
-    return result
-if __name__ == "__main__":
-    print("Predicting PlayDialog")
-    print(
-        predict_playdialog(
-            [
-                {"text": "Hey how are you doing.", "speaker_id": 0},
-                {"text": "Pretty good, pretty good.", "speaker_id": 1},
-                {"text": "I'm great, so happy to be speaking to you.", "speaker_id": 0},
-            ]
-        )
-    )

tts.py CHANGED Viewed

@@ -23,65 +23,65 @@ def get_zerogpu_token():
 model_mapping = {
-    "eleven-multilingual-v2": {
-        "provider": "elevenlabs",
-        "model": "eleven_multilingual_v2",
-    },
-    "eleven-turbo-v2.5": {
-        "provider": "elevenlabs",
-        "model": "eleven_turbo_v2_5",
-    },
-    "eleven-flash-v2.5": {
-        "provider": "elevenlabs",
-        "model": "eleven_flash_v2_5",
-    },
-    "cartesia-sonic-2": {
-        "provider": "cartesia",
-        "model": "sonic-2",
-    },
     "spark-tts": {
         "provider": "spark",
         "model": "spark-tts",
     },
-    "playht-2.0": {
-        "provider": "playht",
-        "model": "PlayHT2.0",
-    },
-    "styletts2": {
-        "provider": "styletts",
-        "model": "styletts2",
-    },
-    "kokoro-v1": {
-        "provider": "kokoro",
-        "model": "kokoro_v1",
-    },
-    "cosyvoice-2.0": {
-        "provider": "cosyvoice",
-        "model": "cosyvoice_2_0",
-    },
-    "papla-p1": {
-        "provider": "papla",
-        "model": "papla_p1",
-    },
-    "hume-octave": {
-        "provider": "hume",
-        "model": "octave",
-    },
-    "megatts3": {
-        "provider": "megatts3",
-        "model": "megatts3",
-    },
-    "minimax-02-hd": {
-        "provider": "minimax",
-        "model": "speech-02-hd",
-    },
-    "minimax-02-turbo": {
-        "provider": "minimax",
-        "model": "speech-02-turbo",
-    },
-    "lanternfish-1": {
-        "provider": "lanternfish",
-        "model": "lanternfish-1",
     },
 }
 url = "https://tts-agi-tts-router-v2.hf.space/tts"
@@ -194,7 +194,38 @@ def predict_dia(script):
                     return requests.get(json.loads(audio_data)[0]["url"]).content
-def predict_tts(text, model):
     global client
     print(f"Predicting TTS for {model}")
     # Exceptions: special models that shouldn't be passed to the router
@@ -204,20 +235,38 @@ def predict_tts(text, model):
         return predict_playdialog(text)
     elif model == "dia-1.6b":
         return predict_dia(text)
     if not model in model_mapping:
         raise ValueError(f"Model {model} not found")
     result = requests.post(
         url,
         headers=headers,
-        data=json.dumps(
-            {
-                "text": text,
-                "provider": model_mapping[model]["provider"],
-                "model": model_mapping[model]["model"],
-            }
-        ),
     )
     response_json = result.json()

 model_mapping = {
+    # "eleven-multilingual-v2": {
+    #     "provider": "elevenlabs",
+    #     "model": "eleven_multilingual_v2",
+    # },
+    # "eleven-turbo-v2.5": {
+    #     "provider": "elevenlabs",
+    #     "model": "eleven_turbo_v2_5",
+    # },
+    # "eleven-flash-v2.5": {
+    #     "provider": "elevenlabs",
+    #     "model": "eleven_flash_v2_5",
+    # },
+    # "cartesia-sonic-2": {
+    #     "provider": "cartesia",
+    #     "model": "sonic-2",
+    # },
     "spark-tts": {
         "provider": "spark",
         "model": "spark-tts",
     },
+    # "playht-2.0": {
+    #     "provider": "playht",
+    #     "model": "PlayHT2.0",
+    # },
+    # "styletts2": {
+    #     "provider": "styletts",
+    #     "model": "styletts2",
+    # },
+    # "cosyvoice-2.0": {
+    #     "provider": "cosyvoice",
+    #     "model": "cosyvoice_2_0",
+    # },
+    # "papla-p1": {
+    #     "provider": "papla",
+    #     "model": "papla_p1",
+    # },
+    # "hume-octave": {
+    #     "provider": "hume",
+    #     "model": "octave",
+    # },
+    # "megatts3": {
+    #     "provider": "megatts3",
+    #     "model": "megatts3",
+    # },
+    # "minimax-02-hd": {
+    #     "provider": "minimax",
+    #     "model": "speech-02-hd",
+    # },
+    # "minimax-02-turbo": {
+    #     "provider": "minimax",
+    #     "model": "speech-02-turbo",
+    # },
+    # "lanternfish-1": {
+    #     "provider": "lanternfish",
+    #     "model": "lanternfish-1",
+    # },
+    "index-tts": {
+        "provider": "bilibili",
+        "model": "index-tts",
     },
 }
 url = "https://tts-agi-tts-router-v2.hf.space/tts"
                     return requests.get(json.loads(audio_data)[0]["url"]).content
+def predict_index_tts(text, reference_audio_path=None):
+    from gradio_client import Client, handle_file
+    client = Client("IndexTeam/IndexTTS")
+    if reference_audio_path:
+        prompt = handle_file(reference_audio_path)
+    else:
+        raise ValueError("index-tts 需要 reference_audio_path")
+    result = client.predict(
+        prompt=prompt,
+        text=text,
+        api_name="/gen_single"
+    )
+    return result
+def predict_spark_tts(text, reference_audio_path=None):
+    from gradio_client import Client, handle_file
+    client = Client("amortalize/Spark-TTS-Zero")
+    prompt_wav = None
+    if reference_audio_path:
+        prompt_wav = handle_file(reference_audio_path)
+    result = client.predict(
+        text=text,
+        prompt_text=text,
+        prompt_wav_upload=prompt_wav,
+        prompt_wav_record=prompt_wav,
+        api_name="/voice_clone"
+    )
+    return result
+def predict_tts(text, model, reference_audio_path=None):
     global client
     print(f"Predicting TTS for {model}")
     # Exceptions: special models that shouldn't be passed to the router
         return predict_playdialog(text)
     elif model == "dia-1.6b":
         return predict_dia(text)
+    elif model == "index-tts":
+        return predict_index_tts(text, reference_audio_path)
+    elif model == "spark-tts":
+        return predict_spark_tts(text, reference_audio_path)
     if not model in model_mapping:
         raise ValueError(f"Model {model} not found")
+    # 构建请求体
+    payload = {
+        "text": text,
+        "provider": model_mapping[model]["provider"],
+        "model": model_mapping[model]["model"],
+    }
+    # 仅对支持音色克隆的模型传递参考音色
+    supports_reference = model in [
+        "styletts2", "eleven-multilingual-v2", "eleven-turbo-v2.5", "eleven-flash-v2.5"
+    ]
+    if reference_audio_path and supports_reference:
+        with open(reference_audio_path, "rb") as f:
+            audio_bytes = f.read()
+            audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+        # 不同模型参考音色字段不同
+        if model == "styletts2":
+            payload["reference_speaker"] = audio_b64
+        else:  # elevenlabs 系列
+            payload["reference_audio"] = audio_b64
     result = requests.post(
         url,
         headers=headers,
+        data=json.dumps(payload),
     )
     response_json = result.json()