Spaces:
Running
Running
Commit
Β·
7fcb739
1
Parent(s):
f55b556
(wip)debug
Browse files
app.py
CHANGED
@@ -541,6 +541,7 @@ def initialize_tts_cache():
|
|
541 |
@limiter.limit("10 per minute") # Keep limit, cached responses are still requests
|
542 |
def generate_tts():
|
543 |
# If verification not setup, handle it first
|
|
|
544 |
if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
|
545 |
return jsonify({"error": "Turnstile verification required"}), 403
|
546 |
|
@@ -631,7 +632,7 @@ def generate_tts():
|
|
631 |
# Function to process a single model (generate directly to TEMP_AUDIO_DIR, not cache subdir)
|
632 |
def process_model_on_the_fly(model):
|
633 |
# δΌ ι reference_audio_path η» predict_tts
|
634 |
-
temp_audio_path = predict_tts(text, model.id, reference_audio_path=reference_audio_path)
|
635 |
if not temp_audio_path or not os.path.exists(temp_audio_path):
|
636 |
raise ValueError(f"predict_tts failed for model {model.id}")
|
637 |
|
|
|
541 |
@limiter.limit("10 per minute") # Keep limit, cached responses are still requests
|
542 |
def generate_tts():
|
543 |
# If verification not setup, handle it first
|
544 |
+
user_token = request.headers['x-ip-token']
|
545 |
if app.config["TURNSTILE_ENABLED"] and not session.get("turnstile_verified"):
|
546 |
return jsonify({"error": "Turnstile verification required"}), 403
|
547 |
|
|
|
632 |
# Function to process a single model (generate directly to TEMP_AUDIO_DIR, not cache subdir)
|
633 |
def process_model_on_the_fly(model):
|
634 |
# δΌ ι reference_audio_path η» predict_tts
|
635 |
+
temp_audio_path = predict_tts(text, model.id, reference_audio_path=reference_audio_path,user_token=user_token)
|
636 |
if not temp_audio_path or not os.path.exists(temp_audio_path):
|
637 |
raise ValueError(f"predict_tts failed for model {model.id}")
|
638 |
|
tts.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import os
|
2 |
from dotenv import load_dotenv
|
3 |
import random
|
4 |
-
|
5 |
-
from fal_client import stream
|
6 |
-
from gradio_client.exceptions import AppError
|
7 |
|
8 |
load_dotenv()
|
9 |
|
@@ -44,10 +42,17 @@ headers = {
|
|
44 |
}
|
45 |
data = {"text": "string", "provider": "string", "model": "string"}
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
def predict_index_tts(text, reference_audio_path=None):
|
49 |
-
|
50 |
-
client = Client("kemuriririn/IndexTTS",verbose=True)
|
51 |
if reference_audio_path:
|
52 |
prompt = handle_file(reference_audio_path)
|
53 |
else:
|
@@ -63,9 +68,8 @@ def predict_index_tts(text, reference_audio_path=None):
|
|
63 |
return result
|
64 |
|
65 |
|
66 |
-
def predict_spark_tts(text, reference_audio_path=None):
|
67 |
-
|
68 |
-
client = Client("kemuriririn/SparkTTS")
|
69 |
prompt_wav = None
|
70 |
if reference_audio_path:
|
71 |
prompt_wav = handle_file(reference_audio_path)
|
@@ -80,9 +84,8 @@ def predict_spark_tts(text, reference_audio_path=None):
|
|
80 |
return result
|
81 |
|
82 |
|
83 |
-
def predict_cosyvoice_tts(text, reference_audio_path=None):
|
84 |
-
|
85 |
-
client = Client("kemuriririn/CosyVoice2-0.5B")
|
86 |
if not reference_audio_path:
|
87 |
raise ValueError("cosyvoice-2.0 ιθ¦ reference_audio_path")
|
88 |
prompt_wav = handle_file(reference_audio_path)
|
@@ -106,9 +109,8 @@ def predict_cosyvoice_tts(text, reference_audio_path=None):
|
|
106 |
return result
|
107 |
|
108 |
|
109 |
-
def predict_maskgct(text, reference_audio_path=None):
|
110 |
-
|
111 |
-
client = Client("amphion/maskgct")
|
112 |
if not reference_audio_path:
|
113 |
raise ValueError("maskgct ιθ¦ reference_audio_path")
|
114 |
prompt_wav = handle_file(reference_audio_path)
|
@@ -123,9 +125,8 @@ def predict_maskgct(text, reference_audio_path=None):
|
|
123 |
return result
|
124 |
|
125 |
|
126 |
-
def predict_gpt_sovits_v2(text, reference_audio_path=None):
|
127 |
-
|
128 |
-
client = Client("kemuriririn/GPT-SoVITS-v2")
|
129 |
if not reference_audio_path:
|
130 |
raise ValueError("GPT-SoVITS-v2 ιθ¦ reference_audio_path")
|
131 |
result = client.predict(
|
@@ -148,20 +149,19 @@ def predict_gpt_sovits_v2(text, reference_audio_path=None):
|
|
148 |
return result
|
149 |
|
150 |
|
151 |
-
def predict_tts(text, model, reference_audio_path=None):
|
152 |
-
global client
|
153 |
print(f"Predicting TTS for {model}")
|
154 |
# Exceptions: special models that shouldn't be passed to the router
|
155 |
if model == "index-tts":
|
156 |
-
result = predict_index_tts(text, reference_audio_path)
|
157 |
elif model == "spark-tts":
|
158 |
-
result = predict_spark_tts(text, reference_audio_path)
|
159 |
elif model == "cosyvoice-2.0":
|
160 |
-
result = predict_cosyvoice_tts(text, reference_audio_path)
|
161 |
elif model == "maskgct":
|
162 |
-
result = predict_maskgct(text, reference_audio_path)
|
163 |
elif model == "gpt-sovits-v2":
|
164 |
-
result = predict_gpt_sovits_v2(text, reference_audio_path)
|
165 |
else:
|
166 |
raise ValueError(f"Model {model} not found")
|
167 |
return result
|
|
|
1 |
import os
|
2 |
from dotenv import load_dotenv
|
3 |
import random
|
4 |
+
from gradio_client import Client, handle_file,file
|
|
|
|
|
5 |
|
6 |
load_dotenv()
|
7 |
|
|
|
42 |
}
|
43 |
data = {"text": "string", "provider": "string", "model": "string"}
|
44 |
|
45 |
+
def set_client_for_session(space:str, user_token=None):
|
46 |
+
if user_token is None:
|
47 |
+
x_ip_token = get_zerogpu_token()
|
48 |
+
else:
|
49 |
+
x_ip_token = user_token
|
50 |
+
|
51 |
+
# The "gradio/text-to-image" space is a ZeroGPU space
|
52 |
+
return Client(space, headers={"X-IP-Token": x_ip_token})
|
53 |
|
54 |
+
def predict_index_tts(text, user_token=None, reference_audio_path=None):
|
55 |
+
client = set_client_for_session("kemuriririn/IndexTTS",user_token=user_token)
|
|
|
56 |
if reference_audio_path:
|
57 |
prompt = handle_file(reference_audio_path)
|
58 |
else:
|
|
|
68 |
return result
|
69 |
|
70 |
|
71 |
+
def predict_spark_tts(text, user_token=None,reference_audio_path=None):
|
72 |
+
client = set_client_for_session("kemuriririn/SparkTTS",user_token=user_token)
|
|
|
73 |
prompt_wav = None
|
74 |
if reference_audio_path:
|
75 |
prompt_wav = handle_file(reference_audio_path)
|
|
|
84 |
return result
|
85 |
|
86 |
|
87 |
+
def predict_cosyvoice_tts(text, user_token=None, reference_audio_path=None):
|
88 |
+
client = set_client_for_session("kemuriririn/CosyVoice2-0.5B",user_token=user_token)
|
|
|
89 |
if not reference_audio_path:
|
90 |
raise ValueError("cosyvoice-2.0 ιθ¦ reference_audio_path")
|
91 |
prompt_wav = handle_file(reference_audio_path)
|
|
|
109 |
return result
|
110 |
|
111 |
|
112 |
+
def predict_maskgct(text, user_token=None, reference_audio_path=None):
|
113 |
+
client = set_client_for_session("amphion/maskgct",user_token=user_token)
|
|
|
114 |
if not reference_audio_path:
|
115 |
raise ValueError("maskgct ιθ¦ reference_audio_path")
|
116 |
prompt_wav = handle_file(reference_audio_path)
|
|
|
125 |
return result
|
126 |
|
127 |
|
128 |
+
def predict_gpt_sovits_v2(text, user_token=None,reference_audio_path=None):
|
129 |
+
client = set_client_for_session("kemuriririn/GPT-SoVITS-v2",user_token=user_token)
|
|
|
130 |
if not reference_audio_path:
|
131 |
raise ValueError("GPT-SoVITS-v2 ιθ¦ reference_audio_path")
|
132 |
result = client.predict(
|
|
|
149 |
return result
|
150 |
|
151 |
|
152 |
+
def predict_tts(text, model, user_token=None, reference_audio_path=None):
|
|
|
153 |
print(f"Predicting TTS for {model}")
|
154 |
# Exceptions: special models that shouldn't be passed to the router
|
155 |
if model == "index-tts":
|
156 |
+
result = predict_index_tts(text, user_token,reference_audio_path)
|
157 |
elif model == "spark-tts":
|
158 |
+
result = predict_spark_tts(text, user_token,reference_audio_path)
|
159 |
elif model == "cosyvoice-2.0":
|
160 |
+
result = predict_cosyvoice_tts(text, user_token,reference_audio_path)
|
161 |
elif model == "maskgct":
|
162 |
+
result = predict_maskgct(text, user_token,reference_audio_path)
|
163 |
elif model == "gpt-sovits-v2":
|
164 |
+
result = predict_gpt_sovits_v2(text, user_token, reference_audio_path)
|
165 |
else:
|
166 |
raise ValueError(f"Model {model} not found")
|
167 |
return result
|