CrenCren's picture
Upload folder using huggingface_hub
88aba71 verified
# server.py
from flask import Flask, request, send_file, jsonify
from gevent.pywsgi import WSGIServer
from dotenv import load_dotenv
import os
from handle_text import prepare_tts_input_with_context
from tts_handler import generate_speech, get_models, get_voices
from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES
app = Flask(__name__)
load_dotenv()
API_KEY = os.getenv('API_KEY', 'your_api_key_here')
PORT = int(os.getenv('PORT', 5050))
DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AvaNeural')
DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0))
REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False)
EXPAND_API = getenv_bool('EXPAND_API', True)
# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')
@app.route('/v1/audio/speech', methods=['POST'])
@app.route('/audio/speech', methods=['POST']) # Add this line for the alias
@require_api_key
def text_to_speech():
data = request.json
if not data or 'input' not in data:
return jsonify({"error": "Missing 'input' in request body"}), 400
text = data.get('input')
if not REMOVE_FILTER:
text = prepare_tts_input_with_context(text)
# model = data.get('model', DEFAULT_MODEL)
voice = data.get('voice', DEFAULT_VOICE)
response_format = data.get('response_format', DEFAULT_RESPONSE_FORMAT)
speed = float(data.get('speed', DEFAULT_SPEED))
mime_type = AUDIO_FORMAT_MIME_TYPES.get(response_format, "audio/mpeg")
# Generate the audio file in the specified format with speed adjustment
output_file_path = generate_speech(text, voice, response_format, speed)
# Return the file with the correct MIME type
return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}")
@app.route('/v1/models', methods=['GET', 'POST'])
@app.route('/models', methods=['GET', 'POST'])
@require_api_key
def list_models():
return jsonify({"data": get_models()})
@app.route('/v1/voices', methods=['GET', 'POST'])
@app.route('/voices', methods=['GET', 'POST'])
@require_api_key
def list_voices():
specific_language = None
data = request.args if request.method == 'GET' else request.json
if data and ('language' in data or 'locale' in data):
specific_language = data.get('language') if 'language' in data else data.get('locale')
return jsonify({"voices": get_voices(specific_language)})
@app.route('/v1/voices/all', methods=['GET', 'POST'])
@app.route('/voices/all', methods=['GET', 'POST'])
@require_api_key
def list_all_voices():
return jsonify({"voices": get_voices('all')})
"""
Support for ElevenLabs and Azure AI Speech
(currently in beta)
"""
# http://localhost:5050/elevenlabs/v1/text-to-speech
# http://localhost:5050/elevenlabs/v1/text-to-speech/en-US-AndrewNeural
@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST'])
@require_api_key
def elevenlabs_tts(voice_id):
if not EXPAND_API:
return jsonify({"error": f"Endpoint not allowed"}), 500
# Parse the incoming JSON payload
try:
payload = request.json
if not payload or 'text' not in payload:
return jsonify({"error": "Missing 'text' in request body"}), 400
except Exception as e:
return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400
text = payload['text']
if not REMOVE_FILTER:
text = prepare_tts_input_with_context(text)
voice = voice_id # ElevenLabs uses the voice_id in the URL
# Use default settings for edge-tts
response_format = 'mp3'
speed = DEFAULT_SPEED # Optional customization via payload.get('speed', DEFAULT_SPEED)
# Generate speech using edge-tts
try:
output_file_path = generate_speech(text, voice, response_format, speed)
except Exception as e:
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
# Return the generated audio file
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
# tts.speech.microsoft.com/cognitiveservices/v1
# https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
# http://localhost:5050/azure/cognitiveservices/v1
@app.route('/azure/cognitiveservices/v1', methods=['POST'])
@require_api_key
def azure_tts():
if not EXPAND_API:
return jsonify({"error": f"Endpoint not allowed"}), 500
# Parse the SSML payload
try:
ssml_data = request.data.decode('utf-8')
if not ssml_data:
return jsonify({"error": "Missing SSML payload"}), 400
# Extract the text and voice from SSML
from xml.etree import ElementTree as ET
root = ET.fromstring(ssml_data)
text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text
voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name')
except Exception as e:
return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400
# Use default settings for edge-tts
response_format = 'mp3'
speed = DEFAULT_SPEED
if not REMOVE_FILTER:
text = prepare_tts_input_with_context(text)
# Generate speech using edge-tts
try:
output_file_path = generate_speech(text, voice, response_format, speed)
except Exception as e:
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
# Return the generated audio file
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API")
print(f" ")
print(f" * Serving OpenAI Edge TTS")
print(f" * Server running on http://localhost:{PORT}")
print(f" * TTS Endpoint: http://localhost:{PORT}/v1/audio/speech")
print(f" ")
if __name__ == '__main__':
http_server = WSGIServer(('0.0.0.0', PORT), app)
http_server.serve_forever()