|
|
|
|
|
from flask import Flask, request, send_file, jsonify
|
|
from gevent.pywsgi import WSGIServer
|
|
from dotenv import load_dotenv
|
|
import os
|
|
|
|
from handle_text import prepare_tts_input_with_context
|
|
from tts_handler import generate_speech, get_models, get_voices
|
|
from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES
|
|
|
|
app = Flask(__name__)
|
|
load_dotenv()
|
|
|
|
API_KEY = os.getenv('API_KEY', 'your_api_key_here')
|
|
PORT = int(os.getenv('PORT', 5050))
|
|
|
|
DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AvaNeural')
|
|
DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
|
|
DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0))
|
|
|
|
REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False)
|
|
EXPAND_API = getenv_bool('EXPAND_API', True)
|
|
|
|
|
|
|
|
@app.route('/v1/audio/speech', methods=['POST'])
|
|
@app.route('/audio/speech', methods=['POST'])
|
|
@require_api_key
|
|
def text_to_speech():
|
|
data = request.json
|
|
if not data or 'input' not in data:
|
|
return jsonify({"error": "Missing 'input' in request body"}), 400
|
|
|
|
text = data.get('input')
|
|
|
|
if not REMOVE_FILTER:
|
|
text = prepare_tts_input_with_context(text)
|
|
|
|
|
|
voice = data.get('voice', DEFAULT_VOICE)
|
|
|
|
response_format = data.get('response_format', DEFAULT_RESPONSE_FORMAT)
|
|
speed = float(data.get('speed', DEFAULT_SPEED))
|
|
|
|
mime_type = AUDIO_FORMAT_MIME_TYPES.get(response_format, "audio/mpeg")
|
|
|
|
|
|
output_file_path = generate_speech(text, voice, response_format, speed)
|
|
|
|
|
|
return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}")
|
|
|
|
@app.route('/v1/models', methods=['GET', 'POST'])
|
|
@app.route('/models', methods=['GET', 'POST'])
|
|
@require_api_key
|
|
def list_models():
|
|
return jsonify({"data": get_models()})
|
|
|
|
@app.route('/v1/voices', methods=['GET', 'POST'])
|
|
@app.route('/voices', methods=['GET', 'POST'])
|
|
@require_api_key
|
|
def list_voices():
|
|
specific_language = None
|
|
|
|
data = request.args if request.method == 'GET' else request.json
|
|
if data and ('language' in data or 'locale' in data):
|
|
specific_language = data.get('language') if 'language' in data else data.get('locale')
|
|
|
|
return jsonify({"voices": get_voices(specific_language)})
|
|
|
|
@app.route('/v1/voices/all', methods=['GET', 'POST'])
|
|
@app.route('/voices/all', methods=['GET', 'POST'])
|
|
@require_api_key
|
|
def list_all_voices():
|
|
return jsonify({"voices": get_voices('all')})
|
|
|
|
"""
|
|
Support for ElevenLabs and Azure AI Speech
|
|
(currently in beta)
|
|
"""
|
|
|
|
|
|
|
|
@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST'])
|
|
@require_api_key
|
|
def elevenlabs_tts(voice_id):
|
|
if not EXPAND_API:
|
|
return jsonify({"error": f"Endpoint not allowed"}), 500
|
|
|
|
|
|
try:
|
|
payload = request.json
|
|
if not payload or 'text' not in payload:
|
|
return jsonify({"error": "Missing 'text' in request body"}), 400
|
|
except Exception as e:
|
|
return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400
|
|
|
|
text = payload['text']
|
|
|
|
if not REMOVE_FILTER:
|
|
text = prepare_tts_input_with_context(text)
|
|
|
|
voice = voice_id
|
|
|
|
|
|
response_format = 'mp3'
|
|
speed = DEFAULT_SPEED
|
|
|
|
|
|
try:
|
|
output_file_path = generate_speech(text, voice, response_format, speed)
|
|
except Exception as e:
|
|
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
|
|
|
|
|
|
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
|
|
|
|
|
|
|
|
|
|
@app.route('/azure/cognitiveservices/v1', methods=['POST'])
|
|
@require_api_key
|
|
def azure_tts():
|
|
if not EXPAND_API:
|
|
return jsonify({"error": f"Endpoint not allowed"}), 500
|
|
|
|
|
|
try:
|
|
ssml_data = request.data.decode('utf-8')
|
|
if not ssml_data:
|
|
return jsonify({"error": "Missing SSML payload"}), 400
|
|
|
|
|
|
from xml.etree import ElementTree as ET
|
|
root = ET.fromstring(ssml_data)
|
|
text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text
|
|
voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name')
|
|
except Exception as e:
|
|
return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400
|
|
|
|
|
|
response_format = 'mp3'
|
|
speed = DEFAULT_SPEED
|
|
|
|
if not REMOVE_FILTER:
|
|
text = prepare_tts_input_with_context(text)
|
|
|
|
|
|
try:
|
|
output_file_path = generate_speech(text, voice, response_format, speed)
|
|
except Exception as e:
|
|
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
|
|
|
|
|
|
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
|
|
|
|
print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API")
|
|
print(f" ")
|
|
print(f" * Serving OpenAI Edge TTS")
|
|
print(f" * Server running on http://localhost:{PORT}")
|
|
print(f" * TTS Endpoint: http://localhost:{PORT}/v1/audio/speech")
|
|
print(f" ")
|
|
|
|
if __name__ == '__main__':
|
|
http_server = WSGIServer(('0.0.0.0', PORT), app)
|
|
http_server.serve_forever()
|
|
|