File size: 6,114 Bytes
88aba71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# server.py
from flask import Flask, request, send_file, jsonify
from gevent.pywsgi import WSGIServer
from dotenv import load_dotenv
import os
from handle_text import prepare_tts_input_with_context
from tts_handler import generate_speech, get_models, get_voices
from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES
app = Flask(__name__)
load_dotenv()
API_KEY = os.getenv('API_KEY', 'your_api_key_here')
PORT = int(os.getenv('PORT', 5050))
DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AvaNeural')
DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0))
REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False)
EXPAND_API = getenv_bool('EXPAND_API', True)
# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')
@app.route('/v1/audio/speech', methods=['POST'])
@app.route('/audio/speech', methods=['POST']) # Add this line for the alias
@require_api_key
def text_to_speech():
data = request.json
if not data or 'input' not in data:
return jsonify({"error": "Missing 'input' in request body"}), 400
text = data.get('input')
if not REMOVE_FILTER:
text = prepare_tts_input_with_context(text)
# model = data.get('model', DEFAULT_MODEL)
voice = data.get('voice', DEFAULT_VOICE)
response_format = data.get('response_format', DEFAULT_RESPONSE_FORMAT)
speed = float(data.get('speed', DEFAULT_SPEED))
mime_type = AUDIO_FORMAT_MIME_TYPES.get(response_format, "audio/mpeg")
# Generate the audio file in the specified format with speed adjustment
output_file_path = generate_speech(text, voice, response_format, speed)
# Return the file with the correct MIME type
return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}")
@app.route('/v1/models', methods=['GET', 'POST'])
@app.route('/models', methods=['GET', 'POST'])
@require_api_key
def list_models():
return jsonify({"data": get_models()})
@app.route('/v1/voices', methods=['GET', 'POST'])
@app.route('/voices', methods=['GET', 'POST'])
@require_api_key
def list_voices():
specific_language = None
data = request.args if request.method == 'GET' else request.json
if data and ('language' in data or 'locale' in data):
specific_language = data.get('language') if 'language' in data else data.get('locale')
return jsonify({"voices": get_voices(specific_language)})
@app.route('/v1/voices/all', methods=['GET', 'POST'])
@app.route('/voices/all', methods=['GET', 'POST'])
@require_api_key
def list_all_voices():
return jsonify({"voices": get_voices('all')})
"""
Support for ElevenLabs and Azure AI Speech
(currently in beta)
"""
# http://localhost:5050/elevenlabs/v1/text-to-speech
# http://localhost:5050/elevenlabs/v1/text-to-speech/en-US-AndrewNeural
@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST'])
@require_api_key
def elevenlabs_tts(voice_id):
if not EXPAND_API:
return jsonify({"error": f"Endpoint not allowed"}), 500
# Parse the incoming JSON payload
try:
payload = request.json
if not payload or 'text' not in payload:
return jsonify({"error": "Missing 'text' in request body"}), 400
except Exception as e:
return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400
text = payload['text']
if not REMOVE_FILTER:
text = prepare_tts_input_with_context(text)
voice = voice_id # ElevenLabs uses the voice_id in the URL
# Use default settings for edge-tts
response_format = 'mp3'
speed = DEFAULT_SPEED # Optional customization via payload.get('speed', DEFAULT_SPEED)
# Generate speech using edge-tts
try:
output_file_path = generate_speech(text, voice, response_format, speed)
except Exception as e:
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
# Return the generated audio file
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
# tts.speech.microsoft.com/cognitiveservices/v1
# https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
# http://localhost:5050/azure/cognitiveservices/v1
@app.route('/azure/cognitiveservices/v1', methods=['POST'])
@require_api_key
def azure_tts():
if not EXPAND_API:
return jsonify({"error": f"Endpoint not allowed"}), 500
# Parse the SSML payload
try:
ssml_data = request.data.decode('utf-8')
if not ssml_data:
return jsonify({"error": "Missing SSML payload"}), 400
# Extract the text and voice from SSML
from xml.etree import ElementTree as ET
root = ET.fromstring(ssml_data)
text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text
voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name')
except Exception as e:
return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400
# Use default settings for edge-tts
response_format = 'mp3'
speed = DEFAULT_SPEED
if not REMOVE_FILTER:
text = prepare_tts_input_with_context(text)
# Generate speech using edge-tts
try:
output_file_path = generate_speech(text, voice, response_format, speed)
except Exception as e:
return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500
# Return the generated audio file
return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")
print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API")
print(f" ")
print(f" * Serving OpenAI Edge TTS")
print(f" * Server running on http://localhost:{PORT}")
print(f" * TTS Endpoint: http://localhost:{PORT}/v1/audio/speech")
print(f" ")
if __name__ == '__main__':
http_server = WSGIServer(('0.0.0.0', PORT), app)
http_server.serve_forever()
|