Spaces:

CrenCren
/

cren

Build error

App Files Files Community

cren / weclone-audio /src /server未完工 /server.py

CrenCren

Upload folder using huggingface_hub

88aba71 verified 4 months ago

raw

history blame contribute delete

6.11 kB

	# server.py

	from flask import Flask, request, send_file, jsonify
	from gevent.pywsgi import WSGIServer
	from dotenv import load_dotenv
	import os

	from handle_text import prepare_tts_input_with_context
	from tts_handler import generate_speech, get_models, get_voices
	from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES

	app = Flask(__name__)
	load_dotenv()

	API_KEY = os.getenv('API_KEY', 'your_api_key_here')
	PORT = int(os.getenv('PORT', 5050))

	DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AvaNeural')
	DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3')
	DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0))

	REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False)
	EXPAND_API = getenv_bool('EXPAND_API', True)

	# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1')

	@app.route('/v1/audio/speech', methods=['POST'])
	@app.route('/audio/speech', methods=['POST']) # Add this line for the alias
	@require_api_key
	def text_to_speech():
	data = request.json
	if not data or 'input' not in data:
	return jsonify({"error": "Missing 'input' in request body"}), 400

	text = data.get('input')

	if not REMOVE_FILTER:
	text = prepare_tts_input_with_context(text)

	# model = data.get('model', DEFAULT_MODEL)
	voice = data.get('voice', DEFAULT_VOICE)

	response_format = data.get('response_format', DEFAULT_RESPONSE_FORMAT)
	speed = float(data.get('speed', DEFAULT_SPEED))

	mime_type = AUDIO_FORMAT_MIME_TYPES.get(response_format, "audio/mpeg")

	# Generate the audio file in the specified format with speed adjustment
	output_file_path = generate_speech(text, voice, response_format, speed)

	# Return the file with the correct MIME type
	return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}")

	@app.route('/v1/models', methods=['GET', 'POST'])
	@app.route('/models', methods=['GET', 'POST'])
	@require_api_key
	def list_models():
	return jsonify({"data": get_models()})

	@app.route('/v1/voices', methods=['GET', 'POST'])
	@app.route('/voices', methods=['GET', 'POST'])
	@require_api_key
	def list_voices():
	specific_language = None

	data = request.args if request.method == 'GET' else request.json
	if data and ('language' in data or 'locale' in data):
	specific_language = data.get('language') if 'language' in data else data.get('locale')

	return jsonify({"voices": get_voices(specific_language)})

	@app.route('/v1/voices/all', methods=['GET', 'POST'])
	@app.route('/voices/all', methods=['GET', 'POST'])
	@require_api_key
	def list_all_voices():
	return jsonify({"voices": get_voices('all')})

	"""
	Support for ElevenLabs and Azure AI Speech
	(currently in beta)
	"""

	# http://localhost:5050/elevenlabs/v1/text-to-speech
	# http://localhost:5050/elevenlabs/v1/text-to-speech/en-US-AndrewNeural
	@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST'])
	@require_api_key
	def elevenlabs_tts(voice_id):
	if not EXPAND_API:
	return jsonify({"error": f"Endpoint not allowed"}), 500

	# Parse the incoming JSON payload
	try:
	payload = request.json
	if not payload or 'text' not in payload:
	return jsonify({"error": "Missing 'text' in request body"}), 400
	except Exception as e:
	return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400

	text = payload['text']

	if not REMOVE_FILTER:
	text = prepare_tts_input_with_context(text)

	voice = voice_id # ElevenLabs uses the voice_id in the URL

	# Use default settings for edge-tts
	response_format = 'mp3'
	speed = DEFAULT_SPEED # Optional customization via payload.get('speed', DEFAULT_SPEED)

	# Generate speech using edge-tts
	try:
	output_file_path = generate_speech(text, voice, response_format, speed)
	except Exception as e:
	return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500

	# Return the generated audio file
	return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")

	# tts.speech.microsoft.com/cognitiveservices/v1
	# https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
	# http://localhost:5050/azure/cognitiveservices/v1
	@app.route('/azure/cognitiveservices/v1', methods=['POST'])
	@require_api_key
	def azure_tts():
	if not EXPAND_API:
	return jsonify({"error": f"Endpoint not allowed"}), 500

	# Parse the SSML payload
	try:
	ssml_data = request.data.decode('utf-8')
	if not ssml_data:
	return jsonify({"error": "Missing SSML payload"}), 400

	# Extract the text and voice from SSML
	from xml.etree import ElementTree as ET
	root = ET.fromstring(ssml_data)
	text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text
	voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name')
	except Exception as e:
	return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400

	# Use default settings for edge-tts
	response_format = 'mp3'
	speed = DEFAULT_SPEED

	if not REMOVE_FILTER:
	text = prepare_tts_input_with_context(text)

	# Generate speech using edge-tts
	try:
	output_file_path = generate_speech(text, voice, response_format, speed)
	except Exception as e:
	return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500

	# Return the generated audio file
	return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3")

	print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API")
	print(f" ")
	print(f" * Serving OpenAI Edge TTS")
	print(f" * Server running on http://localhost:{PORT}")
	print(f" * TTS Endpoint: http://localhost:{PORT}/v1/audio/speech")
	print(f" ")

	if __name__ == '__main__':
	http_server = WSGIServer(('0.0.0.0', PORT), app)
	http_server.serve_forever()