Spaces:

rahul7star
/

WANGP1

Paused

App Files Files Community

WANGP1 / wan /multitalk /kokoro /__main__.py

rahul7star

Migrated from GitHub

30f8a30 verified 12 days ago

raw

history blame contribute delete

4.09 kB

	"""Kokoro TTS CLI
	Example usage:
	python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug

	echo "Bom dia mundo, como vão vocês" > text.txt
	python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav

	Common issues:
	pip not installed: `uv pip install pip`
	(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)

	espeak not installed: `apt-get install espeak-ng`
	"""

	import argparse
	import wave
	from pathlib import Path
	from typing import Generator, TYPE_CHECKING

	import numpy as np
	from loguru import logger

	languages = [
	"a", # American English
	"b", # British English
	"h", # Hindi
	"e", # Spanish
	"f", # French
	"i", # Italian
	"p", # Brazilian Portuguese
	"j", # Japanese
	"z", # Mandarin Chinese
	]

	if TYPE_CHECKING:
	from kokoro import KPipeline


	def generate_audio(
	text: str, kokoro_language: str, voice: str, speed=1
	) -> Generator["KPipeline.Result", None, None]:
	from kokoro import KPipeline

	if not voice.startswith(kokoro_language):
	logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
	pipeline = KPipeline(lang_code=kokoro_language)
	yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")


	def generate_and_save_audio(
	output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
	) -> None:
	with wave.open(str(output_file.resolve()), "wb") as wav_file:
	wav_file.setnchannels(1) # Mono audio
	wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
	wav_file.setframerate(24000) # Sample rate

	for result in generate_audio(
	text, kokoro_language=kokoro_language, voice=voice, speed=speed
	):
	logger.debug(result.phonemes)
	if result.audio is None:
	continue
	audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
	wav_file.writeframes(audio_bytes)


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-m",
	"--voice",
	default="af_heart",
	help="Voice to use",
	)
	parser.add_argument(
	"-l",
	"--language",
	help="Language to use (defaults to the one corresponding to the voice)",
	choices=languages,
	)
	parser.add_argument(
	"-o",
	"--output-file",
	"--output_file",
	type=Path,
	help="Path to output WAV file",
	required=True,
	)
	parser.add_argument(
	"-i",
	"--input-file",
	"--input_file",
	type=Path,
	help="Path to input text file (default: stdin)",
	)
	parser.add_argument(
	"-t",
	"--text",
	help="Text to use instead of reading from stdin",
	)
	parser.add_argument(
	"-s",
	"--speed",
	type=float,
	default=1.0,
	help="Speech speed",
	)
	parser.add_argument(
	"--debug",
	action="store_true",
	help="Print DEBUG messages to console",
	)
	args = parser.parse_args()
	if args.debug:
	logger.level("DEBUG")
	logger.debug(args)

	lang = args.language or args.voice[0]

	if args.text is not None and args.input_file is not None:
	raise Exception("You cannot specify both 'text' and 'input_file'")
	elif args.text:
	text = args.text
	elif args.input_file:
	file: Path = args.input_file
	text = file.read_text()
	else:
	import sys
	print("Press Ctrl+D to stop reading input and start generating", flush=True)
	text = '\n'.join(sys.stdin)

	logger.debug(f"Input text: {text!r}")

	out_file: Path = args.output_file
	if not out_file.suffix == ".wav":
	logger.warning("The output file name should end with .wav")
	generate_and_save_audio(
	output_file=out_file,
	text=text,
	kokoro_language=lang,
	voice=args.voice,
	speed=args.speed,
	)


	if __name__ == "__main__":
	main()