WANGP1 / wan /multitalk /kokoro /__main__.py
rahul7star's picture
Migrated from GitHub
30f8a30 verified
"""Kokoro TTS CLI
Example usage:
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
echo "Bom dia mundo, como vão vocês" > text.txt
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
Common issues:
pip not installed: `uv pip install pip`
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
espeak not installed: `apt-get install espeak-ng`
"""
import argparse
import wave
from pathlib import Path
from typing import Generator, TYPE_CHECKING
import numpy as np
from loguru import logger
languages = [
"a", # American English
"b", # British English
"h", # Hindi
"e", # Spanish
"f", # French
"i", # Italian
"p", # Brazilian Portuguese
"j", # Japanese
"z", # Mandarin Chinese
]
if TYPE_CHECKING:
from kokoro import KPipeline
def generate_audio(
text: str, kokoro_language: str, voice: str, speed=1
) -> Generator["KPipeline.Result", None, None]:
from kokoro import KPipeline
if not voice.startswith(kokoro_language):
logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
pipeline = KPipeline(lang_code=kokoro_language)
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
def generate_and_save_audio(
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
) -> None:
with wave.open(str(output_file.resolve()), "wb") as wav_file:
wav_file.setnchannels(1) # Mono audio
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
wav_file.setframerate(24000) # Sample rate
for result in generate_audio(
text, kokoro_language=kokoro_language, voice=voice, speed=speed
):
logger.debug(result.phonemes)
if result.audio is None:
continue
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
wav_file.writeframes(audio_bytes)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--voice",
default="af_heart",
help="Voice to use",
)
parser.add_argument(
"-l",
"--language",
help="Language to use (defaults to the one corresponding to the voice)",
choices=languages,
)
parser.add_argument(
"-o",
"--output-file",
"--output_file",
type=Path,
help="Path to output WAV file",
required=True,
)
parser.add_argument(
"-i",
"--input-file",
"--input_file",
type=Path,
help="Path to input text file (default: stdin)",
)
parser.add_argument(
"-t",
"--text",
help="Text to use instead of reading from stdin",
)
parser.add_argument(
"-s",
"--speed",
type=float,
default=1.0,
help="Speech speed",
)
parser.add_argument(
"--debug",
action="store_true",
help="Print DEBUG messages to console",
)
args = parser.parse_args()
if args.debug:
logger.level("DEBUG")
logger.debug(args)
lang = args.language or args.voice[0]
if args.text is not None and args.input_file is not None:
raise Exception("You cannot specify both 'text' and 'input_file'")
elif args.text:
text = args.text
elif args.input_file:
file: Path = args.input_file
text = file.read_text()
else:
import sys
print("Press Ctrl+D to stop reading input and start generating", flush=True)
text = '\n'.join(sys.stdin)
logger.debug(f"Input text: {text!r}")
out_file: Path = args.output_file
if not out_file.suffix == ".wav":
logger.warning("The output file name should end with .wav")
generate_and_save_audio(
output_file=out_file,
text=text,
kokoro_language=lang,
voice=args.voice,
speed=args.speed,
)
if __name__ == "__main__":
main()