Spaces:
Paused
Paused
"""Kokoro TTS CLI | |
Example usage: | |
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug | |
echo "Bom dia mundo, como vão vocês" > text.txt | |
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav | |
Common issues: | |
pip not installed: `uv pip install pip` | |
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed) | |
espeak not installed: `apt-get install espeak-ng` | |
""" | |
import argparse | |
import wave | |
from pathlib import Path | |
from typing import Generator, TYPE_CHECKING | |
import numpy as np | |
from loguru import logger | |
languages = [ | |
"a", # American English | |
"b", # British English | |
"h", # Hindi | |
"e", # Spanish | |
"f", # French | |
"i", # Italian | |
"p", # Brazilian Portuguese | |
"j", # Japanese | |
"z", # Mandarin Chinese | |
] | |
if TYPE_CHECKING: | |
from kokoro import KPipeline | |
def generate_audio( | |
text: str, kokoro_language: str, voice: str, speed=1 | |
) -> Generator["KPipeline.Result", None, None]: | |
from kokoro import KPipeline | |
if not voice.startswith(kokoro_language): | |
logger.warning(f"Voice {voice} is not made for language {kokoro_language}") | |
pipeline = KPipeline(lang_code=kokoro_language) | |
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+") | |
def generate_and_save_audio( | |
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1 | |
) -> None: | |
with wave.open(str(output_file.resolve()), "wb") as wav_file: | |
wav_file.setnchannels(1) # Mono audio | |
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio) | |
wav_file.setframerate(24000) # Sample rate | |
for result in generate_audio( | |
text, kokoro_language=kokoro_language, voice=voice, speed=speed | |
): | |
logger.debug(result.phonemes) | |
if result.audio is None: | |
continue | |
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes() | |
wav_file.writeframes(audio_bytes) | |
def main() -> None: | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"-m", | |
"--voice", | |
default="af_heart", | |
help="Voice to use", | |
) | |
parser.add_argument( | |
"-l", | |
"--language", | |
help="Language to use (defaults to the one corresponding to the voice)", | |
choices=languages, | |
) | |
parser.add_argument( | |
"-o", | |
"--output-file", | |
"--output_file", | |
type=Path, | |
help="Path to output WAV file", | |
required=True, | |
) | |
parser.add_argument( | |
"-i", | |
"--input-file", | |
"--input_file", | |
type=Path, | |
help="Path to input text file (default: stdin)", | |
) | |
parser.add_argument( | |
"-t", | |
"--text", | |
help="Text to use instead of reading from stdin", | |
) | |
parser.add_argument( | |
"-s", | |
"--speed", | |
type=float, | |
default=1.0, | |
help="Speech speed", | |
) | |
parser.add_argument( | |
"--debug", | |
action="store_true", | |
help="Print DEBUG messages to console", | |
) | |
args = parser.parse_args() | |
if args.debug: | |
logger.level("DEBUG") | |
logger.debug(args) | |
lang = args.language or args.voice[0] | |
if args.text is not None and args.input_file is not None: | |
raise Exception("You cannot specify both 'text' and 'input_file'") | |
elif args.text: | |
text = args.text | |
elif args.input_file: | |
file: Path = args.input_file | |
text = file.read_text() | |
else: | |
import sys | |
print("Press Ctrl+D to stop reading input and start generating", flush=True) | |
text = '\n'.join(sys.stdin) | |
logger.debug(f"Input text: {text!r}") | |
out_file: Path = args.output_file | |
if not out_file.suffix == ".wav": | |
logger.warning("The output file name should end with .wav") | |
generate_and_save_audio( | |
output_file=out_file, | |
text=text, | |
kokoro_language=lang, | |
voice=args.voice, | |
speed=args.speed, | |
) | |
if __name__ == "__main__": | |
main() | |