Spaces:
Paused
Paused
File size: 4,089 Bytes
30f8a30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
"""Kokoro TTS CLI
Example usage:
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug
echo "Bom dia mundo, como vão vocês" > text.txt
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav
Common issues:
pip not installed: `uv pip install pip`
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)
espeak not installed: `apt-get install espeak-ng`
"""
import argparse
import wave
from pathlib import Path
from typing import Generator, TYPE_CHECKING
import numpy as np
from loguru import logger
languages = [
"a", # American English
"b", # British English
"h", # Hindi
"e", # Spanish
"f", # French
"i", # Italian
"p", # Brazilian Portuguese
"j", # Japanese
"z", # Mandarin Chinese
]
if TYPE_CHECKING:
from kokoro import KPipeline
def generate_audio(
text: str, kokoro_language: str, voice: str, speed=1
) -> Generator["KPipeline.Result", None, None]:
from kokoro import KPipeline
if not voice.startswith(kokoro_language):
logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
pipeline = KPipeline(lang_code=kokoro_language)
yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")
def generate_and_save_audio(
output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
) -> None:
with wave.open(str(output_file.resolve()), "wb") as wav_file:
wav_file.setnchannels(1) # Mono audio
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
wav_file.setframerate(24000) # Sample rate
for result in generate_audio(
text, kokoro_language=kokoro_language, voice=voice, speed=speed
):
logger.debug(result.phonemes)
if result.audio is None:
continue
audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
wav_file.writeframes(audio_bytes)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--voice",
default="af_heart",
help="Voice to use",
)
parser.add_argument(
"-l",
"--language",
help="Language to use (defaults to the one corresponding to the voice)",
choices=languages,
)
parser.add_argument(
"-o",
"--output-file",
"--output_file",
type=Path,
help="Path to output WAV file",
required=True,
)
parser.add_argument(
"-i",
"--input-file",
"--input_file",
type=Path,
help="Path to input text file (default: stdin)",
)
parser.add_argument(
"-t",
"--text",
help="Text to use instead of reading from stdin",
)
parser.add_argument(
"-s",
"--speed",
type=float,
default=1.0,
help="Speech speed",
)
parser.add_argument(
"--debug",
action="store_true",
help="Print DEBUG messages to console",
)
args = parser.parse_args()
if args.debug:
logger.level("DEBUG")
logger.debug(args)
lang = args.language or args.voice[0]
if args.text is not None and args.input_file is not None:
raise Exception("You cannot specify both 'text' and 'input_file'")
elif args.text:
text = args.text
elif args.input_file:
file: Path = args.input_file
text = file.read_text()
else:
import sys
print("Press Ctrl+D to stop reading input and start generating", flush=True)
text = '\n'.join(sys.stdin)
logger.debug(f"Input text: {text!r}")
out_file: Path = args.output_file
if not out_file.suffix == ".wav":
logger.warning("The output file name should end with .wav")
generate_and_save_audio(
output_file=out_file,
text=text,
kokoro_language=lang,
voice=args.voice,
speed=args.speed,
)
if __name__ == "__main__":
main()
|