File size: 4,089 Bytes
30f8a30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""Kokoro TTS CLI
Example usage:
python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug

echo "Bom dia mundo, como vão vocês" > text.txt
python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav

Common issues:
pip not installed: `uv pip install pip`
(Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed)

espeak not installed: `apt-get install espeak-ng`
"""

import argparse
import wave
from pathlib import Path
from typing import Generator, TYPE_CHECKING

import numpy as np
from loguru import logger

languages = [
    "a",  # American English
    "b",  # British English
    "h",  # Hindi
    "e",  # Spanish
    "f",  # French
    "i",  # Italian
    "p",  # Brazilian Portuguese
    "j",  # Japanese
    "z",  # Mandarin Chinese
]

if TYPE_CHECKING:
    from kokoro import KPipeline


def generate_audio(
    text: str, kokoro_language: str, voice: str, speed=1
) -> Generator["KPipeline.Result", None, None]:
    from kokoro import KPipeline

    if not voice.startswith(kokoro_language):
        logger.warning(f"Voice {voice} is not made for language {kokoro_language}")
    pipeline = KPipeline(lang_code=kokoro_language)
    yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+")


def generate_and_save_audio(
    output_file: Path, text: str, kokoro_language: str, voice: str, speed=1
) -> None:
    with wave.open(str(output_file.resolve()), "wb") as wav_file:
        wav_file.setnchannels(1)  # Mono audio
        wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
        wav_file.setframerate(24000)  # Sample rate

        for result in generate_audio(
            text, kokoro_language=kokoro_language, voice=voice, speed=speed
        ):
            logger.debug(result.phonemes)
            if result.audio is None:
                continue
            audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes()
            wav_file.writeframes(audio_bytes)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m",
        "--voice",
        default="af_heart",
        help="Voice to use",
    )
    parser.add_argument(
        "-l",
        "--language",
        help="Language to use (defaults to the one corresponding to the voice)",
        choices=languages,
    )
    parser.add_argument(
        "-o",
        "--output-file",
        "--output_file",
        type=Path,
        help="Path to output WAV file",
        required=True,
    )
    parser.add_argument(
        "-i",
        "--input-file",
        "--input_file",
        type=Path,
        help="Path to input text file (default: stdin)",
    )
    parser.add_argument(
        "-t",
        "--text",
        help="Text to use instead of reading from stdin",
    )
    parser.add_argument(
        "-s",
        "--speed",
        type=float,
        default=1.0,
        help="Speech speed",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Print DEBUG messages to console",
    )
    args = parser.parse_args()
    if args.debug:
        logger.level("DEBUG")
    logger.debug(args)

    lang = args.language or args.voice[0]

    if args.text is not None and args.input_file is not None:
        raise Exception("You cannot specify both 'text' and 'input_file'")
    elif args.text:
        text = args.text
    elif args.input_file:
        file: Path = args.input_file
        text = file.read_text()
    else:
        import sys
        print("Press Ctrl+D to stop reading input and start generating", flush=True)
        text = '\n'.join(sys.stdin)

    logger.debug(f"Input text: {text!r}")

    out_file: Path = args.output_file
    if not out_file.suffix == ".wav":
        logger.warning("The output file name should end with .wav")
    generate_and_save_audio(
        output_file=out_file,
        text=text,
        kokoro_language=lang,
        voice=args.voice,
        speed=args.speed,
    )


if __name__ == "__main__":
    main()