import modal import io image = modal.Image.debian_slim(python_version="3.12").apt_install("espeak-ng") image = image.pip_install( "kokoro>=0.9.4", "soundfile", "fastapi[standard]", "spacy==3.8.0" ) image = image.run_commands("python -m spacy download en_core_web_sm") app = modal.App("kokoro-api", image=image) with image.imports(): import os from kokoro import KPipeline from IPython.display import display, Audio import soundfile as sf import torch from fastapi.responses import StreamingResponse, Response, FileResponse import numpy as np import uuid @app.cls(gpu="t4", scaledown_window=60 * 2, enable_memory_snapshot=True) @modal.concurrent(max_inputs=30) class kokoro: @modal.enter() def load(self): self.pipeline = KPipeline(lang_code="a") @modal.fastapi_endpoint(docs=True, method="POST") def generate(self, text: str, voice: str = "af_heart"): if len(text) == 0: return Response(content="Text is empty", status_code=400) generator = self.pipeline(text, voice) audio_bytes = io.BytesIO() random_name = str(uuid.uuid4()) file_path = f"{random_name}.wav" audio_combined = [] for _, _, audio in generator: audio_combined.append(audio) audio_combined = np.concatenate(audio_combined) sf.write(file_path, audio_combined, 24000) return FileResponse(path=file_path, media_type="audio/wav", filename=file_path) @modal.fastapi_endpoint(docs=True, method="GET") def wake_up(self): return Response(content="Kokoro is awake", status_code=200)