import argparse, os import numpy as np import soundfile as sf from kokoro import KPipeline def main(): ap = argparse.ArgumentParser() ap.add_argument("text", help="Japanese text") ap.add_argument("--voice", default="jf_alpha", help="e.g. jf_alpha, jf_tebukuro, jm_kumo ...") ap.add_argument("--speed", type=float, default=1.0, help="1.0 = normal, >1 faster, <1 slower") ap.add_argument("--out", default="out.wav") args = ap.parse_args() # Japanese pipeline pipeline = KPipeline(lang_code="j") # Japanese [oai_citation:2‡Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M/blob/938257c07e326d534677886ca13829b39347fff7/README.md) # Split at Japanese punctuation to avoid “rushing” long passages # (Kokoro voices often behave best around moderate chunk sizes.) [oai_citation:3‡Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) split_pattern = r"(?<=[。!?\n])\s*" audio_parts = [] generator = pipeline(args.text, voice=args.voice, speed=args.speed, split_pattern=split_pattern) for _, _, audio in generator: # audio can be a torch.Tensor or already a numpy array depending on device/backend if hasattr(audio, "detach"): # torch.Tensor audio = audio.detach() if hasattr(audio, "cpu"): # move to CPU if needed audio = audio.cpu() if hasattr(audio, "numpy"): # torch -> numpy audio = audio.numpy() audio_parts.append(np.asarray(audio, dtype=np.float32)) if not audio_parts: raise SystemExit("No audio generated (empty input?)") audio_all = np.concatenate(audio_parts, axis=0) sf.write(args.out, audio_all, 24000) print(f"Wrote: {args.out} (24kHz)") if __name__ == "__main__": main()