diff --git a/.gitignore b/.gitignore index 58afa4f..bd0431a 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ tmp* *.tmp *.swp *.wav +.DS_Store diff --git a/kokoro_ja.py b/kokoro_ja.py new file mode 100644 index 0000000..7b32416 --- /dev/null +++ b/kokoro_ja.py @@ -0,0 +1,42 @@ +import argparse, os +import numpy as np +import soundfile as sf +from kokoro import KPipeline + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("text", help="Japanese text") + ap.add_argument("--voice", default="jf_alpha", help="e.g. jf_alpha, jf_tebukuro, jm_kumo ...") + ap.add_argument("--speed", type=float, default=1.0, help="1.0 = normal, >1 faster, <1 slower") + ap.add_argument("--out", default="out.wav") + args = ap.parse_args() + + # Japanese pipeline + pipeline = KPipeline(lang_code="j") # Japanese [oai_citation:2‡Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M/blob/938257c07e326d534677886ca13829b39347fff7/README.md) + + # Split at Japanese punctuation to avoid “rushing” long passages + # (Kokoro voices often behave best around moderate chunk sizes.) [oai_citation:3‡Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) + split_pattern = r"(?<=[。!?\n])\s*" + + audio_parts = [] + generator = pipeline(args.text, voice=args.voice, speed=args.speed, split_pattern=split_pattern) + for _, _, audio in generator: + # audio can be a torch.Tensor or already a numpy array depending on device/backend + if hasattr(audio, "detach"): # torch.Tensor + audio = audio.detach() + if hasattr(audio, "cpu"): # move to CPU if needed + audio = audio.cpu() + if hasattr(audio, "numpy"): # torch -> numpy + audio = audio.numpy() + + audio_parts.append(np.asarray(audio, dtype=np.float32)) + + if not audio_parts: + raise SystemExit("No audio generated (empty input?)") + + audio_all = np.concatenate(audio_parts, axis=0) + sf.write(args.out, audio_all, 24000) + print(f"Wrote: {args.out} (24kHz)") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5b4ed7f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +kokoro>=0.9.4 +soundfile +misaki[ja] +unidic +fugashi diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..0caa8c1 --- /dev/null +++ b/run.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3.11 -m venv .venv +source .venv/bin/activate + +python -m pip install -U pip setuptools wheel +python -m pip install -r requirements.txt + +# Download UniDic only if mecabrc is missing (fast/no-op if already present) +python - <<'PY' +import os, sys, subprocess +import unidic + +mecabrc = os.path.join(unidic.DICDIR, "mecabrc") +if not os.path.exists(mecabrc): + print("UniDic not downloaded yet -> downloading (this can be large)...") + subprocess.check_call([sys.executable, "-m", "unidic", "download"]) +else: + print("UniDic already present:", mecabrc) +PY + +# run your tts +PYTORCH_ENABLE_MPS_FALLBACK=1 python kokoro_ja.py "$@" \ No newline at end of file