Add kokoro_ja.py, requirements.txt, and run.sh; update .gitignore

2026-01-26 07:54:22 +01:00
parent 9b3b1c478a
commit 31a9cdc26d
4 changed files with 72 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ tmp*
 *.tmp
 *.swp
 *.wav
+.DS_Store
--- a/kokoro_ja.py
+++ b/kokoro_ja.py
@@ -0,0 +1,42 @@
+import argparse, os
+import numpy as np
+import soundfile as sf
+from kokoro import KPipeline
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("text", help="Japanese text")
+    ap.add_argument("--voice", default="jf_alpha", help="e.g. jf_alpha, jf_tebukuro, jm_kumo ...")
+    ap.add_argument("--speed", type=float, default=1.0, help="1.0 = normal, >1 faster, <1 slower")
+    ap.add_argument("--out", default="out.wav")
+    args = ap.parse_args()
+
+    # Japanese pipeline
+    pipeline = KPipeline(lang_code="j")  # Japanese  [oai_citation:2‡Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M/blob/938257c07e326d534677886ca13829b39347fff7/README.md)
+
+    # Split at Japanese punctuation to avoid “rushing” long passages
+    # (Kokoro voices often behave best around moderate chunk sizes.)  [oai_citation:3‡Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md)
+    split_pattern = r"(?<=[。！？\n])\s*"
+
+    audio_parts = []
+    generator = pipeline(args.text, voice=args.voice, speed=args.speed, split_pattern=split_pattern)
+    for _, _, audio in generator:
+        # audio can be a torch.Tensor or already a numpy array depending on device/backend
+        if hasattr(audio, "detach"):  # torch.Tensor
+            audio = audio.detach()
+        if hasattr(audio, "cpu"):     # move to CPU if needed
+            audio = audio.cpu()
+        if hasattr(audio, "numpy"):   # torch -> numpy
+            audio = audio.numpy()
+
+        audio_parts.append(np.asarray(audio, dtype=np.float32))
+
+    if not audio_parts:
+        raise SystemExit("No audio generated (empty input?)")
+
+    audio_all = np.concatenate(audio_parts, axis=0)
+    sf.write(args.out, audio_all, 24000)
+    print(f"Wrote: {args.out} (24kHz)")
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+kokoro>=0.9.4
+soundfile
+misaki[ja]
+unidic
+fugashi
--- a/run.sh
+++ b/run.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python3.11 -m venv .venv
+source .venv/bin/activate
+
+python -m pip install -U pip setuptools wheel
+python -m pip install -r requirements.txt
+
+# Download UniDic only if mecabrc is missing (fast/no-op if already present)
+python - <<'PY'
+import os, sys, subprocess
+import unidic
+
+mecabrc = os.path.join(unidic.DICDIR, "mecabrc")
+if not os.path.exists(mecabrc):
+    print("UniDic not downloaded yet -> downloading (this can be large)...")
+    subprocess.check_call([sys.executable, "-m", "unidic", "download"])
+else:
+    print("UniDic already present:", mecabrc)
+PY
+
+# run your tts
+PYTORCH_ENABLE_MPS_FALLBACK=1 python kokoro_ja.py "$@"