Add binary path resolution and availability checks in corpus_builder.py

This commit is contained in:
2026-03-19 23:22:07 +01:00
parent 7bec9cdc67
commit fb37f8cc00

View File

@@ -417,6 +417,15 @@ def read_text_file(path: Path) -> str:
def run_cmd(cmd: List[str], *, cwd: Optional[Path] = None, env: Optional[Dict[str, str]] = None) -> subprocess.CompletedProcess: def run_cmd(cmd: List[str], *, cwd: Optional[Path] = None, env: Optional[Dict[str, str]] = None) -> subprocess.CompletedProcess:
return subprocess.run(cmd, cwd=str(cwd) if cwd else None, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) return subprocess.run(cmd, cwd=str(cwd) if cwd else None, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
def resolve_binary_path(explicit: Optional[str], env_name: str, binary_name: str, fallback: str) -> str:
candidate = explicit or os.environ.get(env_name) or shutil.which(binary_name) or fallback
return str(candidate)
def binary_available(binary_path: Optional[str]) -> bool:
if not binary_path:
return False
return Path(binary_path).exists() or shutil.which(binary_path) is not None
def ffprobe_json(ffprobe_bin: str, media_path: Path) -> Optional[Dict]: def ffprobe_json(ffprobe_bin: str, media_path: Path) -> Optional[Dict]:
cmd = [ffprobe_bin, "-v", "error", "-print_format", "json", "-show_format", "-show_streams", str(media_path)] cmd = [ffprobe_bin, "-v", "error", "-print_format", "json", "-show_format", "-show_streams", str(media_path)]
res = run_cmd(cmd) res = run_cmd(cmd)
@@ -1479,7 +1488,22 @@ def process_media(path: Path, args) -> List[Record]:
if whisper_error: if whisper_error:
raise RuntimeError(whisper_error) raise RuntimeError(whisper_error)
probe = ffprobe_json(args.ffprobe, path) ffmpeg_bin = resolve_binary_path(args.ffmpeg, "HEIMGEIST_FFMPEG_PATH", "ffmpeg", "/usr/bin/ffmpeg")
ffprobe_bin = resolve_binary_path(args.ffprobe, "HEIMGEIST_FFPROBE_PATH", "ffprobe", "/usr/bin/ffprobe")
missing_tools = []
if not binary_available(ffmpeg_bin):
missing_tools.append("ffmpeg")
if not binary_available(ffprobe_bin):
missing_tools.append("ffprobe")
if missing_tools:
missing = " and ".join(missing_tools)
raise RuntimeError(
f"Audio/video ingestion requires {missing}. "
"Install the tools system-wide or provide bundled paths via "
"HEIMGEIST_FFMPEG_PATH and HEIMGEIST_FFPROBE_PATH."
)
probe = ffprobe_json(ffprobe_bin, path)
duration_s = None duration_s = None
if probe: if probe:
try: try:
@@ -1491,7 +1515,7 @@ def process_media(path: Path, args) -> List[Record]:
tmpdir = Path(tempfile.mkdtemp(prefix="av_")) tmpdir = Path(tempfile.mkdtemp(prefix="av_"))
wav_path = tmpdir / "audio.wav" wav_path = tmpdir / "audio.wav"
ok = extract_audio_wav(args.ffmpeg, path, wav_path) ok = extract_audio_wav(ffmpeg_bin, path, wav_path)
if not ok or not wav_path.exists(): if not ok or not wav_path.exists():
try: shutil.rmtree(tmpdir) try: shutil.rmtree(tmpdir)
except Exception: pass except Exception: pass
@@ -1500,7 +1524,7 @@ def process_media(path: Path, args) -> List[Record]:
slice_dir = tmpdir / "slices" slice_dir = tmpdir / "slices"
slice_dir.mkdir(parents=True, exist_ok=True) slice_dir.mkdir(parents=True, exist_ok=True)
nslices = max(1, args.num_slices) nslices = max(1, args.num_slices)
slices = slice_audio(wav_path, slice_dir, nslices, args.overlap_sec, args.ffprobe, args.ffmpeg) slices = slice_audio(wav_path, slice_dir, nslices, args.overlap_sec, ffprobe_bin, ffmpeg_bin)
mpw = args.mp_workers or len(slices) mpw = args.mp_workers or len(slices)
device = _resolve_whisper_device(args.whisper_device) device = _resolve_whisper_device(args.whisper_device)
@@ -1625,8 +1649,8 @@ def run_build(root: Path, out: Path, *, on_progress=None, **opts) -> dict:
writer_queue=opts.get("writer_queue", 64), writer_queue=opts.get("writer_queue", 64),
writer_chunk=opts.get("writer_chunk", 256), writer_chunk=opts.get("writer_chunk", 256),
writer_rotate_mb=opts.get("writer_rotate_mb", 0), writer_rotate_mb=opts.get("writer_rotate_mb", 0),
ffmpeg=opts.get("ffmpeg", shutil.which("ffmpeg") or "/usr/bin/ffmpeg"), ffmpeg=opts.get("ffmpeg", resolve_binary_path(None, "HEIMGEIST_FFMPEG_PATH", "ffmpeg", "/usr/bin/ffmpeg")),
ffprobe=opts.get("ffprobe", shutil.which("ffprobe") or "/usr/bin/ffprobe"), ffprobe=opts.get("ffprobe", resolve_binary_path(None, "HEIMGEIST_FFPROBE_PATH", "ffprobe", "/usr/bin/ffprobe")),
tesseract=opts.get("tesseract", shutil.which("tesseract") or "/usr/bin/tesseract"), tesseract=opts.get("tesseract", shutil.which("tesseract") or "/usr/bin/tesseract"),
ebook_convert=opts.get("ebook_convert", shutil.which("ebook-convert")), ebook_convert=opts.get("ebook_convert", shutil.which("ebook-convert")),
pandoc=opts.get("pandoc", shutil.which("pandoc")), pandoc=opts.get("pandoc", shutil.which("pandoc")),