Disable language detection in corpus building process

This commit is contained in:
2026-03-19 22:29:58 +01:00
parent c5c3209d21
commit 9a92344387
2 changed files with 10 additions and 3 deletions

View File

@@ -769,6 +769,7 @@ def _run_prepare_pipeline(slug: str, on_progress=None, **opts):
out=paths["corpus"],
on_progress=build_progress,
emit="per-file",
lang_detect=False,
)
_mark_pipeline_stage(slug, "build", corpus_signature)
states["has_corpus"] = True
@@ -1087,6 +1088,7 @@ async def build_library(slug: str):
root=stage_dir(slug),
out=_collect_library_paths(slug)["corpus"],
emit="per-file",
lang_detect=False,
stage_signature=payload.get("corpus_signature"),
)
return {"job_id": job_id}

View File

@@ -267,6 +267,11 @@ try:
except Exception:
_ld_detect = None
_LANGID_RUNTIME_SAFE = (
langid is not None and
not (sys.platform == "darwin" and sys.version_info >= (3, 13))
)
# Progress
try:
from tqdm import tqdm
@@ -501,11 +506,11 @@ def detect_language(text: str) -> Optional[str]:
else:
sample = text
try:
if langid is not None:
lang, _ = langid.classify(sample)
return lang
if _ld_detect is not None:
return _ld_detect(sample)
if _LANGID_RUNTIME_SAFE:
lang, _ = langid.classify(sample)
return lang
except Exception:
pass
return None