Disable language detection in corpus building process
This commit is contained in:
@@ -769,6 +769,7 @@ def _run_prepare_pipeline(slug: str, on_progress=None, **opts):
|
||||
out=paths["corpus"],
|
||||
on_progress=build_progress,
|
||||
emit="per-file",
|
||||
lang_detect=False,
|
||||
)
|
||||
_mark_pipeline_stage(slug, "build", corpus_signature)
|
||||
states["has_corpus"] = True
|
||||
@@ -1087,6 +1088,7 @@ async def build_library(slug: str):
|
||||
root=stage_dir(slug),
|
||||
out=_collect_library_paths(slug)["corpus"],
|
||||
emit="per-file",
|
||||
lang_detect=False,
|
||||
stage_signature=payload.get("corpus_signature"),
|
||||
)
|
||||
return {"job_id": job_id}
|
||||
|
||||
@@ -267,6 +267,11 @@ try:
|
||||
except Exception:
|
||||
_ld_detect = None
|
||||
|
||||
_LANGID_RUNTIME_SAFE = (
|
||||
langid is not None and
|
||||
not (sys.platform == "darwin" and sys.version_info >= (3, 13))
|
||||
)
|
||||
|
||||
# Progress
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
@@ -501,11 +506,11 @@ def detect_language(text: str) -> Optional[str]:
|
||||
else:
|
||||
sample = text
|
||||
try:
|
||||
if langid is not None:
|
||||
lang, _ = langid.classify(sample)
|
||||
return lang
|
||||
if _ld_detect is not None:
|
||||
return _ld_detect(sample)
|
||||
if _LANGID_RUNTIME_SAFE:
|
||||
lang, _ = langid.classify(sample)
|
||||
return lang
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user