Adjust text extraction logic in websearch.py
This commit is contained in:
@@ -279,11 +279,18 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
|
|||||||
|
|
||||||
cleaned = _clean_lines(best_text)
|
cleaned = _clean_lines(best_text)
|
||||||
# If too short, fall back to broader body text instead of dropping the page
|
# If too short, fall back to broader body text instead of dropping the page
|
||||||
if len(cleaned) < 300:
|
if len(cleaned) < 200:
|
||||||
broader = _clean_lines(_norm(soup.body or soup))
|
broader = _clean_lines(_norm(soup.body or soup))
|
||||||
if len(broader) > len(cleaned):
|
if len(broader) > len(cleaned):
|
||||||
cleaned = broader
|
cleaned = broader
|
||||||
|
|
||||||
|
# Absolute fallback to regex-based stripping if soup path failed
|
||||||
|
if len(cleaned.strip()) < 120:
|
||||||
|
txt = re.sub(r"<\s*(script|style)[^>]*>.*?<\s*/\s*\1\s*>", " ", html, flags=re.S|re.I)
|
||||||
|
txt = re.sub(r"<[^>]+>", " ", txt)
|
||||||
|
txt = re.sub(r"\s+", " ", txt)
|
||||||
|
cleaned = txt.strip()
|
||||||
|
|
||||||
if not cleaned.strip():
|
if not cleaned.strip():
|
||||||
return ""
|
return ""
|
||||||
if len(cleaned) > max_len:
|
if len(cleaned) > max_len:
|
||||||
|
|||||||
Reference in New Issue
Block a user