From ac97e6f785d9bbc485aa8ad9cca442d8c1f6a798 Mon Sep 17 00:00:00 2001 From: Victor Giers Date: Sun, 30 Nov 2025 15:05:24 +0100 Subject: [PATCH] Adjust text extraction logic in websearch.py --- websearch.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/websearch.py b/websearch.py index f32c5fb..71bbdc5 100644 --- a/websearch.py +++ b/websearch.py @@ -279,11 +279,18 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: cleaned = _clean_lines(best_text) # If too short, fall back to broader body text instead of dropping the page - if len(cleaned) < 300: + if len(cleaned) < 200: broader = _clean_lines(_norm(soup.body or soup)) if len(broader) > len(cleaned): cleaned = broader + # Absolute fallback to regex-based stripping if soup path failed + if len(cleaned.strip()) < 120: + txt = re.sub(r"<\s*(script|style)[^>]*>.*?<\s*/\s*\1\s*>", " ", html, flags=re.S|re.I) + txt = re.sub(r"<[^>]+>", " ", txt) + txt = re.sub(r"\s+", " ", txt) + cleaned = txt.strip() + if not cleaned.strip(): return "" if len(cleaned) > max_len: