From ac97e6f785d9bbc485aa8ad9cca442d8c1f6a798 Mon Sep 17 00:00:00 2001
From: Victor Giers <giers@MacBook-Pro.local>
Date: Sun, 30 Nov 2025 15:05:24 +0100
Subject: [PATCH] Adjust text extraction logic in websearch.py

---
 websearch.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/websearch.py b/websearch.py
index f32c5fb..71bbdc5 100644
--- a/websearch.py
+++ b/websearch.py
@@ -279,11 +279,18 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
 
         cleaned = _clean_lines(best_text)
         # If too short, fall back to broader body text instead of dropping the page
-        if len(cleaned) < 300:
+        if len(cleaned) < 200:
             broader = _clean_lines(_norm(soup.body or soup))
             if len(broader) > len(cleaned):
                 cleaned = broader
 
+        # Absolute fallback to regex-based stripping if soup path failed
+        if len(cleaned.strip()) < 120:
+            txt = re.sub(r"<\s*(script|style)[^>]*>.*?<\s*/\s*\1\s*>", " ", html, flags=re.S|re.I)
+            txt = re.sub(r"<[^>]+>", " ", txt)
+            txt = re.sub(r"\s+", " ", txt)
+            cleaned = txt.strip()
+
         if not cleaned.strip():
             return ""
         if len(cleaned) > max_len: