auto-git:

[change] websearch.py
2025-11-30 15:02:12 +01:00
parent 1d33f48f9c
commit b944cde381
1 changed files with 61 additions and 15 deletions
--- a/websearch.py
+++ b/websearch.py
@@ -208,18 +208,42 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
        for tag in soup.select("script,style,noscript,template"):
            tag.decompose()
-        for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas", "input", "select", "option", "button"):
+        # Remove cookie/consent overlays by attributes or short text snippets
        for el in list(soup.find_all(True)):
            if el.name in {"html", "body"}:
                continue
            attr_str = " ".join([
                el.get("id") or "",
                " ".join(el.get("class") or []),
                el.get("role") or "",
                el.get("aria-label") or "",
            ]).lower()
            text_preview = (el.get_text(" ", strip=True)[:220] or "").lower()
            if _COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview):
                try:
                    el.decompose()
                except Exception:
                    pass
        # Remove clearly decorative/structural regions
        for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas"):
            for t in soup.find_all(tag_name):
-                t.decompose()
+                try:
                    t.decompose()
                except Exception:
                    pass
        # Remove elements whose id/class/role clearly mark them as noise
        for el in list(soup.find_all(True)):
            if el.name in {"html", "body"}:
                continue
            attrs = " ".join([
                el.get("id") or "",
                " ".join(el.get("class") or []),
                el.get("role") or "",
                el.get("aria-label") or "",
            ])
-            if _NOISE_ATTR_RE.search(attrs or ""):
+            if _NOISE_ATTR_RE.search(attrs or "") and not _CONTENT_HINT_RE.search(attrs or ""):
                try:
                    el.decompose()
                except Exception:
@@ -231,24 +255,40 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
            txt = re.sub(r"\n{3,}", "\n\n", txt)
            return txt
        def _score_node(node, raw_text: str, attr_str: str) -> float:
            length = len(raw_text)
            if length < 80:
                return 0.0
            link_count = len(node.find_all("a"))
            link_density = link_count / max(1.0, length / 80.0)
            bonus = 0.0
            if node.name == "article":
                bonus += 0.6 * length
            elif node.name == "main":
                bonus += 0.4 * length
            elif node.name == "section":
                bonus += 0.2 * length
            if _CONTENT_HINT_RE.search(attr_str):
                bonus += 0.35 * length
            if _NEGATIVE_HINT_RE.search(attr_str):
                bonus -= 0.25 * length
            penalty = min(0.9, link_density * 0.6)
            return (length + bonus) * (1.0 - penalty)
        # Score candidate blocks to pick main content
        best_text = ""
        best_score = 0.0
        for node in soup.find_all(["article", "main", "section", "div", "body"]):
            raw = _norm(node)
-            if not raw or len(raw) < 80:
+            if not raw:
                continue
-            link_count = len(node.find_all("a"))
+            attr_str = " ".join([
-            link_density = link_count / max(1.0, len(raw) / 80.0)
+                node.get("id") or "",
-            penalty = min(0.9, link_density)
+                " ".join(node.get("class") or []),
-            bonus = 1.0
+                node.get("role") or "",
-            if node.name == "article":
+                node.get("aria-label") or "",
-                bonus += 0.35
+            ]).lower()
-            elif node.name == "main":
+            score = _score_node(node, raw, attr_str)
                bonus += 0.25
            elif node.name == "section":
                bonus += 0.1
            score = len(raw) * bonus * (1.0 - penalty)
            if score > best_score:
                best_score = score
                best_text = raw
@@ -259,6 +299,12 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
            best_text = _norm(target)
        cleaned = _clean_lines(best_text)
        # If too short, fall back to broader body text instead of dropping the page
        if len(cleaned) < 300:
            broader = _clean_lines(_norm(soup.body or soup))
            if len(broader) > len(cleaned):
                cleaned = broader
        if not cleaned.strip():
            return ""
        if len(cleaned) > max_len: