diff --git a/websearch.py b/websearch.py index eb9c52f..f32c5fb 100644 --- a/websearch.py +++ b/websearch.py @@ -208,7 +208,7 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: for tag in soup.select("script,style,noscript,template"): tag.decompose() - # Remove cookie/consent overlays by attributes or short text snippets + # Remove cookie/consent overlays by attributes or short text snippets (conservative) for el in list(soup.find_all(True)): if el.name in {"html", "body"}: continue @@ -219,35 +219,14 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: el.get("aria-label") or "", ]).lower() text_preview = (el.get_text(" ", strip=True)[:220] or "").lower() - if _COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview): - try: - el.decompose() - except Exception: - pass - - # Remove clearly decorative/structural regions - for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas"): - for t in soup.find_all(tag_name): - try: - t.decompose() - except Exception: - pass - - # Remove elements whose id/class/role clearly mark them as noise - for el in list(soup.find_all(True)): - if el.name in {"html", "body"}: - continue - attrs = " ".join([ - el.get("id") or "", - " ".join(el.get("class") or []), - el.get("role") or "", - el.get("aria-label") or "", - ]) - if _NOISE_ATTR_RE.search(attrs or "") and not _CONTENT_HINT_RE.search(attrs or ""): - try: - el.decompose() - except Exception: - pass + if (_COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview)): + # only remove if looks like a small overlay/dialog, not the main body + full_txt = " ".join(el.get_text(" ", strip=True).split()) + if len(text_preview) <= 260 or len(full_txt) <= 800: + try: + el.decompose() + except Exception: + pass def _norm(node): txt = node.get_text("\n", strip=True) @@ -257,7 +236,7 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: def _score_node(node, raw_text: str, attr_str: str) -> float: length = len(raw_text) - if length < 80: + if length < 40: return 0.0 link_count = len(node.find_all("a")) link_density = link_count / max(1.0, length / 80.0) @@ -271,11 +250,11 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: if _CONTENT_HINT_RE.search(attr_str): bonus += 0.35 * length if _NEGATIVE_HINT_RE.search(attr_str): - bonus -= 0.25 * length + bonus -= 0.2 * length penalty = min(0.9, link_density * 0.6) return (length + bonus) * (1.0 - penalty) - # Score candidate blocks to pick main content + # Score candidate blocks to pick main content without over-deleting nodes best_text = "" best_score = 0.0 for node in soup.find_all(["article", "main", "section", "div", "body"]):