diff --git a/websearch.py b/websearch.py index 626d1c8..3e65d2a 100644 --- a/websearch.py +++ b/websearch.py @@ -208,18 +208,42 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: for tag in soup.select("script,style,noscript,template"): tag.decompose() - for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas", "input", "select", "option", "button"): + # Remove cookie/consent overlays by attributes or short text snippets + for el in list(soup.find_all(True)): + if el.name in {"html", "body"}: + continue + attr_str = " ".join([ + el.get("id") or "", + " ".join(el.get("class") or []), + el.get("role") or "", + el.get("aria-label") or "", + ]).lower() + text_preview = (el.get_text(" ", strip=True)[:220] or "").lower() + if _COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview): + try: + el.decompose() + except Exception: + pass + + # Remove clearly decorative/structural regions + for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas"): for t in soup.find_all(tag_name): - t.decompose() + try: + t.decompose() + except Exception: + pass # Remove elements whose id/class/role clearly mark them as noise for el in list(soup.find_all(True)): + if el.name in {"html", "body"}: + continue attrs = " ".join([ el.get("id") or "", " ".join(el.get("class") or []), el.get("role") or "", + el.get("aria-label") or "", ]) - if _NOISE_ATTR_RE.search(attrs or ""): + if _NOISE_ATTR_RE.search(attrs or "") and not _CONTENT_HINT_RE.search(attrs or ""): try: el.decompose() except Exception: @@ -231,24 +255,40 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: txt = re.sub(r"\n{3,}", "\n\n", txt) return txt + def _score_node(node, raw_text: str, attr_str: str) -> float: + length = len(raw_text) + if length < 80: + return 0.0 + link_count = len(node.find_all("a")) + link_density = link_count / max(1.0, length / 80.0) + bonus = 0.0 + if node.name == "article": + bonus += 0.6 * length + elif node.name == "main": + bonus += 0.4 * length + elif node.name == "section": + bonus += 0.2 * length + if _CONTENT_HINT_RE.search(attr_str): + bonus += 0.35 * length + if _NEGATIVE_HINT_RE.search(attr_str): + bonus -= 0.25 * length + penalty = min(0.9, link_density * 0.6) + return (length + bonus) * (1.0 - penalty) + # Score candidate blocks to pick main content best_text = "" best_score = 0.0 for node in soup.find_all(["article", "main", "section", "div", "body"]): raw = _norm(node) - if not raw or len(raw) < 80: + if not raw: continue - link_count = len(node.find_all("a")) - link_density = link_count / max(1.0, len(raw) / 80.0) - penalty = min(0.9, link_density) - bonus = 1.0 - if node.name == "article": - bonus += 0.35 - elif node.name == "main": - bonus += 0.25 - elif node.name == "section": - bonus += 0.1 - score = len(raw) * bonus * (1.0 - penalty) + attr_str = " ".join([ + node.get("id") or "", + " ".join(node.get("class") or []), + node.get("role") or "", + node.get("aria-label") or "", + ]).lower() + score = _score_node(node, raw, attr_str) if score > best_score: best_score = score best_text = raw @@ -259,6 +299,12 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str: best_text = _norm(target) cleaned = _clean_lines(best_text) + # If too short, fall back to broader body text instead of dropping the page + if len(cleaned) < 300: + broader = _clean_lines(_norm(soup.body or soup)) + if len(broader) > len(cleaned): + cleaned = broader + if not cleaned.strip(): return "" if len(cleaned) > max_len: