Refine text extraction logic in websearch.py

This commit is contained in:
Victor Giers
2025-11-30 15:05:16 +01:00
parent 87318354e0
commit 63ae8d8c6c

View File

@@ -208,7 +208,7 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
for tag in soup.select("script,style,noscript,template"): for tag in soup.select("script,style,noscript,template"):
tag.decompose() tag.decompose()
# Remove cookie/consent overlays by attributes or short text snippets # Remove cookie/consent overlays by attributes or short text snippets (conservative)
for el in list(soup.find_all(True)): for el in list(soup.find_all(True)):
if el.name in {"html", "body"}: if el.name in {"html", "body"}:
continue continue
@@ -219,35 +219,14 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
el.get("aria-label") or "", el.get("aria-label") or "",
]).lower() ]).lower()
text_preview = (el.get_text(" ", strip=True)[:220] or "").lower() text_preview = (el.get_text(" ", strip=True)[:220] or "").lower()
if _COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview): if (_COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview)):
try: # only remove if looks like a small overlay/dialog, not the main body
el.decompose() full_txt = " ".join(el.get_text(" ", strip=True).split())
except Exception: if len(text_preview) <= 260 or len(full_txt) <= 800:
pass try:
el.decompose()
# Remove clearly decorative/structural regions except Exception:
for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas"): pass
for t in soup.find_all(tag_name):
try:
t.decompose()
except Exception:
pass
# Remove elements whose id/class/role clearly mark them as noise
for el in list(soup.find_all(True)):
if el.name in {"html", "body"}:
continue
attrs = " ".join([
el.get("id") or "",
" ".join(el.get("class") or []),
el.get("role") or "",
el.get("aria-label") or "",
])
if _NOISE_ATTR_RE.search(attrs or "") and not _CONTENT_HINT_RE.search(attrs or ""):
try:
el.decompose()
except Exception:
pass
def _norm(node): def _norm(node):
txt = node.get_text("\n", strip=True) txt = node.get_text("\n", strip=True)
@@ -257,7 +236,7 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
def _score_node(node, raw_text: str, attr_str: str) -> float: def _score_node(node, raw_text: str, attr_str: str) -> float:
length = len(raw_text) length = len(raw_text)
if length < 80: if length < 40:
return 0.0 return 0.0
link_count = len(node.find_all("a")) link_count = len(node.find_all("a"))
link_density = link_count / max(1.0, length / 80.0) link_density = link_count / max(1.0, length / 80.0)
@@ -271,11 +250,11 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
if _CONTENT_HINT_RE.search(attr_str): if _CONTENT_HINT_RE.search(attr_str):
bonus += 0.35 * length bonus += 0.35 * length
if _NEGATIVE_HINT_RE.search(attr_str): if _NEGATIVE_HINT_RE.search(attr_str):
bonus -= 0.25 * length bonus -= 0.2 * length
penalty = min(0.9, link_density * 0.6) penalty = min(0.9, link_density * 0.6)
return (length + bonus) * (1.0 - penalty) return (length + bonus) * (1.0 - penalty)
# Score candidate blocks to pick main content # Score candidate blocks to pick main content without over-deleting nodes
best_text = "" best_text = ""
best_score = 0.0 best_score = 0.0
for node in soup.find_all(["article", "main", "section", "div", "body"]): for node in soup.find_all(["article", "main", "section", "div", "body"]):