auto-git:
[change] websearch.py
This commit is contained in:
76
websearch.py
76
websearch.py
@@ -208,18 +208,42 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
|
|||||||
for tag in soup.select("script,style,noscript,template"):
|
for tag in soup.select("script,style,noscript,template"):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas", "input", "select", "option", "button"):
|
# Remove cookie/consent overlays by attributes or short text snippets
|
||||||
|
for el in list(soup.find_all(True)):
|
||||||
|
if el.name in {"html", "body"}:
|
||||||
|
continue
|
||||||
|
attr_str = " ".join([
|
||||||
|
el.get("id") or "",
|
||||||
|
" ".join(el.get("class") or []),
|
||||||
|
el.get("role") or "",
|
||||||
|
el.get("aria-label") or "",
|
||||||
|
]).lower()
|
||||||
|
text_preview = (el.get_text(" ", strip=True)[:220] or "").lower()
|
||||||
|
if _COOKIE_HINT_RE.search(attr_str) or _COOKIE_HINT_RE.search(text_preview):
|
||||||
|
try:
|
||||||
|
el.decompose()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Remove clearly decorative/structural regions
|
||||||
|
for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas"):
|
||||||
for t in soup.find_all(tag_name):
|
for t in soup.find_all(tag_name):
|
||||||
t.decompose()
|
try:
|
||||||
|
t.decompose()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Remove elements whose id/class/role clearly mark them as noise
|
# Remove elements whose id/class/role clearly mark them as noise
|
||||||
for el in list(soup.find_all(True)):
|
for el in list(soup.find_all(True)):
|
||||||
|
if el.name in {"html", "body"}:
|
||||||
|
continue
|
||||||
attrs = " ".join([
|
attrs = " ".join([
|
||||||
el.get("id") or "",
|
el.get("id") or "",
|
||||||
" ".join(el.get("class") or []),
|
" ".join(el.get("class") or []),
|
||||||
el.get("role") or "",
|
el.get("role") or "",
|
||||||
|
el.get("aria-label") or "",
|
||||||
])
|
])
|
||||||
if _NOISE_ATTR_RE.search(attrs or ""):
|
if _NOISE_ATTR_RE.search(attrs or "") and not _CONTENT_HINT_RE.search(attrs or ""):
|
||||||
try:
|
try:
|
||||||
el.decompose()
|
el.decompose()
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -231,24 +255,40 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
|
|||||||
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
txt = re.sub(r"\n{3,}", "\n\n", txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
def _score_node(node, raw_text: str, attr_str: str) -> float:
|
||||||
|
length = len(raw_text)
|
||||||
|
if length < 80:
|
||||||
|
return 0.0
|
||||||
|
link_count = len(node.find_all("a"))
|
||||||
|
link_density = link_count / max(1.0, length / 80.0)
|
||||||
|
bonus = 0.0
|
||||||
|
if node.name == "article":
|
||||||
|
bonus += 0.6 * length
|
||||||
|
elif node.name == "main":
|
||||||
|
bonus += 0.4 * length
|
||||||
|
elif node.name == "section":
|
||||||
|
bonus += 0.2 * length
|
||||||
|
if _CONTENT_HINT_RE.search(attr_str):
|
||||||
|
bonus += 0.35 * length
|
||||||
|
if _NEGATIVE_HINT_RE.search(attr_str):
|
||||||
|
bonus -= 0.25 * length
|
||||||
|
penalty = min(0.9, link_density * 0.6)
|
||||||
|
return (length + bonus) * (1.0 - penalty)
|
||||||
|
|
||||||
# Score candidate blocks to pick main content
|
# Score candidate blocks to pick main content
|
||||||
best_text = ""
|
best_text = ""
|
||||||
best_score = 0.0
|
best_score = 0.0
|
||||||
for node in soup.find_all(["article", "main", "section", "div", "body"]):
|
for node in soup.find_all(["article", "main", "section", "div", "body"]):
|
||||||
raw = _norm(node)
|
raw = _norm(node)
|
||||||
if not raw or len(raw) < 80:
|
if not raw:
|
||||||
continue
|
continue
|
||||||
link_count = len(node.find_all("a"))
|
attr_str = " ".join([
|
||||||
link_density = link_count / max(1.0, len(raw) / 80.0)
|
node.get("id") or "",
|
||||||
penalty = min(0.9, link_density)
|
" ".join(node.get("class") or []),
|
||||||
bonus = 1.0
|
node.get("role") or "",
|
||||||
if node.name == "article":
|
node.get("aria-label") or "",
|
||||||
bonus += 0.35
|
]).lower()
|
||||||
elif node.name == "main":
|
score = _score_node(node, raw, attr_str)
|
||||||
bonus += 0.25
|
|
||||||
elif node.name == "section":
|
|
||||||
bonus += 0.1
|
|
||||||
score = len(raw) * bonus * (1.0 - penalty)
|
|
||||||
if score > best_score:
|
if score > best_score:
|
||||||
best_score = score
|
best_score = score
|
||||||
best_text = raw
|
best_text = raw
|
||||||
@@ -259,6 +299,12 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
|
|||||||
best_text = _norm(target)
|
best_text = _norm(target)
|
||||||
|
|
||||||
cleaned = _clean_lines(best_text)
|
cleaned = _clean_lines(best_text)
|
||||||
|
# If too short, fall back to broader body text instead of dropping the page
|
||||||
|
if len(cleaned) < 300:
|
||||||
|
broader = _clean_lines(_norm(soup.body or soup))
|
||||||
|
if len(broader) > len(cleaned):
|
||||||
|
cleaned = broader
|
||||||
|
|
||||||
if not cleaned.strip():
|
if not cleaned.strip():
|
||||||
return ""
|
return ""
|
||||||
if len(cleaned) > max_len:
|
if len(cleaned) > max_len:
|
||||||
|
|||||||
Reference in New Issue
Block a user