auto-git:

[change] websearch.py
This commit is contained in:
Victor Giers
2025-11-30 14:57:17 +01:00
parent de4daa8f1e
commit 0f6cb8fb5d

View File

@@ -136,14 +136,56 @@ def _is_probably_html_url(url: str) -> bool:
if "." not in path:
return True
ext = path.rsplit(".", 1)[-1]
return ext not in {
"pdf","jpg","jpeg","png","gif","webp","svg","mp4","mp3","mov","avi",
"zip","gz","7z","tar","rar","woff","woff2","ttf","otf"
}
return ext not in {
"pdf","jpg","jpeg","png","gif","webp","svg","mp4","mp3","mov","avi",
"zip","gz","7z","tar","rar","woff","woff2","ttf","otf"
}
except Exception:
return True
# Keywords that typically belong to navigation, banners, or cookie dialogs
_NOISE_WORDS = {
"home","contact","about","copyright","privacy","policy","cookies","cookie","consent",
"login","log","sign","signup","signin","register","account","subscribe","newsletter",
"advert","advertisement","ads","promo","banner","menu","navigation","nav","footer",
"header","share","social","terms","conditions","accessibility","language","shop",
"search","skip","main","content"
}
_NOISE_ATTR_RE = re.compile(r"(cookie|consent|gdpr|banner|popup|modal|dialog|newsletter|subscribe|advert|promo|signin|signup|login|toolbar|share|social|nav|menu|footer|header)", re.I)
def _clean_lines(text: str) -> str:
lines = [ln.strip() for ln in text.splitlines()]
out = []
seen: set[str] = set()
for ln in lines:
if not ln:
if out and out[-1]:
out.append("")
continue
lower = ln.lower()
# Drop obvious boilerplate / cookie notices / menu crumbs
if len(ln) <= 140:
if any(k in lower for k in ("cookie", "consent", "newsletter", "advert", "privacy policy", "terms", "skip to main", "enable javascript", "accept all", "manage preferences")):
continue
tokens = re.findall(r"[a-zA-Z]+", lower)
if tokens and all(t in _NOISE_WORDS for t in tokens) and len(tokens) <= 8:
continue
if lower in seen:
continue
seen.add(lower)
out.append(ln)
# Collapse multiple blank lines
compact: list[str] = []
for ln in out:
if ln == "" and (not compact or compact[-1] == ""):
continue
compact.append(ln)
return "\n".join(compact)
def _extract_text(html: str, *, max_len: int = 120_000) -> str:
if not html:
return ""
@@ -152,18 +194,73 @@ def _extract_text(html: str, *, max_len: int = 120_000) -> str:
txt = re.sub(r"<\s*(script|style)[^>]*>.*?<\s*/\s*\1\s*>", " ", html, flags=re.S|re.I)
txt = re.sub(r"<[^>]+>", " ", txt)
txt = re.sub(r"\s+", " ", txt)
return txt.strip()[:max_len]
return _clean_lines(txt.strip())[:max_len]
try:
try:
soup = BeautifulSoup(html, "lxml")
except Exception:
soup = BeautifulSoup(html, "html.parser")
for tag in soup.select("script,style,noscript"):
# Strip obvious boilerplate containers first
for tag in soup.select("script,style,noscript,template"):
tag.decompose()
text = soup.get_text("\n", strip=True)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text[:max_len] if len(text) > max_len else text
for tag_name in ("header", "footer", "nav", "aside", "form", "iframe", "svg", "canvas", "input", "select", "option", "button"):
for t in soup.find_all(tag_name):
t.decompose()
# Remove elements whose id/class/role clearly mark them as noise
for el in list(soup.find_all(True)):
attrs = " ".join([
el.get("id") or "",
" ".join(el.get("class") or []),
el.get("role") or "",
])
if _NOISE_ATTR_RE.search(attrs or ""):
try:
el.decompose()
except Exception:
pass
def _norm(node):
txt = node.get_text("\n", strip=True)
txt = re.sub(r"[ \t]+", " ", txt)
txt = re.sub(r"\n{3,}", "\n\n", txt)
return txt
# Score candidate blocks to pick main content
best_text = ""
best_score = 0.0
for node in soup.find_all(["article", "main", "section", "div", "body"]):
raw = _norm(node)
if not raw or len(raw) < 80:
continue
link_count = len(node.find_all("a"))
link_density = link_count / max(1.0, len(raw) / 80.0)
penalty = min(0.9, link_density)
bonus = 1.0
if node.name == "article":
bonus += 0.35
elif node.name == "main":
bonus += 0.25
elif node.name == "section":
bonus += 0.1
score = len(raw) * bonus * (1.0 - penalty)
if score > best_score:
best_score = score
best_text = raw
if not best_text:
# fallback to whole body/text
target = soup.body or soup
best_text = _norm(target)
cleaned = _clean_lines(best_text)
if not cleaned.strip():
return ""
if len(cleaned) > max_len:
return cleaned[:max_len]
return cleaned
except Exception:
return ""
@@ -425,4 +522,3 @@ def prior_art_search(
out.append({"url": u, "title": "", "score": round(sc, 1), "snippet": snippet})
return {"queries": queries, "lang": lang, "results": out}