Add URL ingestion and friendly URL name generation
This commit is contained in:
@@ -1088,11 +1088,24 @@ class App(TkinterDnD.Tk): # type: ignore
|
||||
if p.is_dir():
|
||||
for q in p.rglob('*'):
|
||||
if q.is_file():
|
||||
final.append(q)
|
||||
final.append(q)
|
||||
elif p.exists():
|
||||
final.append(p)
|
||||
return final
|
||||
|
||||
@staticmethod
|
||||
def _friendly_url_name(url: str) -> str:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc or url
|
||||
path = (parsed.path or "").strip("/").split("/")
|
||||
if path and path[0]:
|
||||
first = path[0][:40]
|
||||
return f"{host}/{first}"
|
||||
return host
|
||||
except Exception:
|
||||
return url
|
||||
|
||||
def _add_paths(self, paths: List[Path]):
|
||||
# Expand directories into files
|
||||
expanded: List[Path] = []
|
||||
@@ -1272,6 +1285,12 @@ class App(TkinterDnD.Tk): # type: ignore
|
||||
h.update((str(path) + "|" + str(getattr(st, 'st_mtime', 0.0))).encode("utf-8", "ignore"))
|
||||
return h.hexdigest()
|
||||
|
||||
def _compute_url_hash(self, url: str) -> str:
|
||||
try:
|
||||
return hashlib.sha256(url.strip().encode("utf-8", "ignore")).hexdigest()
|
||||
except Exception:
|
||||
return hashlib.sha256(url.encode("utf-8", "ignore")).hexdigest()
|
||||
|
||||
def _ensure_file_symlink(self, src: Path, file_hash: str) -> Path:
|
||||
# name pattern: {hash}__basename
|
||||
dst = self._files_dir / f"{file_hash}__{src.name}"
|
||||
@@ -1347,6 +1366,50 @@ class App(TkinterDnD.Tk): # type: ignore
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _ingest_single_url(self, url: str, url_hash: str) -> bool:
|
||||
try:
|
||||
self._set_status(f"Fetching {url}…")
|
||||
try:
|
||||
html_text, _hdrs = websearch._http_get(url, timeout=25)
|
||||
except Exception:
|
||||
return False
|
||||
text = websearch._extract_text(html_text)
|
||||
if not text.strip():
|
||||
return False
|
||||
title = self._friendly_url_name(url)
|
||||
# Try a simple <title> scrape
|
||||
try:
|
||||
m = re.search(r"<title>(.*?)</title>", html_text, flags=re.I | re.S)
|
||||
if m:
|
||||
raw_title = m.group(1)
|
||||
cleaned = re.sub(r"\s+", " ", raw_title)
|
||||
try:
|
||||
cleaned = html.unescape(cleaned)
|
||||
except Exception:
|
||||
pass
|
||||
cleaned = cleaned.strip()
|
||||
if cleaned:
|
||||
title = cleaned
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ts = int(time.time())
|
||||
obj = {
|
||||
"id": url,
|
||||
"title": title,
|
||||
"text": text,
|
||||
"source_path": url,
|
||||
"mime": "text/html",
|
||||
"file_hash": url_hash,
|
||||
"added_at": ts,
|
||||
}
|
||||
with self._corpus_file.open("a", encoding="utf-8") as fh_out:
|
||||
fh_out.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
||||
self._seen_hashes.add(url_hash)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _ensure_corpus_for_files(self, paths: List[Path], *, blocking: bool = True):
|
||||
if not paths:
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user