140 lines
4.6 KiB
Python
140 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backfill missing translations for the `news` table.
|
|
|
|
- Finds rows where summary_de and/or summary_jp are NULL/empty
|
|
but summary_en is present.
|
|
- Uses the same Ollama-based translation routine as the collector.
|
|
- Safe to run multiple times.
|
|
|
|
Usage examples:
|
|
python server/tools/backfill_translations.py
|
|
python server/tools/backfill_translations.py --langs de
|
|
python server/tools/backfill_translations.py --limit 25 --verbose
|
|
python server/tools/backfill_translations.py --dry-run
|
|
"""
|
|
|
|
import os
|
|
import sqlite3
|
|
import argparse
|
|
from typing import Iterable, Tuple, Optional
|
|
|
|
# Make parent directory (server/) importable so we can import news_collector.py
|
|
import sys, os
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
SERVER_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, ".."))
|
|
if SERVER_ROOT not in sys.path:
|
|
sys.path.insert(0, SERVER_ROOT)
|
|
|
|
# Reuse the exact same translation logic as the collector
|
|
from news_collector import translate_summary, setup_database, run_db_migrations
|
|
|
|
DB_FILE = os.path.join(SERVER_ROOT, "news.db")
|
|
|
|
|
|
def is_blank(s: Optional[str]) -> bool:
|
|
return s is None or str(s).strip() == ""
|
|
|
|
|
|
def select_rows(conn: sqlite3.Connection, limit: Optional[int]) -> Iterable[Tuple]:
|
|
"""
|
|
Select rows that have EN present but are missing DE and/or JP.
|
|
Returns tuples: (id, country_name, news_date, summary_en, summary_de, summary_jp)
|
|
"""
|
|
sql = """
|
|
SELECT id, country_name, news_date, summary_en, summary_de, summary_jp
|
|
FROM news
|
|
WHERE
|
|
summary_en IS NOT NULL AND TRIM(summary_en) <> ''
|
|
AND (
|
|
summary_de IS NULL OR TRIM(summary_de) = ''
|
|
OR summary_jp IS NULL OR TRIM(summary_jp) = ''
|
|
)
|
|
ORDER BY news_date DESC, id ASC
|
|
"""
|
|
if limit and limit > 0:
|
|
sql += " LIMIT ?"
|
|
return conn.execute(sql, (limit,))
|
|
return conn.execute(sql)
|
|
|
|
|
|
def backfill(conn: sqlite3.Connection, langs: Iterable[str], dry_run: bool = False, verbose: bool = False) -> int:
|
|
"""
|
|
Perform backfill. Returns the count of rows updated.
|
|
"""
|
|
cur = conn.cursor()
|
|
rows = list(select_rows(conn, limit=args.limit))
|
|
if verbose:
|
|
print(f"[backfill] candidates: {len(rows)}")
|
|
|
|
updated_count = 0
|
|
|
|
for row in rows:
|
|
row_id, country, news_date, en, de, jp = row
|
|
|
|
to_update = {}
|
|
if "de" in langs and is_blank(de) and not is_blank(en):
|
|
tr_de = translate_summary(en, "de")
|
|
if tr_de:
|
|
to_update["summary_de"] = tr_de
|
|
if verbose:
|
|
print(f" ✓ {country} [{news_date}] -> DE translated")
|
|
|
|
if "jp" in langs and is_blank(jp) and not is_blank(en):
|
|
tr_jp = translate_summary(en, "jp")
|
|
if tr_jp:
|
|
to_update["summary_jp"] = tr_jp
|
|
if verbose:
|
|
print(f" ✓ {country} [{news_date}] -> JP translated")
|
|
|
|
if to_update:
|
|
updated_count += 1
|
|
if not dry_run:
|
|
sets = ", ".join([f"{k} = ?" for k in to_update.keys()])
|
|
params = list(to_update.values()) + [row_id]
|
|
cur.execute(f"UPDATE news SET {sets}, created_at = CURRENT_TIMESTAMP WHERE id = ?", params)
|
|
|
|
if not dry_run:
|
|
conn.commit()
|
|
|
|
return updated_count
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Backfill missing news translations (DE/JP) from summary_en.")
|
|
parser.add_argument("--langs", nargs="+", default=["de", "jp"], choices=["de", "jp"],
|
|
help="Which languages to backfill (default: de jp)")
|
|
parser.add_argument("--limit", type=int, default=0, help="Max rows to process (0 = no limit)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Do not write changes to the database")
|
|
parser.add_argument("--verbose", action="store_true", help="Verbose output")
|
|
args_local = parser.parse_args()
|
|
|
|
# Make args available inside backfill() call
|
|
global args
|
|
args = args_local
|
|
|
|
# Ensure DB & schema are ready
|
|
setup_database()
|
|
try:
|
|
run_db_migrations()
|
|
except Exception as e:
|
|
print(f"[warn] migration step failed: {e}")
|
|
|
|
if not os.path.exists(DB_FILE):
|
|
print(f"[error] DB file not found: {DB_FILE}")
|
|
return 1
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
try:
|
|
count = backfill(conn, args.langs, dry_run=args.dry_run, verbose=args.verbose)
|
|
if args.dry_run:
|
|
print(f"[dry-run] would update {count} row(s).")
|
|
else:
|
|
print(f"[done] updated {count} row(s).")
|
|
finally:
|
|
conn.close()
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main()) |