Files
10AM/server/tools/backfill_translations.py
2025-09-09 17:29:49 +02:00

140 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Backfill missing translations for the `news` table.
- Finds rows where summary_de and/or summary_jp are NULL/empty
but summary_en is present.
- Uses the same Ollama-based translation routine as the collector.
- Safe to run multiple times.
Usage examples:
python server/tools/backfill_translations.py
python server/tools/backfill_translations.py --langs de
python server/tools/backfill_translations.py --limit 25 --verbose
python server/tools/backfill_translations.py --dry-run
"""
import os
import sqlite3
import argparse
from typing import Iterable, Tuple, Optional
# Make parent directory (server/) importable so we can import news_collector.py
import sys, os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SERVER_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, ".."))
if SERVER_ROOT not in sys.path:
sys.path.insert(0, SERVER_ROOT)
# Reuse the exact same translation logic as the collector
from news_collector import translate_summary, setup_database, run_db_migrations
DB_FILE = os.path.join(SERVER_ROOT, "news.db")
def is_blank(s: Optional[str]) -> bool:
return s is None or str(s).strip() == ""
def select_rows(conn: sqlite3.Connection, limit: Optional[int]) -> Iterable[Tuple]:
"""
Select rows that have EN present but are missing DE and/or JP.
Returns tuples: (id, country_name, news_date, summary_en, summary_de, summary_jp)
"""
sql = """
SELECT id, country_name, news_date, summary_en, summary_de, summary_jp
FROM news
WHERE
summary_en IS NOT NULL AND TRIM(summary_en) <> ''
AND (
summary_de IS NULL OR TRIM(summary_de) = ''
OR summary_jp IS NULL OR TRIM(summary_jp) = ''
)
ORDER BY news_date DESC, id ASC
"""
if limit and limit > 0:
sql += " LIMIT ?"
return conn.execute(sql, (limit,))
return conn.execute(sql)
def backfill(conn: sqlite3.Connection, langs: Iterable[str], dry_run: bool = False, verbose: bool = False) -> int:
"""
Perform backfill. Returns the count of rows updated.
"""
cur = conn.cursor()
rows = list(select_rows(conn, limit=args.limit))
if verbose:
print(f"[backfill] candidates: {len(rows)}")
updated_count = 0
for row in rows:
row_id, country, news_date, en, de, jp = row
to_update = {}
if "de" in langs and is_blank(de) and not is_blank(en):
tr_de = translate_summary(en, "de")
if tr_de:
to_update["summary_de"] = tr_de
if verbose:
print(f"{country} [{news_date}] -> DE translated")
if "jp" in langs and is_blank(jp) and not is_blank(en):
tr_jp = translate_summary(en, "jp")
if tr_jp:
to_update["summary_jp"] = tr_jp
if verbose:
print(f"{country} [{news_date}] -> JP translated")
if to_update:
updated_count += 1
if not dry_run:
sets = ", ".join([f"{k} = ?" for k in to_update.keys()])
params = list(to_update.values()) + [row_id]
cur.execute(f"UPDATE news SET {sets}, created_at = CURRENT_TIMESTAMP WHERE id = ?", params)
if not dry_run:
conn.commit()
return updated_count
def main():
parser = argparse.ArgumentParser(description="Backfill missing news translations (DE/JP) from summary_en.")
parser.add_argument("--langs", nargs="+", default=["de", "jp"], choices=["de", "jp"],
help="Which languages to backfill (default: de jp)")
parser.add_argument("--limit", type=int, default=0, help="Max rows to process (0 = no limit)")
parser.add_argument("--dry-run", action="store_true", help="Do not write changes to the database")
parser.add_argument("--verbose", action="store_true", help="Verbose output")
args_local = parser.parse_args()
# Make args available inside backfill() call
global args
args = args_local
# Ensure DB & schema are ready
setup_database()
try:
run_db_migrations()
except Exception as e:
print(f"[warn] migration step failed: {e}")
if not os.path.exists(DB_FILE):
print(f"[error] DB file not found: {DB_FILE}")
return 1
conn = sqlite3.connect(DB_FILE)
try:
count = backfill(conn, args.langs, dry_run=args.dry_run, verbose=args.verbose)
if args.dry_run:
print(f"[dry-run] would update {count} row(s).")
else:
print(f"[done] updated {count} row(s).")
finally:
conn.close()
return 0
if __name__ == "__main__":
raise SystemExit(main())