initial commit
This commit is contained in:
140
server/tools/backfill_translations.py
Normal file
140
server/tools/backfill_translations.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill missing translations for the `news` table.
|
||||
|
||||
- Finds rows where summary_de and/or summary_jp are NULL/empty
|
||||
but summary_en is present.
|
||||
- Uses the same Ollama-based translation routine as the collector.
|
||||
- Safe to run multiple times.
|
||||
|
||||
Usage examples:
|
||||
python server/tools/backfill_translations.py
|
||||
python server/tools/backfill_translations.py --langs de
|
||||
python server/tools/backfill_translations.py --limit 25 --verbose
|
||||
python server/tools/backfill_translations.py --dry-run
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import argparse
|
||||
from typing import Iterable, Tuple, Optional
|
||||
|
||||
# Make parent directory (server/) importable so we can import news_collector.py
|
||||
import sys, os
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
SERVER_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, ".."))
|
||||
if SERVER_ROOT not in sys.path:
|
||||
sys.path.insert(0, SERVER_ROOT)
|
||||
|
||||
# Reuse the exact same translation logic as the collector
|
||||
from news_collector import translate_summary, setup_database, run_db_migrations
|
||||
|
||||
DB_FILE = os.path.join(SERVER_ROOT, "news.db")
|
||||
|
||||
|
||||
def is_blank(s: Optional[str]) -> bool:
|
||||
return s is None or str(s).strip() == ""
|
||||
|
||||
|
||||
def select_rows(conn: sqlite3.Connection, limit: Optional[int]) -> Iterable[Tuple]:
|
||||
"""
|
||||
Select rows that have EN present but are missing DE and/or JP.
|
||||
Returns tuples: (id, country_name, news_date, summary_en, summary_de, summary_jp)
|
||||
"""
|
||||
sql = """
|
||||
SELECT id, country_name, news_date, summary_en, summary_de, summary_jp
|
||||
FROM news
|
||||
WHERE
|
||||
summary_en IS NOT NULL AND TRIM(summary_en) <> ''
|
||||
AND (
|
||||
summary_de IS NULL OR TRIM(summary_de) = ''
|
||||
OR summary_jp IS NULL OR TRIM(summary_jp) = ''
|
||||
)
|
||||
ORDER BY news_date DESC, id ASC
|
||||
"""
|
||||
if limit and limit > 0:
|
||||
sql += " LIMIT ?"
|
||||
return conn.execute(sql, (limit,))
|
||||
return conn.execute(sql)
|
||||
|
||||
|
||||
def backfill(conn: sqlite3.Connection, langs: Iterable[str], dry_run: bool = False, verbose: bool = False) -> int:
|
||||
"""
|
||||
Perform backfill. Returns the count of rows updated.
|
||||
"""
|
||||
cur = conn.cursor()
|
||||
rows = list(select_rows(conn, limit=args.limit))
|
||||
if verbose:
|
||||
print(f"[backfill] candidates: {len(rows)}")
|
||||
|
||||
updated_count = 0
|
||||
|
||||
for row in rows:
|
||||
row_id, country, news_date, en, de, jp = row
|
||||
|
||||
to_update = {}
|
||||
if "de" in langs and is_blank(de) and not is_blank(en):
|
||||
tr_de = translate_summary(en, "de")
|
||||
if tr_de:
|
||||
to_update["summary_de"] = tr_de
|
||||
if verbose:
|
||||
print(f" ✓ {country} [{news_date}] -> DE translated")
|
||||
|
||||
if "jp" in langs and is_blank(jp) and not is_blank(en):
|
||||
tr_jp = translate_summary(en, "jp")
|
||||
if tr_jp:
|
||||
to_update["summary_jp"] = tr_jp
|
||||
if verbose:
|
||||
print(f" ✓ {country} [{news_date}] -> JP translated")
|
||||
|
||||
if to_update:
|
||||
updated_count += 1
|
||||
if not dry_run:
|
||||
sets = ", ".join([f"{k} = ?" for k in to_update.keys()])
|
||||
params = list(to_update.values()) + [row_id]
|
||||
cur.execute(f"UPDATE news SET {sets}, created_at = CURRENT_TIMESTAMP WHERE id = ?", params)
|
||||
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
return updated_count
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backfill missing news translations (DE/JP) from summary_en.")
|
||||
parser.add_argument("--langs", nargs="+", default=["de", "jp"], choices=["de", "jp"],
|
||||
help="Which languages to backfill (default: de jp)")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Max rows to process (0 = no limit)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Do not write changes to the database")
|
||||
parser.add_argument("--verbose", action="store_true", help="Verbose output")
|
||||
args_local = parser.parse_args()
|
||||
|
||||
# Make args available inside backfill() call
|
||||
global args
|
||||
args = args_local
|
||||
|
||||
# Ensure DB & schema are ready
|
||||
setup_database()
|
||||
try:
|
||||
run_db_migrations()
|
||||
except Exception as e:
|
||||
print(f"[warn] migration step failed: {e}")
|
||||
|
||||
if not os.path.exists(DB_FILE):
|
||||
print(f"[error] DB file not found: {DB_FILE}")
|
||||
return 1
|
||||
|
||||
conn = sqlite3.connect(DB_FILE)
|
||||
try:
|
||||
count = backfill(conn, args.langs, dry_run=args.dry_run, verbose=args.verbose)
|
||||
if args.dry_run:
|
||||
print(f"[dry-run] would update {count} row(s).")
|
||||
else:
|
||||
print(f"[done] updated {count} row(s).")
|
||||
finally:
|
||||
conn.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user