Files
youtube_summarizer/translate_summary.py

125 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
translate_summary.py
Usage:
python3 translate_summary.py --summary-file <file> --lang <de|jp> [--model <model>] [--output-file <file>]
Arguments:
--summary-file Path to the file containing the English summary text.
--lang Target language ('de' for German, 'jp' for Japanese).
--model (Optional) Ollama model name, defaults to mistral:latest.
--output-file (Optional) Where to write translated summary as plain text.
Example:
python3 translate_summary.py --summary-file summary.txt --lang de --model mistral:latest
"""
import sys
import argparse
import json
import math
import requests
LANG_MAP = {
"de": "German",
"jp": "Japanese"
}
OLLAMA_CHARS_PER_TOKEN = 3.5
OLLAMA_OUTPUT_TOKEN_BUDGET = 2048
OLLAMA_CONTEXT_BUCKETS = (4096, 8192, 16384, 32768, 65536)
def default_translation_prompt_template(target_language):
if target_language not in LANG_MAP:
raise ValueError("Supported languages: de (German), jp (Japanese)")
return (
f"Translate the following summary into {LANG_MAP[target_language]}. Only output the translated summary, "
"no explanation or intro. If it's already in the target language, do nothing but repeat it.\n\n"
"Summary:\n{summary}\n\nTranslation:"
)
def render_translation_prompt(summary_text, target_language, prompt_template=None):
template = (prompt_template or default_translation_prompt_template(target_language)).strip()
prompt = (
template
.replace("{language}", LANG_MAP[target_language])
.replace("{summary}", summary_text)
)
if "{summary}" not in template:
prompt = f"{prompt}\n\nSummary:\n{summary_text}\n\nTranslation:"
return prompt
def choose_ollama_num_ctx(prompt, output_budget=OLLAMA_OUTPUT_TOKEN_BUDGET):
estimated_input_tokens = math.ceil(len(prompt) / OLLAMA_CHARS_PER_TOKEN)
needed_tokens = estimated_input_tokens + output_budget
for bucket in OLLAMA_CONTEXT_BUCKETS:
if needed_tokens <= bucket:
return bucket
return OLLAMA_CONTEXT_BUCKETS[-1]
def translate_summary_text(summary_text, target_language, model="mistral:latest", prompt_template=None):
if target_language not in LANG_MAP:
raise ValueError("Supported languages: de (German), jp (Japanese)")
prompt = (
render_translation_prompt(summary_text, target_language, prompt_template)
)
payload = {
"model": model,
"messages": [
{"role": "system", "content": f"You are an expert translator proficient in {LANG_MAP[target_language]} and English."},
{"role": "user", "content": prompt}
],
"options": {
"num_ctx": choose_ollama_num_ctx(prompt)
},
"stream": False
}
resp = requests.post("http://localhost:11434/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
return data.get("message", {}).get("content", "").strip()
def translate_summary_file(summary_file, target_language, model="mistral:latest", prompt_template=None):
with open(summary_file, "r", encoding="utf-8") as f:
summary_text = f.read().strip()
if not summary_text:
raise ValueError("Empty summary text!")
return translate_summary_text(summary_text, target_language, model, prompt_template)
def main():
parser = argparse.ArgumentParser(description="Translate summary using Ollama")
parser.add_argument("--summary-file", required=True, help="Path to file with English summary text")
parser.add_argument("--lang", required=True, choices=["de", "jp"], help="Target language: 'de' or 'jp'")
parser.add_argument("--model", default="mistral:latest", help="Ollama model to use")
parser.add_argument("--prompt-template", help="Prompt template for the translation LLM call")
parser.add_argument("--prompt-template-file", help="Path to a text file containing the translation prompt template")
parser.add_argument("--output-file", help="Output file for translated summary")
args = parser.parse_args()
prompt_template = args.prompt_template
if args.prompt_template_file:
with open(args.prompt_template_file, "r", encoding="utf-8") as f:
prompt_template = f.read()
# Read summary
try:
translation = translate_summary_file(args.summary_file, args.lang, args.model, prompt_template)
except Exception as e:
print(f"Translation failed: {e}", file=sys.stderr)
sys.exit(2)
# Output result
if args.output_file:
with open(args.output_file, "w", encoding="utf-8") as f:
f.write(translation)
else:
print(translation)
if __name__ == "__main__":
main()