- fcntl.flock verhindert parallele Enricher-Instanzen - Atomarer Cache-Save über tmp-Datei - Debug-Logs bei leerer/gefilterter Beschreibung - Sleep auf 2s erhöht für stabilere Ollama-Antworten
248 lines
7.9 KiB
Python
248 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Save.TV Film-Enricher — reichert Archiv-Filme per KI an.
|
|
|
|
Läuft als Cronjob (z.B. alle 3h) auf CT 116.
|
|
Nutzt Ollama auf dem KI-Server (gleicher Endpunkt wie llm.py).
|
|
Schreibt in /mnt/savetv/.filminfo_cache.json — dieselbe Datei
|
|
die savetv_web.py und savetv_extra_routes.py verwenden.
|
|
|
|
Ergebnis: 3-6 Sätze Beschreibung, Hauptdarsteller, Land, Jahr, Genre.
|
|
"""
|
|
|
|
import fcntl
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import requests
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
sys.path.insert(0, "/opt")
|
|
|
|
from tools import savetv
|
|
|
|
log = logging.getLogger("savetv_enrich")
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
|
|
OLLAMA_BASE = "http://100.84.255.83:11434"
|
|
MODEL = "qwen2.5:14b"
|
|
FALLBACK_MODEL = "qwen3:30b-a3b"
|
|
|
|
FILMINFO_CACHE = Path("/mnt/savetv/.filminfo_cache.json")
|
|
LOCKFILE = Path("/tmp/savetv_enrich.lock")
|
|
BATCH_SIZE = 8
|
|
SLEEP_BETWEEN = 2.0
|
|
|
|
|
|
def _load_cache() -> dict:
|
|
if FILMINFO_CACHE.exists():
|
|
try:
|
|
return json.loads(FILMINFO_CACHE.read_text())
|
|
except Exception:
|
|
pass
|
|
return {}
|
|
|
|
|
|
def _save_cache(cache: dict):
|
|
tmp = FILMINFO_CACHE.with_suffix(".tmp")
|
|
tmp.write_text(json.dumps(cache, ensure_ascii=False, indent=1))
|
|
tmp.rename(FILMINFO_CACHE)
|
|
|
|
|
|
def _is_enriched(entry: dict) -> bool:
|
|
return bool(entry.get("description"))
|
|
|
|
|
|
def _call_ollama(prompt: str, model: str = MODEL) -> str:
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": (
|
|
"Du bist eine Filmdatenbank. Antworte AUSSCHLIESSLICH mit validem JSON. "
|
|
"Kein Markdown, keine Erklärungen, kein Denken. Nur JSON. Sprache: Deutsch."
|
|
)},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
"stream": False,
|
|
"think": False,
|
|
"options": {"num_predict": 1024},
|
|
}
|
|
try:
|
|
r = requests.post(
|
|
f"{OLLAMA_BASE}/api/chat",
|
|
json=payload, timeout=180,
|
|
)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
text = data.get("message", {}).get("content", "").strip()
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```\w*\n?", "", text)
|
|
text = re.sub(r"\n?```$", "", text)
|
|
return text.strip()
|
|
except requests.exceptions.ReadTimeout:
|
|
if model != FALLBACK_MODEL:
|
|
log.warning("Timeout mit %s, Fallback auf %s", model, FALLBACK_MODEL)
|
|
return _call_ollama(prompt, model=FALLBACK_MODEL)
|
|
raise
|
|
except Exception as e:
|
|
log.error("Ollama-Fehler: %s", e)
|
|
return ""
|
|
|
|
|
|
def _normalize_actors(actors_raw) -> list:
|
|
if not actors_raw or not isinstance(actors_raw, list):
|
|
return []
|
|
result = []
|
|
for a in actors_raw[:4]:
|
|
if isinstance(a, str):
|
|
result.append(a)
|
|
elif isinstance(a, dict):
|
|
name = a.get("name") or a.get("actor") or ""
|
|
if name:
|
|
result.append(name)
|
|
return result
|
|
|
|
|
|
def _enrich_film(title: str) -> dict:
|
|
clean_title = re.sub(r"\s*[-\u2013\u2014]\s*.+$", "", title).strip()
|
|
|
|
prompt = f"""Gib mir Informationen zum Film "{clean_title}".
|
|
Antworte als JSON mit exakt diesen Feldern (alle Texte auf Deutsch):
|
|
{{
|
|
"year": "Erscheinungsjahr als String",
|
|
"countries": ["Produktionsländer als Strings"],
|
|
"genres": ["bis zu 3 Genres als Strings"],
|
|
"actors": ["bis zu 4 Hauptdarsteller als Strings"],
|
|
"director": "Regisseur als String",
|
|
"description": "3-5 Sätze auf Deutsch: Worum geht es, für wen geeignet. Keine Spoiler."
|
|
}}
|
|
Wichtig: Alle Werte müssen Strings sein. Schreibe die description komplett auf Deutsch.
|
|
Falls du den Film nicht kennst, setze description auf leeren String."""
|
|
|
|
raw = _call_ollama(prompt)
|
|
if not raw:
|
|
log.warning(" Leere Antwort von Ollama für '%s'", title)
|
|
return {"year": "", "countries": [], "genres": [], "actors": [],
|
|
"director": "", "description": ""}
|
|
|
|
try:
|
|
data = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
match = re.search(r"\{[\s\S]*\}", raw)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group())
|
|
except json.JSONDecodeError:
|
|
log.warning("JSON-Parse fehlgeschlagen für '%s': %.200s", title, raw)
|
|
return {"year": "", "countries": [], "genres": [], "actors": [],
|
|
"director": "", "description": ""}
|
|
else:
|
|
log.warning("Kein JSON gefunden für '%s': %.200s", title, raw)
|
|
return {"year": "", "countries": [], "genres": [], "actors": [],
|
|
"director": "", "description": ""}
|
|
|
|
desc = str(data.get("description", ""))[:600]
|
|
if desc and not _is_mostly_latin(desc):
|
|
log.info(" Nicht-lateinische Beschreibung gefiltert: %.80s", desc)
|
|
desc = ""
|
|
if not desc:
|
|
log.info(" Beschreibung leer, raw year=%s actors=%s",
|
|
data.get("year"), str(data.get("actors", []))[:80])
|
|
|
|
return {
|
|
"year": str(data.get("year", ""))[:4],
|
|
"countries": [str(c) for c in (data.get("countries") or [])[:3]],
|
|
"genres": [str(g) for g in (data.get("genres") or [])[:3]],
|
|
"actors": _normalize_actors(data.get("actors")),
|
|
"director": str(data.get("director", ""))[:60],
|
|
"description": desc,
|
|
}
|
|
|
|
|
|
def _is_mostly_latin(text: str) -> bool:
|
|
if not text:
|
|
return False
|
|
latin = sum(1 for c in text if c.isascii() or '\u00C0' <= c <= '\u024F')
|
|
return latin / max(len(text), 1) > 0.7
|
|
|
|
|
|
def run():
|
|
log.info("Starte Film-Enrichment...")
|
|
|
|
entries = savetv._get_full_archive()
|
|
if not entries:
|
|
log.warning("Keine Archiv-Einträge von Save.TV erhalten")
|
|
return
|
|
|
|
titles = set()
|
|
for e in entries:
|
|
tc = e.get("STRTELECASTENTRY", {})
|
|
if tc.get("SFOLGE", ""):
|
|
continue
|
|
title = tc.get("STITLE", "")
|
|
if title and not savetv._is_excluded(title):
|
|
titles.add(title)
|
|
|
|
cache = _load_cache()
|
|
missing = [t for t in titles if not _is_enriched(cache.get(t, {}))]
|
|
|
|
if not missing:
|
|
log.info("Alle %d Filme bereits angereichert", len(titles))
|
|
return
|
|
|
|
log.info("%d Filme im Archiv, %d davon noch ohne KI-Beschreibung",
|
|
len(titles), len(missing))
|
|
|
|
enriched = 0
|
|
for i, title in enumerate(missing):
|
|
log.info("[%d/%d] Anreichern: %s", i + 1, len(missing), title)
|
|
try:
|
|
info = _enrich_film(title)
|
|
if info.get("description"):
|
|
cache[title] = info
|
|
enriched += 1
|
|
log.info(" OK: %s (%s)", info.get("year", "?"),
|
|
", ".join(info.get("actors", [])[:2]))
|
|
if enriched % BATCH_SIZE == 0:
|
|
_save_cache(cache)
|
|
log.info(" Cache gespeichert (%d angereichert)", enriched)
|
|
else:
|
|
old = cache.get(title, {})
|
|
for k in ("year", "countries", "genres"):
|
|
if info.get(k) and not old.get(k):
|
|
old[k] = info[k]
|
|
if info.get("actors"):
|
|
old["actors"] = info["actors"]
|
|
if info.get("director"):
|
|
old["director"] = info["director"]
|
|
cache[title] = old
|
|
log.info(" Keine Beschreibung erhalten, Basisdaten übernommen")
|
|
except Exception as e:
|
|
log.error(" Fehler bei '%s': %s", title, e)
|
|
|
|
time.sleep(SLEEP_BETWEEN)
|
|
|
|
_save_cache(cache)
|
|
log.info("Fertig: %d von %d Filmen neu angereichert", enriched, len(missing))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
lock_fd = open(LOCKFILE, "w")
|
|
try:
|
|
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
except BlockingIOError:
|
|
print("Enricher läuft bereits — Abbruch.")
|
|
sys.exit(0)
|
|
|
|
try:
|
|
run()
|
|
finally:
|
|
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
|
lock_fd.close()
|