feat: KI-Enricher für Save.TV Filmdatenbank

Nutzt Ollama (qwen3:30b) um Filme im Archiv automatisch mit Beschreibung, Hauptdarstellern, Land und Genre anzureichern. Läuft als Cronjob alle 3h, Ergebnis sofort in der Web-UI sichtbar.
2026-03-27 12:25:58 +00:00 · 2026-03-27 12:25:58 +00:00 · 45b2903fb6
commit 45b2903fb6
parent 43f5586ea2
1 changed files with 199 additions and 0 deletions
--- a/homelab-ai-bot/savetv_enrich.py
+++ b/homelab-ai-bot/savetv_enrich.py
@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""Save.TV Film-Enricher — reichert Archiv-Filme per KI an.
+
+Läuft als Cronjob (z.B. alle 3h) auf CT 116.
+Nutzt Ollama auf dem KI-Server (gleicher Endpunkt wie llm.py).
+Schreibt in /mnt/savetv/.filminfo_cache.json — dieselbe Datei
+die savetv_web.py und savetv_extra_routes.py verwenden.
+
+Ergebnis: 3-6 Sätze Beschreibung, Hauptdarsteller, Land, Jahr, Genre.
+"""
+
+import json
+import logging
+import os
+import re
+import sys
+import time
+import requests
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, "/opt")
+
+from tools import savetv
+
+log = logging.getLogger("savetv_enrich")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    datefmt="%H:%M:%S",
+)
+
+OLLAMA_BASE = "http://100.84.255.83:11434"
+MODEL = "qwen3:30b-a3b"
+FALLBACK_MODEL = "qwen2.5:14b"
+
+FILMINFO_CACHE = Path("/mnt/savetv/.filminfo_cache.json")
+BATCH_SIZE = 8
+SLEEP_BETWEEN = 1.5
+
+
+def _load_cache() -> dict:
+    if FILMINFO_CACHE.exists():
+        try:
+            return json.loads(FILMINFO_CACHE.read_text())
+        except Exception:
+            pass
+    return {}
+
+
+def _save_cache(cache: dict):
+    FILMINFO_CACHE.write_text(json.dumps(cache, ensure_ascii=False, indent=1))
+
+
+def _is_enriched(entry: dict) -> bool:
+    """Prüft ob ein Cache-Eintrag bereits KI-angereichert ist."""
+    return bool(entry.get("description"))
+
+
+def _call_ollama(prompt: str, model: str = MODEL) -> str:
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": (
+                "Du bist eine Filmdatenbank. Antworte NUR mit validem JSON, "
+                "kein Markdown, keine Erklärungen. /no_think"
+            )},
+            {"role": "user", "content": prompt + " /no_think"},
+        ],
+        "max_tokens": 800,
+        "stream": False,
+    }
+    try:
+        r = requests.post(
+            f"{OLLAMA_BASE}/v1/chat/completions",
+            json=payload, timeout=120,
+        )
+        r.raise_for_status()
+        text = r.json()["choices"][0]["message"]["content"].strip()
+        if text.startswith("```"):
+            text = re.sub(r"^```\w*\n?", "", text)
+            text = re.sub(r"\n?```$", "", text)
+        return text.strip()
+    except requests.exceptions.ReadTimeout:
+        if model != FALLBACK_MODEL:
+            log.warning("Timeout mit %s, Fallback auf %s", model, FALLBACK_MODEL)
+            return _call_ollama(prompt, model=FALLBACK_MODEL)
+        raise
+    except Exception as e:
+        log.error("Ollama-Fehler: %s", e)
+        return ""
+
+
+def _enrich_film(title: str) -> dict:
+    """Fragt die KI nach Filmdaten zu einem Titel."""
+    clean_title = re.sub(r"\s*[-\u2013\u2014]\s*.+$", "", title).strip()
+
+    prompt = f"""Gib mir Informationen zum Film "{clean_title}".
+Antworte als JSON mit exakt diesen Feldern:
+{{
+  "year": "Erscheinungsjahr als String oder leer",
+  "countries": ["Produktionsland/länder"],
+  "genres": ["bis zu 3 Genres"],
+  "actors": ["bis zu 4 Hauptdarsteller"],
+  "director": "Regisseur oder leer",
+  "description": "3-5 Sätze auf Deutsch: Worum geht es, was macht den Film besonders, für wen ist er geeignet. Keine Spoiler."
+}}
+Falls du den Film nicht kennst, setze description auf leer und die anderen Felder soweit bekannt."""
+
+    raw = _call_ollama(prompt)
+    if not raw:
+        return {"year": "", "countries": [], "genres": [], "actors": [],
+                "director": "", "description": ""}
+
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError:
+        match = re.search(r"\{[\s\S]*\}", raw)
+        if match:
+            try:
+                data = json.loads(match.group())
+            except json.JSONDecodeError:
+                log.warning("JSON-Parse fehlgeschlagen für '%s'", title)
+                return {"year": "", "countries": [], "genres": [], "actors": [],
+                        "director": "", "description": ""}
+        else:
+            return {"year": "", "countries": [], "genres": [], "actors": [],
+                    "director": "", "description": ""}
+
+    return {
+        "year": str(data.get("year", ""))[:4],
+        "countries": (data.get("countries") or [])[:3],
+        "genres": (data.get("genres") or [])[:3],
+        "actors": (data.get("actors") or [])[:4],
+        "director": str(data.get("director", ""))[:60],
+        "description": str(data.get("description", ""))[:600],
+    }
+
+
+def run():
+    """Hauptfunktion: Archiv laden, fehlende Filme anreichern."""
+    log.info("Starte Film-Enrichment...")
+
+    entries = savetv._get_full_archive()
+    if not entries:
+        log.warning("Keine Archiv-Einträge von Save.TV erhalten")
+        return
+
+    titles = set()
+    for e in entries:
+        tc = e.get("STRTELECASTENTRY", {})
+        if tc.get("SFOLGE", ""):
+            continue
+        title = tc.get("STITLE", "")
+        if title and not savetv._is_excluded(title):
+            titles.add(title)
+
+    cache = _load_cache()
+    missing = [t for t in titles if not _is_enriched(cache.get(t, {}))]
+
+    if not missing:
+        log.info("Alle %d Filme bereits angereichert", len(titles))
+        return
+
+    log.info("%d Filme im Archiv, %d davon noch ohne KI-Beschreibung", len(titles), len(missing))
+
+    enriched = 0
+    for i, title in enumerate(missing):
+        log.info("[%d/%d] Anreichern: %s", i + 1, len(missing), title)
+        try:
+            info = _enrich_film(title)
+            if info.get("description"):
+                cache[title] = info
+                enriched += 1
+                if enriched % BATCH_SIZE == 0:
+                    _save_cache(cache)
+                    log.info("  Cache gespeichert (%d angereichert)", enriched)
+            else:
+                old = cache.get(title, {})
+                for k in ("year", "countries", "genres"):
+                    if info.get(k) and not old.get(k):
+                        old[k] = info[k]
+                if info.get("actors"):
+                    old["actors"] = info["actors"]
+                if info.get("director"):
+                    old["director"] = info["director"]
+                cache[title] = old
+                log.info("  Keine Beschreibung erhalten, Basisdaten übernommen")
+        except Exception as e:
+            log.error("  Fehler bei '%s': %s", title, e)
+
+        time.sleep(SLEEP_BETWEEN)
+
+    _save_cache(cache)
+    log.info("Fertig: %d von %d Filmen neu angereichert", enriched, len(missing))
+
+
+if __name__ == "__main__":
+    run()