homelab-brain/homelab-ai-bot/savetv_enrich.py

#!/usr/bin/env python3
"""Save.TV Film-Enricher — reichert Archiv-Filme per KI an.

Läuft als Cronjob (z.B. alle 3h) auf CT 116.
Nutzt Ollama auf dem KI-Server (gleicher Endpunkt wie llm.py).
Schreibt in /mnt/savetv/.filminfo_cache.json — dieselbe Datei
die savetv_web.py und savetv_extra_routes.py verwenden.

Ergebnis: 3-6 Sätze Beschreibung, Hauptdarsteller, Land, Jahr, Genre.
"""

import fcntl
import json
import logging
import os
import re
import sys
import time
import requests
from pathlib import Path

sys.path.insert(0, os.path.dirname(__file__))
sys.path.insert(0, "/opt")

from tools import savetv

log = logging.getLogger("savetv_enrich")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%H:%M:%S",
)

OLLAMA_BASE = "http://100.84.255.83:11434"
MODEL = "qwen2.5:14b"
FALLBACK_MODEL = "qwen3:30b-a3b"

FILMINFO_CACHE = Path("/mnt/savetv/.filminfo_cache.json")
LOCKFILE = Path("/tmp/savetv_enrich.lock")
BATCH_SIZE = 8
SLEEP_BETWEEN = 0.5


def _load_cache() -> dict:
    if FILMINFO_CACHE.exists():
        try:
            return json.loads(FILMINFO_CACHE.read_text())
        except Exception:
            pass
    return {}


def _save_cache(cache: dict):
    tmp = FILMINFO_CACHE.with_suffix(".tmp")
    tmp.write_text(json.dumps(cache, ensure_ascii=False, indent=1))
    tmp.rename(FILMINFO_CACHE)


def _is_enriched(entry: dict) -> bool:
    return bool(entry.get("description"))


def _call_ollama(prompt: str, model: str = MODEL) -> str:
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": (
                "Du bist eine Filmdatenbank. Antworte AUSSCHLIESSLICH mit validem JSON. "
                "Kein Markdown, keine Erklärungen, kein Denken. Nur JSON. Sprache: Deutsch."
            )},
            {"role": "user", "content": prompt},
        ],
        "stream": False,
        "think": False,
        "options": {"num_predict": 1024},
    }
    try:
        r = requests.post(
            f"{OLLAMA_BASE}/api/chat",
            json=payload, timeout=180,
        )
        r.raise_for_status()
        data = r.json()
        text = data.get("message", {}).get("content", "").strip()
        if text.startswith("```"):
            text = re.sub(r"^```\w*\n?", "", text)
            text = re.sub(r"\n?```$", "", text)
        return text.strip()
    except requests.exceptions.ReadTimeout:
        if model != FALLBACK_MODEL:
            log.warning("Timeout mit %s, Fallback auf %s", model, FALLBACK_MODEL)
            return _call_ollama(prompt, model=FALLBACK_MODEL)
        raise
    except Exception as e:
        log.error("Ollama-Fehler: %s", e)
        return ""


def _normalize_actors(actors_raw) -> list:
    if not actors_raw or not isinstance(actors_raw, list):
        return []
    result = []
    for a in actors_raw[:4]:
        if isinstance(a, str):
            result.append(a)
        elif isinstance(a, dict):
            name = a.get("name") or a.get("actor") or ""
            if name:
                result.append(name)
    return result


def _enrich_film(title: str) -> dict:
    clean_title = re.sub(r"\s*[-\u2013\u2014]\s*.+$", "", title).strip()

    prompt = f"""Gib mir Informationen zum Film "{clean_title}".
Antworte als JSON mit exakt diesen Feldern (alle Texte auf Deutsch):
{{
  "year": "Erscheinungsjahr als String",
  "countries": ["Produktionsländer als Strings"],
  "genres": ["bis zu 3 Genres als Strings"],
  "actors": ["bis zu 4 Hauptdarsteller als Strings"],
  "director": "Regisseur als String",
  "description": "3-5 Sätze auf Deutsch: Worum geht es, für wen geeignet. Keine Spoiler."
}}
Wichtig: Alle Werte müssen Strings sein. Schreibe die description komplett auf Deutsch.
Falls du den Film nicht kennst, setze description auf leeren String."""

    raw = _call_ollama(prompt)
    if not raw:
        log.warning("  Leere Antwort von Ollama für '%s'", title)
        return {"year": "", "countries": [], "genres": [], "actors": [],
                "director": "", "description": ""}

    try:
        data = json.loads(raw)
    except json.JSONDecodeError:
        match = re.search(r"\{[\s\S]*\}", raw)
        if match:
            try:
                data = json.loads(match.group())
            except json.JSONDecodeError:
                log.warning("JSON-Parse fehlgeschlagen für '%s': %.200s", title, raw)
                return {"year": "", "countries": [], "genres": [], "actors": [],
                        "director": "", "description": ""}
        else:
            log.warning("Kein JSON gefunden für '%s': %.200s", title, raw)
            return {"year": "", "countries": [], "genres": [], "actors": [],
                    "director": "", "description": ""}

    desc = str(data.get("description", ""))[:600]
    if desc and not _is_mostly_latin(desc):
        log.info("  Nicht-lateinische Beschreibung gefiltert: %.80s", desc)
        desc = ""
    if not desc:
        log.info("  Beschreibung leer, raw year=%s actors=%s",
                 data.get("year"), str(data.get("actors", []))[:80])

    return {
        "year": str(data.get("year", ""))[:4],
        "countries": [str(c) for c in (data.get("countries") or [])[:3]],
        "genres": [str(g) for g in (data.get("genres") or [])[:3]],
        "actors": _normalize_actors(data.get("actors")),
        "director": str(data.get("director", ""))[:60],
        "description": desc,
    }


def _is_mostly_latin(text: str) -> bool:
    if not text:
        return False
    latin = sum(1 for c in text if c.isascii() or '\u00C0' <= c <= '\u024F')
    return latin / max(len(text), 1) > 0.7


def run():
    log.info("Starte Film-Enrichment...")

    entries = savetv._get_full_archive()
    if not entries:
        log.warning("Keine Archiv-Einträge von Save.TV erhalten")
        return

    titles = set()
    for e in entries:
        tc = e.get("STRTELECASTENTRY", {})
        if tc.get("SFOLGE", ""):
            continue
        title = tc.get("STITLE", "")
        if title and not savetv._is_excluded(title):
            titles.add(title)

    cache = _load_cache()
    missing = [t for t in titles if not _is_enriched(cache.get(t, {}))]

    if not missing:
        log.info("Alle %d Filme bereits angereichert", len(titles))
        return

    log.info("%d Filme im Archiv, %d davon noch ohne KI-Beschreibung",
             len(titles), len(missing))

    enriched = 0
    for i, title in enumerate(missing):
        log.info("[%d/%d] Anreichern: %s", i + 1, len(missing), title)
        try:
            info = _enrich_film(title)
            if info.get("description"):
                cache[title] = info
                enriched += 1
                log.info("  OK: %s (%s)", info.get("year", "?"),
                         ", ".join(info.get("actors", [])[:2]))
                if enriched % BATCH_SIZE == 0:
                    _save_cache(cache)
                    log.info("  Cache gespeichert (%d angereichert)", enriched)
            else:
                old = cache.get(title, {})
                for k in ("year", "countries", "genres"):
                    if info.get(k) and not old.get(k):
                        old[k] = info[k]
                if info.get("actors"):
                    old["actors"] = info["actors"]
                if info.get("director"):
                    old["director"] = info["director"]
                cache[title] = old
                log.info("  Keine Beschreibung erhalten, Basisdaten übernommen")
        except Exception as e:
            log.error("  Fehler bei '%s': %s", title, e)

        time.sleep(SLEEP_BETWEEN)

    _save_cache(cache)
    log.info("Fertig: %d von %d Filmen neu angereichert", enriched, len(missing))


if __name__ == "__main__":
    lock_fd = open(LOCKFILE, "w")
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
    except BlockingIOError:
        print("Enricher läuft bereits — Abbruch.")
        sys.exit(0)

    try:
        run()
    finally:
        fcntl.flock(lock_fd, fcntl.LOCK_UN)
        lock_fd.close()