feat: KI-Enricher für Save.TV Filmdatenbank
Nutzt Ollama (qwen3:30b) um Filme im Archiv automatisch mit Beschreibung, Hauptdarstellern, Land und Genre anzureichern. Läuft als Cronjob alle 3h, Ergebnis sofort in der Web-UI sichtbar.
This commit is contained in:
parent
43f5586ea2
commit
45b2903fb6
1 changed files with 199 additions and 0 deletions
199
homelab-ai-bot/savetv_enrich.py
Normal file
199
homelab-ai-bot/savetv_enrich.py
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Save.TV Film-Enricher — reichert Archiv-Filme per KI an.
|
||||
|
||||
Läuft als Cronjob (z.B. alle 3h) auf CT 116.
|
||||
Nutzt Ollama auf dem KI-Server (gleicher Endpunkt wie llm.py).
|
||||
Schreibt in /mnt/savetv/.filminfo_cache.json — dieselbe Datei
|
||||
die savetv_web.py und savetv_extra_routes.py verwenden.
|
||||
|
||||
Ergebnis: 3-6 Sätze Beschreibung, Hauptdarsteller, Land, Jahr, Genre.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, "/opt")
|
||||
|
||||
from tools import savetv
|
||||
|
||||
log = logging.getLogger("savetv_enrich")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
OLLAMA_BASE = "http://100.84.255.83:11434"
|
||||
MODEL = "qwen3:30b-a3b"
|
||||
FALLBACK_MODEL = "qwen2.5:14b"
|
||||
|
||||
FILMINFO_CACHE = Path("/mnt/savetv/.filminfo_cache.json")
|
||||
BATCH_SIZE = 8
|
||||
SLEEP_BETWEEN = 1.5
|
||||
|
||||
|
||||
def _load_cache() -> dict:
|
||||
if FILMINFO_CACHE.exists():
|
||||
try:
|
||||
return json.loads(FILMINFO_CACHE.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _save_cache(cache: dict):
|
||||
FILMINFO_CACHE.write_text(json.dumps(cache, ensure_ascii=False, indent=1))
|
||||
|
||||
|
||||
def _is_enriched(entry: dict) -> bool:
|
||||
"""Prüft ob ein Cache-Eintrag bereits KI-angereichert ist."""
|
||||
return bool(entry.get("description"))
|
||||
|
||||
|
||||
def _call_ollama(prompt: str, model: str = MODEL) -> str:
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": (
|
||||
"Du bist eine Filmdatenbank. Antworte NUR mit validem JSON, "
|
||||
"kein Markdown, keine Erklärungen. /no_think"
|
||||
)},
|
||||
{"role": "user", "content": prompt + " /no_think"},
|
||||
],
|
||||
"max_tokens": 800,
|
||||
"stream": False,
|
||||
}
|
||||
try:
|
||||
r = requests.post(
|
||||
f"{OLLAMA_BASE}/v1/chat/completions",
|
||||
json=payload, timeout=120,
|
||||
)
|
||||
r.raise_for_status()
|
||||
text = r.json()["choices"][0]["message"]["content"].strip()
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```\w*\n?", "", text)
|
||||
text = re.sub(r"\n?```$", "", text)
|
||||
return text.strip()
|
||||
except requests.exceptions.ReadTimeout:
|
||||
if model != FALLBACK_MODEL:
|
||||
log.warning("Timeout mit %s, Fallback auf %s", model, FALLBACK_MODEL)
|
||||
return _call_ollama(prompt, model=FALLBACK_MODEL)
|
||||
raise
|
||||
except Exception as e:
|
||||
log.error("Ollama-Fehler: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
def _enrich_film(title: str) -> dict:
|
||||
"""Fragt die KI nach Filmdaten zu einem Titel."""
|
||||
clean_title = re.sub(r"\s*[-\u2013\u2014]\s*.+$", "", title).strip()
|
||||
|
||||
prompt = f"""Gib mir Informationen zum Film "{clean_title}".
|
||||
Antworte als JSON mit exakt diesen Feldern:
|
||||
{{
|
||||
"year": "Erscheinungsjahr als String oder leer",
|
||||
"countries": ["Produktionsland/länder"],
|
||||
"genres": ["bis zu 3 Genres"],
|
||||
"actors": ["bis zu 4 Hauptdarsteller"],
|
||||
"director": "Regisseur oder leer",
|
||||
"description": "3-5 Sätze auf Deutsch: Worum geht es, was macht den Film besonders, für wen ist er geeignet. Keine Spoiler."
|
||||
}}
|
||||
Falls du den Film nicht kennst, setze description auf leer und die anderen Felder soweit bekannt."""
|
||||
|
||||
raw = _call_ollama(prompt)
|
||||
if not raw:
|
||||
return {"year": "", "countries": [], "genres": [], "actors": [],
|
||||
"director": "", "description": ""}
|
||||
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r"\{[\s\S]*\}", raw)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
log.warning("JSON-Parse fehlgeschlagen für '%s'", title)
|
||||
return {"year": "", "countries": [], "genres": [], "actors": [],
|
||||
"director": "", "description": ""}
|
||||
else:
|
||||
return {"year": "", "countries": [], "genres": [], "actors": [],
|
||||
"director": "", "description": ""}
|
||||
|
||||
return {
|
||||
"year": str(data.get("year", ""))[:4],
|
||||
"countries": (data.get("countries") or [])[:3],
|
||||
"genres": (data.get("genres") or [])[:3],
|
||||
"actors": (data.get("actors") or [])[:4],
|
||||
"director": str(data.get("director", ""))[:60],
|
||||
"description": str(data.get("description", ""))[:600],
|
||||
}
|
||||
|
||||
|
||||
def run():
|
||||
"""Hauptfunktion: Archiv laden, fehlende Filme anreichern."""
|
||||
log.info("Starte Film-Enrichment...")
|
||||
|
||||
entries = savetv._get_full_archive()
|
||||
if not entries:
|
||||
log.warning("Keine Archiv-Einträge von Save.TV erhalten")
|
||||
return
|
||||
|
||||
titles = set()
|
||||
for e in entries:
|
||||
tc = e.get("STRTELECASTENTRY", {})
|
||||
if tc.get("SFOLGE", ""):
|
||||
continue
|
||||
title = tc.get("STITLE", "")
|
||||
if title and not savetv._is_excluded(title):
|
||||
titles.add(title)
|
||||
|
||||
cache = _load_cache()
|
||||
missing = [t for t in titles if not _is_enriched(cache.get(t, {}))]
|
||||
|
||||
if not missing:
|
||||
log.info("Alle %d Filme bereits angereichert", len(titles))
|
||||
return
|
||||
|
||||
log.info("%d Filme im Archiv, %d davon noch ohne KI-Beschreibung", len(titles), len(missing))
|
||||
|
||||
enriched = 0
|
||||
for i, title in enumerate(missing):
|
||||
log.info("[%d/%d] Anreichern: %s", i + 1, len(missing), title)
|
||||
try:
|
||||
info = _enrich_film(title)
|
||||
if info.get("description"):
|
||||
cache[title] = info
|
||||
enriched += 1
|
||||
if enriched % BATCH_SIZE == 0:
|
||||
_save_cache(cache)
|
||||
log.info(" Cache gespeichert (%d angereichert)", enriched)
|
||||
else:
|
||||
old = cache.get(title, {})
|
||||
for k in ("year", "countries", "genres"):
|
||||
if info.get(k) and not old.get(k):
|
||||
old[k] = info[k]
|
||||
if info.get("actors"):
|
||||
old["actors"] = info["actors"]
|
||||
if info.get("director"):
|
||||
old["director"] = info["director"]
|
||||
cache[title] = old
|
||||
log.info(" Keine Beschreibung erhalten, Basisdaten übernommen")
|
||||
except Exception as e:
|
||||
log.error(" Fehler bei '%s': %s", title, e)
|
||||
|
||||
time.sleep(SLEEP_BETWEEN)
|
||||
|
||||
_save_cache(cache)
|
||||
log.info("Fertig: %d von %d Filmen neu angereichert", enriched, len(missing))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Loading…
Add table
Reference in a new issue