savetv_enrich: File-Lock + Debug-Logging + robusterer Cache-Save
- fcntl.flock verhindert parallele Enricher-Instanzen - Atomarer Cache-Save über tmp-Datei - Debug-Logs bei leerer/gefilterter Beschreibung - Sleep auf 2s erhöht für stabilere Ollama-Antworten
This commit is contained in:
parent
908aadd13f
commit
e204edf2ea
1 changed files with 30 additions and 14 deletions
|
|
@ -9,6 +9,7 @@ die savetv_web.py und savetv_extra_routes.py verwenden.
|
||||||
Ergebnis: 3-6 Sätze Beschreibung, Hauptdarsteller, Land, Jahr, Genre.
|
Ergebnis: 3-6 Sätze Beschreibung, Hauptdarsteller, Land, Jahr, Genre.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import fcntl
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
@ -35,8 +36,9 @@ MODEL = "qwen2.5:14b"
|
||||||
FALLBACK_MODEL = "qwen3:30b-a3b"
|
FALLBACK_MODEL = "qwen3:30b-a3b"
|
||||||
|
|
||||||
FILMINFO_CACHE = Path("/mnt/savetv/.filminfo_cache.json")
|
FILMINFO_CACHE = Path("/mnt/savetv/.filminfo_cache.json")
|
||||||
|
LOCKFILE = Path("/tmp/savetv_enrich.lock")
|
||||||
BATCH_SIZE = 8
|
BATCH_SIZE = 8
|
||||||
SLEEP_BETWEEN = 1.5
|
SLEEP_BETWEEN = 2.0
|
||||||
|
|
||||||
|
|
||||||
def _load_cache() -> dict:
|
def _load_cache() -> dict:
|
||||||
|
|
@ -49,16 +51,16 @@ def _load_cache() -> dict:
|
||||||
|
|
||||||
|
|
||||||
def _save_cache(cache: dict):
|
def _save_cache(cache: dict):
|
||||||
FILMINFO_CACHE.write_text(json.dumps(cache, ensure_ascii=False, indent=1))
|
tmp = FILMINFO_CACHE.with_suffix(".tmp")
|
||||||
|
tmp.write_text(json.dumps(cache, ensure_ascii=False, indent=1))
|
||||||
|
tmp.rename(FILMINFO_CACHE)
|
||||||
|
|
||||||
|
|
||||||
def _is_enriched(entry: dict) -> bool:
|
def _is_enriched(entry: dict) -> bool:
|
||||||
"""Prüft ob ein Cache-Eintrag bereits KI-angereichert ist."""
|
|
||||||
return bool(entry.get("description"))
|
return bool(entry.get("description"))
|
||||||
|
|
||||||
|
|
||||||
def _call_ollama(prompt: str, model: str = MODEL) -> str:
|
def _call_ollama(prompt: str, model: str = MODEL) -> str:
|
||||||
"""Ruft Ollama via native /api/chat auf."""
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model,
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|
@ -95,7 +97,6 @@ def _call_ollama(prompt: str, model: str = MODEL) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _normalize_actors(actors_raw) -> list:
|
def _normalize_actors(actors_raw) -> list:
|
||||||
"""Wandelt actors-Feld in eine einfache String-Liste um."""
|
|
||||||
if not actors_raw or not isinstance(actors_raw, list):
|
if not actors_raw or not isinstance(actors_raw, list):
|
||||||
return []
|
return []
|
||||||
result = []
|
result = []
|
||||||
|
|
@ -110,7 +111,6 @@ def _normalize_actors(actors_raw) -> list:
|
||||||
|
|
||||||
|
|
||||||
def _enrich_film(title: str) -> dict:
|
def _enrich_film(title: str) -> dict:
|
||||||
"""Fragt die KI nach Filmdaten zu einem Titel."""
|
|
||||||
clean_title = re.sub(r"\s*[-\u2013\u2014]\s*.+$", "", title).strip()
|
clean_title = re.sub(r"\s*[-\u2013\u2014]\s*.+$", "", title).strip()
|
||||||
|
|
||||||
prompt = f"""Gib mir Informationen zum Film "{clean_title}".
|
prompt = f"""Gib mir Informationen zum Film "{clean_title}".
|
||||||
|
|
@ -128,6 +128,7 @@ Falls du den Film nicht kennst, setze description auf leeren String."""
|
||||||
|
|
||||||
raw = _call_ollama(prompt)
|
raw = _call_ollama(prompt)
|
||||||
if not raw:
|
if not raw:
|
||||||
|
log.warning(" Leere Antwort von Ollama für '%s'", title)
|
||||||
return {"year": "", "countries": [], "genres": [], "actors": [],
|
return {"year": "", "countries": [], "genres": [], "actors": [],
|
||||||
"director": "", "description": ""}
|
"director": "", "description": ""}
|
||||||
|
|
||||||
|
|
@ -139,17 +140,21 @@ Falls du den Film nicht kennst, setze description auf leeren String."""
|
||||||
try:
|
try:
|
||||||
data = json.loads(match.group())
|
data = json.loads(match.group())
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
log.warning("JSON-Parse fehlgeschlagen für '%s': %s", title, raw[:100])
|
log.warning("JSON-Parse fehlgeschlagen für '%s': %.200s", title, raw)
|
||||||
return {"year": "", "countries": [], "genres": [], "actors": [],
|
return {"year": "", "countries": [], "genres": [], "actors": [],
|
||||||
"director": "", "description": ""}
|
"director": "", "description": ""}
|
||||||
else:
|
else:
|
||||||
log.warning("Kein JSON gefunden für '%s': %s", title, raw[:100])
|
log.warning("Kein JSON gefunden für '%s': %.200s", title, raw)
|
||||||
return {"year": "", "countries": [], "genres": [], "actors": [],
|
return {"year": "", "countries": [], "genres": [], "actors": [],
|
||||||
"director": "", "description": ""}
|
"director": "", "description": ""}
|
||||||
|
|
||||||
desc = str(data.get("description", ""))[:600]
|
desc = str(data.get("description", ""))[:600]
|
||||||
if not _is_mostly_latin(desc):
|
if desc and not _is_mostly_latin(desc):
|
||||||
|
log.info(" Nicht-lateinische Beschreibung gefiltert: %.80s", desc)
|
||||||
desc = ""
|
desc = ""
|
||||||
|
if not desc:
|
||||||
|
log.info(" Beschreibung leer, raw year=%s actors=%s",
|
||||||
|
data.get("year"), str(data.get("actors", []))[:80])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"year": str(data.get("year", ""))[:4],
|
"year": str(data.get("year", ""))[:4],
|
||||||
|
|
@ -162,7 +167,6 @@ Falls du den Film nicht kennst, setze description auf leeren String."""
|
||||||
|
|
||||||
|
|
||||||
def _is_mostly_latin(text: str) -> bool:
|
def _is_mostly_latin(text: str) -> bool:
|
||||||
"""Prüft ob ein Text hauptsächlich lateinische Zeichen enthält."""
|
|
||||||
if not text:
|
if not text:
|
||||||
return False
|
return False
|
||||||
latin = sum(1 for c in text if c.isascii() or '\u00C0' <= c <= '\u024F')
|
latin = sum(1 for c in text if c.isascii() or '\u00C0' <= c <= '\u024F')
|
||||||
|
|
@ -170,7 +174,6 @@ def _is_mostly_latin(text: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
"""Hauptfunktion: Archiv laden, fehlende Filme anreichern."""
|
|
||||||
log.info("Starte Film-Enrichment...")
|
log.info("Starte Film-Enrichment...")
|
||||||
|
|
||||||
entries = savetv._get_full_archive()
|
entries = savetv._get_full_archive()
|
||||||
|
|
@ -194,7 +197,8 @@ def run():
|
||||||
log.info("Alle %d Filme bereits angereichert", len(titles))
|
log.info("Alle %d Filme bereits angereichert", len(titles))
|
||||||
return
|
return
|
||||||
|
|
||||||
log.info("%d Filme im Archiv, %d davon noch ohne KI-Beschreibung", len(titles), len(missing))
|
log.info("%d Filme im Archiv, %d davon noch ohne KI-Beschreibung",
|
||||||
|
len(titles), len(missing))
|
||||||
|
|
||||||
enriched = 0
|
enriched = 0
|
||||||
for i, title in enumerate(missing):
|
for i, title in enumerate(missing):
|
||||||
|
|
@ -204,7 +208,8 @@ def run():
|
||||||
if info.get("description"):
|
if info.get("description"):
|
||||||
cache[title] = info
|
cache[title] = info
|
||||||
enriched += 1
|
enriched += 1
|
||||||
log.info(" OK: %s (%s)", info.get("year", "?"), ", ".join(info.get("actors", [])[:2]))
|
log.info(" OK: %s (%s)", info.get("year", "?"),
|
||||||
|
", ".join(info.get("actors", [])[:2]))
|
||||||
if enriched % BATCH_SIZE == 0:
|
if enriched % BATCH_SIZE == 0:
|
||||||
_save_cache(cache)
|
_save_cache(cache)
|
||||||
log.info(" Cache gespeichert (%d angereichert)", enriched)
|
log.info(" Cache gespeichert (%d angereichert)", enriched)
|
||||||
|
|
@ -229,4 +234,15 @@ def run():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
lock_fd = open(LOCKFILE, "w")
|
||||||
|
try:
|
||||||
|
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||||
|
except BlockingIOError:
|
||||||
|
print("Enricher läuft bereits — Abbruch.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
try:
|
||||||
run()
|
run()
|
||||||
|
finally:
|
||||||
|
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||||
|
lock_fd.close()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue