fix: predict.py ohne interne LLM-Analyse, strukturierter Daten-Report

- Tool gibt strukturierten Report zurueck (kein verschachtelter LLM-Aufruf mehr) - Klare Hervorhebung: KRITISCH (>=100 Fehler), Erhoeht (>=20), Disk-Trend - Proxmox: korrekte Passwort/Token-Ermittlung, nur echte pve-Hosts im Fehler-Log - daily_forecast: LLM analysiert den Report bevor er gesendet wird
2026-03-21 13:44:35 +01:00 · 2026-03-21 13:44:35 +01:00 · 47bd67eb97
commit 47bd67eb97
parent a39930e9ae
2 changed files with 117 additions and 98 deletions
--- a/homelab-ai-bot/telegram_bot.py
+++ b/homelab-ai-bot/telegram_bot.py
@ -822,14 +822,27 @@ async def handle_callback(update: Update, ctx: ContextTypes.DEFAULT_TYPE):


 async def _send_daily_forecast(context):
-    """Taeglich 08:00 Uhr: KI-Systemvorhersage senden."""
+    """Taeglich 08:00 Uhr: Systemdaten sammeln, LLM analysiert, Ergebnis senden."""
    if not CHAT_ID:
        return
    bot = getattr(context, "bot", None) or context
    try:
        from tools.predict import handle_get_health_forecast
-        result = await asyncio.get_event_loop().run_in_executor(None, handle_get_health_forecast)
-        await bot.send_message(chat_id=CHAT_ID, text=result, parse_mode="Markdown")
+        from llm import ask_with_tools
+        report = await asyncio.get_event_loop().run_in_executor(None, handle_get_health_forecast)
+        prompt = (
+            "Morgendlicher System-Check. Analysiere diesen Report und gib eine kurze "
+            "Prognose (max 10 Zeilen) ob sich Probleme anbahnen. Nenne nur echte Auffälligkeiten, "
+            "gib klare Handlungsempfehlungen wenn nötig. Wenn alles OK ist, reicht eine kurze Entwarnung.
+
+"
+            + report
+        )
+        analysis = await asyncio.get_event_loop().run_in_executor(None, ask_with_tools, prompt)
+        msg = f"🔭 *Tägliche Systemvorhersage*
+
+{analysis}"
+        await bot.send_message(chat_id=CHAT_ID, text=msg, parse_mode="Markdown")
        log.info("Taegl. Systemvorhersage gesendet")
    except Exception:
        log.exception("Fehler beim Senden der Systemvorhersage")
--- a/homelab-ai-bot/tools/predict.py
+++ b/homelab-ai-bot/tools/predict.py
@ -1,26 +1,23 @@
-"""KI-gestützte Systemvorhersage — analysiert Logs, Metriken und Container-Status."""
+"""KI-gestützte Systemvorhersage — sammelt Metriken, Logs und Container-Status."""

-import json
+import re
 import requests
 from datetime import datetime, timezone, timedelta

 from core import prometheus_client, loki_client, config
 from core import proxmox_client

-OLLAMA_URL = "http://100.84.255.83:11434"
-FORECAST_MODEL = "qwen3:30b-a3b"
-
 TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_health_forecast",
            "description": (
-                "KI-gestützte Systemvorhersage für das Homelab. Analysiert Fehler-Logs, "
-                "Disk-Trends, CPU/RAM-Auslastung und Container-Status. Gibt eine Prognose "
-                "aus, ob sich Probleme anbahnen — z.B. voller Speicher, häufige Abstürze, "
-                "steigende Fehlerquoten. Trigger: 'vorhersage', 'was bahnt sich an', "
-                "'prognose', 'health forecast', 'system check', 'systemstatus'."
+                "Sammelt Systemdaten (Disk-Trends, Log-Fehler, Container-Status, CPU/RAM) "
+                "und gibt einen strukturierten Report zurück. Analysiere den Report und gib "
+                "eine Prognose, ob sich Probleme anbahnen. Trigger: 'vorhersage', "
+                "'was bahnt sich an', 'prognose', 'health forecast', 'system check', "
+                "'systemstatus', 'wie geht es dem system'."
            ),
            "parameters": {"type": "object", "properties": {}, "required": []},
        },
@ -95,17 +92,20 @@ def _gather_proxmox() -> dict:
    result = {}
    try:
        cfg = config.parse_config()
-        passwords = {}
+
+        pw_default = cfg.passwords.get("default", "")
+        pw_hetzner = cfg.passwords.get("hetzner", pw_default)
+        passwords = {"default": pw_default, "pve-hetzner": pw_hetzner}
+        for host in proxmox_client.PROXMOX_HOSTS:
+            if host not in passwords:
+                passwords[host] = pw_default
+
        tokens = {}
-        for pve_host in cfg.proxmox_hosts:
-            name = pve_host.get("name", "")
-            pw = pve_host.get("password", "")
-            tok_name = pve_host.get("token_name", "")
-            tok_val = pve_host.get("token_value", "")
-            if pw:
-                passwords[name] = pw
-            if tok_name and tok_val:
-                tokens[name] = {"name": tok_name, "value": tok_val}
+        tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "")
+        tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "")
+        if tn and tv:
+            tokens["pve-hetzner"] = {"name": tn, "value": tv}
+
        containers = proxmox_client.get_all_containers(passwords=passwords, tokens=tokens)
        stopped = [
            {"id": c.get("vmid"), "name": c.get("name", "?")}
@ -113,112 +113,118 @@ def _gather_proxmox() -> dict:
            if c.get("status") == "stopped" and "error" not in c
        ]
        running = len([c for c in containers if c.get("status") == "running"])
+        errors = [c for c in containers if "error" in c]
        result["total"] = len(containers)
        result["running"] = running
        result["stopped"] = stopped
+        result["host_errors"] = [
+            f"{e.get('_host', '?')}: {e['error'][:80]}" for e in errors
+        ]
    except Exception as e:
        result["proxmox_error"] = str(e)
    return result


-def _call_analysis_llm(data_summary: str) -> str:
-    now_str = datetime.now().strftime("%d.%m.%Y %H:%M")
-    prompt = (
-        f"Du bist ein Homelab-Monitoring-Experte. Heute ist der {now_str}.\n"
-        "Analysiere die folgenden System-Rohdaten und erstelle eine kompakte Prognose.\n\n"
-        "REGELN:\n"
-        "- Nur echte Auffälligkeiten nennen (nicht jede normale Metrik)\n"
-        "- Disk-Delta > 2% in 24h = Warnung\n"
-        "- Disk > 80% = kritisch\n"
-        "- RAM > 85% = Warnung\n"
-        "- Fehler > 50 in 24h für einen Host = Warnung\n"
-        "- Gestoppte Container = prüfen ob OK\n"
-        "- Wenn alles normal: kurze Entwarnung genügt\n"
-        "- Max 12 Zeilen, Emojis erlaubt, auf Deutsch\n"
-        "- Klare Handlungsempfehlung wenn nötig\n\n"
-        f"System-Daten:\n{data_summary}\n\n"
-        "Prognose:"
-    )
-    try:
-        r = requests.post(
-            f"{OLLAMA_URL}/api/chat",
-            json={
-                "model": FORECAST_MODEL,
-                "messages": [{"role": "user", "content": prompt + " /no_think"}],
-                "stream": False,
-                "options": {"num_predict": 700, "temperature": 0.3},
-            },
-            timeout=180,
-        )
-        r.raise_for_status()
-        content = r.json().get("message", {}).get("content", "")
-        # Strip <think> tags if present
-        import re
-        content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
-        return content or "LLM-Analyse ergab kein Ergebnis."
-    except Exception as e:
-        return f"⚠️ LLM-Analyse nicht verfügbar: {e}"
-
-
 def handle_get_health_forecast(**kw) -> str:
    prom = _gather_prometheus()
    loki = _gather_loki()
    pve = _gather_proxmox()

-    summary_parts = []
+    now_str = datetime.now().strftime("%d.%m.%Y %H:%M")
+    lines = [f"📊 Systemdaten-Report ({now_str})", ""]

+    # --- Prometheus Warnungen ---
    if prom.get("warnings"):
-        summary_parts.append("AKTIVE WARNUNGEN: " + ", ".join(prom["warnings"]))
+        lines.append("⚠️ AKTIVE SCHWELLWERT-WARNUNGEN:")
+        for w in prom["warnings"]:
+            lines.append(f"  {w}")
    else:
-        summary_parts.append("Prometheus-Warnungen: keine")
+        lines.append("✅ Keine Prometheus-Schwellwert-Warnungen")

-    if prom.get("disk_current"):
-        lines = [f"{d['host']}: {d['used_pct']}%" for d in prom["disk_current"]]
-        summary_parts.append("Disk-Nutzung aktuell:\n  " + "\n  ".join(lines))
+    # --- Disk-Trends ---
+    disk_issues = []
+    for h, t in prom.get("disk_trend_24h", {}).items():
+        if t["delta_24h"] >= 2.0:
+            disk_issues.append(
+                f"  📈 {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)"
+            )
+        elif t["delta_24h"] >= 1.0:
+            disk_issues.append(
+                f"  ↗ {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)"
+            )

-    if prom.get("disk_trend_24h"):
-        trend_lines = []
-        for h, t in prom["disk_trend_24h"].items():
-            if abs(t["delta_24h"]) > 0.5:
-                trend_lines.append(
-                    f"  {h}: {t['start_pct']}% → {t['end_pct']}% (Δ {t['delta_24h']:+.1f}% in 24h)"
-                )
-        if trend_lines:
-            summary_parts.append("Disk-Trends (letzte 24h):\n" + "\n".join(trend_lines))
+    # Auch hohe absolute Werte hervorheben
+    for d in prom.get("disk_current", []):
+        if d["used_pct"] >= 70:
+            disk_issues.append(f"  💾 {d['host']}: aktuell {d['used_pct']}% belegt")

-    if prom.get("memory"):
-        high_mem = [m for m in prom["memory"] if m["used_pct"] > 70]
-        if high_mem:
-            mem_lines = [f"{m['host']}: {m['used_pct']}%" for m in high_mem]
-            summary_parts.append("RAM > 70%:\n  " + "\n  ".join(mem_lines))
+    if disk_issues:
+        lines.append("")
+        lines.append("🗄️ Disk-Auffälligkeiten:")
+        lines.extend(disk_issues)
+    else:
+        # Kurze Übersicht
+        disk_summary = ", ".join(
+            f"{d['host']}: {d['used_pct']}%"
+            for d in sorted(prom.get("disk_current", []), key=lambda x: -x["used_pct"])[:5]
+        )
+        if disk_summary:
+            lines.append(f"💾 Disk (Top-5): {disk_summary}")

-    if prom.get("load5"):
-        high_load = [l for l in prom["load5"] if l["load5"] > 2.0]
-        if high_load:
-            load_lines = [f"{l['host']}: load5={l['load5']}" for l in high_load]
-            summary_parts.append("Hohe Last:\n  " + "\n  ".join(load_lines))
+    # --- RAM ---
+    high_mem = [m for m in prom.get("memory", []) if m["used_pct"] > 70]
+    if high_mem:
+        lines.append("")
+        lines.append("🧠 RAM > 70%:")
+        for m in sorted(high_mem, key=lambda x: -x["used_pct"]):
+            lines.append(f"  {m['host']}: {m['used_pct']}%")

+    # --- Load ---
+    high_load = [l for l in prom.get("load5", []) if l["load5"] > 2.0]
+    if high_load:
+        lines.append("")
+        lines.append("⚡ Hohe Last (load5 > 2):")
+        for l in sorted(high_load, key=lambda x: -x["load5"]):
+            lines.append(f"  {l['host']}: {l['load5']}")
+
+    # --- Loki Fehler ---
    errors = loki.get("errors_24h", {})
+    lines.append("")
    if errors:
-        err_lines = [f"{h}: {c} Fehler" for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]]
-        summary_parts.append("Log-Fehler letzte 24h:\n  " + "\n  ".join(err_lines))
+        lines.append("🔴 Log-Fehler letzte 24h:")
+        for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]:
+            level = "🔴 KRITISCH" if c >= 100 else "🟠 Erhöht" if c >= 20 else "🟡"
+            lines.append(f"  {level} {h}: {c} Fehler")
    else:
-        summary_parts.append("Log-Fehler letzte 24h: keine")
+        lines.append("✅ Keine Log-Fehler in den letzten 24h")

    if loki.get("silent_hosts"):
-        summary_parts.append("Stille Hosts (>60 min kein Log): " + ", ".join(loki["silent_hosts"]))
+        lines.append(f"🔇 Stille Hosts (>60min kein Log): {', '.join(loki['silent_hosts'])}")

+    if loki.get("loki_error"):
+        lines.append(f"⚠️ Loki nicht erreichbar: {loki['loki_error'][:60]}")
+
+    # --- Proxmox ---
+    lines.append("")
    if pve.get("stopped"):
-        stopped_names = [f"CT{c['id']} {c['name']}" for c in pve["stopped"]]
-        summary_parts.append("Gestoppte Container: " + ", ".join(stopped_names))
+        lines.append(f"🛑 Gestoppte Container ({len(pve['stopped'])}):")
+        for c in pve["stopped"]:
+            lines.append(f"  CT{c['id']} {c['name']}")
    elif "proxmox_error" not in pve:
-        summary_parts.append(f"Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen")
+        lines.append(
+            f"🟢 Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen"
+        )

-    data_summary = "\n\n".join(summary_parts)
-    analysis = _call_analysis_llm(data_summary)
+    if pve.get("host_errors"):
+        # Nur echte Proxmox-Hosts (pve-* oder pbs-*)
+        real_errors = [e for e in pve["host_errors"] if e.startswith(("pve-", "pbs-"))]
+        if real_errors:
+            lines.append("⚠️ Proxmox-Host-Fehler: " + "; ".join(real_errors[:3]))

-    header = f"🔭 *Systemvorhersage* ({datetime.now().strftime('%d.%m.%Y %H:%M')})\n\n"
-    return header + analysis
+    if pve.get("proxmox_error"):
+        lines.append(f"⚠️ Proxmox-Fehler: {pve['proxmox_error'][:80]}")
+
+    return "\n".join(lines)


 HANDLERS = {