From c4553b46d7ac3686374310b24f6cf743cfed87c1 Mon Sep 17 00:00:00 2001 From: Homelab Cursor Date: Sat, 21 Mar 2026 13:18:20 +0100 Subject: [PATCH] feat: KI-Systemvorhersage (tools/predict.py) + taegl. 08:00 Job - tools/predict.py: sammelt Disk-Trends (Prometheus), Fehler-Logs (Loki), Container-Status (Proxmox) und laesst lokales LLM eine Prognose erstellen - telegram_bot.py: daily_forecast Job taegl. 08:00 Uhr, sendet Prognose via Telegram - llm.py: Forecast-Trigger (vorhersage, prognose, was bahnt sich an etc.) -> lokal --- homelab-ai-bot/llm.py | 2 + homelab-ai-bot/telegram_bot.py | 21 +++ homelab-ai-bot/tools/predict.py | 226 ++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 homelab-ai-bot/tools/predict.py diff --git a/homelab-ai-bot/llm.py b/homelab-ai-bot/llm.py index 2cab1352..48bc3845 100644 --- a/homelab-ai-bot/llm.py +++ b/homelab-ai-bot/llm.py @@ -35,6 +35,8 @@ _LOCAL_OVERRIDES = [ "savetv", "save.tv", "filmtipp", "aufnahme", "wordpress", "matomo", "tailscale", "unsere api", "meine api", "die api", + "vorhersage", "prognose", "health forecast", "was bahnt", "systemstatus", + "system check", "system-check", "wie gehts dem system", "wie geht es dem system", ] _WEB_TRIGGERS = [ "recherche", "recherchiere", "suche im internet", "web search", diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index 887ab6d8..1762d56e 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -820,6 +820,21 @@ async def handle_callback(update: Update, ctx: ContextTypes.DEFAULT_TYPE): ) + +async def _send_daily_forecast(context): + """Taeglich 08:00 Uhr: KI-Systemvorhersage senden.""" + if not CHAT_ID: + return + bot = getattr(context, "bot", None) or context + try: + from tools.predict import handle_get_health_forecast + result = await asyncio.get_event_loop().run_in_executor(None, handle_get_health_forecast) + await bot.send_message(chat_id=CHAT_ID, text=result, parse_mode="Markdown") + log.info("Taegl. Systemvorhersage gesendet") + except Exception: + log.exception("Fehler beim Senden der Systemvorhersage") + + async def _send_daily_filmtipps(context): """Täglicher Cronjob: EPG scannen, Top-Filme auto-aufnehmen, Rest vorschlagen. @@ -933,6 +948,12 @@ def main(): name="daily_filmtipps", ) log.info("Täglicher Filmtipp-Job registriert (14:00 Uhr)") + app.job_queue.run_daily( + _send_daily_forecast, + time=dtime(hour=8, minute=0), + name="daily_forecast", + ) + log.info("Täglicher Forecast-Job registriert (08:00 Uhr)") else: log.warning("JobQueue nicht verfügbar — Filmtipps werden per asyncio-Loop gesendet") diff --git a/homelab-ai-bot/tools/predict.py b/homelab-ai-bot/tools/predict.py new file mode 100644 index 00000000..38ba7ee4 --- /dev/null +++ b/homelab-ai-bot/tools/predict.py @@ -0,0 +1,226 @@ +"""KI-gestützte Systemvorhersage — analysiert Logs, Metriken und Container-Status.""" + +import json +import requests +from datetime import datetime, timezone, timedelta + +from core import prometheus_client, loki_client, config +from core import proxmox_client + +OLLAMA_URL = "http://100.84.255.83:11434" +FORECAST_MODEL = "qwen3:30b-a3b" + +TOOLS = [ + { + "type": "function", + "function": { + "name": "get_health_forecast", + "description": ( + "KI-gestützte Systemvorhersage für das Homelab. Analysiert Fehler-Logs, " + "Disk-Trends, CPU/RAM-Auslastung und Container-Status. Gibt eine Prognose " + "aus, ob sich Probleme anbahnen — z.B. voller Speicher, häufige Abstürze, " + "steigende Fehlerquoten. Trigger: 'vorhersage', 'was bahnt sich an', " + "'prognose', 'health forecast', 'system check', 'systemstatus'." + ), + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + }, +] + + +def _gather_prometheus() -> dict: + result = {} + try: + result["warnings"] = prometheus_client.get_warnings() + + disk = prometheus_client.get_disk() + result["disk_current"] = [ + {"host": r["host"], "used_pct": round(r["value"], 1)} for r in disk + ] + + trend = prometheus_client.range_query( + 'max by (host) ((1 - node_filesystem_avail_bytes{mountpoint="/"} ' + '/ node_filesystem_size_bytes{mountpoint="/"}) * 100)', + hours=24, + step="2h", + ) + trends = {} + if trend.get("status") == "success": + for r in trend.get("data", {}).get("result", []): + h = r.get("metric", {}).get("host", "?") + vals = [float(v[1]) for v in r.get("values", []) if v[1] != "NaN"] + if len(vals) >= 2: + delta = vals[-1] - vals[0] + trends[h] = { + "start_pct": round(vals[0], 1), + "end_pct": round(vals[-1], 1), + "delta_24h": round(delta, 2), + } + result["disk_trend_24h"] = trends + + mem = prometheus_client.get_memory() + result["memory"] = [ + {"host": r["host"], "used_pct": round(r["value"], 1)} for r in mem + ] + + load = prometheus_client.get_load() + result["load5"] = [ + {"host": r["host"], "load5": round(r["value"], 2)} for r in load + ] + except Exception as e: + result["prometheus_error"] = str(e) + return result + + +def _gather_loki() -> dict: + result = {} + try: + hosts = loki_client.get_labels() + error_counts = {} + for host in hosts[:20]: + errors = loki_client.get_errors(container=host, hours=24, limit=300) + count = 0 if (len(errors) == 1 and "error" in errors[0]) else len(errors) + if count > 0: + error_counts[host] = count + result["errors_24h"] = error_counts + + silent = loki_client.check_silence(minutes=60) + result["silent_hosts"] = [s["host"] for s in silent if "host" in s] + except Exception as e: + result["loki_error"] = str(e) + return result + + +def _gather_proxmox() -> dict: + result = {} + try: + cfg = config.parse_config() + passwords = {} + tokens = {} + for pve_host in cfg.proxmox_hosts: + name = pve_host.get("name", "") + pw = pve_host.get("password", "") + tok_name = pve_host.get("token_name", "") + tok_val = pve_host.get("token_value", "") + if pw: + passwords[name] = pw + if tok_name and tok_val: + tokens[name] = {"name": tok_name, "value": tok_val} + containers = proxmox_client.get_all_containers(passwords=passwords, tokens=tokens) + stopped = [ + {"id": c.get("vmid"), "name": c.get("name", "?")} + for c in containers + if c.get("status") == "stopped" and "error" not in c + ] + running = len([c for c in containers if c.get("status") == "running"]) + result["total"] = len(containers) + result["running"] = running + result["stopped"] = stopped + except Exception as e: + result["proxmox_error"] = str(e) + return result + + +def _call_analysis_llm(data_summary: str) -> str: + now_str = datetime.now().strftime("%d.%m.%Y %H:%M") + prompt = ( + f"Du bist ein Homelab-Monitoring-Experte. Heute ist der {now_str}.\n" + "Analysiere die folgenden System-Rohdaten und erstelle eine kompakte Prognose.\n\n" + "REGELN:\n" + "- Nur echte Auffälligkeiten nennen (nicht jede normale Metrik)\n" + "- Disk-Delta > 2% in 24h = Warnung\n" + "- Disk > 80% = kritisch\n" + "- RAM > 85% = Warnung\n" + "- Fehler > 50 in 24h für einen Host = Warnung\n" + "- Gestoppte Container = prüfen ob OK\n" + "- Wenn alles normal: kurze Entwarnung genügt\n" + "- Max 12 Zeilen, Emojis erlaubt, auf Deutsch\n" + "- Klare Handlungsempfehlung wenn nötig\n\n" + f"System-Daten:\n{data_summary}\n\n" + "Prognose:" + ) + try: + r = requests.post( + f"{OLLAMA_URL}/api/chat", + json={ + "model": FORECAST_MODEL, + "messages": [{"role": "user", "content": prompt + " /no_think"}], + "stream": False, + "options": {"num_predict": 700, "temperature": 0.3}, + }, + timeout=180, + ) + r.raise_for_status() + content = r.json().get("message", {}).get("content", "") + # Strip tags if present + import re + content = re.sub(r".*?", "", content, flags=re.DOTALL).strip() + return content or "LLM-Analyse ergab kein Ergebnis." + except Exception as e: + return f"⚠️ LLM-Analyse nicht verfügbar: {e}" + + +def handle_get_health_forecast(**kw) -> str: + prom = _gather_prometheus() + loki = _gather_loki() + pve = _gather_proxmox() + + summary_parts = [] + + if prom.get("warnings"): + summary_parts.append("AKTIVE WARNUNGEN: " + ", ".join(prom["warnings"])) + else: + summary_parts.append("Prometheus-Warnungen: keine") + + if prom.get("disk_current"): + lines = [f"{d['host']}: {d['used_pct']}%" for d in prom["disk_current"]] + summary_parts.append("Disk-Nutzung aktuell:\n " + "\n ".join(lines)) + + if prom.get("disk_trend_24h"): + trend_lines = [] + for h, t in prom["disk_trend_24h"].items(): + if abs(t["delta_24h"]) > 0.5: + trend_lines.append( + f" {h}: {t['start_pct']}% → {t['end_pct']}% (Δ {t['delta_24h']:+.1f}% in 24h)" + ) + if trend_lines: + summary_parts.append("Disk-Trends (letzte 24h):\n" + "\n".join(trend_lines)) + + if prom.get("memory"): + high_mem = [m for m in prom["memory"] if m["used_pct"] > 70] + if high_mem: + mem_lines = [f"{m['host']}: {m['used_pct']}%" for m in high_mem] + summary_parts.append("RAM > 70%:\n " + "\n ".join(mem_lines)) + + if prom.get("load5"): + high_load = [l for l in prom["load5"] if l["load5"] > 2.0] + if high_load: + load_lines = [f"{l['host']}: load5={l['load5']}" for l in high_load] + summary_parts.append("Hohe Last:\n " + "\n ".join(load_lines)) + + errors = loki.get("errors_24h", {}) + if errors: + err_lines = [f"{h}: {c} Fehler" for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]] + summary_parts.append("Log-Fehler letzte 24h:\n " + "\n ".join(err_lines)) + else: + summary_parts.append("Log-Fehler letzte 24h: keine") + + if loki.get("silent_hosts"): + summary_parts.append("Stille Hosts (>60 min kein Log): " + ", ".join(loki["silent_hosts"])) + + if pve.get("stopped"): + stopped_names = [f"CT{c['id']} {c['name']}" for c in pve["stopped"]] + summary_parts.append("Gestoppte Container: " + ", ".join(stopped_names)) + elif "proxmox_error" not in pve: + summary_parts.append(f"Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen") + + data_summary = "\n\n".join(summary_parts) + analysis = _call_analysis_llm(data_summary) + + header = f"🔭 *Systemvorhersage* ({datetime.now().strftime('%d.%m.%Y %H:%M')})\n\n" + return header + analysis + + +HANDLERS = { + "get_health_forecast": handle_get_health_forecast, +}