diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index 5dc0e3cb..037aad24 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -822,14 +822,27 @@ async def handle_callback(update: Update, ctx: ContextTypes.DEFAULT_TYPE): async def _send_daily_forecast(context): - """Taeglich 08:00 Uhr: KI-Systemvorhersage senden.""" + """Taeglich 08:00 Uhr: Systemdaten sammeln, LLM analysiert, Ergebnis senden.""" if not CHAT_ID: return bot = getattr(context, "bot", None) or context try: from tools.predict import handle_get_health_forecast - result = await asyncio.get_event_loop().run_in_executor(None, handle_get_health_forecast) - await bot.send_message(chat_id=CHAT_ID, text=result, parse_mode="Markdown") + from llm import ask_with_tools + report = await asyncio.get_event_loop().run_in_executor(None, handle_get_health_forecast) + prompt = ( + "Morgendlicher System-Check. Analysiere diesen Report und gib eine kurze " + "Prognose (max 10 Zeilen) ob sich Probleme anbahnen. Nenne nur echte Auffälligkeiten, " + "gib klare Handlungsempfehlungen wenn nötig. Wenn alles OK ist, reicht eine kurze Entwarnung. + +" + + report + ) + analysis = await asyncio.get_event_loop().run_in_executor(None, ask_with_tools, prompt) + msg = f"🔭 *Tägliche Systemvorhersage* + +{analysis}" + await bot.send_message(chat_id=CHAT_ID, text=msg, parse_mode="Markdown") log.info("Taegl. Systemvorhersage gesendet") except Exception: log.exception("Fehler beim Senden der Systemvorhersage") diff --git a/homelab-ai-bot/tools/predict.py b/homelab-ai-bot/tools/predict.py index 38ba7ee4..d0c370d5 100644 --- a/homelab-ai-bot/tools/predict.py +++ b/homelab-ai-bot/tools/predict.py @@ -1,26 +1,23 @@ -"""KI-gestützte Systemvorhersage — analysiert Logs, Metriken und Container-Status.""" +"""KI-gestützte Systemvorhersage — sammelt Metriken, Logs und Container-Status.""" -import json +import re import requests from datetime import datetime, timezone, timedelta from core import prometheus_client, loki_client, config from core import proxmox_client -OLLAMA_URL = "http://100.84.255.83:11434" -FORECAST_MODEL = "qwen3:30b-a3b" - TOOLS = [ { "type": "function", "function": { "name": "get_health_forecast", "description": ( - "KI-gestützte Systemvorhersage für das Homelab. Analysiert Fehler-Logs, " - "Disk-Trends, CPU/RAM-Auslastung und Container-Status. Gibt eine Prognose " - "aus, ob sich Probleme anbahnen — z.B. voller Speicher, häufige Abstürze, " - "steigende Fehlerquoten. Trigger: 'vorhersage', 'was bahnt sich an', " - "'prognose', 'health forecast', 'system check', 'systemstatus'." + "Sammelt Systemdaten (Disk-Trends, Log-Fehler, Container-Status, CPU/RAM) " + "und gibt einen strukturierten Report zurück. Analysiere den Report und gib " + "eine Prognose, ob sich Probleme anbahnen. Trigger: 'vorhersage', " + "'was bahnt sich an', 'prognose', 'health forecast', 'system check', " + "'systemstatus', 'wie geht es dem system'." ), "parameters": {"type": "object", "properties": {}, "required": []}, }, @@ -95,17 +92,20 @@ def _gather_proxmox() -> dict: result = {} try: cfg = config.parse_config() - passwords = {} + + pw_default = cfg.passwords.get("default", "") + pw_hetzner = cfg.passwords.get("hetzner", pw_default) + passwords = {"default": pw_default, "pve-hetzner": pw_hetzner} + for host in proxmox_client.PROXMOX_HOSTS: + if host not in passwords: + passwords[host] = pw_default + tokens = {} - for pve_host in cfg.proxmox_hosts: - name = pve_host.get("name", "") - pw = pve_host.get("password", "") - tok_name = pve_host.get("token_name", "") - tok_val = pve_host.get("token_value", "") - if pw: - passwords[name] = pw - if tok_name and tok_val: - tokens[name] = {"name": tok_name, "value": tok_val} + tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") + tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") + if tn and tv: + tokens["pve-hetzner"] = {"name": tn, "value": tv} + containers = proxmox_client.get_all_containers(passwords=passwords, tokens=tokens) stopped = [ {"id": c.get("vmid"), "name": c.get("name", "?")} @@ -113,112 +113,118 @@ def _gather_proxmox() -> dict: if c.get("status") == "stopped" and "error" not in c ] running = len([c for c in containers if c.get("status") == "running"]) + errors = [c for c in containers if "error" in c] result["total"] = len(containers) result["running"] = running result["stopped"] = stopped + result["host_errors"] = [ + f"{e.get('_host', '?')}: {e['error'][:80]}" for e in errors + ] except Exception as e: result["proxmox_error"] = str(e) return result -def _call_analysis_llm(data_summary: str) -> str: - now_str = datetime.now().strftime("%d.%m.%Y %H:%M") - prompt = ( - f"Du bist ein Homelab-Monitoring-Experte. Heute ist der {now_str}.\n" - "Analysiere die folgenden System-Rohdaten und erstelle eine kompakte Prognose.\n\n" - "REGELN:\n" - "- Nur echte Auffälligkeiten nennen (nicht jede normale Metrik)\n" - "- Disk-Delta > 2% in 24h = Warnung\n" - "- Disk > 80% = kritisch\n" - "- RAM > 85% = Warnung\n" - "- Fehler > 50 in 24h für einen Host = Warnung\n" - "- Gestoppte Container = prüfen ob OK\n" - "- Wenn alles normal: kurze Entwarnung genügt\n" - "- Max 12 Zeilen, Emojis erlaubt, auf Deutsch\n" - "- Klare Handlungsempfehlung wenn nötig\n\n" - f"System-Daten:\n{data_summary}\n\n" - "Prognose:" - ) - try: - r = requests.post( - f"{OLLAMA_URL}/api/chat", - json={ - "model": FORECAST_MODEL, - "messages": [{"role": "user", "content": prompt + " /no_think"}], - "stream": False, - "options": {"num_predict": 700, "temperature": 0.3}, - }, - timeout=180, - ) - r.raise_for_status() - content = r.json().get("message", {}).get("content", "") - # Strip tags if present - import re - content = re.sub(r".*?", "", content, flags=re.DOTALL).strip() - return content or "LLM-Analyse ergab kein Ergebnis." - except Exception as e: - return f"⚠️ LLM-Analyse nicht verfügbar: {e}" - - def handle_get_health_forecast(**kw) -> str: prom = _gather_prometheus() loki = _gather_loki() pve = _gather_proxmox() - summary_parts = [] + now_str = datetime.now().strftime("%d.%m.%Y %H:%M") + lines = [f"📊 Systemdaten-Report ({now_str})", ""] + # --- Prometheus Warnungen --- if prom.get("warnings"): - summary_parts.append("AKTIVE WARNUNGEN: " + ", ".join(prom["warnings"])) + lines.append("⚠️ AKTIVE SCHWELLWERT-WARNUNGEN:") + for w in prom["warnings"]: + lines.append(f" {w}") else: - summary_parts.append("Prometheus-Warnungen: keine") + lines.append("✅ Keine Prometheus-Schwellwert-Warnungen") - if prom.get("disk_current"): - lines = [f"{d['host']}: {d['used_pct']}%" for d in prom["disk_current"]] - summary_parts.append("Disk-Nutzung aktuell:\n " + "\n ".join(lines)) + # --- Disk-Trends --- + disk_issues = [] + for h, t in prom.get("disk_trend_24h", {}).items(): + if t["delta_24h"] >= 2.0: + disk_issues.append( + f" 📈 {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)" + ) + elif t["delta_24h"] >= 1.0: + disk_issues.append( + f" ↗ {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)" + ) - if prom.get("disk_trend_24h"): - trend_lines = [] - for h, t in prom["disk_trend_24h"].items(): - if abs(t["delta_24h"]) > 0.5: - trend_lines.append( - f" {h}: {t['start_pct']}% → {t['end_pct']}% (Δ {t['delta_24h']:+.1f}% in 24h)" - ) - if trend_lines: - summary_parts.append("Disk-Trends (letzte 24h):\n" + "\n".join(trend_lines)) + # Auch hohe absolute Werte hervorheben + for d in prom.get("disk_current", []): + if d["used_pct"] >= 70: + disk_issues.append(f" 💾 {d['host']}: aktuell {d['used_pct']}% belegt") - if prom.get("memory"): - high_mem = [m for m in prom["memory"] if m["used_pct"] > 70] - if high_mem: - mem_lines = [f"{m['host']}: {m['used_pct']}%" for m in high_mem] - summary_parts.append("RAM > 70%:\n " + "\n ".join(mem_lines)) + if disk_issues: + lines.append("") + lines.append("🗄️ Disk-Auffälligkeiten:") + lines.extend(disk_issues) + else: + # Kurze Übersicht + disk_summary = ", ".join( + f"{d['host']}: {d['used_pct']}%" + for d in sorted(prom.get("disk_current", []), key=lambda x: -x["used_pct"])[:5] + ) + if disk_summary: + lines.append(f"💾 Disk (Top-5): {disk_summary}") - if prom.get("load5"): - high_load = [l for l in prom["load5"] if l["load5"] > 2.0] - if high_load: - load_lines = [f"{l['host']}: load5={l['load5']}" for l in high_load] - summary_parts.append("Hohe Last:\n " + "\n ".join(load_lines)) + # --- RAM --- + high_mem = [m for m in prom.get("memory", []) if m["used_pct"] > 70] + if high_mem: + lines.append("") + lines.append("🧠 RAM > 70%:") + for m in sorted(high_mem, key=lambda x: -x["used_pct"]): + lines.append(f" {m['host']}: {m['used_pct']}%") + # --- Load --- + high_load = [l for l in prom.get("load5", []) if l["load5"] > 2.0] + if high_load: + lines.append("") + lines.append("⚡ Hohe Last (load5 > 2):") + for l in sorted(high_load, key=lambda x: -x["load5"]): + lines.append(f" {l['host']}: {l['load5']}") + + # --- Loki Fehler --- errors = loki.get("errors_24h", {}) + lines.append("") if errors: - err_lines = [f"{h}: {c} Fehler" for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]] - summary_parts.append("Log-Fehler letzte 24h:\n " + "\n ".join(err_lines)) + lines.append("🔴 Log-Fehler letzte 24h:") + for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]: + level = "🔴 KRITISCH" if c >= 100 else "🟠 Erhöht" if c >= 20 else "🟡" + lines.append(f" {level} {h}: {c} Fehler") else: - summary_parts.append("Log-Fehler letzte 24h: keine") + lines.append("✅ Keine Log-Fehler in den letzten 24h") if loki.get("silent_hosts"): - summary_parts.append("Stille Hosts (>60 min kein Log): " + ", ".join(loki["silent_hosts"])) + lines.append(f"🔇 Stille Hosts (>60min kein Log): {', '.join(loki['silent_hosts'])}") + if loki.get("loki_error"): + lines.append(f"⚠️ Loki nicht erreichbar: {loki['loki_error'][:60]}") + + # --- Proxmox --- + lines.append("") if pve.get("stopped"): - stopped_names = [f"CT{c['id']} {c['name']}" for c in pve["stopped"]] - summary_parts.append("Gestoppte Container: " + ", ".join(stopped_names)) + lines.append(f"🛑 Gestoppte Container ({len(pve['stopped'])}):") + for c in pve["stopped"]: + lines.append(f" CT{c['id']} {c['name']}") elif "proxmox_error" not in pve: - summary_parts.append(f"Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen") + lines.append( + f"🟢 Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen" + ) - data_summary = "\n\n".join(summary_parts) - analysis = _call_analysis_llm(data_summary) + if pve.get("host_errors"): + # Nur echte Proxmox-Hosts (pve-* oder pbs-*) + real_errors = [e for e in pve["host_errors"] if e.startswith(("pve-", "pbs-"))] + if real_errors: + lines.append("⚠️ Proxmox-Host-Fehler: " + "; ".join(real_errors[:3])) - header = f"🔭 *Systemvorhersage* ({datetime.now().strftime('%d.%m.%Y %H:%M')})\n\n" - return header + analysis + if pve.get("proxmox_error"): + lines.append(f"⚠️ Proxmox-Fehler: {pve['proxmox_error'][:80]}") + + return "\n".join(lines) HANDLERS = {