"""KI-gestützte Systemvorhersage — sammelt Metriken, Logs und Container-Status.""" import re import requests from datetime import datetime, timezone, timedelta from core import prometheus_client, loki_client, config from core import proxmox_client TOOLS = [ { "type": "function", "function": { "name": "get_health_forecast", "description": ( "Sammelt Systemdaten (Disk-Trends, Log-Fehler, Container-Status, CPU/RAM) " "und gibt einen strukturierten Report zurück. Analysiere den Report und gib " "eine Prognose, ob sich Probleme anbahnen. Trigger: 'vorhersage', " "'was bahnt sich an', 'prognose', 'health forecast', 'system check', " "'systemstatus', 'wie geht es dem system'." ), "parameters": {"type": "object", "properties": {}, "required": []}, }, }, ] def _gather_prometheus() -> dict: result = {} try: result["warnings"] = prometheus_client.get_warnings() disk = prometheus_client.get_disk() result["disk_current"] = [ {"host": r["host"], "used_pct": round(r["value"], 1)} for r in disk ] trend = prometheus_client.range_query( 'max by (host) ((1 - node_filesystem_avail_bytes{mountpoint="/"} ' '/ node_filesystem_size_bytes{mountpoint="/"}) * 100)', hours=24, step="2h", ) trends = {} if trend.get("status") == "success": for r in trend.get("data", {}).get("result", []): h = r.get("metric", {}).get("host", "?") vals = [float(v[1]) for v in r.get("values", []) if v[1] != "NaN"] if len(vals) >= 2: delta = vals[-1] - vals[0] trends[h] = { "start_pct": round(vals[0], 1), "end_pct": round(vals[-1], 1), "delta_24h": round(delta, 2), } result["disk_trend_24h"] = trends mem = prometheus_client.get_memory() result["memory"] = [ {"host": r["host"], "used_pct": round(r["value"], 1)} for r in mem ] load = prometheus_client.get_load() result["load5"] = [ {"host": r["host"], "load5": round(r["value"], 2)} for r in load ] except Exception as e: result["prometheus_error"] = str(e) return result def _gather_loki() -> dict: result = {} try: hosts = loki_client.get_labels() error_counts = {} for host in hosts[:20]: errors = loki_client.get_errors(container=host, hours=24, limit=300) count = 0 if (len(errors) == 1 and "error" in errors[0]) else len(errors) if count > 0: error_counts[host] = count result["errors_24h"] = error_counts silent = loki_client.check_silence(minutes=60) result["silent_hosts"] = [s["host"] for s in silent if "host" in s] except Exception as e: result["loki_error"] = str(e) return result def _gather_proxmox() -> dict: result = {} try: cfg = config.parse_config() pw_default = cfg.passwords.get("default", "") pw_hetzner = cfg.passwords.get("hetzner", pw_default) passwords = {"default": pw_default, "pve-hetzner": pw_hetzner} for host in proxmox_client.PROXMOX_HOSTS: if host not in passwords: passwords[host] = pw_default tokens = {} tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") if tn and tv: tokens["pve-hetzner"] = {"name": tn, "value": tv} containers = proxmox_client.get_all_containers(passwords=passwords, tokens=tokens) stopped = [ {"id": c.get("vmid"), "name": c.get("name", "?")} for c in containers if c.get("status") == "stopped" and "error" not in c ] running = len([c for c in containers if c.get("status") == "running"]) errors = [c for c in containers if "error" in c] result["total"] = len(containers) result["running"] = running result["stopped"] = stopped result["host_errors"] = [ f"{e.get('_host', '?')}: {e['error'][:80]}" for e in errors ] except Exception as e: result["proxmox_error"] = str(e) return result def handle_get_health_forecast(**kw) -> str: prom = _gather_prometheus() loki = _gather_loki() pve = _gather_proxmox() now_str = datetime.now().strftime("%d.%m.%Y %H:%M") lines = [f"📊 Systemdaten-Report ({now_str})", ""] # --- Prometheus Warnungen --- if prom.get("warnings"): lines.append("⚠️ AKTIVE SCHWELLWERT-WARNUNGEN:") for w in prom["warnings"]: lines.append(f" {w}") else: lines.append("✅ Keine Prometheus-Schwellwert-Warnungen") # --- Disk-Trends --- disk_issues = [] for h, t in prom.get("disk_trend_24h", {}).items(): if t["delta_24h"] >= 2.0: disk_issues.append( f" 📈 {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)" ) elif t["delta_24h"] >= 1.0: disk_issues.append( f" ↗ {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)" ) # Auch hohe absolute Werte hervorheben for d in prom.get("disk_current", []): if d["used_pct"] >= 70: disk_issues.append(f" 💾 {d['host']}: aktuell {d['used_pct']}% belegt") if disk_issues: lines.append("") lines.append("🗄️ Disk-Auffälligkeiten:") lines.extend(disk_issues) else: # Kurze Übersicht disk_summary = ", ".join( f"{d['host']}: {d['used_pct']}%" for d in sorted(prom.get("disk_current", []), key=lambda x: -x["used_pct"])[:5] ) if disk_summary: lines.append(f"💾 Disk (Top-5): {disk_summary}") # --- RAM --- high_mem = [m for m in prom.get("memory", []) if m["used_pct"] > 70] if high_mem: lines.append("") lines.append("🧠 RAM > 70%:") for m in sorted(high_mem, key=lambda x: -x["used_pct"]): lines.append(f" {m['host']}: {m['used_pct']}%") # --- Load --- high_load = [l for l in prom.get("load5", []) if l["load5"] > 2.0] if high_load: lines.append("") lines.append("⚡ Hohe Last (load5 > 2):") for l in sorted(high_load, key=lambda x: -x["load5"]): lines.append(f" {l['host']}: {l['load5']}") # --- Loki Fehler --- errors = loki.get("errors_24h", {}) lines.append("") if errors: lines.append("🔴 Log-Fehler letzte 24h:") for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]: level = "🔴 KRITISCH" if c >= 100 else "🟠 Erhöht" if c >= 20 else "🟡" lines.append(f" {level} {h}: {c} Fehler") else: lines.append("✅ Keine Log-Fehler in den letzten 24h") if loki.get("silent_hosts"): lines.append(f"🔇 Stille Hosts (>60min kein Log): {', '.join(loki['silent_hosts'])}") if loki.get("loki_error"): lines.append(f"⚠️ Loki nicht erreichbar: {loki['loki_error'][:60]}") # --- Proxmox --- lines.append("") if pve.get("stopped"): lines.append(f"🛑 Gestoppte Container ({len(pve['stopped'])}):") for c in pve["stopped"]: lines.append(f" CT{c['id']} {c['name']}") elif "proxmox_error" not in pve: lines.append( f"🟢 Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen" ) if pve.get("host_errors"): # Nur echte Proxmox-Hosts (pve-* oder pbs-*) real_errors = [e for e in pve["host_errors"] if e.startswith(("pve-", "pbs-"))] if real_errors: lines.append("⚠️ Proxmox-Host-Fehler: " + "; ".join(real_errors[:3])) if pve.get("proxmox_error"): lines.append(f"⚠️ Proxmox-Fehler: {pve['proxmox_error'][:80]}") return "\n".join(lines) HANDLERS = { "get_health_forecast": handle_get_health_forecast, }