- Tool gibt strukturierten Report zurueck (kein verschachtelter LLM-Aufruf mehr) - Klare Hervorhebung: KRITISCH (>=100 Fehler), Erhoeht (>=20), Disk-Trend - Proxmox: korrekte Passwort/Token-Ermittlung, nur echte pve-Hosts im Fehler-Log - daily_forecast: LLM analysiert den Report bevor er gesendet wird
232 lines
8.1 KiB
Python
232 lines
8.1 KiB
Python
"""KI-gestützte Systemvorhersage — sammelt Metriken, Logs und Container-Status."""
|
|
|
|
import re
|
|
import requests
|
|
from datetime import datetime, timezone, timedelta
|
|
|
|
from core import prometheus_client, loki_client, config
|
|
from core import proxmox_client
|
|
|
|
TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_health_forecast",
|
|
"description": (
|
|
"Sammelt Systemdaten (Disk-Trends, Log-Fehler, Container-Status, CPU/RAM) "
|
|
"und gibt einen strukturierten Report zurück. Analysiere den Report und gib "
|
|
"eine Prognose, ob sich Probleme anbahnen. Trigger: 'vorhersage', "
|
|
"'was bahnt sich an', 'prognose', 'health forecast', 'system check', "
|
|
"'systemstatus', 'wie geht es dem system'."
|
|
),
|
|
"parameters": {"type": "object", "properties": {}, "required": []},
|
|
},
|
|
},
|
|
]
|
|
|
|
|
|
def _gather_prometheus() -> dict:
|
|
result = {}
|
|
try:
|
|
result["warnings"] = prometheus_client.get_warnings()
|
|
|
|
disk = prometheus_client.get_disk()
|
|
result["disk_current"] = [
|
|
{"host": r["host"], "used_pct": round(r["value"], 1)} for r in disk
|
|
]
|
|
|
|
trend = prometheus_client.range_query(
|
|
'max by (host) ((1 - node_filesystem_avail_bytes{mountpoint="/"} '
|
|
'/ node_filesystem_size_bytes{mountpoint="/"}) * 100)',
|
|
hours=24,
|
|
step="2h",
|
|
)
|
|
trends = {}
|
|
if trend.get("status") == "success":
|
|
for r in trend.get("data", {}).get("result", []):
|
|
h = r.get("metric", {}).get("host", "?")
|
|
vals = [float(v[1]) for v in r.get("values", []) if v[1] != "NaN"]
|
|
if len(vals) >= 2:
|
|
delta = vals[-1] - vals[0]
|
|
trends[h] = {
|
|
"start_pct": round(vals[0], 1),
|
|
"end_pct": round(vals[-1], 1),
|
|
"delta_24h": round(delta, 2),
|
|
}
|
|
result["disk_trend_24h"] = trends
|
|
|
|
mem = prometheus_client.get_memory()
|
|
result["memory"] = [
|
|
{"host": r["host"], "used_pct": round(r["value"], 1)} for r in mem
|
|
]
|
|
|
|
load = prometheus_client.get_load()
|
|
result["load5"] = [
|
|
{"host": r["host"], "load5": round(r["value"], 2)} for r in load
|
|
]
|
|
except Exception as e:
|
|
result["prometheus_error"] = str(e)
|
|
return result
|
|
|
|
|
|
def _gather_loki() -> dict:
|
|
result = {}
|
|
try:
|
|
hosts = loki_client.get_labels()
|
|
error_counts = {}
|
|
for host in hosts[:20]:
|
|
errors = loki_client.get_errors(container=host, hours=24, limit=300)
|
|
count = 0 if (len(errors) == 1 and "error" in errors[0]) else len(errors)
|
|
if count > 0:
|
|
error_counts[host] = count
|
|
result["errors_24h"] = error_counts
|
|
|
|
silent = loki_client.check_silence(minutes=60)
|
|
result["silent_hosts"] = [s["host"] for s in silent if "host" in s]
|
|
except Exception as e:
|
|
result["loki_error"] = str(e)
|
|
return result
|
|
|
|
|
|
def _gather_proxmox() -> dict:
|
|
result = {}
|
|
try:
|
|
cfg = config.parse_config()
|
|
|
|
pw_default = cfg.passwords.get("default", "")
|
|
pw_hetzner = cfg.passwords.get("hetzner", pw_default)
|
|
passwords = {"default": pw_default, "pve-hetzner": pw_hetzner}
|
|
for host in proxmox_client.PROXMOX_HOSTS:
|
|
if host not in passwords:
|
|
passwords[host] = pw_default
|
|
|
|
tokens = {}
|
|
tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "")
|
|
tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "")
|
|
if tn and tv:
|
|
tokens["pve-hetzner"] = {"name": tn, "value": tv}
|
|
|
|
containers = proxmox_client.get_all_containers(passwords=passwords, tokens=tokens)
|
|
stopped = [
|
|
{"id": c.get("vmid"), "name": c.get("name", "?")}
|
|
for c in containers
|
|
if c.get("status") == "stopped" and "error" not in c
|
|
]
|
|
running = len([c for c in containers if c.get("status") == "running"])
|
|
errors = [c for c in containers if "error" in c]
|
|
result["total"] = len(containers)
|
|
result["running"] = running
|
|
result["stopped"] = stopped
|
|
result["host_errors"] = [
|
|
f"{e.get('_host', '?')}: {e['error'][:80]}" for e in errors
|
|
]
|
|
except Exception as e:
|
|
result["proxmox_error"] = str(e)
|
|
return result
|
|
|
|
|
|
def handle_get_health_forecast(**kw) -> str:
|
|
prom = _gather_prometheus()
|
|
loki = _gather_loki()
|
|
pve = _gather_proxmox()
|
|
|
|
now_str = datetime.now().strftime("%d.%m.%Y %H:%M")
|
|
lines = [f"📊 Systemdaten-Report ({now_str})", ""]
|
|
|
|
# --- Prometheus Warnungen ---
|
|
if prom.get("warnings"):
|
|
lines.append("⚠️ AKTIVE SCHWELLWERT-WARNUNGEN:")
|
|
for w in prom["warnings"]:
|
|
lines.append(f" {w}")
|
|
else:
|
|
lines.append("✅ Keine Prometheus-Schwellwert-Warnungen")
|
|
|
|
# --- Disk-Trends ---
|
|
disk_issues = []
|
|
for h, t in prom.get("disk_trend_24h", {}).items():
|
|
if t["delta_24h"] >= 2.0:
|
|
disk_issues.append(
|
|
f" 📈 {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)"
|
|
)
|
|
elif t["delta_24h"] >= 1.0:
|
|
disk_issues.append(
|
|
f" ↗ {h}: {t['start_pct']}% → {t['end_pct']}% (+{t['delta_24h']:.1f}% in 24h)"
|
|
)
|
|
|
|
# Auch hohe absolute Werte hervorheben
|
|
for d in prom.get("disk_current", []):
|
|
if d["used_pct"] >= 70:
|
|
disk_issues.append(f" 💾 {d['host']}: aktuell {d['used_pct']}% belegt")
|
|
|
|
if disk_issues:
|
|
lines.append("")
|
|
lines.append("🗄️ Disk-Auffälligkeiten:")
|
|
lines.extend(disk_issues)
|
|
else:
|
|
# Kurze Übersicht
|
|
disk_summary = ", ".join(
|
|
f"{d['host']}: {d['used_pct']}%"
|
|
for d in sorted(prom.get("disk_current", []), key=lambda x: -x["used_pct"])[:5]
|
|
)
|
|
if disk_summary:
|
|
lines.append(f"💾 Disk (Top-5): {disk_summary}")
|
|
|
|
# --- RAM ---
|
|
high_mem = [m for m in prom.get("memory", []) if m["used_pct"] > 70]
|
|
if high_mem:
|
|
lines.append("")
|
|
lines.append("🧠 RAM > 70%:")
|
|
for m in sorted(high_mem, key=lambda x: -x["used_pct"]):
|
|
lines.append(f" {m['host']}: {m['used_pct']}%")
|
|
|
|
# --- Load ---
|
|
high_load = [l for l in prom.get("load5", []) if l["load5"] > 2.0]
|
|
if high_load:
|
|
lines.append("")
|
|
lines.append("⚡ Hohe Last (load5 > 2):")
|
|
for l in sorted(high_load, key=lambda x: -x["load5"]):
|
|
lines.append(f" {l['host']}: {l['load5']}")
|
|
|
|
# --- Loki Fehler ---
|
|
errors = loki.get("errors_24h", {})
|
|
lines.append("")
|
|
if errors:
|
|
lines.append("🔴 Log-Fehler letzte 24h:")
|
|
for h, c in sorted(errors.items(), key=lambda x: -x[1])[:10]:
|
|
level = "🔴 KRITISCH" if c >= 100 else "🟠 Erhöht" if c >= 20 else "🟡"
|
|
lines.append(f" {level} {h}: {c} Fehler")
|
|
else:
|
|
lines.append("✅ Keine Log-Fehler in den letzten 24h")
|
|
|
|
if loki.get("silent_hosts"):
|
|
lines.append(f"🔇 Stille Hosts (>60min kein Log): {', '.join(loki['silent_hosts'])}")
|
|
|
|
if loki.get("loki_error"):
|
|
lines.append(f"⚠️ Loki nicht erreichbar: {loki['loki_error'][:60]}")
|
|
|
|
# --- Proxmox ---
|
|
lines.append("")
|
|
if pve.get("stopped"):
|
|
lines.append(f"🛑 Gestoppte Container ({len(pve['stopped'])}):")
|
|
for c in pve["stopped"]:
|
|
lines.append(f" CT{c['id']} {c['name']}")
|
|
elif "proxmox_error" not in pve:
|
|
lines.append(
|
|
f"🟢 Proxmox: {pve.get('running', 0)}/{pve.get('total', 0)} Container laufen"
|
|
)
|
|
|
|
if pve.get("host_errors"):
|
|
# Nur echte Proxmox-Hosts (pve-* oder pbs-*)
|
|
real_errors = [e for e in pve["host_errors"] if e.startswith(("pve-", "pbs-"))]
|
|
if real_errors:
|
|
lines.append("⚠️ Proxmox-Host-Fehler: " + "; ".join(real_errors[:3]))
|
|
|
|
if pve.get("proxmox_error"):
|
|
lines.append(f"⚠️ Proxmox-Fehler: {pve['proxmox_error'][:80]}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
HANDLERS = {
|
|
"get_health_forecast": handle_get_health_forecast,
|
|
}
|