From 16036531f35aff8abe0b75f113a835d9a1bcb856 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Mar 2026 13:35:05 +0700 Subject: [PATCH] =?UTF-8?q?Prometheus-Integration:=207=20Hosts=20=C3=BCber?= =?UTF-8?q?wachen=20+=20Bot-Kontext?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - prometheus_client.py komplett neu: host-basierte Abfragen, Warnungen bei CPU>80%, RAM>85%, Disk>85%, format_overview/detail - context.py: Prometheus-Daten bei System-Fragen laden, host-spezifische Detail-Abfrage, Warnungen im Fallback Made-with: Cursor --- homelab-ai-bot/context.py | 37 +++- homelab-ai-bot/core/prometheus_client.py | 236 ++++++++++++++++------- 2 files changed, 191 insertions(+), 82 deletions(-) diff --git a/homelab-ai-bot/context.py b/homelab-ai-bot/context.py index a6e59c79..cc5489f5 100644 --- a/homelab-ai-bot/context.py +++ b/homelab-ai-bot/context.py @@ -6,7 +6,7 @@ import os import re sys.path.insert(0, os.path.dirname(__file__)) -from core import config, loki_client, proxmox_client, wordpress_client +from core import config, loki_client, proxmox_client, wordpress_client, prometheus_client def _load_config(): @@ -23,12 +23,13 @@ def _get_tokens(cfg): def _get_passwords(cfg): - return { - "pve-hetzner": cfg.passwords.get("hetzner", ""), - "pve1": cfg.passwords.get("default", ""), - "pve3": cfg.passwords.get("default", ""), - "default": cfg.passwords.get("default", ""), - } + pw_default = cfg.passwords.get("default", "") + pw_hetzner = cfg.passwords.get("hetzner", pw_default) + pws = {"default": pw_default, "pve-hetzner": pw_hetzner} + for host in proxmox_client.PROXMOX_HOSTS: + if host not in pws: + pws[host] = pw_default + return pws def gather_status() -> str: @@ -146,10 +147,27 @@ def gather_context_for_question(question: str) -> str: # WordPress-Daten für Blog-Fragen if any(w in q for w in ["wordpress", "blog", "post", "artikel", "kommentar", "plugin"]): - wordpress_client.init(cfg) # WICHTIG: Init vor Format_overview + wordpress_client.init(cfg) wp_overview = wordpress_client.format_overview(cfg) parts.append("=== WordPress ===\n" + wp_overview) + # Prometheus-Metriken für System-Fragen + if any(w in q for w in ["cpu", "ram", "speicher", "memory", "disk", "platte", + "festplatte", "auslastung", "load", "uptime", "server", + "metriken", "prometheus", "performance", "ressource"]): + host_match = None + for name in ["pve-hetzner", "pve-ka-1", "pve-ka-2", "pve-ka-3", + "pve-mu-2", "pve-mu-3", "pve-he"]: + if name.replace("-", "") in q.replace("-", "").replace(" ", ""): + host_match = name + break + if host_match: + parts.append(f"=== Prometheus {host_match} ===\n" + + prometheus_client.format_host_detail(host_match)) + else: + parts.append("=== Prometheus Übersicht ===\n" + + prometheus_client.format_overview()) + ct_match = re.search(r'\bct[- ]?(\d{3})\b', q) if ct_match: parts.append(f"=== CT {ct_match.group(1)} ===\n" + gather_container_status(ct_match.group(1))) @@ -162,5 +180,8 @@ def gather_context_for_question(question: str) -> str: if not parts: parts.append("=== Container Status ===\n" + gather_status()) parts.append("=== Aktuelle Fehler ===\n" + gather_errors(hours=1)) + warnings = prometheus_client.get_warnings() + if warnings: + parts.append("=== Prometheus Warnungen ===\n" + "\n".join(warnings)) return "\n\n".join(parts) diff --git a/homelab-ai-bot/core/prometheus_client.py b/homelab-ai-bot/core/prometheus_client.py index f53bd305..f4954dc1 100644 --- a/homelab-ai-bot/core/prometheus_client.py +++ b/homelab-ai-bot/core/prometheus_client.py @@ -1,15 +1,18 @@ -"""Prometheus API client for querying system metrics.""" +"""Prometheus API client — fragt Host-Metriken ab für den Hausmeister-Bot.""" import requests from datetime import datetime, timezone, timedelta -PROMETHEUS_URL = "http://100.88.230.59:9090" +PROMETHEUS_URL = "http://10.10.10.1:9090" + +WARN_CPU = 80 +WARN_MEM = 85 +WARN_DISK = 85 -def _query(endpoint: str, params: dict, base_url: str = None) -> dict: - url = f"{base_url or PROMETHEUS_URL}{endpoint}" +def _query(endpoint: str, params: dict) -> dict: try: - r = requests.get(url, params=params, timeout=10) + r = requests.get(f"{PROMETHEUS_URL}{endpoint}", params=params, timeout=10) r.raise_for_status() return r.json() except requests.RequestException as e: @@ -17,12 +20,10 @@ def _query(endpoint: str, params: dict, base_url: str = None) -> dict: def instant_query(query: str) -> dict: - """Run an instant PromQL query.""" return _query("/api/v1/query", {"query": query}) def range_query(query: str, hours: float = 1, step: str = "5m") -> dict: - """Run a range PromQL query.""" now = datetime.now(timezone.utc) start = now - timedelta(hours=hours) return _query("/api/v1/query_range", { @@ -33,101 +34,188 @@ def range_query(query: str, hours: float = 1, step: str = "5m") -> dict: }) +def is_available() -> bool: + data = _query("/api/v1/query", {"query": "up"}) + return data.get("status") == "success" + + def get_targets() -> list[dict]: - """Get all Prometheus scrape targets with their status.""" data = _query("/api/v1/targets", {}) if "error" in data: return [{"error": data["error"]}] - targets = [] for t in data.get("data", {}).get("activeTargets", []): + labels = t.get("labels", {}) targets.append({ - "job": t.get("labels", {}).get("job", "unknown"), - "instance": t.get("labels", {}).get("instance", "unknown"), + "host": labels.get("host", labels.get("instance", "?")), + "location": labels.get("location", ""), "health": t.get("health", "unknown"), - "last_scrape": t.get("lastScrape", ""), + "job": labels.get("job", ""), }) return targets -def is_available() -> bool: - """Check if Prometheus is reachable.""" - data = _query("/api/v1/query", {"query": "up"}) - return "error" not in data or data.get("status") == "success" +def _by_host(data: dict, metric: str) -> list[dict]: + if data.get("status") != "success": + return [] + results = [] + for r in data.get("data", {}).get("result", []): + m = r.get("metric", {}) + host = m.get("host", m.get("instance", "?")) + val = float(r.get("value", [0, 0])[1]) + results.append({"host": host, "value": val, "metric": metric}) + return sorted(results, key=lambda x: x["host"]) -def get_cpu(instance: str = None, hours: float = 1) -> dict: - """Get CPU usage. If instance given, filter to that instance.""" - if instance: - q = f'100 - (avg by (instance) (rate(node_cpu_seconds_total{{mode="idle", instance=~"{instance}.*"}}[5m])) * 100)' - else: - q = '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)' - data = instant_query(q) - return _extract_metrics(data, "cpu_percent") +def get_cpu(host: str = None) -> list[dict]: + filt = f', host="{host}"' if host else "" + q = f'100 - (avg by (host) (rate(node_cpu_seconds_total{{mode="idle"{filt}}}[5m])) * 100)' + return _by_host(instant_query(q), "cpu") -def get_memory(instance: str = None) -> dict: - """Get memory usage percentage.""" - if instance: - q = f'(1 - node_memory_MemAvailable_bytes{{instance=~"{instance}.*"}} / node_memory_MemTotal_bytes{{instance=~"{instance}.*"}}) * 100' - else: - q = '(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100' - data = instant_query(q) - return _extract_metrics(data, "memory_percent") +def get_memory(host: str = None) -> list[dict]: + filt = f', host="{host}"' if host else "" + q = f'(1 - node_memory_MemAvailable_bytes{{host!=""{filt}}} / node_memory_MemTotal_bytes{{host!=""{filt}}}) * 100' + return _by_host(instant_query(q), "mem") -def get_disk(instance: str = None) -> dict: - """Get root filesystem usage percentage.""" - if instance: - q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", instance=~"{instance}.*"}} / node_filesystem_size_bytes{{mountpoint="/", instance=~"{instance}.*"}}) * 100' - else: - q = '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100' - data = instant_query(q) - return _extract_metrics(data, "disk_percent") +def get_disk(host: str = None) -> list[dict]: + filt = f', host="{host}"' if host else "" + q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", host!=""{filt}}} / node_filesystem_size_bytes{{mountpoint="/", host!=""{filt}}}) * 100' + return _by_host(instant_query(q), "disk") -def get_overview() -> str: - """Get a formatted overview of all monitored instances.""" +def get_disk_bytes(host: str = None) -> list[dict]: + """Returns available and total bytes per host.""" + filt = f', host="{host}"' if host else "" + avail = _by_host(instant_query( + f'node_filesystem_avail_bytes{{mountpoint="/", host!=""{filt}}}'), "avail_bytes") + total = _by_host(instant_query( + f'node_filesystem_size_bytes{{mountpoint="/", host!=""{filt}}}'), "total_bytes") + total_map = {r["host"]: r["value"] for r in total} + result = [] + for a in avail: + t = total_map.get(a["host"], 0) + result.append({ + "host": a["host"], + "avail_gb": a["value"] / (1024**3), + "total_gb": t / (1024**3), + }) + return result + + +def get_uptime(host: str = None) -> list[dict]: + filt = f', host="{host}"' if host else "" + q = f'node_time_seconds{{host!=""{filt}}} - node_boot_time_seconds{{host!=""{filt}}}' + return _by_host(instant_query(q), "uptime_sec") + + +def get_load(host: str = None) -> list[dict]: + filt = f', host="{host}"' if host else "" + q = f'node_load5{{host!=""{filt}}}' + return _by_host(instant_query(q), "load5") + + +def get_warnings() -> list[str]: + """Return list of warning strings for hosts exceeding thresholds.""" + warnings = [] + for r in get_cpu(): + if r["value"] >= WARN_CPU: + warnings.append(f"🔴 {r['host']}: CPU {r['value']:.0f}%") + for r in get_memory(): + if r["value"] >= WARN_MEM: + warnings.append(f"🔴 {r['host']}: RAM {r['value']:.0f}%") + for r in get_disk(): + if r["value"] >= WARN_DISK: + warnings.append(f"🔴 {r['host']}: Disk {r['value']:.0f}%") + return warnings + + +def _fmt_uptime(seconds: float) -> str: + days = int(seconds // 86400) + hours = int((seconds % 86400) // 3600) + if days > 0: + return f"{days}d {hours}h" + return f"{hours}h" + + +def format_overview() -> str: + """Kompakte Übersicht aller Hosts — für den Bot.""" if not is_available(): - return "Prometheus is not reachable at " + PROMETHEUS_URL + return "⚠️ Prometheus nicht erreichbar." - targets = get_targets() - if not targets: - return "No Prometheus targets found." + cpu = {r["host"]: r["value"] for r in get_cpu()} + mem = {r["host"]: r["value"] for r in get_memory()} + disk = {r["host"]: r["value"] for r in get_disk()} + disk_gb = {r["host"]: r for r in get_disk_bytes()} + uptime = {r["host"]: r["value"] for r in get_uptime()} + load = {r["host"]: r["value"] for r in get_load()} - lines = ["## Prometheus Targets\n"] - for t in targets: - status = "UP" if t["health"] == "up" else "DOWN" - lines.append(f"- [{status}] {t['job']} ({t['instance']})") + hosts = sorted(set(list(cpu.keys()) + list(mem.keys()) + list(disk.keys()))) + if not hosts: + return "Keine Prometheus-Daten verfügbar." - cpu = get_cpu() - if "results" in cpu: - lines.append("\n## CPU Usage") - for r in cpu["results"]: - lines.append(f"- {r['instance']}: {r['value']:.1f}%") + lines = [f"📊 Server-Metriken ({len(hosts)} Hosts)\n"] + for h in hosts: + c = cpu.get(h, -1) + m = mem.get(h, -1) + d = disk.get(h, -1) + dinfo = disk_gb.get(h, {}) + u = uptime.get(h, 0) + l5 = load.get(h, -1) - mem = get_memory() - if "results" in mem: - lines.append("\n## Memory Usage") - for r in mem["results"]: - lines.append(f"- {r['instance']}: {r['value']:.1f}%") + warn = "" + if c >= WARN_CPU or m >= WARN_MEM or d >= WARN_DISK: + warn = " ⚠️" - disk = get_disk() - if "results" in disk: - lines.append("\n## Disk Usage (/)") - for r in disk["results"]: - lines.append(f"- {r['instance']}: {r['value']:.1f}%") + disk_str = f"{d:.0f}%" + if dinfo: + disk_str += f" ({dinfo.get('avail_gb', 0):.0f}/{dinfo.get('total_gb', 0):.0f} GB frei)" + + lines.append( + f"{'🟢' if c >= 0 else '🔴'} {h}{warn}\n" + f" CPU: {c:.0f}% RAM: {m:.0f}% Disk: {disk_str}\n" + f" Load5: {l5:.1f} Uptime: {_fmt_uptime(u)}" + ) + + warnings = get_warnings() + if warnings: + lines.append("\n⚠️ WARNUNGEN:") + lines.extend(warnings) + else: + lines.append("\n✅ Alle Werte im Normalbereich.") return "\n".join(lines) -def _extract_metrics(data: dict, metric_name: str) -> dict: - if "error" in data and data.get("status") != "success": - return {"error": data.get("error", "unknown error")} +def format_host_detail(host: str) -> str: + """Detail-Metriken für einen einzelnen Host.""" + if not is_available(): + return "⚠️ Prometheus nicht erreichbar." - results = [] - for r in data.get("data", {}).get("result", []): - instance = r.get("metric", {}).get("instance", "unknown") - value = float(r.get("value", [0, 0])[1]) - results.append({"instance": instance, "value": value, "metric": metric_name}) - return {"results": results} + cpu = get_cpu(host) + mem = get_memory(host) + disk = get_disk(host) + disk_gb = get_disk_bytes(host) + uptime = get_uptime(host) + load = get_load(host) + + if not cpu and not mem: + return f"Keine Metriken für '{host}' gefunden." + + lines = [f"📊 {host} — Detail\n"] + if cpu: + lines.append(f"CPU: {cpu[0]['value']:.1f}%") + if mem: + lines.append(f"RAM: {mem[0]['value']:.1f}%") + if disk: + d_str = f"Disk: {disk[0]['value']:.1f}%" + if disk_gb: + d_str += f" ({disk_gb[0]['avail_gb']:.1f} / {disk_gb[0]['total_gb']:.1f} GB frei)" + lines.append(d_str) + if load: + lines.append(f"Load (5m): {load[0]['value']:.2f}") + if uptime: + lines.append(f"Uptime: {_fmt_uptime(uptime[0]['value'])}") + + return "\n".join(lines)