From 16036531f35aff8abe0b75f113a835d9a1bcb856 Mon Sep 17 00:00:00 2001
From: root <root@pve1.takeo.lan>
Date: Mon, 9 Mar 2026 13:35:05 +0700
Subject: [PATCH] =?UTF-8?q?Prometheus-Integration:=207=20Hosts=20=C3=BCber?=
 =?UTF-8?q?wachen=20+=20Bot-Kontext?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- prometheus_client.py komplett neu: host-basierte Abfragen,
  Warnungen bei CPU>80%, RAM>85%, Disk>85%, format_overview/detail
- context.py: Prometheus-Daten bei System-Fragen laden,
  host-spezifische Detail-Abfrage, Warnungen im Fallback

Made-with: Cursor
---
 homelab-ai-bot/context.py                |  37 +++-
 homelab-ai-bot/core/prometheus_client.py | 236 ++++++++++++++++-------
 2 files changed, 191 insertions(+), 82 deletions(-)

diff --git a/homelab-ai-bot/context.py b/homelab-ai-bot/context.py
index a6e59c79..cc5489f5 100644
--- a/homelab-ai-bot/context.py
+++ b/homelab-ai-bot/context.py
@@ -6,7 +6,7 @@ import os
 import re
 
 sys.path.insert(0, os.path.dirname(__file__))
-from core import config, loki_client, proxmox_client, wordpress_client
+from core import config, loki_client, proxmox_client, wordpress_client, prometheus_client
 
 
 def _load_config():
@@ -23,12 +23,13 @@ def _get_tokens(cfg):
 
 
 def _get_passwords(cfg):
-    return {
-        "pve-hetzner": cfg.passwords.get("hetzner", ""),
-        "pve1": cfg.passwords.get("default", ""),
-        "pve3": cfg.passwords.get("default", ""),
-        "default": cfg.passwords.get("default", ""),
-    }
+    pw_default = cfg.passwords.get("default", "")
+    pw_hetzner = cfg.passwords.get("hetzner", pw_default)
+    pws = {"default": pw_default, "pve-hetzner": pw_hetzner}
+    for host in proxmox_client.PROXMOX_HOSTS:
+        if host not in pws:
+            pws[host] = pw_default
+    return pws
 
 
 def gather_status() -> str:
@@ -146,10 +147,27 @@ def gather_context_for_question(question: str) -> str:
 
     # WordPress-Daten für Blog-Fragen
     if any(w in q for w in ["wordpress", "blog", "post", "artikel", "kommentar", "plugin"]):
-        wordpress_client.init(cfg)  # WICHTIG: Init vor Format_overview
+        wordpress_client.init(cfg)
         wp_overview = wordpress_client.format_overview(cfg)
         parts.append("=== WordPress ===\n" + wp_overview)
 
+    # Prometheus-Metriken für System-Fragen
+    if any(w in q for w in ["cpu", "ram", "speicher", "memory", "disk", "platte",
+                             "festplatte", "auslastung", "load", "uptime", "server",
+                             "metriken", "prometheus", "performance", "ressource"]):
+        host_match = None
+        for name in ["pve-hetzner", "pve-ka-1", "pve-ka-2", "pve-ka-3",
+                      "pve-mu-2", "pve-mu-3", "pve-he"]:
+            if name.replace("-", "") in q.replace("-", "").replace(" ", ""):
+                host_match = name
+                break
+        if host_match:
+            parts.append(f"=== Prometheus {host_match} ===\n" +
+                         prometheus_client.format_host_detail(host_match))
+        else:
+            parts.append("=== Prometheus Übersicht ===\n" +
+                         prometheus_client.format_overview())
+
     ct_match = re.search(r'\bct[- ]?(\d{3})\b', q)
     if ct_match:
         parts.append(f"=== CT {ct_match.group(1)} ===\n" + gather_container_status(ct_match.group(1)))
@@ -162,5 +180,8 @@ def gather_context_for_question(question: str) -> str:
     if not parts:
         parts.append("=== Container Status ===\n" + gather_status())
         parts.append("=== Aktuelle Fehler ===\n" + gather_errors(hours=1))
+        warnings = prometheus_client.get_warnings()
+        if warnings:
+            parts.append("=== Prometheus Warnungen ===\n" + "\n".join(warnings))
 
     return "\n\n".join(parts)
diff --git a/homelab-ai-bot/core/prometheus_client.py b/homelab-ai-bot/core/prometheus_client.py
index f53bd305..f4954dc1 100644
--- a/homelab-ai-bot/core/prometheus_client.py
+++ b/homelab-ai-bot/core/prometheus_client.py
@@ -1,15 +1,18 @@
-"""Prometheus API client for querying system metrics."""
+"""Prometheus API client — fragt Host-Metriken ab für den Hausmeister-Bot."""
 
 import requests
 from datetime import datetime, timezone, timedelta
 
-PROMETHEUS_URL = "http://100.88.230.59:9090"
+PROMETHEUS_URL = "http://10.10.10.1:9090"
+
+WARN_CPU = 80
+WARN_MEM = 85
+WARN_DISK = 85
 
 
-def _query(endpoint: str, params: dict, base_url: str = None) -> dict:
-    url = f"{base_url or PROMETHEUS_URL}{endpoint}"
+def _query(endpoint: str, params: dict) -> dict:
     try:
-        r = requests.get(url, params=params, timeout=10)
+        r = requests.get(f"{PROMETHEUS_URL}{endpoint}", params=params, timeout=10)
         r.raise_for_status()
         return r.json()
     except requests.RequestException as e:
@@ -17,12 +20,10 @@ def _query(endpoint: str, params: dict, base_url: str = None) -> dict:
 
 
 def instant_query(query: str) -> dict:
-    """Run an instant PromQL query."""
     return _query("/api/v1/query", {"query": query})
 
 
 def range_query(query: str, hours: float = 1, step: str = "5m") -> dict:
-    """Run a range PromQL query."""
     now = datetime.now(timezone.utc)
     start = now - timedelta(hours=hours)
     return _query("/api/v1/query_range", {
@@ -33,101 +34,188 @@ def range_query(query: str, hours: float = 1, step: str = "5m") -> dict:
     })
 
 
+def is_available() -> bool:
+    data = _query("/api/v1/query", {"query": "up"})
+    return data.get("status") == "success"
+
+
 def get_targets() -> list[dict]:
-    """Get all Prometheus scrape targets with their status."""
     data = _query("/api/v1/targets", {})
     if "error" in data:
         return [{"error": data["error"]}]
-
     targets = []
     for t in data.get("data", {}).get("activeTargets", []):
+        labels = t.get("labels", {})
         targets.append({
-            "job": t.get("labels", {}).get("job", "unknown"),
-            "instance": t.get("labels", {}).get("instance", "unknown"),
+            "host": labels.get("host", labels.get("instance", "?")),
+            "location": labels.get("location", ""),
             "health": t.get("health", "unknown"),
-            "last_scrape": t.get("lastScrape", ""),
+            "job": labels.get("job", ""),
         })
     return targets
 
 
-def is_available() -> bool:
-    """Check if Prometheus is reachable."""
-    data = _query("/api/v1/query", {"query": "up"})
-    return "error" not in data or data.get("status") == "success"
+def _by_host(data: dict, metric: str) -> list[dict]:
+    if data.get("status") != "success":
+        return []
+    results = []
+    for r in data.get("data", {}).get("result", []):
+        m = r.get("metric", {})
+        host = m.get("host", m.get("instance", "?"))
+        val = float(r.get("value", [0, 0])[1])
+        results.append({"host": host, "value": val, "metric": metric})
+    return sorted(results, key=lambda x: x["host"])
 
 
-def get_cpu(instance: str = None, hours: float = 1) -> dict:
-    """Get CPU usage. If instance given, filter to that instance."""
-    if instance:
-        q = f'100 - (avg by (instance) (rate(node_cpu_seconds_total{{mode="idle", instance=~"{instance}.*"}}[5m])) * 100)'
-    else:
-        q = '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
-    data = instant_query(q)
-    return _extract_metrics(data, "cpu_percent")
+def get_cpu(host: str = None) -> list[dict]:
+    filt = f', host="{host}"' if host else ""
+    q = f'100 - (avg by (host) (rate(node_cpu_seconds_total{{mode="idle"{filt}}}[5m])) * 100)'
+    return _by_host(instant_query(q), "cpu")
 
 
-def get_memory(instance: str = None) -> dict:
-    """Get memory usage percentage."""
-    if instance:
-        q = f'(1 - node_memory_MemAvailable_bytes{{instance=~"{instance}.*"}} / node_memory_MemTotal_bytes{{instance=~"{instance}.*"}}) * 100'
-    else:
-        q = '(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100'
-    data = instant_query(q)
-    return _extract_metrics(data, "memory_percent")
+def get_memory(host: str = None) -> list[dict]:
+    filt = f', host="{host}"' if host else ""
+    q = f'(1 - node_memory_MemAvailable_bytes{{host!=""{filt}}} / node_memory_MemTotal_bytes{{host!=""{filt}}}) * 100'
+    return _by_host(instant_query(q), "mem")
 
 
-def get_disk(instance: str = None) -> dict:
-    """Get root filesystem usage percentage."""
-    if instance:
-        q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", instance=~"{instance}.*"}} / node_filesystem_size_bytes{{mountpoint="/", instance=~"{instance}.*"}}) * 100'
-    else:
-        q = '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100'
-    data = instant_query(q)
-    return _extract_metrics(data, "disk_percent")
+def get_disk(host: str = None) -> list[dict]:
+    filt = f', host="{host}"' if host else ""
+    q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", host!=""{filt}}} / node_filesystem_size_bytes{{mountpoint="/", host!=""{filt}}}) * 100'
+    return _by_host(instant_query(q), "disk")
 
 
-def get_overview() -> str:
-    """Get a formatted overview of all monitored instances."""
+def get_disk_bytes(host: str = None) -> list[dict]:
+    """Returns available and total bytes per host."""
+    filt = f', host="{host}"' if host else ""
+    avail = _by_host(instant_query(
+        f'node_filesystem_avail_bytes{{mountpoint="/", host!=""{filt}}}'), "avail_bytes")
+    total = _by_host(instant_query(
+        f'node_filesystem_size_bytes{{mountpoint="/", host!=""{filt}}}'), "total_bytes")
+    total_map = {r["host"]: r["value"] for r in total}
+    result = []
+    for a in avail:
+        t = total_map.get(a["host"], 0)
+        result.append({
+            "host": a["host"],
+            "avail_gb": a["value"] / (1024**3),
+            "total_gb": t / (1024**3),
+        })
+    return result
+
+
+def get_uptime(host: str = None) -> list[dict]:
+    filt = f', host="{host}"' if host else ""
+    q = f'node_time_seconds{{host!=""{filt}}} - node_boot_time_seconds{{host!=""{filt}}}'
+    return _by_host(instant_query(q), "uptime_sec")
+
+
+def get_load(host: str = None) -> list[dict]:
+    filt = f', host="{host}"' if host else ""
+    q = f'node_load5{{host!=""{filt}}}'
+    return _by_host(instant_query(q), "load5")
+
+
+def get_warnings() -> list[str]:
+    """Return list of warning strings for hosts exceeding thresholds."""
+    warnings = []
+    for r in get_cpu():
+        if r["value"] >= WARN_CPU:
+            warnings.append(f"🔴 {r['host']}: CPU {r['value']:.0f}%")
+    for r in get_memory():
+        if r["value"] >= WARN_MEM:
+            warnings.append(f"🔴 {r['host']}: RAM {r['value']:.0f}%")
+    for r in get_disk():
+        if r["value"] >= WARN_DISK:
+            warnings.append(f"🔴 {r['host']}: Disk {r['value']:.0f}%")
+    return warnings
+
+
+def _fmt_uptime(seconds: float) -> str:
+    days = int(seconds // 86400)
+    hours = int((seconds % 86400) // 3600)
+    if days > 0:
+        return f"{days}d {hours}h"
+    return f"{hours}h"
+
+
+def format_overview() -> str:
+    """Kompakte Übersicht aller Hosts — für den Bot."""
     if not is_available():
-        return "Prometheus is not reachable at " + PROMETHEUS_URL
+        return "⚠️ Prometheus nicht erreichbar."
 
-    targets = get_targets()
-    if not targets:
-        return "No Prometheus targets found."
+    cpu = {r["host"]: r["value"] for r in get_cpu()}
+    mem = {r["host"]: r["value"] for r in get_memory()}
+    disk = {r["host"]: r["value"] for r in get_disk()}
+    disk_gb = {r["host"]: r for r in get_disk_bytes()}
+    uptime = {r["host"]: r["value"] for r in get_uptime()}
+    load = {r["host"]: r["value"] for r in get_load()}
 
-    lines = ["## Prometheus Targets\n"]
-    for t in targets:
-        status = "UP" if t["health"] == "up" else "DOWN"
-        lines.append(f"- [{status}] {t['job']} ({t['instance']})")
+    hosts = sorted(set(list(cpu.keys()) + list(mem.keys()) + list(disk.keys())))
+    if not hosts:
+        return "Keine Prometheus-Daten verfügbar."
 
-    cpu = get_cpu()
-    if "results" in cpu:
-        lines.append("\n## CPU Usage")
-        for r in cpu["results"]:
-            lines.append(f"- {r['instance']}: {r['value']:.1f}%")
+    lines = [f"📊 Server-Metriken ({len(hosts)} Hosts)\n"]
+    for h in hosts:
+        c = cpu.get(h, -1)
+        m = mem.get(h, -1)
+        d = disk.get(h, -1)
+        dinfo = disk_gb.get(h, {})
+        u = uptime.get(h, 0)
+        l5 = load.get(h, -1)
 
-    mem = get_memory()
-    if "results" in mem:
-        lines.append("\n## Memory Usage")
-        for r in mem["results"]:
-            lines.append(f"- {r['instance']}: {r['value']:.1f}%")
+        warn = ""
+        if c >= WARN_CPU or m >= WARN_MEM or d >= WARN_DISK:
+            warn = " ⚠️"
 
-    disk = get_disk()
-    if "results" in disk:
-        lines.append("\n## Disk Usage (/)")
-        for r in disk["results"]:
-            lines.append(f"- {r['instance']}: {r['value']:.1f}%")
+        disk_str = f"{d:.0f}%"
+        if dinfo:
+            disk_str += f" ({dinfo.get('avail_gb', 0):.0f}/{dinfo.get('total_gb', 0):.0f} GB frei)"
+
+        lines.append(
+            f"{'🟢' if c >= 0 else '🔴'} {h}{warn}\n"
+            f"  CPU: {c:.0f}%  RAM: {m:.0f}%  Disk: {disk_str}\n"
+            f"  Load5: {l5:.1f}  Uptime: {_fmt_uptime(u)}"
+        )
+
+    warnings = get_warnings()
+    if warnings:
+        lines.append("\n⚠️ WARNUNGEN:")
+        lines.extend(warnings)
+    else:
+        lines.append("\n✅ Alle Werte im Normalbereich.")
 
     return "\n".join(lines)
 
 
-def _extract_metrics(data: dict, metric_name: str) -> dict:
-    if "error" in data and data.get("status") != "success":
-        return {"error": data.get("error", "unknown error")}
+def format_host_detail(host: str) -> str:
+    """Detail-Metriken für einen einzelnen Host."""
+    if not is_available():
+        return "⚠️ Prometheus nicht erreichbar."
 
-    results = []
-    for r in data.get("data", {}).get("result", []):
-        instance = r.get("metric", {}).get("instance", "unknown")
-        value = float(r.get("value", [0, 0])[1])
-        results.append({"instance": instance, "value": value, "metric": metric_name})
-    return {"results": results}
+    cpu = get_cpu(host)
+    mem = get_memory(host)
+    disk = get_disk(host)
+    disk_gb = get_disk_bytes(host)
+    uptime = get_uptime(host)
+    load = get_load(host)
+
+    if not cpu and not mem:
+        return f"Keine Metriken für '{host}' gefunden."
+
+    lines = [f"📊 {host} — Detail\n"]
+    if cpu:
+        lines.append(f"CPU: {cpu[0]['value']:.1f}%")
+    if mem:
+        lines.append(f"RAM: {mem[0]['value']:.1f}%")
+    if disk:
+        d_str = f"Disk: {disk[0]['value']:.1f}%"
+        if disk_gb:
+            d_str += f" ({disk_gb[0]['avail_gb']:.1f} / {disk_gb[0]['total_gb']:.1f} GB frei)"
+        lines.append(d_str)
+    if load:
+        lines.append(f"Load (5m): {load[0]['value']:.2f}")
+    if uptime:
+        lines.append(f"Uptime: {_fmt_uptime(uptime[0]['value'])}")
+
+    return "\n".join(lines)