Prometheus-Integration: 7 Hosts überwachen + Bot-Kontext
- prometheus_client.py komplett neu: host-basierte Abfragen, Warnungen bei CPU>80%, RAM>85%, Disk>85%, format_overview/detail - context.py: Prometheus-Daten bei System-Fragen laden, host-spezifische Detail-Abfrage, Warnungen im Fallback Made-with: Cursor
This commit is contained in:
parent
a47aaaff2a
commit
16036531f3
2 changed files with 191 additions and 82 deletions
|
|
@ -6,7 +6,7 @@ import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(__file__))
|
sys.path.insert(0, os.path.dirname(__file__))
|
||||||
from core import config, loki_client, proxmox_client, wordpress_client
|
from core import config, loki_client, proxmox_client, wordpress_client, prometheus_client
|
||||||
|
|
||||||
|
|
||||||
def _load_config():
|
def _load_config():
|
||||||
|
|
@ -23,12 +23,13 @@ def _get_tokens(cfg):
|
||||||
|
|
||||||
|
|
||||||
def _get_passwords(cfg):
|
def _get_passwords(cfg):
|
||||||
return {
|
pw_default = cfg.passwords.get("default", "")
|
||||||
"pve-hetzner": cfg.passwords.get("hetzner", ""),
|
pw_hetzner = cfg.passwords.get("hetzner", pw_default)
|
||||||
"pve1": cfg.passwords.get("default", ""),
|
pws = {"default": pw_default, "pve-hetzner": pw_hetzner}
|
||||||
"pve3": cfg.passwords.get("default", ""),
|
for host in proxmox_client.PROXMOX_HOSTS:
|
||||||
"default": cfg.passwords.get("default", ""),
|
if host not in pws:
|
||||||
}
|
pws[host] = pw_default
|
||||||
|
return pws
|
||||||
|
|
||||||
|
|
||||||
def gather_status() -> str:
|
def gather_status() -> str:
|
||||||
|
|
@ -146,10 +147,27 @@ def gather_context_for_question(question: str) -> str:
|
||||||
|
|
||||||
# WordPress-Daten für Blog-Fragen
|
# WordPress-Daten für Blog-Fragen
|
||||||
if any(w in q for w in ["wordpress", "blog", "post", "artikel", "kommentar", "plugin"]):
|
if any(w in q for w in ["wordpress", "blog", "post", "artikel", "kommentar", "plugin"]):
|
||||||
wordpress_client.init(cfg) # WICHTIG: Init vor Format_overview
|
wordpress_client.init(cfg)
|
||||||
wp_overview = wordpress_client.format_overview(cfg)
|
wp_overview = wordpress_client.format_overview(cfg)
|
||||||
parts.append("=== WordPress ===\n" + wp_overview)
|
parts.append("=== WordPress ===\n" + wp_overview)
|
||||||
|
|
||||||
|
# Prometheus-Metriken für System-Fragen
|
||||||
|
if any(w in q for w in ["cpu", "ram", "speicher", "memory", "disk", "platte",
|
||||||
|
"festplatte", "auslastung", "load", "uptime", "server",
|
||||||
|
"metriken", "prometheus", "performance", "ressource"]):
|
||||||
|
host_match = None
|
||||||
|
for name in ["pve-hetzner", "pve-ka-1", "pve-ka-2", "pve-ka-3",
|
||||||
|
"pve-mu-2", "pve-mu-3", "pve-he"]:
|
||||||
|
if name.replace("-", "") in q.replace("-", "").replace(" ", ""):
|
||||||
|
host_match = name
|
||||||
|
break
|
||||||
|
if host_match:
|
||||||
|
parts.append(f"=== Prometheus {host_match} ===\n" +
|
||||||
|
prometheus_client.format_host_detail(host_match))
|
||||||
|
else:
|
||||||
|
parts.append("=== Prometheus Übersicht ===\n" +
|
||||||
|
prometheus_client.format_overview())
|
||||||
|
|
||||||
ct_match = re.search(r'\bct[- ]?(\d{3})\b', q)
|
ct_match = re.search(r'\bct[- ]?(\d{3})\b', q)
|
||||||
if ct_match:
|
if ct_match:
|
||||||
parts.append(f"=== CT {ct_match.group(1)} ===\n" + gather_container_status(ct_match.group(1)))
|
parts.append(f"=== CT {ct_match.group(1)} ===\n" + gather_container_status(ct_match.group(1)))
|
||||||
|
|
@ -162,5 +180,8 @@ def gather_context_for_question(question: str) -> str:
|
||||||
if not parts:
|
if not parts:
|
||||||
parts.append("=== Container Status ===\n" + gather_status())
|
parts.append("=== Container Status ===\n" + gather_status())
|
||||||
parts.append("=== Aktuelle Fehler ===\n" + gather_errors(hours=1))
|
parts.append("=== Aktuelle Fehler ===\n" + gather_errors(hours=1))
|
||||||
|
warnings = prometheus_client.get_warnings()
|
||||||
|
if warnings:
|
||||||
|
parts.append("=== Prometheus Warnungen ===\n" + "\n".join(warnings))
|
||||||
|
|
||||||
return "\n\n".join(parts)
|
return "\n\n".join(parts)
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,18 @@
|
||||||
"""Prometheus API client for querying system metrics."""
|
"""Prometheus API client — fragt Host-Metriken ab für den Hausmeister-Bot."""
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
|
|
||||||
PROMETHEUS_URL = "http://100.88.230.59:9090"
|
PROMETHEUS_URL = "http://10.10.10.1:9090"
|
||||||
|
|
||||||
|
WARN_CPU = 80
|
||||||
|
WARN_MEM = 85
|
||||||
|
WARN_DISK = 85
|
||||||
|
|
||||||
|
|
||||||
def _query(endpoint: str, params: dict, base_url: str = None) -> dict:
|
def _query(endpoint: str, params: dict) -> dict:
|
||||||
url = f"{base_url or PROMETHEUS_URL}{endpoint}"
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(url, params=params, timeout=10)
|
r = requests.get(f"{PROMETHEUS_URL}{endpoint}", params=params, timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json()
|
return r.json()
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
|
|
@ -17,12 +20,10 @@ def _query(endpoint: str, params: dict, base_url: str = None) -> dict:
|
||||||
|
|
||||||
|
|
||||||
def instant_query(query: str) -> dict:
|
def instant_query(query: str) -> dict:
|
||||||
"""Run an instant PromQL query."""
|
|
||||||
return _query("/api/v1/query", {"query": query})
|
return _query("/api/v1/query", {"query": query})
|
||||||
|
|
||||||
|
|
||||||
def range_query(query: str, hours: float = 1, step: str = "5m") -> dict:
|
def range_query(query: str, hours: float = 1, step: str = "5m") -> dict:
|
||||||
"""Run a range PromQL query."""
|
|
||||||
now = datetime.now(timezone.utc)
|
now = datetime.now(timezone.utc)
|
||||||
start = now - timedelta(hours=hours)
|
start = now - timedelta(hours=hours)
|
||||||
return _query("/api/v1/query_range", {
|
return _query("/api/v1/query_range", {
|
||||||
|
|
@ -33,101 +34,188 @@ def range_query(query: str, hours: float = 1, step: str = "5m") -> dict:
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def is_available() -> bool:
|
||||||
|
data = _query("/api/v1/query", {"query": "up"})
|
||||||
|
return data.get("status") == "success"
|
||||||
|
|
||||||
|
|
||||||
def get_targets() -> list[dict]:
|
def get_targets() -> list[dict]:
|
||||||
"""Get all Prometheus scrape targets with their status."""
|
|
||||||
data = _query("/api/v1/targets", {})
|
data = _query("/api/v1/targets", {})
|
||||||
if "error" in data:
|
if "error" in data:
|
||||||
return [{"error": data["error"]}]
|
return [{"error": data["error"]}]
|
||||||
|
|
||||||
targets = []
|
targets = []
|
||||||
for t in data.get("data", {}).get("activeTargets", []):
|
for t in data.get("data", {}).get("activeTargets", []):
|
||||||
|
labels = t.get("labels", {})
|
||||||
targets.append({
|
targets.append({
|
||||||
"job": t.get("labels", {}).get("job", "unknown"),
|
"host": labels.get("host", labels.get("instance", "?")),
|
||||||
"instance": t.get("labels", {}).get("instance", "unknown"),
|
"location": labels.get("location", ""),
|
||||||
"health": t.get("health", "unknown"),
|
"health": t.get("health", "unknown"),
|
||||||
"last_scrape": t.get("lastScrape", ""),
|
"job": labels.get("job", ""),
|
||||||
})
|
})
|
||||||
return targets
|
return targets
|
||||||
|
|
||||||
|
|
||||||
def is_available() -> bool:
|
def _by_host(data: dict, metric: str) -> list[dict]:
|
||||||
"""Check if Prometheus is reachable."""
|
if data.get("status") != "success":
|
||||||
data = _query("/api/v1/query", {"query": "up"})
|
return []
|
||||||
return "error" not in data or data.get("status") == "success"
|
results = []
|
||||||
|
for r in data.get("data", {}).get("result", []):
|
||||||
|
m = r.get("metric", {})
|
||||||
|
host = m.get("host", m.get("instance", "?"))
|
||||||
|
val = float(r.get("value", [0, 0])[1])
|
||||||
|
results.append({"host": host, "value": val, "metric": metric})
|
||||||
|
return sorted(results, key=lambda x: x["host"])
|
||||||
|
|
||||||
|
|
||||||
def get_cpu(instance: str = None, hours: float = 1) -> dict:
|
def get_cpu(host: str = None) -> list[dict]:
|
||||||
"""Get CPU usage. If instance given, filter to that instance."""
|
filt = f', host="{host}"' if host else ""
|
||||||
if instance:
|
q = f'100 - (avg by (host) (rate(node_cpu_seconds_total{{mode="idle"{filt}}}[5m])) * 100)'
|
||||||
q = f'100 - (avg by (instance) (rate(node_cpu_seconds_total{{mode="idle", instance=~"{instance}.*"}}[5m])) * 100)'
|
return _by_host(instant_query(q), "cpu")
|
||||||
else:
|
|
||||||
q = '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
|
|
||||||
data = instant_query(q)
|
|
||||||
return _extract_metrics(data, "cpu_percent")
|
|
||||||
|
|
||||||
|
|
||||||
def get_memory(instance: str = None) -> dict:
|
def get_memory(host: str = None) -> list[dict]:
|
||||||
"""Get memory usage percentage."""
|
filt = f', host="{host}"' if host else ""
|
||||||
if instance:
|
q = f'(1 - node_memory_MemAvailable_bytes{{host!=""{filt}}} / node_memory_MemTotal_bytes{{host!=""{filt}}}) * 100'
|
||||||
q = f'(1 - node_memory_MemAvailable_bytes{{instance=~"{instance}.*"}} / node_memory_MemTotal_bytes{{instance=~"{instance}.*"}}) * 100'
|
return _by_host(instant_query(q), "mem")
|
||||||
else:
|
|
||||||
q = '(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100'
|
|
||||||
data = instant_query(q)
|
|
||||||
return _extract_metrics(data, "memory_percent")
|
|
||||||
|
|
||||||
|
|
||||||
def get_disk(instance: str = None) -> dict:
|
def get_disk(host: str = None) -> list[dict]:
|
||||||
"""Get root filesystem usage percentage."""
|
filt = f', host="{host}"' if host else ""
|
||||||
if instance:
|
q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", host!=""{filt}}} / node_filesystem_size_bytes{{mountpoint="/", host!=""{filt}}}) * 100'
|
||||||
q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", instance=~"{instance}.*"}} / node_filesystem_size_bytes{{mountpoint="/", instance=~"{instance}.*"}}) * 100'
|
return _by_host(instant_query(q), "disk")
|
||||||
else:
|
|
||||||
q = '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100'
|
|
||||||
data = instant_query(q)
|
|
||||||
return _extract_metrics(data, "disk_percent")
|
|
||||||
|
|
||||||
|
|
||||||
def get_overview() -> str:
|
def get_disk_bytes(host: str = None) -> list[dict]:
|
||||||
"""Get a formatted overview of all monitored instances."""
|
"""Returns available and total bytes per host."""
|
||||||
|
filt = f', host="{host}"' if host else ""
|
||||||
|
avail = _by_host(instant_query(
|
||||||
|
f'node_filesystem_avail_bytes{{mountpoint="/", host!=""{filt}}}'), "avail_bytes")
|
||||||
|
total = _by_host(instant_query(
|
||||||
|
f'node_filesystem_size_bytes{{mountpoint="/", host!=""{filt}}}'), "total_bytes")
|
||||||
|
total_map = {r["host"]: r["value"] for r in total}
|
||||||
|
result = []
|
||||||
|
for a in avail:
|
||||||
|
t = total_map.get(a["host"], 0)
|
||||||
|
result.append({
|
||||||
|
"host": a["host"],
|
||||||
|
"avail_gb": a["value"] / (1024**3),
|
||||||
|
"total_gb": t / (1024**3),
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_uptime(host: str = None) -> list[dict]:
|
||||||
|
filt = f', host="{host}"' if host else ""
|
||||||
|
q = f'node_time_seconds{{host!=""{filt}}} - node_boot_time_seconds{{host!=""{filt}}}'
|
||||||
|
return _by_host(instant_query(q), "uptime_sec")
|
||||||
|
|
||||||
|
|
||||||
|
def get_load(host: str = None) -> list[dict]:
|
||||||
|
filt = f', host="{host}"' if host else ""
|
||||||
|
q = f'node_load5{{host!=""{filt}}}'
|
||||||
|
return _by_host(instant_query(q), "load5")
|
||||||
|
|
||||||
|
|
||||||
|
def get_warnings() -> list[str]:
|
||||||
|
"""Return list of warning strings for hosts exceeding thresholds."""
|
||||||
|
warnings = []
|
||||||
|
for r in get_cpu():
|
||||||
|
if r["value"] >= WARN_CPU:
|
||||||
|
warnings.append(f"🔴 {r['host']}: CPU {r['value']:.0f}%")
|
||||||
|
for r in get_memory():
|
||||||
|
if r["value"] >= WARN_MEM:
|
||||||
|
warnings.append(f"🔴 {r['host']}: RAM {r['value']:.0f}%")
|
||||||
|
for r in get_disk():
|
||||||
|
if r["value"] >= WARN_DISK:
|
||||||
|
warnings.append(f"🔴 {r['host']}: Disk {r['value']:.0f}%")
|
||||||
|
return warnings
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_uptime(seconds: float) -> str:
|
||||||
|
days = int(seconds // 86400)
|
||||||
|
hours = int((seconds % 86400) // 3600)
|
||||||
|
if days > 0:
|
||||||
|
return f"{days}d {hours}h"
|
||||||
|
return f"{hours}h"
|
||||||
|
|
||||||
|
|
||||||
|
def format_overview() -> str:
|
||||||
|
"""Kompakte Übersicht aller Hosts — für den Bot."""
|
||||||
if not is_available():
|
if not is_available():
|
||||||
return "Prometheus is not reachable at " + PROMETHEUS_URL
|
return "⚠️ Prometheus nicht erreichbar."
|
||||||
|
|
||||||
targets = get_targets()
|
cpu = {r["host"]: r["value"] for r in get_cpu()}
|
||||||
if not targets:
|
mem = {r["host"]: r["value"] for r in get_memory()}
|
||||||
return "No Prometheus targets found."
|
disk = {r["host"]: r["value"] for r in get_disk()}
|
||||||
|
disk_gb = {r["host"]: r for r in get_disk_bytes()}
|
||||||
|
uptime = {r["host"]: r["value"] for r in get_uptime()}
|
||||||
|
load = {r["host"]: r["value"] for r in get_load()}
|
||||||
|
|
||||||
lines = ["## Prometheus Targets\n"]
|
hosts = sorted(set(list(cpu.keys()) + list(mem.keys()) + list(disk.keys())))
|
||||||
for t in targets:
|
if not hosts:
|
||||||
status = "UP" if t["health"] == "up" else "DOWN"
|
return "Keine Prometheus-Daten verfügbar."
|
||||||
lines.append(f"- [{status}] {t['job']} ({t['instance']})")
|
|
||||||
|
|
||||||
cpu = get_cpu()
|
lines = [f"📊 Server-Metriken ({len(hosts)} Hosts)\n"]
|
||||||
if "results" in cpu:
|
for h in hosts:
|
||||||
lines.append("\n## CPU Usage")
|
c = cpu.get(h, -1)
|
||||||
for r in cpu["results"]:
|
m = mem.get(h, -1)
|
||||||
lines.append(f"- {r['instance']}: {r['value']:.1f}%")
|
d = disk.get(h, -1)
|
||||||
|
dinfo = disk_gb.get(h, {})
|
||||||
|
u = uptime.get(h, 0)
|
||||||
|
l5 = load.get(h, -1)
|
||||||
|
|
||||||
mem = get_memory()
|
warn = ""
|
||||||
if "results" in mem:
|
if c >= WARN_CPU or m >= WARN_MEM or d >= WARN_DISK:
|
||||||
lines.append("\n## Memory Usage")
|
warn = " ⚠️"
|
||||||
for r in mem["results"]:
|
|
||||||
lines.append(f"- {r['instance']}: {r['value']:.1f}%")
|
|
||||||
|
|
||||||
disk = get_disk()
|
disk_str = f"{d:.0f}%"
|
||||||
if "results" in disk:
|
if dinfo:
|
||||||
lines.append("\n## Disk Usage (/)")
|
disk_str += f" ({dinfo.get('avail_gb', 0):.0f}/{dinfo.get('total_gb', 0):.0f} GB frei)"
|
||||||
for r in disk["results"]:
|
|
||||||
lines.append(f"- {r['instance']}: {r['value']:.1f}%")
|
lines.append(
|
||||||
|
f"{'🟢' if c >= 0 else '🔴'} {h}{warn}\n"
|
||||||
|
f" CPU: {c:.0f}% RAM: {m:.0f}% Disk: {disk_str}\n"
|
||||||
|
f" Load5: {l5:.1f} Uptime: {_fmt_uptime(u)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
warnings = get_warnings()
|
||||||
|
if warnings:
|
||||||
|
lines.append("\n⚠️ WARNUNGEN:")
|
||||||
|
lines.extend(warnings)
|
||||||
|
else:
|
||||||
|
lines.append("\n✅ Alle Werte im Normalbereich.")
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def _extract_metrics(data: dict, metric_name: str) -> dict:
|
def format_host_detail(host: str) -> str:
|
||||||
if "error" in data and data.get("status") != "success":
|
"""Detail-Metriken für einen einzelnen Host."""
|
||||||
return {"error": data.get("error", "unknown error")}
|
if not is_available():
|
||||||
|
return "⚠️ Prometheus nicht erreichbar."
|
||||||
|
|
||||||
results = []
|
cpu = get_cpu(host)
|
||||||
for r in data.get("data", {}).get("result", []):
|
mem = get_memory(host)
|
||||||
instance = r.get("metric", {}).get("instance", "unknown")
|
disk = get_disk(host)
|
||||||
value = float(r.get("value", [0, 0])[1])
|
disk_gb = get_disk_bytes(host)
|
||||||
results.append({"instance": instance, "value": value, "metric": metric_name})
|
uptime = get_uptime(host)
|
||||||
return {"results": results}
|
load = get_load(host)
|
||||||
|
|
||||||
|
if not cpu and not mem:
|
||||||
|
return f"Keine Metriken für '{host}' gefunden."
|
||||||
|
|
||||||
|
lines = [f"📊 {host} — Detail\n"]
|
||||||
|
if cpu:
|
||||||
|
lines.append(f"CPU: {cpu[0]['value']:.1f}%")
|
||||||
|
if mem:
|
||||||
|
lines.append(f"RAM: {mem[0]['value']:.1f}%")
|
||||||
|
if disk:
|
||||||
|
d_str = f"Disk: {disk[0]['value']:.1f}%"
|
||||||
|
if disk_gb:
|
||||||
|
d_str += f" ({disk_gb[0]['avail_gb']:.1f} / {disk_gb[0]['total_gb']:.1f} GB frei)"
|
||||||
|
lines.append(d_str)
|
||||||
|
if load:
|
||||||
|
lines.append(f"Load (5m): {load[0]['value']:.2f}")
|
||||||
|
if uptime:
|
||||||
|
lines.append(f"Uptime: {_fmt_uptime(uptime[0]['value'])}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue