homelab-brain/homelab-ai-bot/monitor.py
orbitalo 723196fe64 monitor: Grafana-Healthcheck auf /api/health umstellen
Stabiler als die Root-URL; reduziert False Positives bei kurzzeitigen 5xx auf /.
2026-04-01 09:24:11 +00:00

364 lines
12 KiB
Python

"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2)."""
import sys
import os
import json
import hashlib
import requests
import time
from datetime import datetime, timezone
sys.path.insert(0, os.path.dirname(__file__))
from core import config, loki_client, proxmox_client, mail_client
ALERT_STATE_FILE = "/var/cache/hausmeister-alert-state.json"
ALERT_COOLDOWN_SECONDS = {
"container": 1800,
"ram": 1800,
"panic": 3600,
"silence": 3600,
"http": 1800,
"restart": 900,
"memory_expiry": 43200,
"default": 3600,
"error_rate": 1800,
}
def _get_tokens(cfg):
tokens = {}
tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "")
tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "")
if tn and tv:
tokens["pve-hetzner"] = {"name": tn, "value": tv}
return tokens
def _get_passwords(cfg):
pw = cfg.passwords.get("default", "")
return {host: pw for host in proxmox_client.PROXMOX_HOSTS}
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
HTTP_HEALTH_CHECKS = [
{"name": "WordPress (CT 101)", "url": "http://10.10.10.101/robots.txt"},
{"name": "Matomo (CT 113)", "url": "http://10.10.10.113"},
{"name": "Grafana (CT 110)", "url": "http://10.10.10.110:3000/api/health"},
{"name": "Flugscanner-Agent (pve-pp-1)", "url": "http://100.126.26.46:5010/status",
"retries": 5, "timeout": 25, "retry_delay": 6},
]
EXPECTED_STOPPED = {
(115, "pve-ka-1"), # flugscanner-asia-old (gestoppt, Cluster pve1) — Live CT auf pve-pp-1
(115, "pve-ka-3"), # dieselbe CT, zweite API-Sicht (Cluster)
(101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
(101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
(504, "pve-ka-2"), # Shop-Template — stopped
(8000, "pve-ka-2"), # Kunde0-Shop — stopped
(8010, "pve-ka-2"), # Kunde1-Shop — stopped
}
# VMIDs, die auf jedem Proxmox-Host in CONFIG ok sind, solange status == stopped
# (115 erscheint je nach API-Zuordnung auch als pve-hetzner o.ä., nicht nur ka-1/ka-3)
EXPECTED_STOPPED_VMIDS = {115, 504, 8000, 8010}
IGNORED_HOSTS = {"${HOSTNAME}", ""}
SILENCE_IGNORED_HOSTS = {
"ct-600-webcam", # kein rsyslog, Stream läuft aber
"ct-103-Intercity-Taxi", # absichtlich gestoppt
"ct-101-freshrss", # auf pve-ka-3, lokale Loki (nicht zentral)
}
def check_all() -> list[str]:
"""Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück."""
cfg = config.parse_config()
alerts = []
containers = proxmox_client.get_all_containers(
_get_passwords(cfg), _get_tokens(cfg)
)
for ct in containers:
if "error" in ct:
continue
vmid = ct.get("vmid", 0)
name = ct.get("name", "?")
status = ct.get("status", "unknown")
host = ct.get("_host", "")
if vmid in CRITICAL_CONTAINERS and status != "running":
ok_stopped = vmid in EXPECTED_STOPPED_VMIDS and status == "stopped"
if (vmid, host) not in EXPECTED_STOPPED and not ok_stopped:
alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!")
mem = ct.get("mem", 0)
maxmem = ct.get("maxmem", 1)
if maxmem > 0 and mem / maxmem > 0.90:
pct = int(mem / maxmem * 100)
alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%")
errors = loki_client.get_errors(hours=0.5, limit=50)
error_lines = [e for e in errors if "error" not in e]
panic_lines = []
for e in error_lines:
line = e.get("line", "")
ll = line.lower()
if not any(w in ll for w in ["panic", "fatal", "oom", "out of memory"]):
continue
if "query=" in line or "caller=metrics" in line:
continue
if "HTTP/1." in line and ('" 200 ' in line or '" 301 ' in line or '" 302 ' in line or '" 304 ' in line):
continue
if "GET /" in line or "POST /" in line or "HEAD /" in line:
continue
panic_lines.append(e)
if panic_lines:
hosts = set(e.get("host", "?") for e in panic_lines)
hosts -= IGNORED_HOSTS
if hosts:
alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")
error_rates = loki_client.check_error_rate(minutes=30)
for er in error_rates:
alerts.append(
f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})"
)
running_names = {
ct.get("name", "").lower()
for ct in containers
if "error" not in ct and ct.get("status") == "running"
}
silent = loki_client.check_silence(minutes=35)
if silent and "error" not in silent[0]:
names = [
s["host"] for s in silent
if s.get("host") not in IGNORED_HOSTS
and s.get("host") not in SILENCE_IGNORED_HOSTS
and s["host"].lower() in running_names
]
if names:
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
_headers = {"User-Agent": "Mozilla/5.0 (Hausmeister-Bot/1.0 health-check)"}
for check in HTTP_HEALTH_CHECKS:
timeout = check.get("timeout", 15)
retries = check.get("retries", 1)
retry_delay = check.get("retry_delay", 3)
msg = None
for attempt in range(retries):
try:
r = requests.head(
check["url"], timeout=timeout, allow_redirects=True, headers=_headers
)
if r.status_code < 400:
msg = None
break
msg = f"🔴 {check['name']} antwortet mit HTTP {r.status_code}"
except requests.RequestException as e:
msg = f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}"
if attempt < retries - 1:
time.sleep(retry_delay)
if msg:
alerts.append(msg)
restarts = loki_client.check_service_restarts(minutes=35)
for r in restarts:
alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)")
try:
import memory_client
import time as _time
now_ts = int(_time.time())
mem_items = memory_client.get_active_memory()
for item in mem_items:
exp = item.get("expires_at")
if exp and 0 < exp - now_ts < 86400:
from datetime import datetime as _dt
exp_str = _dt.fromtimestamp(exp).strftime("%d.%m. %H:%M")
alerts.append(f"⏰ Memory läuft ab ({exp_str}): {item['content'][:80]}")
except Exception:
pass
try:
mail_client.init(cfg)
important = mail_client.get_important_mails(hours=1)
if important and "error" not in important[0]:
state = _load_alert_state()
seen = state.get("seen_mails", {})
now = datetime.now(timezone.utc).timestamp()
new_mails = []
for m in important:
fp = hashlib.md5(
f"{m.get('date_str','')}{m.get('from','')}{m.get('subject','')}".encode()
).hexdigest()
if fp not in seen:
new_mails.append(m)
seen[fp] = now
seen = {k: v for k, v in seen.items() if now - v < 172800}
state["seen_mails"] = seen
_save_alert_state(state)
if new_mails:
senders = [m["from"][:30] for m in new_mails]
alerts.append(f"📧 {len(new_mails)} neue wichtige Mail(s): {', '.join(senders)}")
except Exception:
pass
return alerts
def format_report() -> str:
"""Tagesbericht: Gesamtstatus aller Systeme."""
cfg = config.parse_config()
lines = ["📋 Tagesbericht Homelab\n"]
containers = proxmox_client.get_all_containers(
_get_passwords(cfg), _get_tokens(cfg)
)
running = [c for c in containers if c.get("status") == "running"]
stopped = [c for c in containers if c.get("status") == "stopped"]
errors_ct = [c for c in containers if "error" in c]
lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar")
errors = loki_client.get_errors(hours=24, limit=100)
error_count = len([e for e in errors if "error" not in e])
lines.append(f"Fehler (24h): {error_count}")
silent = loki_client.check_silence(minutes=35)
if silent and "error" not in (silent[0] if silent else {}):
names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS]
if names:
lines.append(f"Stille Hosts: {', '.join(names)}")
else:
lines.append("Stille Hosts: keine")
else:
lines.append("Stille Hosts: keine")
try:
import memory_client
mem_items = memory_client.get_active_memory()
perm = [i for i in mem_items if i.get("memory_type") != "temporary"]
temp = [i for i in mem_items if i.get("memory_type") == "temporary"]
candidates = memory_client.get_candidates()
mem_line = f"Memory: {len(perm)} dauerhaft, {len(temp)} temporär"
import time as _time
now_ts = int(_time.time())
soon = [i for i in temp if i.get("expires_at") and i["expires_at"] - now_ts < 86400]
if soon:
mem_line += f", {len(soon)} laufen in 24h ab"
if candidates:
mem_line += f", {len(candidates)} Kandidaten offen"
lines.append(mem_line)
except Exception:
pass
alerts = check_all()
if alerts:
lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:")
lines.extend(alerts)
else:
lines.append("\n✅ Keine Alarme — alles läuft.")
return "\n".join(lines)
def _load_alert_state() -> dict:
try:
with open(ALERT_STATE_FILE, "r") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {}
def _save_alert_state(state: dict):
try:
with open(ALERT_STATE_FILE, "w") as f:
json.dump(state, f)
except Exception:
pass
def _alert_key(alert_text: str) -> str:
return hashlib.md5(alert_text.encode()).hexdigest()
def _alert_category(alert_text: str) -> str:
if "CT " in alert_text and "ist " in alert_text:
return "container"
if "RAM " in alert_text:
return "ram"
if "panic" in alert_text.lower() or "fatal" in alert_text.lower():
return "panic"
if "Fehler in 30 Min" in alert_text:
return "error_rate"
if "Keine Logs" in alert_text:
return "silence"
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
return "http"
if "Service-Neustart" in alert_text:
return "restart"
if "Memory läuft ab" in alert_text:
return "memory_expiry"
return "default"
def _filter_new_alerts(alerts: list[str]) -> list[str]:
"""Filtert bereits gemeldete Infra-Alerts per Cooldown. Mails werden separat in check_all() dedupliziert."""
state = _load_alert_state()
now = datetime.now(timezone.utc).timestamp()
cooldowns = state.get("alert_cooldowns", {})
new_alerts = []
for alert in alerts:
key = _alert_key(alert)
cat = _alert_category(alert)
cooldown = ALERT_COOLDOWN_SECONDS.get(cat, 3600)
last_sent = cooldowns.get(key, {}).get("ts", 0)
if now - last_sent > cooldown:
new_alerts.append(alert)
cooldowns[key] = {"ts": now, "text": alert[:80], "cat": cat}
cutoff = now - 86400
cooldowns = {k: v for k, v in cooldowns.items() if v.get("ts", 0) > cutoff}
state["alert_cooldowns"] = cooldowns
_save_alert_state(state)
return new_alerts
def send_alert(token: str, chat_id: str, message: str):
"""Sendet eine Nachricht via Telegram."""
requests.post(
f"https://api.telegram.org/bot{token}/sendMessage",
data={"chat_id": chat_id, "text": message},
timeout=10,
)
def run_check_and_alert():
"""Hauptfunktion für Cron: prüft und sendet Alerts falls nötig."""
cfg = config.parse_config()
token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "")
chat_id = cfg.raw.get("TG_CHAT_ID", "")
if not token or not chat_id:
return
alerts = check_all()
new_alerts = _filter_new_alerts(alerts)
if new_alerts:
msg = "🔧 Hausmeister-Check\n\n" + "\n".join(new_alerts)
send_alert(token, chat_id, msg)
if __name__ == "__main__":
import sys as _sys
if len(_sys.argv) > 1 and _sys.argv[1] == "report":
print(format_report())
else:
run_check_and_alert()