diff --git a/homelab-ai-bot/monitor.py b/homelab-ai-bot/monitor.py index e69de29b..8fd124f5 100644 --- a/homelab-ai-bot/monitor.py +++ b/homelab-ai-bot/monitor.py @@ -0,0 +1,140 @@ +"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2).""" + +import sys +import os +import requests + +sys.path.insert(0, os.path.dirname(__file__)) +from core import config, loki_client, proxmox_client + + +def _get_tokens(cfg): + tokens = {} + tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") + tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") + if tn and tv: + tokens["pve-hetzner"] = {"name": tn, "value": tv} + return tokens + + +def _get_passwords(cfg): + pw = cfg.passwords.get("default", "") + return {host: pw for host in proxmox_client.PROXMOX_HOSTS} + + +CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115] + +IGNORED_HOSTS = {"${HOSTNAME}", ""} + + +def check_all() -> list[str]: + """Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück.""" + cfg = config.parse_config() + alerts = [] + + containers = proxmox_client.get_all_containers( + _get_passwords(cfg), _get_tokens(cfg) + ) + for ct in containers: + if "error" in ct: + continue + vmid = ct.get("vmid", 0) + name = ct.get("name", "?") + status = ct.get("status", "unknown") + if vmid in CRITICAL_CONTAINERS and status != "running": + alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!") + + mem = ct.get("mem", 0) + maxmem = ct.get("maxmem", 1) + if maxmem > 0 and mem / maxmem > 0.90: + pct = int(mem / maxmem * 100) + alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%") + + errors = loki_client.get_errors(hours=0.5, limit=50) + error_lines = [e for e in errors if "error" not in e] + panic_lines = [e for e in error_lines if + any(w in e.get("line", "").lower() for w in ["panic", "fatal", "oom", "out of memory"]) + and "query=" not in e.get("line", "") + and "caller=metrics" not in e.get("line", "") + ] + if panic_lines: + hosts = set(e.get("host", "?") for e in panic_lines) + hosts -= IGNORED_HOSTS + if hosts: + alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}") + + silent = loki_client.check_silence(minutes=35) + if silent and "error" not in silent[0]: + names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS] + if names: + alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") + + return alerts + + +def format_report() -> str: + """Tagesbericht: Gesamtstatus aller Systeme.""" + cfg = config.parse_config() + lines = ["📋 Tagesbericht Homelab\n"] + + containers = proxmox_client.get_all_containers( + _get_passwords(cfg), _get_tokens(cfg) + ) + running = [c for c in containers if c.get("status") == "running"] + stopped = [c for c in containers if c.get("status") == "stopped"] + errors_ct = [c for c in containers if "error" in c] + lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar") + + errors = loki_client.get_errors(hours=24, limit=100) + error_count = len([e for e in errors if "error" not in e]) + lines.append(f"Fehler (24h): {error_count}") + + silent = loki_client.check_silence(minutes=35) + if silent and "error" not in (silent[0] if silent else {}): + names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS] + if names: + lines.append(f"Stille Hosts: {', '.join(names)}") + else: + lines.append("Stille Hosts: keine") + else: + lines.append("Stille Hosts: keine") + + alerts = check_all() + if alerts: + lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:") + lines.extend(alerts) + else: + lines.append("\n✅ Keine Alarme — alles läuft.") + + return "\n".join(lines) + + +def send_alert(token: str, chat_id: str, message: str): + """Sendet eine Nachricht via Telegram.""" + requests.post( + f"https://api.telegram.org/bot{token}/sendMessage", + data={"chat_id": chat_id, "text": message}, + timeout=10, + ) + + +def run_check_and_alert(): + """Hauptfunktion für Cron: prüft und sendet Alerts falls nötig.""" + cfg = config.parse_config() + token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "") + chat_id = cfg.raw.get("TG_CHAT_ID", "") + if not token or not chat_id: + return + + alerts = check_all() + if alerts: + msg = "🔧 Hausmeister-Check\n\n" + "\n".join(alerts) + send_alert(token, chat_id, msg) + + +if __name__ == "__main__": + import sys as _sys + if len(_sys.argv) > 1 and _sys.argv[1] == "report": + print(format_report()) + else: + run_check_and_alert()