"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2).""" import sys import os import requests sys.path.insert(0, os.path.dirname(__file__)) from core import config, loki_client, proxmox_client, mail_client def _get_tokens(cfg): tokens = {} tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") if tn and tv: tokens["pve-hetzner"] = {"name": tn, "value": tv} return tokens def _get_passwords(cfg): pw = cfg.passwords.get("default", "") return {host: pw for host in proxmox_client.PROXMOX_HOSTS} CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115] IGNORED_HOSTS = {"${HOSTNAME}", ""} def check_all() -> list[str]: """Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück.""" cfg = config.parse_config() alerts = [] containers = proxmox_client.get_all_containers( _get_passwords(cfg), _get_tokens(cfg) ) for ct in containers: if "error" in ct: continue vmid = ct.get("vmid", 0) name = ct.get("name", "?") status = ct.get("status", "unknown") if vmid in CRITICAL_CONTAINERS and status != "running": alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!") mem = ct.get("mem", 0) maxmem = ct.get("maxmem", 1) if maxmem > 0 and mem / maxmem > 0.90: pct = int(mem / maxmem * 100) alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%") errors = loki_client.get_errors(hours=0.5, limit=50) error_lines = [e for e in errors if "error" not in e] panic_lines = [] for e in error_lines: line = e.get("line", "") ll = line.lower() if not any(w in ll for w in ["panic", "fatal", "oom", "out of memory"]): continue if "query=" in line or "caller=metrics" in line: continue if "HTTP/1." in line and ('" 200 ' in line or '" 301 ' in line or '" 302 ' in line or '" 304 ' in line): continue if "GET /" in line or "POST /" in line or "HEAD /" in line: continue panic_lines.append(e) if panic_lines: hosts = set(e.get("host", "?") for e in panic_lines) hosts -= IGNORED_HOSTS if hosts: alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}") silent = loki_client.check_silence(minutes=35) if silent and "error" not in silent[0]: names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS] if names: alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") try: mail_client.init(cfg) important = mail_client.get_important_mails(hours=1) if important and "error" not in important[0]: senders = [m["from"][:30] for m in important] alerts.append(f"📧 {len(important)} wichtige Mail(s) (letzte Stunde): {', '.join(senders)}") except Exception: pass return alerts def format_report() -> str: """Tagesbericht: Gesamtstatus aller Systeme.""" cfg = config.parse_config() lines = ["📋 Tagesbericht Homelab\n"] containers = proxmox_client.get_all_containers( _get_passwords(cfg), _get_tokens(cfg) ) running = [c for c in containers if c.get("status") == "running"] stopped = [c for c in containers if c.get("status") == "stopped"] errors_ct = [c for c in containers if "error" in c] lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar") errors = loki_client.get_errors(hours=24, limit=100) error_count = len([e for e in errors if "error" not in e]) lines.append(f"Fehler (24h): {error_count}") silent = loki_client.check_silence(minutes=35) if silent and "error" not in (silent[0] if silent else {}): names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS] if names: lines.append(f"Stille Hosts: {', '.join(names)}") else: lines.append("Stille Hosts: keine") else: lines.append("Stille Hosts: keine") alerts = check_all() if alerts: lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:") lines.extend(alerts) else: lines.append("\n✅ Keine Alarme — alles läuft.") return "\n".join(lines) def send_alert(token: str, chat_id: str, message: str): """Sendet eine Nachricht via Telegram.""" requests.post( f"https://api.telegram.org/bot{token}/sendMessage", data={"chat_id": chat_id, "text": message}, timeout=10, ) def run_check_and_alert(): """Hauptfunktion für Cron: prüft und sendet Alerts falls nötig.""" cfg = config.parse_config() token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "") chat_id = cfg.raw.get("TG_CHAT_ID", "") if not token or not chat_id: return alerts = check_all() if alerts: msg = "🔧 Hausmeister-Check\n\n" + "\n".join(alerts) send_alert(token, chat_id, msg) if __name__ == "__main__": import sys as _sys if len(_sys.argv) > 1 and _sys.argv[1] == "report": print(format_report()) else: run_check_and_alert()