"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2).""" import sys import os import json import hashlib import requests import time from datetime import datetime, timezone sys.path.insert(0, os.path.dirname(__file__)) from core import config, loki_client, proxmox_client, mail_client ALERT_STATE_FILE = "/var/cache/hausmeister-alert-state.json" ALERT_COOLDOWN_SECONDS = { "container": 1800, "ram": 1800, "panic": 3600, "silence": 3600, "http": 1800, "restart": 900, "memory_expiry": 43200, "default": 3600, "error_rate": 1800, } def _get_tokens(cfg): tokens = {} tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") if tn and tv: tokens["pve-hetzner"] = {"name": tn, "value": tv} return tokens def _get_passwords(cfg): pw = cfg.passwords.get("default", "") return {host: pw for host in proxmox_client.PROXMOX_HOSTS} CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115] HTTP_HEALTH_CHECKS = [ {"name": "WordPress (CT 101)", "url": "http://10.10.10.101/robots.txt"}, {"name": "Matomo (CT 113)", "url": "http://10.10.10.113"}, {"name": "Grafana (CT 110)", "url": "http://10.10.10.110:3000/api/health"}, {"name": "Flugscanner-Agent (pve-pp-1)", "url": "http://100.126.26.46:5010/status", "retries": 5, "timeout": 25, "retry_delay": 6}, ] EXPECTED_STOPPED = { (115, "pve-ka-1"), # flugscanner-asia-old (gestoppt, Cluster pve1) — Live CT auf pve-pp-1 (115, "pve-ka-3"), # dieselbe CT, zweite API-Sicht (Cluster) (101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster) (101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster) (504, "pve-ka-2"), # Shop-Template — stopped (8000, "pve-ka-2"), # Kunde0-Shop — stopped (8010, "pve-ka-2"), # Kunde1-Shop — stopped } # VMIDs, die auf jedem Proxmox-Host in CONFIG ok sind, solange status == stopped # (115 erscheint je nach API-Zuordnung auch als pve-hetzner o.ä., nicht nur ka-1/ka-3) EXPECTED_STOPPED_VMIDS = {115, 504, 8000, 8010} IGNORED_HOSTS = {"${HOSTNAME}", ""} SILENCE_IGNORED_HOSTS = { "ct-600-webcam", # kein rsyslog, Stream läuft aber "ct-103-Intercity-Taxi", # absichtlich gestoppt "ct-101-freshrss", # auf pve-ka-3, lokale Loki (nicht zentral) } def check_all() -> list[str]: """Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück.""" cfg = config.parse_config() alerts = [] containers = proxmox_client.get_all_containers( _get_passwords(cfg), _get_tokens(cfg) ) for ct in containers: if "error" in ct: continue vmid = ct.get("vmid", 0) name = ct.get("name", "?") status = ct.get("status", "unknown") host = ct.get("_host", "") if vmid in CRITICAL_CONTAINERS and status != "running": ok_stopped = vmid in EXPECTED_STOPPED_VMIDS and status == "stopped" if (vmid, host) not in EXPECTED_STOPPED and not ok_stopped: alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!") mem = ct.get("mem", 0) maxmem = ct.get("maxmem", 1) if maxmem > 0 and mem / maxmem > 0.90: pct = int(mem / maxmem * 100) alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%") errors = loki_client.get_errors(hours=0.5, limit=50) error_lines = [e for e in errors if "error" not in e] panic_lines = [] for e in error_lines: line = e.get("line", "") ll = line.lower() if not any(w in ll for w in ["panic", "fatal", "oom", "out of memory"]): continue if "query=" in line or "caller=metrics" in line: continue if "HTTP/1." in line and ('" 200 ' in line or '" 301 ' in line or '" 302 ' in line or '" 304 ' in line): continue if "GET /" in line or "POST /" in line or "HEAD /" in line: continue panic_lines.append(e) if panic_lines: hosts = set(e.get("host", "?") for e in panic_lines) hosts -= IGNORED_HOSTS if hosts: alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}") error_rates = loki_client.check_error_rate(minutes=30) for er in error_rates: alerts.append( f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})" ) running_names = { ct.get("name", "").lower() for ct in containers if "error" not in ct and ct.get("status") == "running" } silent = loki_client.check_silence(minutes=35) if silent and "error" not in silent[0]: names = [ s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS and s.get("host") not in SILENCE_IGNORED_HOSTS and s["host"].lower() in running_names ] if names: alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") _headers = {"User-Agent": "Mozilla/5.0 (Hausmeister-Bot/1.0 health-check)"} for check in HTTP_HEALTH_CHECKS: timeout = check.get("timeout", 15) retries = check.get("retries", 1) retry_delay = check.get("retry_delay", 3) msg = None for attempt in range(retries): try: r = requests.head( check["url"], timeout=timeout, allow_redirects=True, headers=_headers ) if r.status_code < 400: msg = None break msg = f"🔴 {check['name']} antwortet mit HTTP {r.status_code}" except requests.RequestException as e: msg = f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}" if attempt < retries - 1: time.sleep(retry_delay) if msg: alerts.append(msg) restarts = loki_client.check_service_restarts(minutes=35) for r in restarts: alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)") try: import memory_client import time as _time now_ts = int(_time.time()) mem_items = memory_client.get_active_memory() for item in mem_items: exp = item.get("expires_at") if exp and 0 < exp - now_ts < 86400: from datetime import datetime as _dt exp_str = _dt.fromtimestamp(exp).strftime("%d.%m. %H:%M") alerts.append(f"⏰ Memory läuft ab ({exp_str}): {item['content'][:80]}") except Exception: pass try: mail_client.init(cfg) important = mail_client.get_important_mails(hours=1) if important and "error" not in important[0]: state = _load_alert_state() seen = state.get("seen_mails", {}) now = datetime.now(timezone.utc).timestamp() new_mails = [] for m in important: fp = hashlib.md5( f"{m.get('date_str','')}{m.get('from','')}{m.get('subject','')}".encode() ).hexdigest() if fp not in seen: new_mails.append(m) seen[fp] = now seen = {k: v for k, v in seen.items() if now - v < 172800} state["seen_mails"] = seen _save_alert_state(state) if new_mails: senders = [m["from"][:30] for m in new_mails] alerts.append(f"📧 {len(new_mails)} neue wichtige Mail(s): {', '.join(senders)}") except Exception: pass return alerts def format_report() -> str: """Tagesbericht: Gesamtstatus aller Systeme.""" cfg = config.parse_config() lines = ["📋 Tagesbericht Homelab\n"] containers = proxmox_client.get_all_containers( _get_passwords(cfg), _get_tokens(cfg) ) running = [c for c in containers if c.get("status") == "running"] stopped = [c for c in containers if c.get("status") == "stopped"] errors_ct = [c for c in containers if "error" in c] lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar") errors = loki_client.get_errors(hours=24, limit=100) error_count = len([e for e in errors if "error" not in e]) lines.append(f"Fehler (24h): {error_count}") silent = loki_client.check_silence(minutes=35) if silent and "error" not in (silent[0] if silent else {}): names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS] if names: lines.append(f"Stille Hosts: {', '.join(names)}") else: lines.append("Stille Hosts: keine") else: lines.append("Stille Hosts: keine") try: import memory_client mem_items = memory_client.get_active_memory() perm = [i for i in mem_items if i.get("memory_type") != "temporary"] temp = [i for i in mem_items if i.get("memory_type") == "temporary"] candidates = memory_client.get_candidates() mem_line = f"Memory: {len(perm)} dauerhaft, {len(temp)} temporär" import time as _time now_ts = int(_time.time()) soon = [i for i in temp if i.get("expires_at") and i["expires_at"] - now_ts < 86400] if soon: mem_line += f", {len(soon)} laufen in 24h ab" if candidates: mem_line += f", {len(candidates)} Kandidaten offen" lines.append(mem_line) except Exception: pass alerts = check_all() if alerts: lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:") lines.extend(alerts) else: lines.append("\n✅ Keine Alarme — alles läuft.") return "\n".join(lines) def _load_alert_state() -> dict: try: with open(ALERT_STATE_FILE, "r") as f: return json.load(f) except (FileNotFoundError, json.JSONDecodeError): return {} def _save_alert_state(state: dict): try: with open(ALERT_STATE_FILE, "w") as f: json.dump(state, f) except Exception: pass def _alert_key(alert_text: str) -> str: return hashlib.md5(alert_text.encode()).hexdigest() def _alert_category(alert_text: str) -> str: if "CT " in alert_text and "ist " in alert_text: return "container" if "RAM " in alert_text: return "ram" if "panic" in alert_text.lower() or "fatal" in alert_text.lower(): return "panic" if "Fehler in 30 Min" in alert_text: return "error_rate" if "Keine Logs" in alert_text: return "silence" if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text: return "http" if "Service-Neustart" in alert_text: return "restart" if "Memory läuft ab" in alert_text: return "memory_expiry" return "default" def _filter_new_alerts(alerts: list[str]) -> list[str]: """Filtert bereits gemeldete Infra-Alerts per Cooldown. Mails werden separat in check_all() dedupliziert.""" state = _load_alert_state() now = datetime.now(timezone.utc).timestamp() cooldowns = state.get("alert_cooldowns", {}) new_alerts = [] for alert in alerts: key = _alert_key(alert) cat = _alert_category(alert) cooldown = ALERT_COOLDOWN_SECONDS.get(cat, 3600) last_sent = cooldowns.get(key, {}).get("ts", 0) if now - last_sent > cooldown: new_alerts.append(alert) cooldowns[key] = {"ts": now, "text": alert[:80], "cat": cat} cutoff = now - 86400 cooldowns = {k: v for k, v in cooldowns.items() if v.get("ts", 0) > cutoff} state["alert_cooldowns"] = cooldowns _save_alert_state(state) return new_alerts def send_alert(token: str, chat_id: str, message: str): """Sendet eine Nachricht via Telegram.""" requests.post( f"https://api.telegram.org/bot{token}/sendMessage", data={"chat_id": chat_id, "text": message}, timeout=10, ) def run_check_and_alert(): """Hauptfunktion für Cron: prüft und sendet Alerts falls nötig.""" cfg = config.parse_config() token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "") chat_id = cfg.raw.get("TG_CHAT_ID", "") if not token or not chat_id: return alerts = check_all() new_alerts = _filter_new_alerts(alerts) if new_alerts: msg = "🔧 Hausmeister-Check\n\n" + "\n".join(new_alerts) send_alert(token, chat_id, msg) if __name__ == "__main__": import sys as _sys if len(_sys.argv) > 1 and _sys.argv[1] == "report": print(format_report()) else: run_check_and_alert()