homelab-brain/homelab-ai-bot/monitor.py

"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2)."""

import sys
import os
import json
import hashlib
import requests
import time
from datetime import datetime, timezone

sys.path.insert(0, os.path.dirname(__file__))
from core import config, loki_client, proxmox_client, mail_client

ALERT_STATE_FILE = "/var/cache/hausmeister-alert-state.json"
ALERT_COOLDOWN_SECONDS = {
    "container": 1800,
    "ram": 1800,
    "panic": 3600,
    "silence": 3600,
    "http": 1800,
    "restart": 900,
    "memory_expiry": 43200,
    "default": 3600,
    "error_rate": 1800,
}


def _get_tokens(cfg):
    tokens = {}
    tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "")
    tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "")
    if tn and tv:
        tokens["pve-hetzner"] = {"name": tn, "value": tv}
    return tokens


def _get_passwords(cfg):
    pw = cfg.passwords.get("default", "")
    return {host: pw for host in proxmox_client.PROXMOX_HOSTS}


CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]

HTTP_HEALTH_CHECKS = [
    {"name": "WordPress (CT 101)",   "url": "http://10.10.10.101/robots.txt"},
    {"name": "Matomo (CT 113)",      "url": "http://10.10.10.113"},
    {"name": "Grafana (CT 110)",     "url": "http://10.10.10.110:3000/api/health"},
    {"name": "Flugscanner-Agent (pve-pp-1)", "url": "http://100.126.26.46:5010/status",
     "retries": 5, "timeout": 25, "retry_delay": 6},
]

EXPECTED_STOPPED = {
    (115, "pve-ka-1"),   # flugscanner-asia-old (gestoppt, Cluster pve1) — Live CT auf pve-pp-1
    (115, "pve-ka-3"),   # dieselbe CT, zweite API-Sicht (Cluster)
    (101, "pp1"),        # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
    (101, "pp2"),        # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
    (504, "pve-ka-2"),   # Shop-Template — stopped
    (8000, "pve-ka-2"),  # Kunde0-Shop — stopped
    (8010, "pve-ka-2"),  # Kunde1-Shop — stopped
}

# VMIDs, die auf jedem Proxmox-Host in CONFIG ok sind, solange status == stopped
# (115 erscheint je nach API-Zuordnung auch als pve-hetzner o.ä., nicht nur ka-1/ka-3)
EXPECTED_STOPPED_VMIDS = {115, 504, 8000, 8010}

IGNORED_HOSTS = {"${HOSTNAME}", ""}

SILENCE_IGNORED_HOSTS = {
    "ct-600-webcam",          # kein rsyslog, Stream läuft aber
    "ct-103-Intercity-Taxi",  # absichtlich gestoppt
    "ct-101-freshrss",        # auf pve-ka-3, lokale Loki (nicht zentral)
}


def check_all() -> list[str]:
    """Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück."""
    cfg = config.parse_config()
    alerts = []

    containers = proxmox_client.get_all_containers(
        _get_passwords(cfg), _get_tokens(cfg)
    )
    for ct in containers:
        if "error" in ct:
            continue
        vmid = ct.get("vmid", 0)
        name = ct.get("name", "?")
        status = ct.get("status", "unknown")
        host = ct.get("_host", "")
        if vmid in CRITICAL_CONTAINERS and status != "running":
            ok_stopped = vmid in EXPECTED_STOPPED_VMIDS and status == "stopped"
            if (vmid, host) not in EXPECTED_STOPPED and not ok_stopped:
                alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!")

        mem = ct.get("mem", 0)
        maxmem = ct.get("maxmem", 1)
        if maxmem > 0 and mem / maxmem > 0.90:
            pct = int(mem / maxmem * 100)
            alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%")

    errors = loki_client.get_errors(hours=0.5, limit=50)
    error_lines = [e for e in errors if "error" not in e]
    panic_lines = []
    for e in error_lines:
        line = e.get("line", "")
        ll = line.lower()
        if not any(w in ll for w in ["panic", "fatal", "oom", "out of memory"]):
            continue
        if "query=" in line or "caller=metrics" in line:
            continue
        if "HTTP/1." in line and ('" 200 ' in line or '" 301 ' in line or '" 302 ' in line or '" 304 ' in line):
            continue
        if "GET /" in line or "POST /" in line or "HEAD /" in line:
            continue
        panic_lines.append(e)
    if panic_lines:
        hosts = set(e.get("host", "?") for e in panic_lines)
        hosts -= IGNORED_HOSTS
        if hosts:
            alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")

    error_rates = loki_client.check_error_rate(minutes=30)
    for er in error_rates:
        alerts.append(
            f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})"
        )

    running_names = {
        ct.get("name", "").lower()
        for ct in containers
        if "error" not in ct and ct.get("status") == "running"
    }

    silent = loki_client.check_silence(minutes=35)
    if silent and "error" not in silent[0]:
        names = [
            s["host"] for s in silent
            if s.get("host") not in IGNORED_HOSTS
            and s.get("host") not in SILENCE_IGNORED_HOSTS
            and s["host"].lower() in running_names
        ]
        if names:
            alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")

    _headers = {"User-Agent": "Mozilla/5.0 (Hausmeister-Bot/1.0 health-check)"}
    for check in HTTP_HEALTH_CHECKS:
        timeout = check.get("timeout", 15)
        retries = check.get("retries", 1)
        retry_delay = check.get("retry_delay", 3)
        msg = None
        for attempt in range(retries):
            try:
                r = requests.get(
                    check["url"], timeout=timeout, allow_redirects=True, headers=_headers
                )
                if r.status_code < 400:
                    msg = None
                    break
                msg = f"🔴 {check['name']} antwortet mit HTTP {r.status_code}"
            except requests.RequestException as e:
                msg = f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}"
            if attempt < retries - 1:
                time.sleep(retry_delay)
        if msg:
            alerts.append(msg)

    restarts = loki_client.check_service_restarts(minutes=35)
    for r in restarts:
        alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)")

    try:
        import memory_client
        import time as _time
        now_ts = int(_time.time())
        mem_items = memory_client.get_active_memory()
        for item in mem_items:
            exp = item.get("expires_at")
            if exp and 0 < exp - now_ts < 86400:
                from datetime import datetime as _dt
                exp_str = _dt.fromtimestamp(exp).strftime("%d.%m. %H:%M")
                alerts.append(f"⏰ Memory läuft ab ({exp_str}): {item['content'][:80]}")
    except Exception:
        pass

    try:
        mail_client.init(cfg)
        important = mail_client.get_important_mails(hours=1)
        if important and "error" not in important[0]:
            state = _load_alert_state()
            seen = state.get("seen_mails", {})
            now = datetime.now(timezone.utc).timestamp()

            new_mails = []
            for m in important:
                fp = hashlib.md5(
                    f"{m.get('date_str','')}{m.get('from','')}{m.get('subject','')}".encode()
                ).hexdigest()
                if fp not in seen:
                    new_mails.append(m)
                    seen[fp] = now

            seen = {k: v for k, v in seen.items() if now - v < 172800}
            state["seen_mails"] = seen
            _save_alert_state(state)

            if new_mails:
                senders = [m["from"][:30] for m in new_mails]
                alerts.append(f"📧 {len(new_mails)} neue wichtige Mail(s): {', '.join(senders)}")
    except Exception:
        pass

    return alerts


def format_report() -> str:
    """Tagesbericht: Gesamtstatus aller Systeme."""
    cfg = config.parse_config()
    lines = ["📋 Tagesbericht Homelab\n"]

    containers = proxmox_client.get_all_containers(
        _get_passwords(cfg), _get_tokens(cfg)
    )
    running = [c for c in containers if c.get("status") == "running"]
    stopped = [c for c in containers if c.get("status") == "stopped"]
    errors_ct = [c for c in containers if "error" in c]
    lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar")

    errors = loki_client.get_errors(hours=24, limit=100)
    error_count = len([e for e in errors if "error" not in e])
    lines.append(f"Fehler (24h): {error_count}")

    silent = loki_client.check_silence(minutes=35)
    if silent and "error" not in (silent[0] if silent else {}):
        names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS]
        if names:
            lines.append(f"Stille Hosts: {', '.join(names)}")
        else:
            lines.append("Stille Hosts: keine")
    else:
        lines.append("Stille Hosts: keine")

    try:
        import memory_client
        mem_items = memory_client.get_active_memory()
        perm = [i for i in mem_items if i.get("memory_type") != "temporary"]
        temp = [i for i in mem_items if i.get("memory_type") == "temporary"]
        candidates = memory_client.get_candidates()
        mem_line = f"Memory: {len(perm)} dauerhaft, {len(temp)} temporär"
        import time as _time
        now_ts = int(_time.time())
        soon = [i for i in temp if i.get("expires_at") and i["expires_at"] - now_ts < 86400]
        if soon:
            mem_line += f", {len(soon)} laufen in 24h ab"
        if candidates:
            mem_line += f", {len(candidates)} Kandidaten offen"
        lines.append(mem_line)
    except Exception:
        pass

    alerts = check_all()
    if alerts:
        lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:")
        lines.extend(alerts)
    else:
        lines.append("\n✅ Keine Alarme — alles läuft.")

    return "\n".join(lines)


def _load_alert_state() -> dict:
    try:
        with open(ALERT_STATE_FILE, "r") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {}


def _save_alert_state(state: dict):
    try:
        with open(ALERT_STATE_FILE, "w") as f:
            json.dump(state, f)
    except Exception:
        pass


def _alert_key(alert_text: str) -> str:
    return hashlib.md5(alert_text.encode()).hexdigest()


def _alert_category(alert_text: str) -> str:
    if "CT " in alert_text and "ist " in alert_text:
        return "container"
    if "RAM " in alert_text:
        return "ram"
    if "panic" in alert_text.lower() or "fatal" in alert_text.lower():
        return "panic"
    if "Fehler in 30 Min" in alert_text:
        return "error_rate"
    if "Keine Logs" in alert_text:
        return "silence"
    if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
        return "http"
    if "Service-Neustart" in alert_text:
        return "restart"
    if "Memory läuft ab" in alert_text:
        return "memory_expiry"
    return "default"


def _filter_new_alerts(alerts: list[str]) -> list[str]:
    """Filtert bereits gemeldete Infra-Alerts per Cooldown. Mails werden separat in check_all() dedupliziert."""
    state = _load_alert_state()
    now = datetime.now(timezone.utc).timestamp()
    cooldowns = state.get("alert_cooldowns", {})
    new_alerts = []

    for alert in alerts:
        key = _alert_key(alert)
        cat = _alert_category(alert)
        cooldown = ALERT_COOLDOWN_SECONDS.get(cat, 3600)

        last_sent = cooldowns.get(key, {}).get("ts", 0)
        if now - last_sent > cooldown:
            new_alerts.append(alert)
            cooldowns[key] = {"ts": now, "text": alert[:80], "cat": cat}

    cutoff = now - 86400
    cooldowns = {k: v for k, v in cooldowns.items() if v.get("ts", 0) > cutoff}

    state["alert_cooldowns"] = cooldowns
    _save_alert_state(state)
    return new_alerts


def send_alert(token: str, chat_id: str, message: str):
    """Sendet eine Nachricht via Telegram."""
    requests.post(
        f"https://api.telegram.org/bot{token}/sendMessage",
        data={"chat_id": chat_id, "text": message},
        timeout=10,
    )


def run_check_and_alert():
    """Hauptfunktion für Cron: prüft und sendet Alerts falls nötig."""
    cfg = config.parse_config()
    token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "")
    chat_id = cfg.raw.get("TG_CHAT_ID", "")
    if not token or not chat_id:
        return

    alerts = check_all()
    new_alerts = _filter_new_alerts(alerts)
    if new_alerts:
        msg = "🔧 Hausmeister-Check\n\n" + "\n".join(new_alerts)
        send_alert(token, chat_id, msg)


if __name__ == "__main__":
    import sys as _sys
    if len(_sys.argv) > 1 and _sys.argv[1] == "report":
        print(format_report())
    else:
        run_check_and_alert()