364 lines
12 KiB
Python
364 lines
12 KiB
Python
"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2)."""
|
|
|
|
import sys
|
|
import os
|
|
import json
|
|
import hashlib
|
|
import requests
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
from core import config, loki_client, proxmox_client, mail_client
|
|
|
|
ALERT_STATE_FILE = "/var/cache/hausmeister-alert-state.json"
|
|
ALERT_COOLDOWN_SECONDS = {
|
|
"container": 1800,
|
|
"ram": 1800,
|
|
"panic": 3600,
|
|
"silence": 3600,
|
|
"http": 1800,
|
|
"restart": 900,
|
|
"memory_expiry": 43200,
|
|
"default": 3600,
|
|
"error_rate": 1800,
|
|
}
|
|
|
|
|
|
def _get_tokens(cfg):
|
|
tokens = {}
|
|
tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "")
|
|
tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "")
|
|
if tn and tv:
|
|
tokens["pve-hetzner"] = {"name": tn, "value": tv}
|
|
return tokens
|
|
|
|
|
|
def _get_passwords(cfg):
|
|
pw = cfg.passwords.get("default", "")
|
|
return {host: pw for host in proxmox_client.PROXMOX_HOSTS}
|
|
|
|
|
|
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
|
|
|
|
HTTP_HEALTH_CHECKS = [
|
|
{"name": "WordPress (CT 101)", "url": "http://10.10.10.101/robots.txt"},
|
|
{"name": "Matomo (CT 113)", "url": "http://10.10.10.113"},
|
|
{"name": "Grafana (CT 110)", "url": "http://10.10.10.110:3000/api/health"},
|
|
{"name": "Flugscanner-Agent (pve-pp-1)", "url": "http://100.126.26.46:5010/status",
|
|
"retries": 5, "timeout": 25, "retry_delay": 6},
|
|
]
|
|
|
|
EXPECTED_STOPPED = {
|
|
(115, "pve-ka-1"), # flugscanner-asia-old (gestoppt, Cluster pve1) — Live CT auf pve-pp-1
|
|
(115, "pve-ka-3"), # dieselbe CT, zweite API-Sicht (Cluster)
|
|
(101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
|
(101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
|
(504, "pve-ka-2"), # Shop-Template — stopped
|
|
(8000, "pve-ka-2"), # Kunde0-Shop — stopped
|
|
(8010, "pve-ka-2"), # Kunde1-Shop — stopped
|
|
}
|
|
|
|
# VMIDs, die auf jedem Proxmox-Host in CONFIG ok sind, solange status == stopped
|
|
# (115 erscheint je nach API-Zuordnung auch als pve-hetzner o.ä., nicht nur ka-1/ka-3)
|
|
EXPECTED_STOPPED_VMIDS = {115, 504, 8000, 8010}
|
|
|
|
IGNORED_HOSTS = {"${HOSTNAME}", ""}
|
|
|
|
SILENCE_IGNORED_HOSTS = {
|
|
"ct-600-webcam", # kein rsyslog, Stream läuft aber
|
|
"ct-103-Intercity-Taxi", # absichtlich gestoppt
|
|
"ct-101-freshrss", # auf pve-ka-3, lokale Loki (nicht zentral)
|
|
}
|
|
|
|
|
|
def check_all() -> list[str]:
|
|
"""Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück."""
|
|
cfg = config.parse_config()
|
|
alerts = []
|
|
|
|
containers = proxmox_client.get_all_containers(
|
|
_get_passwords(cfg), _get_tokens(cfg)
|
|
)
|
|
for ct in containers:
|
|
if "error" in ct:
|
|
continue
|
|
vmid = ct.get("vmid", 0)
|
|
name = ct.get("name", "?")
|
|
status = ct.get("status", "unknown")
|
|
host = ct.get("_host", "")
|
|
if vmid in CRITICAL_CONTAINERS and status != "running":
|
|
ok_stopped = vmid in EXPECTED_STOPPED_VMIDS and status == "stopped"
|
|
if (vmid, host) not in EXPECTED_STOPPED and not ok_stopped:
|
|
alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!")
|
|
|
|
mem = ct.get("mem", 0)
|
|
maxmem = ct.get("maxmem", 1)
|
|
if maxmem > 0 and mem / maxmem > 0.90:
|
|
pct = int(mem / maxmem * 100)
|
|
alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%")
|
|
|
|
errors = loki_client.get_errors(hours=0.5, limit=50)
|
|
error_lines = [e for e in errors if "error" not in e]
|
|
panic_lines = []
|
|
for e in error_lines:
|
|
line = e.get("line", "")
|
|
ll = line.lower()
|
|
if not any(w in ll for w in ["panic", "fatal", "oom", "out of memory"]):
|
|
continue
|
|
if "query=" in line or "caller=metrics" in line:
|
|
continue
|
|
if "HTTP/1." in line and ('" 200 ' in line or '" 301 ' in line or '" 302 ' in line or '" 304 ' in line):
|
|
continue
|
|
if "GET /" in line or "POST /" in line or "HEAD /" in line:
|
|
continue
|
|
panic_lines.append(e)
|
|
if panic_lines:
|
|
hosts = set(e.get("host", "?") for e in panic_lines)
|
|
hosts -= IGNORED_HOSTS
|
|
if hosts:
|
|
alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")
|
|
|
|
error_rates = loki_client.check_error_rate(minutes=30)
|
|
for er in error_rates:
|
|
alerts.append(
|
|
f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})"
|
|
)
|
|
|
|
running_names = {
|
|
ct.get("name", "").lower()
|
|
for ct in containers
|
|
if "error" not in ct and ct.get("status") == "running"
|
|
}
|
|
|
|
silent = loki_client.check_silence(minutes=35)
|
|
if silent and "error" not in silent[0]:
|
|
names = [
|
|
s["host"] for s in silent
|
|
if s.get("host") not in IGNORED_HOSTS
|
|
and s.get("host") not in SILENCE_IGNORED_HOSTS
|
|
and s["host"].lower() in running_names
|
|
]
|
|
if names:
|
|
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
|
|
|
_headers = {"User-Agent": "Mozilla/5.0 (Hausmeister-Bot/1.0 health-check)"}
|
|
for check in HTTP_HEALTH_CHECKS:
|
|
timeout = check.get("timeout", 15)
|
|
retries = check.get("retries", 1)
|
|
retry_delay = check.get("retry_delay", 3)
|
|
msg = None
|
|
for attempt in range(retries):
|
|
try:
|
|
r = requests.get(
|
|
check["url"], timeout=timeout, allow_redirects=True, headers=_headers
|
|
)
|
|
if r.status_code < 400:
|
|
msg = None
|
|
break
|
|
msg = f"🔴 {check['name']} antwortet mit HTTP {r.status_code}"
|
|
except requests.RequestException as e:
|
|
msg = f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}"
|
|
if attempt < retries - 1:
|
|
time.sleep(retry_delay)
|
|
if msg:
|
|
alerts.append(msg)
|
|
|
|
restarts = loki_client.check_service_restarts(minutes=35)
|
|
for r in restarts:
|
|
alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)")
|
|
|
|
try:
|
|
import memory_client
|
|
import time as _time
|
|
now_ts = int(_time.time())
|
|
mem_items = memory_client.get_active_memory()
|
|
for item in mem_items:
|
|
exp = item.get("expires_at")
|
|
if exp and 0 < exp - now_ts < 86400:
|
|
from datetime import datetime as _dt
|
|
exp_str = _dt.fromtimestamp(exp).strftime("%d.%m. %H:%M")
|
|
alerts.append(f"⏰ Memory läuft ab ({exp_str}): {item['content'][:80]}")
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
mail_client.init(cfg)
|
|
important = mail_client.get_important_mails(hours=1)
|
|
if important and "error" not in important[0]:
|
|
state = _load_alert_state()
|
|
seen = state.get("seen_mails", {})
|
|
now = datetime.now(timezone.utc).timestamp()
|
|
|
|
new_mails = []
|
|
for m in important:
|
|
fp = hashlib.md5(
|
|
f"{m.get('date_str','')}{m.get('from','')}{m.get('subject','')}".encode()
|
|
).hexdigest()
|
|
if fp not in seen:
|
|
new_mails.append(m)
|
|
seen[fp] = now
|
|
|
|
seen = {k: v for k, v in seen.items() if now - v < 172800}
|
|
state["seen_mails"] = seen
|
|
_save_alert_state(state)
|
|
|
|
if new_mails:
|
|
senders = [m["from"][:30] for m in new_mails]
|
|
alerts.append(f"📧 {len(new_mails)} neue wichtige Mail(s): {', '.join(senders)}")
|
|
except Exception:
|
|
pass
|
|
|
|
return alerts
|
|
|
|
|
|
def format_report() -> str:
|
|
"""Tagesbericht: Gesamtstatus aller Systeme."""
|
|
cfg = config.parse_config()
|
|
lines = ["📋 Tagesbericht Homelab\n"]
|
|
|
|
containers = proxmox_client.get_all_containers(
|
|
_get_passwords(cfg), _get_tokens(cfg)
|
|
)
|
|
running = [c for c in containers if c.get("status") == "running"]
|
|
stopped = [c for c in containers if c.get("status") == "stopped"]
|
|
errors_ct = [c for c in containers if "error" in c]
|
|
lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar")
|
|
|
|
errors = loki_client.get_errors(hours=24, limit=100)
|
|
error_count = len([e for e in errors if "error" not in e])
|
|
lines.append(f"Fehler (24h): {error_count}")
|
|
|
|
silent = loki_client.check_silence(minutes=35)
|
|
if silent and "error" not in (silent[0] if silent else {}):
|
|
names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS]
|
|
if names:
|
|
lines.append(f"Stille Hosts: {', '.join(names)}")
|
|
else:
|
|
lines.append("Stille Hosts: keine")
|
|
else:
|
|
lines.append("Stille Hosts: keine")
|
|
|
|
try:
|
|
import memory_client
|
|
mem_items = memory_client.get_active_memory()
|
|
perm = [i for i in mem_items if i.get("memory_type") != "temporary"]
|
|
temp = [i for i in mem_items if i.get("memory_type") == "temporary"]
|
|
candidates = memory_client.get_candidates()
|
|
mem_line = f"Memory: {len(perm)} dauerhaft, {len(temp)} temporär"
|
|
import time as _time
|
|
now_ts = int(_time.time())
|
|
soon = [i for i in temp if i.get("expires_at") and i["expires_at"] - now_ts < 86400]
|
|
if soon:
|
|
mem_line += f", {len(soon)} laufen in 24h ab"
|
|
if candidates:
|
|
mem_line += f", {len(candidates)} Kandidaten offen"
|
|
lines.append(mem_line)
|
|
except Exception:
|
|
pass
|
|
|
|
alerts = check_all()
|
|
if alerts:
|
|
lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:")
|
|
lines.extend(alerts)
|
|
else:
|
|
lines.append("\n✅ Keine Alarme — alles läuft.")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _load_alert_state() -> dict:
|
|
try:
|
|
with open(ALERT_STATE_FILE, "r") as f:
|
|
return json.load(f)
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
return {}
|
|
|
|
|
|
def _save_alert_state(state: dict):
|
|
try:
|
|
with open(ALERT_STATE_FILE, "w") as f:
|
|
json.dump(state, f)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _alert_key(alert_text: str) -> str:
|
|
return hashlib.md5(alert_text.encode()).hexdigest()
|
|
|
|
|
|
def _alert_category(alert_text: str) -> str:
|
|
if "CT " in alert_text and "ist " in alert_text:
|
|
return "container"
|
|
if "RAM " in alert_text:
|
|
return "ram"
|
|
if "panic" in alert_text.lower() or "fatal" in alert_text.lower():
|
|
return "panic"
|
|
if "Fehler in 30 Min" in alert_text:
|
|
return "error_rate"
|
|
if "Keine Logs" in alert_text:
|
|
return "silence"
|
|
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
|
|
return "http"
|
|
if "Service-Neustart" in alert_text:
|
|
return "restart"
|
|
if "Memory läuft ab" in alert_text:
|
|
return "memory_expiry"
|
|
return "default"
|
|
|
|
|
|
def _filter_new_alerts(alerts: list[str]) -> list[str]:
|
|
"""Filtert bereits gemeldete Infra-Alerts per Cooldown. Mails werden separat in check_all() dedupliziert."""
|
|
state = _load_alert_state()
|
|
now = datetime.now(timezone.utc).timestamp()
|
|
cooldowns = state.get("alert_cooldowns", {})
|
|
new_alerts = []
|
|
|
|
for alert in alerts:
|
|
key = _alert_key(alert)
|
|
cat = _alert_category(alert)
|
|
cooldown = ALERT_COOLDOWN_SECONDS.get(cat, 3600)
|
|
|
|
last_sent = cooldowns.get(key, {}).get("ts", 0)
|
|
if now - last_sent > cooldown:
|
|
new_alerts.append(alert)
|
|
cooldowns[key] = {"ts": now, "text": alert[:80], "cat": cat}
|
|
|
|
cutoff = now - 86400
|
|
cooldowns = {k: v for k, v in cooldowns.items() if v.get("ts", 0) > cutoff}
|
|
|
|
state["alert_cooldowns"] = cooldowns
|
|
_save_alert_state(state)
|
|
return new_alerts
|
|
|
|
|
|
def send_alert(token: str, chat_id: str, message: str):
|
|
"""Sendet eine Nachricht via Telegram."""
|
|
requests.post(
|
|
f"https://api.telegram.org/bot{token}/sendMessage",
|
|
data={"chat_id": chat_id, "text": message},
|
|
timeout=10,
|
|
)
|
|
|
|
|
|
def run_check_and_alert():
|
|
"""Hauptfunktion für Cron: prüft und sendet Alerts falls nötig."""
|
|
cfg = config.parse_config()
|
|
token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "")
|
|
chat_id = cfg.raw.get("TG_CHAT_ID", "")
|
|
if not token or not chat_id:
|
|
return
|
|
|
|
alerts = check_all()
|
|
new_alerts = _filter_new_alerts(alerts)
|
|
if new_alerts:
|
|
msg = "🔧 Hausmeister-Check\n\n" + "\n".join(new_alerts)
|
|
send_alert(token, chat_id, msg)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys as _sys
|
|
if len(_sys.argv) > 1 and _sys.argv[1] == "report":
|
|
print(format_report())
|
|
else:
|
|
run_check_and_alert()
|