fix: monitor.py wiederhergestellt (war leer nach git reset)
- format_report() und check_all() wieder verfügbar
- _get_passwords() nutzt dynamische PROXMOX_HOSTS statt hardcodiert
- IGNORED_HOSTS Set für ${HOSTNAME} Filterung
Made-with: Cursor
This commit is contained in:
parent
70012112e4
commit
a3e7337d9a
1 changed files with 140 additions and 0 deletions
|
|
@ -0,0 +1,140 @@
|
|||
"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2)."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from core import config, loki_client, proxmox_client
|
||||
|
||||
|
||||
def _get_tokens(cfg):
|
||||
tokens = {}
|
||||
tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "")
|
||||
tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "")
|
||||
if tn and tv:
|
||||
tokens["pve-hetzner"] = {"name": tn, "value": tv}
|
||||
return tokens
|
||||
|
||||
|
||||
def _get_passwords(cfg):
|
||||
pw = cfg.passwords.get("default", "")
|
||||
return {host: pw for host in proxmox_client.PROXMOX_HOSTS}
|
||||
|
||||
|
||||
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
|
||||
|
||||
IGNORED_HOSTS = {"${HOSTNAME}", ""}
|
||||
|
||||
|
||||
def check_all() -> list[str]:
|
||||
"""Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück."""
|
||||
cfg = config.parse_config()
|
||||
alerts = []
|
||||
|
||||
containers = proxmox_client.get_all_containers(
|
||||
_get_passwords(cfg), _get_tokens(cfg)
|
||||
)
|
||||
for ct in containers:
|
||||
if "error" in ct:
|
||||
continue
|
||||
vmid = ct.get("vmid", 0)
|
||||
name = ct.get("name", "?")
|
||||
status = ct.get("status", "unknown")
|
||||
if vmid in CRITICAL_CONTAINERS and status != "running":
|
||||
alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!")
|
||||
|
||||
mem = ct.get("mem", 0)
|
||||
maxmem = ct.get("maxmem", 1)
|
||||
if maxmem > 0 and mem / maxmem > 0.90:
|
||||
pct = int(mem / maxmem * 100)
|
||||
alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%")
|
||||
|
||||
errors = loki_client.get_errors(hours=0.5, limit=50)
|
||||
error_lines = [e for e in errors if "error" not in e]
|
||||
panic_lines = [e for e in error_lines if
|
||||
any(w in e.get("line", "").lower() for w in ["panic", "fatal", "oom", "out of memory"])
|
||||
and "query=" not in e.get("line", "")
|
||||
and "caller=metrics" not in e.get("line", "")
|
||||
]
|
||||
if panic_lines:
|
||||
hosts = set(e.get("host", "?") for e in panic_lines)
|
||||
hosts -= IGNORED_HOSTS
|
||||
if hosts:
|
||||
alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")
|
||||
|
||||
silent = loki_client.check_silence(minutes=35)
|
||||
if silent and "error" not in silent[0]:
|
||||
names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS]
|
||||
if names:
|
||||
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
||||
|
||||
return alerts
|
||||
|
||||
|
||||
def format_report() -> str:
|
||||
"""Tagesbericht: Gesamtstatus aller Systeme."""
|
||||
cfg = config.parse_config()
|
||||
lines = ["📋 Tagesbericht Homelab\n"]
|
||||
|
||||
containers = proxmox_client.get_all_containers(
|
||||
_get_passwords(cfg), _get_tokens(cfg)
|
||||
)
|
||||
running = [c for c in containers if c.get("status") == "running"]
|
||||
stopped = [c for c in containers if c.get("status") == "stopped"]
|
||||
errors_ct = [c for c in containers if "error" in c]
|
||||
lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar")
|
||||
|
||||
errors = loki_client.get_errors(hours=24, limit=100)
|
||||
error_count = len([e for e in errors if "error" not in e])
|
||||
lines.append(f"Fehler (24h): {error_count}")
|
||||
|
||||
silent = loki_client.check_silence(minutes=35)
|
||||
if silent and "error" not in (silent[0] if silent else {}):
|
||||
names = [s["host"] for s in silent if s.get("host") not in IGNORED_HOSTS]
|
||||
if names:
|
||||
lines.append(f"Stille Hosts: {', '.join(names)}")
|
||||
else:
|
||||
lines.append("Stille Hosts: keine")
|
||||
else:
|
||||
lines.append("Stille Hosts: keine")
|
||||
|
||||
alerts = check_all()
|
||||
if alerts:
|
||||
lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:")
|
||||
lines.extend(alerts)
|
||||
else:
|
||||
lines.append("\n✅ Keine Alarme — alles läuft.")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def send_alert(token: str, chat_id: str, message: str):
|
||||
"""Sendet eine Nachricht via Telegram."""
|
||||
requests.post(
|
||||
f"https://api.telegram.org/bot{token}/sendMessage",
|
||||
data={"chat_id": chat_id, "text": message},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
def run_check_and_alert():
|
||||
"""Hauptfunktion für Cron: prüft und sendet Alerts falls nötig."""
|
||||
cfg = config.parse_config()
|
||||
token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "")
|
||||
chat_id = cfg.raw.get("TG_CHAT_ID", "")
|
||||
if not token or not chat_id:
|
||||
return
|
||||
|
||||
alerts = check_all()
|
||||
if alerts:
|
||||
msg = "🔧 Hausmeister-Check\n\n" + "\n".join(alerts)
|
||||
send_alert(token, chat_id, msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys as _sys
|
||||
if len(_sys.argv) > 1 and _sys.argv[1] == "report":
|
||||
print(format_report())
|
||||
else:
|
||||
run_check_and_alert()
|
||||
Loading…
Add table
Reference in a new issue