From 15296da783f9ef6639fd2916fbefe8ca2cd2c657 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 11 Mar 2026 21:14:06 +0700 Subject: [PATCH] monitoring: limit 200, HTTP-Health-Check, Service-Restart-Erkennung --- homelab-ai-bot/core/loki_client.py | 22 +++++++++++++++++++++- homelab-ai-bot/monitor.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/homelab-ai-bot/core/loki_client.py b/homelab-ai-bot/core/loki_client.py index ca885897..dafaa48c 100644 --- a/homelab-ai-bot/core/loki_client.py +++ b/homelab-ai-bot/core/loki_client.py @@ -46,7 +46,7 @@ def query_logs(query: str, hours: float = 1, limit: int = 100) -> list[dict]: return entries -def get_errors(container: str = None, hours: float = 1, limit: int = 50) -> list[dict]: +def get_errors(container: str = None, hours: float = 1, limit: int = 200) -> list[dict]: """Get error-level logs, optionally filtered by container hostname.""" if container: q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"' @@ -109,6 +109,26 @@ def get_health(container: str, hours: float = 24) -> dict: } +WATCHED_SERVICES = [ + ("rss-manager", "rss-manager"), + ("wordpress-v2", "wordpress"), + ("fuenfvoracht", "fuenfvoracht"), + ("homelab-ai-bot", "hausmeister"), +] + + +def check_service_restarts(minutes: int = 35) -> list[dict]: + """Findet Services die innerhalb des Zeitfensters neu gestartet haben (systemd journal via Loki).""" + restarts = [] + for host, service_name in WATCHED_SERVICES: + q = f'{{host="{host}"}} |~ "(?i)(Started|Restarting|restarted).*{service_name}"' + entries = query_logs(q, hours=minutes / 60, limit=5) + real = [e for e in entries if "error" not in e] + if real: + restarts.append({"host": host, "service": service_name, "count": len(real)}) + return restarts + + def format_logs(entries: list[dict], max_lines: int = 30) -> str: """Format log entries for human/LLM consumption.""" if not entries: diff --git a/homelab-ai-bot/monitor.py b/homelab-ai-bot/monitor.py index 954f9963..f10c9208 100644 --- a/homelab-ai-bot/monitor.py +++ b/homelab-ai-bot/monitor.py @@ -16,6 +16,8 @@ ALERT_COOLDOWN_SECONDS = { "ram": 1800, "panic": 3600, "silence": 3600, + "http": 1800, + "restart": 900, "default": 3600, } @@ -36,6 +38,12 @@ def _get_passwords(cfg): CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115] +HTTP_HEALTH_CHECKS = [ + {"name": "arakavanews.com", "url": "https://arakavanews.com"}, + {"name": "matomo.orbitalo.net", "url": "https://matomo.orbitalo.net"}, + {"name": "grafana.orbitalo.net", "url": "https://grafana.orbitalo.net"}, +] + EXPECTED_STOPPED = { (101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster) (101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster) @@ -99,6 +107,18 @@ def check_all() -> list[str]: if names: alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") + for check in HTTP_HEALTH_CHECKS: + try: + r = requests.get(check["url"], timeout=10, allow_redirects=True) + if r.status_code >= 400: + alerts.append(f"🔴 {check['name']} antwortet mit HTTP {r.status_code}") + except requests.RequestException as e: + alerts.append(f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}") + + restarts = loki_client.check_service_restarts(minutes=35) + for r in restarts: + alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)") + try: mail_client.init(cfg) important = mail_client.get_important_mails(hours=1) @@ -195,6 +215,10 @@ def _alert_category(alert_text: str) -> str: return "panic" if "Keine Logs" in alert_text: return "silence" + if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text: + return "http" + if "Service-Neustart" in alert_text: + return "restart" return "default"