diff --git a/homelab-ai-bot/monitor.py b/homelab-ai-bot/monitor.py index 3552a1ba..8d2850fe 100644 --- a/homelab-ai-bot/monitor.py +++ b/homelab-ai-bot/monitor.py @@ -5,6 +5,7 @@ import os import json import hashlib import requests +import time from datetime import datetime, timezone sys.path.insert(0, os.path.dirname(__file__)) @@ -45,7 +46,8 @@ HTTP_HEALTH_CHECKS = [ {"name": "WordPress (CT 101)", "url": "http://10.10.10.101/robots.txt"}, {"name": "Matomo (CT 113)", "url": "http://10.10.10.113"}, {"name": "Grafana (CT 110)", "url": "http://10.10.10.110:3000"}, - {"name": "Flugscanner-Agent (pve-pp-1)", "url": "http://100.126.26.46:5010/status"}, + {"name": "Flugscanner-Agent (pve-pp-1)", "url": "http://100.126.26.46:5010/status", + "retries": 4, "timeout": 25, "retry_delay": 5}, ] EXPECTED_STOPPED = { @@ -138,12 +140,25 @@ def check_all() -> list[str]: _headers = {"User-Agent": "Mozilla/5.0 (Hausmeister-Bot/1.0 health-check)"} for check in HTTP_HEALTH_CHECKS: - try: - r = requests.head(check["url"], timeout=15, allow_redirects=True, headers=_headers) - if r.status_code >= 400: - alerts.append(f"🔴 {check['name']} antwortet mit HTTP {r.status_code}") - except requests.RequestException as e: - alerts.append(f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}") + timeout = check.get("timeout", 15) + retries = check.get("retries", 1) + retry_delay = check.get("retry_delay", 3) + msg = None + for attempt in range(retries): + try: + r = requests.head( + check["url"], timeout=timeout, allow_redirects=True, headers=_headers + ) + if r.status_code < 400: + msg = None + break + msg = f"🔴 {check['name']} antwortet mit HTTP {r.status_code}" + except requests.RequestException as e: + msg = f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}" + if attempt < retries - 1: + time.sleep(retry_delay) + if msg: + alerts.append(msg) restarts = loki_client.check_service_restarts(minutes=35) for r in restarts: