diff --git a/homelab-ai-bot/__pycache__/monitor.cpython-311.pyc b/homelab-ai-bot/__pycache__/monitor.cpython-311.pyc index a167b0ca..aa7192d3 100644 Binary files a/homelab-ai-bot/__pycache__/monitor.cpython-311.pyc and b/homelab-ai-bot/__pycache__/monitor.cpython-311.pyc differ diff --git a/homelab-ai-bot/core/__pycache__/loki_client.cpython-311.pyc b/homelab-ai-bot/core/__pycache__/loki_client.cpython-311.pyc index f6bd94e7..678f028c 100644 Binary files a/homelab-ai-bot/core/__pycache__/loki_client.cpython-311.pyc and b/homelab-ai-bot/core/__pycache__/loki_client.cpython-311.pyc differ diff --git a/homelab-ai-bot/core/loki_client.py b/homelab-ai-bot/core/loki_client.py index 7ce9d416..83e867ab 100644 --- a/homelab-ai-bot/core/loki_client.py +++ b/homelab-ai-bot/core/loki_client.py @@ -156,6 +156,33 @@ def check_service_restarts(minutes: int = 35) -> list[dict]: return restarts + +ERROR_RATE_THRESHOLDS = { + "rss-manager": 15, + "wordpress-v2": 10, +} +ERROR_RATE_DEFAULT = 25 + + +def check_error_rate(minutes: int = 30) -> list[dict]: + """Check if any host exceeds its error-rate threshold within the window.""" + all_hosts = get_labels() + alerts = [] + now = datetime.now(timezone.utc) + for host in all_hosts: + q = f'count_over_time({{host="{host}"}} |~ "(?i)error" [{minutes}m])' + data = _query("/loki/api/v1/query", {"query": q, "time": _ns(now)}) + count = sum( + int(float(r.get("value", [None, "0"])[1])) + for r in data.get("data", {}).get("result", []) + if len(r.get("value", [])) > 1 + ) + threshold = ERROR_RATE_THRESHOLDS.get(host, ERROR_RATE_DEFAULT) + if count > threshold: + alerts.append({"host": host, "count": count, "threshold": threshold}) + return alerts + + def format_logs(entries: list[dict], max_lines: int = 30) -> str: """Format log entries for human/LLM consumption.""" if not entries: diff --git a/homelab-ai-bot/monitor.py b/homelab-ai-bot/monitor.py index 4d1c2e0d..6c2fe53c 100644 --- a/homelab-ai-bot/monitor.py +++ b/homelab-ai-bot/monitor.py @@ -20,6 +20,7 @@ ALERT_COOLDOWN_SECONDS = { "restart": 900, "memory_expiry": 43200, "default": 3600, + "error_rate": 1800, } @@ -110,11 +111,26 @@ def check_all() -> list[str]: if hosts: alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}") + error_rates = loki_client.check_error_rate(minutes=30) + for er in error_rates: + alerts.append( + f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})" + ) + + running_names = { + ct.get("name", "").lower() + for ct in containers + if "error" not in ct and ct.get("status") == "running" + } + silent = loki_client.check_silence(minutes=35) if silent and "error" not in silent[0]: - names = [s["host"] for s in silent - if s.get("host") not in IGNORED_HOSTS - and s.get("host") not in SILENCE_IGNORED_HOSTS] + names = [ + s["host"] for s in silent + if s.get("host") not in IGNORED_HOSTS + and s.get("host") not in SILENCE_IGNORED_HOSTS + and s["host"].lower() in running_names + ] if names: alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") @@ -257,6 +273,8 @@ def _alert_category(alert_text: str) -> str: return "ram" if "panic" in alert_text.lower() or "fatal" in alert_text.lower(): return "panic" + if "Fehler in 30 Min" in alert_text: + return "error_rate" if "Keine Logs" in alert_text: return "silence" if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text: diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index 1a1c2efd..ff7605ac 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -998,10 +998,22 @@ def main(): except Exception: log.exception("Fehler im Forecast-Loop") + async def _monitor_loop(application): + """Periodischer Monitoring-Check alle 10 Minuten.""" + await asyncio.sleep(60) + while True: + try: + monitor.run_check_and_alert() + except Exception: + log.exception("Fehler im Monitor-Loop") + await asyncio.sleep(600) + async def post_init(application): await application.bot.set_my_commands(BOT_COMMANDS) log.info("Kommandomenü registriert") asyncio.create_task(_watchdog_loop()) + asyncio.create_task(_monitor_loop(application)) + log.info("Monitor-Loop aktiv (alle 10 Min)") if application.job_queue is None: asyncio.create_task(_filmtipp_loop(application)) asyncio.create_task(_forecast_loop(application))