monitoring: error-rate alerts, silence proxmox filter, periodic loop

- loki_client.py: check_error_rate() mit host-spezifischen Schwellen (rss-manager:15, wordpress:10, default:25)
- monitor.py: Error-Rate-Check in check_all(), Silence-Check filtert gestoppte Container via Proxmox-Status
- telegram_bot.py: periodischer _monitor_loop alle 10 Min
- Schliesst #30 und #31
This commit is contained in:
Homelab Cursor 2026-03-24 13:30:58 +01:00
parent 776554806b
commit 43ee006f15
5 changed files with 60 additions and 3 deletions

View file

@ -156,6 +156,33 @@ def check_service_restarts(minutes: int = 35) -> list[dict]:
return restarts
ERROR_RATE_THRESHOLDS = {
"rss-manager": 15,
"wordpress-v2": 10,
}
ERROR_RATE_DEFAULT = 25
def check_error_rate(minutes: int = 30) -> list[dict]:
"""Check if any host exceeds its error-rate threshold within the window."""
all_hosts = get_labels()
alerts = []
now = datetime.now(timezone.utc)
for host in all_hosts:
q = f'count_over_time({{host="{host}"}} |~ "(?i)error" [{minutes}m])'
data = _query("/loki/api/v1/query", {"query": q, "time": _ns(now)})
count = sum(
int(float(r.get("value", [None, "0"])[1]))
for r in data.get("data", {}).get("result", [])
if len(r.get("value", [])) > 1
)
threshold = ERROR_RATE_THRESHOLDS.get(host, ERROR_RATE_DEFAULT)
if count > threshold:
alerts.append({"host": host, "count": count, "threshold": threshold})
return alerts
def format_logs(entries: list[dict], max_lines: int = 30) -> str:
"""Format log entries for human/LLM consumption."""
if not entries:

View file

@ -20,6 +20,7 @@ ALERT_COOLDOWN_SECONDS = {
"restart": 900,
"memory_expiry": 43200,
"default": 3600,
"error_rate": 1800,
}
@ -110,11 +111,26 @@ def check_all() -> list[str]:
if hosts:
alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")
error_rates = loki_client.check_error_rate(minutes=30)
for er in error_rates:
alerts.append(
f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})"
)
running_names = {
ct.get("name", "").lower()
for ct in containers
if "error" not in ct and ct.get("status") == "running"
}
silent = loki_client.check_silence(minutes=35)
if silent and "error" not in silent[0]:
names = [s["host"] for s in silent
if s.get("host") not in IGNORED_HOSTS
and s.get("host") not in SILENCE_IGNORED_HOSTS]
names = [
s["host"] for s in silent
if s.get("host") not in IGNORED_HOSTS
and s.get("host") not in SILENCE_IGNORED_HOSTS
and s["host"].lower() in running_names
]
if names:
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
@ -257,6 +273,8 @@ def _alert_category(alert_text: str) -> str:
return "ram"
if "panic" in alert_text.lower() or "fatal" in alert_text.lower():
return "panic"
if "Fehler in 30 Min" in alert_text:
return "error_rate"
if "Keine Logs" in alert_text:
return "silence"
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:

View file

@ -998,10 +998,22 @@ def main():
except Exception:
log.exception("Fehler im Forecast-Loop")
async def _monitor_loop(application):
"""Periodischer Monitoring-Check alle 10 Minuten."""
await asyncio.sleep(60)
while True:
try:
monitor.run_check_and_alert()
except Exception:
log.exception("Fehler im Monitor-Loop")
await asyncio.sleep(600)
async def post_init(application):
await application.bot.set_my_commands(BOT_COMMANDS)
log.info("Kommandomenü registriert")
asyncio.create_task(_watchdog_loop())
asyncio.create_task(_monitor_loop(application))
log.info("Monitor-Loop aktiv (alle 10 Min)")
if application.job_queue is None:
asyncio.create_task(_filmtipp_loop(application))
asyncio.create_task(_forecast_loop(application))