monitoring: error-rate alerts, silence proxmox filter, periodic loop
- loki_client.py: check_error_rate() mit host-spezifischen Schwellen (rss-manager:15, wordpress:10, default:25) - monitor.py: Error-Rate-Check in check_all(), Silence-Check filtert gestoppte Container via Proxmox-Status - telegram_bot.py: periodischer _monitor_loop alle 10 Min - Schliesst #30 und #31
This commit is contained in:
parent
776554806b
commit
43ee006f15
5 changed files with 60 additions and 3 deletions
Binary file not shown.
Binary file not shown.
|
|
@ -156,6 +156,33 @@ def check_service_restarts(minutes: int = 35) -> list[dict]:
|
||||||
return restarts
|
return restarts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ERROR_RATE_THRESHOLDS = {
|
||||||
|
"rss-manager": 15,
|
||||||
|
"wordpress-v2": 10,
|
||||||
|
}
|
||||||
|
ERROR_RATE_DEFAULT = 25
|
||||||
|
|
||||||
|
|
||||||
|
def check_error_rate(minutes: int = 30) -> list[dict]:
|
||||||
|
"""Check if any host exceeds its error-rate threshold within the window."""
|
||||||
|
all_hosts = get_labels()
|
||||||
|
alerts = []
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
for host in all_hosts:
|
||||||
|
q = f'count_over_time({{host="{host}"}} |~ "(?i)error" [{minutes}m])'
|
||||||
|
data = _query("/loki/api/v1/query", {"query": q, "time": _ns(now)})
|
||||||
|
count = sum(
|
||||||
|
int(float(r.get("value", [None, "0"])[1]))
|
||||||
|
for r in data.get("data", {}).get("result", [])
|
||||||
|
if len(r.get("value", [])) > 1
|
||||||
|
)
|
||||||
|
threshold = ERROR_RATE_THRESHOLDS.get(host, ERROR_RATE_DEFAULT)
|
||||||
|
if count > threshold:
|
||||||
|
alerts.append({"host": host, "count": count, "threshold": threshold})
|
||||||
|
return alerts
|
||||||
|
|
||||||
|
|
||||||
def format_logs(entries: list[dict], max_lines: int = 30) -> str:
|
def format_logs(entries: list[dict], max_lines: int = 30) -> str:
|
||||||
"""Format log entries for human/LLM consumption."""
|
"""Format log entries for human/LLM consumption."""
|
||||||
if not entries:
|
if not entries:
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ ALERT_COOLDOWN_SECONDS = {
|
||||||
"restart": 900,
|
"restart": 900,
|
||||||
"memory_expiry": 43200,
|
"memory_expiry": 43200,
|
||||||
"default": 3600,
|
"default": 3600,
|
||||||
|
"error_rate": 1800,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -110,11 +111,26 @@ def check_all() -> list[str]:
|
||||||
if hosts:
|
if hosts:
|
||||||
alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")
|
alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}")
|
||||||
|
|
||||||
|
error_rates = loki_client.check_error_rate(minutes=30)
|
||||||
|
for er in error_rates:
|
||||||
|
alerts.append(
|
||||||
|
f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
running_names = {
|
||||||
|
ct.get("name", "").lower()
|
||||||
|
for ct in containers
|
||||||
|
if "error" not in ct and ct.get("status") == "running"
|
||||||
|
}
|
||||||
|
|
||||||
silent = loki_client.check_silence(minutes=35)
|
silent = loki_client.check_silence(minutes=35)
|
||||||
if silent and "error" not in silent[0]:
|
if silent and "error" not in silent[0]:
|
||||||
names = [s["host"] for s in silent
|
names = [
|
||||||
if s.get("host") not in IGNORED_HOSTS
|
s["host"] for s in silent
|
||||||
and s.get("host") not in SILENCE_IGNORED_HOSTS]
|
if s.get("host") not in IGNORED_HOSTS
|
||||||
|
and s.get("host") not in SILENCE_IGNORED_HOSTS
|
||||||
|
and s["host"].lower() in running_names
|
||||||
|
]
|
||||||
if names:
|
if names:
|
||||||
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
||||||
|
|
||||||
|
|
@ -257,6 +273,8 @@ def _alert_category(alert_text: str) -> str:
|
||||||
return "ram"
|
return "ram"
|
||||||
if "panic" in alert_text.lower() or "fatal" in alert_text.lower():
|
if "panic" in alert_text.lower() or "fatal" in alert_text.lower():
|
||||||
return "panic"
|
return "panic"
|
||||||
|
if "Fehler in 30 Min" in alert_text:
|
||||||
|
return "error_rate"
|
||||||
if "Keine Logs" in alert_text:
|
if "Keine Logs" in alert_text:
|
||||||
return "silence"
|
return "silence"
|
||||||
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
|
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
|
||||||
|
|
|
||||||
|
|
@ -998,10 +998,22 @@ def main():
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("Fehler im Forecast-Loop")
|
log.exception("Fehler im Forecast-Loop")
|
||||||
|
|
||||||
|
async def _monitor_loop(application):
|
||||||
|
"""Periodischer Monitoring-Check alle 10 Minuten."""
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
monitor.run_check_and_alert()
|
||||||
|
except Exception:
|
||||||
|
log.exception("Fehler im Monitor-Loop")
|
||||||
|
await asyncio.sleep(600)
|
||||||
|
|
||||||
async def post_init(application):
|
async def post_init(application):
|
||||||
await application.bot.set_my_commands(BOT_COMMANDS)
|
await application.bot.set_my_commands(BOT_COMMANDS)
|
||||||
log.info("Kommandomenü registriert")
|
log.info("Kommandomenü registriert")
|
||||||
asyncio.create_task(_watchdog_loop())
|
asyncio.create_task(_watchdog_loop())
|
||||||
|
asyncio.create_task(_monitor_loop(application))
|
||||||
|
log.info("Monitor-Loop aktiv (alle 10 Min)")
|
||||||
if application.job_queue is None:
|
if application.job_queue is None:
|
||||||
asyncio.create_task(_filmtipp_loop(application))
|
asyncio.create_task(_filmtipp_loop(application))
|
||||||
asyncio.create_task(_forecast_loop(application))
|
asyncio.create_task(_forecast_loop(application))
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue