monitoring: limit 200, HTTP-Health-Check, Service-Restart-Erkennung
This commit is contained in:
parent
afdf94af70
commit
15296da783
2 changed files with 45 additions and 1 deletions
|
|
@ -46,7 +46,7 @@ def query_logs(query: str, hours: float = 1, limit: int = 100) -> list[dict]:
|
|||
return entries
|
||||
|
||||
|
||||
def get_errors(container: str = None, hours: float = 1, limit: int = 50) -> list[dict]:
|
||||
def get_errors(container: str = None, hours: float = 1, limit: int = 200) -> list[dict]:
|
||||
"""Get error-level logs, optionally filtered by container hostname."""
|
||||
if container:
|
||||
q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"'
|
||||
|
|
@ -109,6 +109,26 @@ def get_health(container: str, hours: float = 24) -> dict:
|
|||
}
|
||||
|
||||
|
||||
WATCHED_SERVICES = [
|
||||
("rss-manager", "rss-manager"),
|
||||
("wordpress-v2", "wordpress"),
|
||||
("fuenfvoracht", "fuenfvoracht"),
|
||||
("homelab-ai-bot", "hausmeister"),
|
||||
]
|
||||
|
||||
|
||||
def check_service_restarts(minutes: int = 35) -> list[dict]:
|
||||
"""Findet Services die innerhalb des Zeitfensters neu gestartet haben (systemd journal via Loki)."""
|
||||
restarts = []
|
||||
for host, service_name in WATCHED_SERVICES:
|
||||
q = f'{{host="{host}"}} |~ "(?i)(Started|Restarting|restarted).*{service_name}"'
|
||||
entries = query_logs(q, hours=minutes / 60, limit=5)
|
||||
real = [e for e in entries if "error" not in e]
|
||||
if real:
|
||||
restarts.append({"host": host, "service": service_name, "count": len(real)})
|
||||
return restarts
|
||||
|
||||
|
||||
def format_logs(entries: list[dict], max_lines: int = 30) -> str:
|
||||
"""Format log entries for human/LLM consumption."""
|
||||
if not entries:
|
||||
|
|
|
|||
|
|
@ -16,6 +16,8 @@ ALERT_COOLDOWN_SECONDS = {
|
|||
"ram": 1800,
|
||||
"panic": 3600,
|
||||
"silence": 3600,
|
||||
"http": 1800,
|
||||
"restart": 900,
|
||||
"default": 3600,
|
||||
}
|
||||
|
||||
|
|
@ -36,6 +38,12 @@ def _get_passwords(cfg):
|
|||
|
||||
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
|
||||
|
||||
HTTP_HEALTH_CHECKS = [
|
||||
{"name": "arakavanews.com", "url": "https://arakavanews.com"},
|
||||
{"name": "matomo.orbitalo.net", "url": "https://matomo.orbitalo.net"},
|
||||
{"name": "grafana.orbitalo.net", "url": "https://grafana.orbitalo.net"},
|
||||
]
|
||||
|
||||
EXPECTED_STOPPED = {
|
||||
(101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
||||
(101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
||||
|
|
@ -99,6 +107,18 @@ def check_all() -> list[str]:
|
|||
if names:
|
||||
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
||||
|
||||
for check in HTTP_HEALTH_CHECKS:
|
||||
try:
|
||||
r = requests.get(check["url"], timeout=10, allow_redirects=True)
|
||||
if r.status_code >= 400:
|
||||
alerts.append(f"🔴 {check['name']} antwortet mit HTTP {r.status_code}")
|
||||
except requests.RequestException as e:
|
||||
alerts.append(f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}")
|
||||
|
||||
restarts = loki_client.check_service_restarts(minutes=35)
|
||||
for r in restarts:
|
||||
alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)")
|
||||
|
||||
try:
|
||||
mail_client.init(cfg)
|
||||
important = mail_client.get_important_mails(hours=1)
|
||||
|
|
@ -195,6 +215,10 @@ def _alert_category(alert_text: str) -> str:
|
|||
return "panic"
|
||||
if "Keine Logs" in alert_text:
|
||||
return "silence"
|
||||
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
|
||||
return "http"
|
||||
if "Service-Neustart" in alert_text:
|
||||
return "restart"
|
||||
return "default"
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue