homelab-brain/homelab-ai-bot/core/prometheus_client.py

133 lines
4.6 KiB
Python

"""Prometheus API client for querying system metrics."""
import requests
from datetime import datetime, timezone, timedelta
PROMETHEUS_URL = "http://100.88.230.59:9090"
def _query(endpoint: str, params: dict, base_url: str = None) -> dict:
url = f"{base_url or PROMETHEUS_URL}{endpoint}"
try:
r = requests.get(url, params=params, timeout=10)
r.raise_for_status()
return r.json()
except requests.RequestException as e:
return {"error": str(e), "status": "unavailable"}
def instant_query(query: str) -> dict:
"""Run an instant PromQL query."""
return _query("/api/v1/query", {"query": query})
def range_query(query: str, hours: float = 1, step: str = "5m") -> dict:
"""Run a range PromQL query."""
now = datetime.now(timezone.utc)
start = now - timedelta(hours=hours)
return _query("/api/v1/query_range", {
"query": query,
"start": start.isoformat(),
"end": now.isoformat(),
"step": step,
})
def get_targets() -> list[dict]:
"""Get all Prometheus scrape targets with their status."""
data = _query("/api/v1/targets", {})
if "error" in data:
return [{"error": data["error"]}]
targets = []
for t in data.get("data", {}).get("activeTargets", []):
targets.append({
"job": t.get("labels", {}).get("job", "unknown"),
"instance": t.get("labels", {}).get("instance", "unknown"),
"health": t.get("health", "unknown"),
"last_scrape": t.get("lastScrape", ""),
})
return targets
def is_available() -> bool:
"""Check if Prometheus is reachable."""
data = _query("/api/v1/query", {"query": "up"})
return "error" not in data or data.get("status") == "success"
def get_cpu(instance: str = None, hours: float = 1) -> dict:
"""Get CPU usage. If instance given, filter to that instance."""
if instance:
q = f'100 - (avg by (instance) (rate(node_cpu_seconds_total{{mode="idle", instance=~"{instance}.*"}}[5m])) * 100)'
else:
q = '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
data = instant_query(q)
return _extract_metrics(data, "cpu_percent")
def get_memory(instance: str = None) -> dict:
"""Get memory usage percentage."""
if instance:
q = f'(1 - node_memory_MemAvailable_bytes{{instance=~"{instance}.*"}} / node_memory_MemTotal_bytes{{instance=~"{instance}.*"}}) * 100'
else:
q = '(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100'
data = instant_query(q)
return _extract_metrics(data, "memory_percent")
def get_disk(instance: str = None) -> dict:
"""Get root filesystem usage percentage."""
if instance:
q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", instance=~"{instance}.*"}} / node_filesystem_size_bytes{{mountpoint="/", instance=~"{instance}.*"}}) * 100'
else:
q = '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100'
data = instant_query(q)
return _extract_metrics(data, "disk_percent")
def get_overview() -> str:
"""Get a formatted overview of all monitored instances."""
if not is_available():
return "Prometheus is not reachable at " + PROMETHEUS_URL
targets = get_targets()
if not targets:
return "No Prometheus targets found."
lines = ["## Prometheus Targets\n"]
for t in targets:
status = "UP" if t["health"] == "up" else "DOWN"
lines.append(f"- [{status}] {t['job']} ({t['instance']})")
cpu = get_cpu()
if "results" in cpu:
lines.append("\n## CPU Usage")
for r in cpu["results"]:
lines.append(f"- {r['instance']}: {r['value']:.1f}%")
mem = get_memory()
if "results" in mem:
lines.append("\n## Memory Usage")
for r in mem["results"]:
lines.append(f"- {r['instance']}: {r['value']:.1f}%")
disk = get_disk()
if "results" in disk:
lines.append("\n## Disk Usage (/)")
for r in disk["results"]:
lines.append(f"- {r['instance']}: {r['value']:.1f}%")
return "\n".join(lines)
def _extract_metrics(data: dict, metric_name: str) -> dict:
if "error" in data and data.get("status") != "success":
return {"error": data.get("error", "unknown error")}
results = []
for r in data.get("data", {}).get("result", []):
instance = r.get("metric", {}).get("instance", "unknown")
value = float(r.get("value", [0, 0])[1])
results.append({"instance": instance, "value": value, "metric": metric_name})
return {"results": results}