homelab-brain/scripts/sync_state.py
2026-03-08 09:32:40 +01:00

471 lines
15 KiB
Python
Executable file

#!/usr/bin/env python3
"""sync_state.py — Ersetzt sync-state.sh mit Core-Modules.
Läuft alle 15 Min auf pve-hetzner via Cron.
- Generiert STATE.md Dateien aus homelab.conf + Live-Daten
- Service-Watchdog mit Telegram-Alerts
- Git Commit & Push nach Forgejo
Nutzt dieselben Core-Module wie MCP-Server und Telegram-Bot.
"""
import os
import sys
import time
import subprocess
import json
from datetime import datetime
from pathlib import Path
os.environ.setdefault("PATH", "/usr/sbin:/usr/local/sbin:/usr/local/bin:/usr/bin:/sbin:/bin")
CORE_PATH = Path("/root/homelab-mcp/core")
sys.path.insert(0, str(CORE_PATH.parent))
from core import config
REPO = Path("/opt/homelab-brain")
DEBOUNCE_DIR = Path("/tmp/homelab_watchdog")
DEBOUNCE_DIR.mkdir(exist_ok=True)
NOW = datetime.now()
DATE = NOW.strftime("%Y-%m-%d %H:%M")
CHANGED = False
def log(msg: str):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
# ── Telegram ──────────────────────────────────────────
def tg_alert(cfg: config.HomelabConfig, key: str, msg: str):
token = cfg.telegram.get("tg_mutter_token", "")
chat_id = cfg.raw.get("TG_CHAT_ID", "")
if not token or not chat_id:
return
lockfile = DEBOUNCE_DIR / f"{key}.lock"
now = int(time.time())
if lockfile.exists():
try:
last = int(lockfile.read_text().strip())
if now - last < 600:
return
except ValueError:
pass
subprocess.run([
"curl", "-s", "-X", "POST",
f"https://api.telegram.org/bot{token}/sendMessage",
"-d", f"chat_id={chat_id}",
"-d", f"text=Homelab Watchdog%0A%0A{msg}",
"-d", "parse_mode=Markdown",
], capture_output=True, timeout=10)
lockfile.write_text(str(now))
log(f"Alert gesendet: {key}")
def tg_recovery(cfg: config.HomelabConfig, key: str, name: str):
token = cfg.telegram.get("tg_mutter_token", "")
chat_id = cfg.raw.get("TG_CHAT_ID", "")
lockfile = DEBOUNCE_DIR / f"{key}.lock"
if lockfile.exists() and token and chat_id:
subprocess.run([
"curl", "-s", "-X", "POST",
f"https://api.telegram.org/bot{token}/sendMessage",
"-d", f"chat_id={chat_id}",
"-d", f"text=*{name}* wieder online",
"-d", "parse_mode=Markdown",
], capture_output=True, timeout=10)
lockfile.unlink(missing_ok=True)
log(f"Recovery: {name} wieder online")
# ── Service Checks (pct exec) ────────────────────────
def pct_exec(ct: int, cmd: str, timeout: int = 15) -> str:
try:
r = subprocess.run(
["pct", "exec", str(ct), "--", "bash", "-c", cmd],
capture_output=True, text=True, timeout=timeout,
)
return r.stdout.strip()
except (subprocess.TimeoutExpired, Exception):
return ""
def check_service(cfg: config.HomelabConfig, ct: int, service: str, name: str) -> str:
status = "unknown"
for attempt in range(3):
result = pct_exec(ct, f"systemctl is-active {service}")
if result == "active":
status = "active"
break
if attempt < 2:
time.sleep(2)
else:
status = result or "unknown"
if status != "active":
tg_alert(cfg, f"service_{service}",
f"*{name}* ist DOWN%0AService: {service}%0ACT: {ct}%0AStatus: {status}")
return "DOWN"
tg_recovery(cfg, f"service_{service}", name)
return "active"
def check_docker(cfg: config.HomelabConfig, ct: int, container: str, name: str) -> str:
status = "unknown"
for attempt in range(3):
result = pct_exec(ct, f"docker inspect --format='{{{{.State.Status}}}}' {container}")
if result == "running":
status = "running"
break
if attempt < 2:
time.sleep(2)
else:
status = result or "unknown"
if status != "running":
tg_alert(cfg, container.replace("-", "_"),
f"*{name}* ist DOWN%0AStatus: {status}%0ACT: {ct}")
else:
tg_recovery(cfg, container.replace("-", "_"), name)
return status
# ── STATE.md Generatoren ─────────────────────────────
def generate_arakava_state(cfg: config.HomelabConfig) -> str:
log("Sammle Arakava News Status...")
rss_status = pct_exec(109, "systemctl is-active rss-manager") or "unknown"
wp_status = pct_exec(101, "docker inspect --format='{{.State.Status}}' wordpress-app") or "unknown"
feed_cmd = (
"python3 -c \""
"import sqlite3;"
" db = sqlite3.connect('/opt/rss-manager/rss_manager.db');"
" rows = db.execute('SELECT name, last_run FROM feeds WHERE enabled=1 ORDER BY last_run DESC LIMIT 5').fetchall();"
" [print(f' {r[0]}: {r[1] or chr(110)+chr(105)+chr(101)}') for r in rows]"
"\""
)
feed_activity = pct_exec(109, feed_cmd) or " (nicht abrufbar)"
or_key = cfg.api_keys.get("openrouter_key", "")
or_cmd = (
"python3 -c \""
"import requests\n"
"try:\n"
" r = requests.get('https://openrouter.ai/api/v1/auth/key',"
" headers={'Authorization': 'Bearer " + or_key + "'}, timeout=5)\n"
" d = r.json().get('data', {})\n"
" remaining = float(d.get('limit', 20)) - float(d.get('usage', 0))\n"
" print(f'${remaining:.2f} verbleibend')\n" # noqa: not an f-string, goes to shell
"except Exception as e:\n"
" print(f'(nicht abrufbar: {e})')\n"
"\""
)
or_balance = pct_exec(109, or_cmd) or "(nicht abrufbar)"
errors = pct_exec(109, "grep -c 'ERROR' /opt/rss-manager/logs/service.log 2>/dev/null || echo 0") or "0"
last_error = pct_exec(109, "grep 'ERROR' /opt/rss-manager/logs/service.log 2>/dev/null | tail -1 || echo 'keine'") or "keine"
ct_101 = config.get_container(cfg, vmid=101)
ct_109 = config.get_container(cfg, vmid=109)
ct_600 = config.get_container(cfg, vmid=600)
ct_601 = config.get_container(cfg, vmid=601)
rss_url = f"http://{ct_109.tailscale_ip}:8080" if ct_109 and ct_109.tailscale_ip else ""
matomo_url = f"https://{cfg.domains.get('matomo', '')}"
blog_url = f"https://{cfg.domains.get('primary', '')}"
admin_url = f"{blog_url}/wp-admin"
pw_admin = cfg.passwords.get("wp_admin", "?")
pw_default = cfg.passwords.get("default", "?")
feed_table_cmd = (
"python3 -c \""
"import sqlite3;"
" db = sqlite3.connect('/opt/rss-manager/rss_manager.db');"
" rows = db.execute('SELECT id, name, schedule FROM feeds WHERE enabled=1 ORDER BY id').fetchall();"
" [print(f'| {r[0]} | {r[1]} | {r[2]} |') for r in rows]"
"\""
)
feed_table = pct_exec(109, feed_table_cmd) or "| — | (nicht abrufbar) | — |"
def ct_row(ct, extra=""):
if not ct:
return "| ? | ? | ? | ? |"
s = extra or ct.services
return f"| {ct.vmid} | {s} | {ct.tailscale_ip or ''} |"
return f"""# Arakava News — Live State
> Auto-generiert: {DATE}
## Service Status
| Service | CT | Status |
|---|---|---|
| rss-manager | 109 | {rss_status} |
| WordPress Docker | 101 | {wp_status} |
## Letzte Feed-Aktivität (Top 5)
{feed_activity}
## Fehler (letzte 24h)
- Fehler gesamt: {errors}
- Letzter Fehler: {last_error}
## OpenRouter Guthaben
{or_balance}
## URLs
- Blog: {blog_url}
- Admin: {admin_url} (admin / {pw_admin})
- RSS Manager: {rss_url} (admin / {pw_default})
- Matomo: {matomo_url} (admin / {pw_default})
## Container (Primary — pve-hetzner)
| CT | Dienst | Tailscale |
|---|---|---|
{ct_row(ct_101)}
{ct_row(ct_109)}
## Container (Mirror — pve3 Muldenstein)
| CT | Dienst | Tailscale |
|---|---|---|
{ct_row(ct_600)}
{ct_row(ct_601)}
## Aktive Feeds
| ID | Name | Schedule |
|---|---|---|
{feed_table}
## Code (CT 109: /opt/rss-manager/)
poster.py, scheduler.py, app.py, db.py
## Änderungshistorie
- 08.03.2026: Domain arakavanews.com live, Mirror CT 600/601 auf pve3
- 08.03.2026: homelab.conf als zentrale Quelle der Wahrheit
- 24.02.2026: Scheduler Lock gegen Doppelstarts
- 24.02.2026: Telegram auf HTML-Modus (Sonderzeichen-Fix)
- 24.02.2026: Werbeartikel-Blacklist (Anzeige:, Sponsored, etc.)
- 23.02.2026: Matomo von CT 113 → CT 109 migriert
"""
def generate_infra_state(cfg: config.HomelabConfig) -> str:
log("Sammle Infrastruktur Status...")
disk_root = subprocess.run(
["df", "-h", "/"], capture_output=True, text=True
).stdout.strip().split("\n")
disk_root_info = " ".join(disk_root[-1].split()[3:5]) if len(disk_root) > 1 else "n/a"
disk_data = "n/a"
r = subprocess.run(["df", "-h", "/var/lib/vz"], capture_output=True, text=True)
if r.returncode == 0:
parts = r.stdout.strip().split("\n")
if len(parts) > 1:
disk_data = " ".join(parts[-1].split()[3:5])
hetzner_cts = [c for c in cfg.containers if c.host == "pve-hetzner"]
pve1_cts = [c for c in cfg.containers if c.host == "pve1"]
pve3_cts = [c for c in cfg.containers if c.host == "pve3"]
def ct_table(cts, cols=("CT", "Name", "Tailscale IP", "Dienste")):
header = "| " + " | ".join(cols) + " |"
sep = "|" + "|".join(["---"] * len(cols)) + "|"
rows = []
for c in sorted(cts, key=lambda x: x.vmid):
ts = c.tailscale_ip or ""
rows.append(f"| {c.vmid} | {c.name} | {ts} | {c.services} |")
return f"{header}\n{sep}\n" + "\n".join(rows) if rows else "(keine)"
tunnel_lines = []
for t in cfg.tunnels:
status_label = "Standby" if t.status == "standby" else "aktiv"
tunnel_lines.append(f"- CT {t.ct_id}: {t.domain}{t.target} ({status_label})")
tunnel_text = "\n".join(tunnel_lines) if tunnel_lines else "- keine"
pw_hetzner = cfg.passwords.get("hetzner", "?")
pw_default = cfg.passwords.get("default", "?")
return f"""# Infrastruktur — Live State
> Auto-generiert: {DATE}
## pve-hetzner Disk
| Mount | Belegt |
|---|---|
| / (root) | {disk_root_info} |
| /var/lib/vz (VMs/CTs) | {disk_data} |
## Aktive Container auf pve-hetzner
{ct_table(hetzner_cts)}
## Container auf pve1 (Kambodscha)
{ct_table(pve1_cts)}
## Container auf pve3 (Muldenstein)
{ct_table(pve3_cts)}
## Routing (Cloudflare Tunnels)
{tunnel_text}
## Zugangsdaten
- pve-hetzner: root / {pw_hetzner}
- pve1: root / {pw_default}
- Alle CTs: root / {pw_default}
## Telegram Bots
| Bot | Zweck |
|---|---|
| @MutterbotAI_bot | Watchdog-Alerts |
| @Orbitalo_Hausmeister_bot | Homelab AI-Bot |
"""
def generate_smarthome_state(cfg: config.HomelabConfig) -> str:
log("Sammle Smart Home Status...")
backup_dir = Path("/home/backup-muldenstein/backups")
if backup_dir.exists():
backups = sorted(backup_dir.glob("*.tar.gz"), key=lambda p: p.stat().st_mtime, reverse=True)
if backups:
stat = backups[0].stat()
size_mb = stat.st_size // (1024 * 1024)
mtime = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
last_backup = f"{size_mb}MB, {mtime}"
else:
last_backup = "keine Backups gefunden"
backup_count = str(len(backups))
else:
last_backup = "Verzeichnis nicht vorhanden"
backup_count = "0"
grafana_url = f"https://{cfg.domains.get('grafana', 'grafana.orbitalo.net')}"
return f"""# Smart Home Muldenstein — Live State
> Auto-generiert: {DATE}
## Backup-Status
- Letztes Backup: {last_backup}
- Backups gesamt: {backup_count}
- Ziel: /home/backup-muldenstein/backups/ (CT 144)
## Services (CT 143)
| Dienst | URL |
|---|---|
| Grafana | {grafana_url} |
| ioBroker | http://192.168.178.36:8081 |
| InfluxDB | http://192.168.178.36:8086 |
## Grafana Alerts → Telegram {cfg.raw.get('TG_CHAT_ID', '?')}
- Promtail DOWN (> 5 Min keine Daten)
- CPU > 70%
- Memory > 80%
- Disk > 90%
## Backup-Zeitplan
- täglich 04:00 → /root/backup-to-hetzner.sh (auf pve3)
- Retention: 30d tägl, 90d wöchl, unbegrenzt monatl
"""
# ── Git Operations ────────────────────────────────────
def git_sync(cfg: config.HomelabConfig):
forgejo_token = cfg.api_keys.get("forgejo_sync_token", "")
ct_111 = config.get_container(cfg, vmid=111)
forgejo_ip = ct_111.tailscale_ip if ct_111 else "100.89.246.60"
forgejo_url = f"http://orbitalo:{forgejo_token}@{forgejo_ip}:3000/orbitalo/homelab-brain.git"
subprocess.run(
["git", "-C", str(REPO), "fetch", forgejo_url, "main", "--quiet"],
capture_output=True, timeout=30,
)
subprocess.run(
["git", "-C", str(REPO), "reset", "--hard", "FETCH_HEAD"],
capture_output=True, timeout=15,
)
return forgejo_url
def git_commit_and_push(cfg: config.HomelabConfig, forgejo_url: str):
subprocess.run(["git", "-C", str(REPO), "add", "-A"], capture_output=True, timeout=15)
subprocess.run(
["git", "-C", str(REPO),
"-c", "user.email=sync@homelab", "-c", "user.name=Auto-Sync",
"commit", "-m", f"Auto-Sync: {DATE}", "--quiet"],
capture_output=True, timeout=15,
)
for _push_attempt in range(3):
r = subprocess.run(
["git", "-C", str(REPO), "push", forgejo_url, "main", "--quiet"],
capture_output=True, text=True, timeout=30,
)
if r.returncode == 0:
log("Push erfolgreich")
(DEBOUNCE_DIR / "git_push.lock").unlink(missing_ok=True)
return
log(f"Push Retry {_push_attempt + 1}/3 — re-sync...")
subprocess.run(
["git", "-C", str(REPO), "fetch", forgejo_url, "main", "--quiet"],
capture_output=True, timeout=30,
)
subprocess.run(
["git", "-C", str(REPO), "rebase", "FETCH_HEAD"],
capture_output=True, timeout=15,
)
err = r.stderr.split("\n")[0] if r.stderr else "unbekannt"
log(f"Push FEHLER nach 3 Versuchen: {err}")
tg_alert(cfg, "git_push",
f"*Homelab Git-Sync fehlgeschlagen*%0A%0AFehler: {err}%0AZeit: {DATE}")
# ── Main ──────────────────────────────────────────────
def main():
global CHANGED
log("Sync startet...")
cfg = config.parse_config(REPO / "homelab.conf")
forgejo_url = git_sync(cfg)
cfg = config.parse_config(REPO / "homelab.conf")
# Watchdog
log("Watchdog läuft...")
check_service(cfg, 109, "rss-manager", "RSS Manager")
check_docker(cfg, 101, "wordpress-app", "WordPress Docker")
# STATE.md Dateien generieren
states = {
"arakava-news/STATE.md": generate_arakava_state(cfg),
"infrastructure/STATE.md": generate_infra_state(cfg),
"smart-home/STATE.md": generate_smarthome_state(cfg),
}
for path, content in states.items():
full_path = REPO / path
full_path.parent.mkdir(parents=True, exist_ok=True)
full_path.write_text(content)
CHANGED = True
log(f"{path} aktualisiert")
if CHANGED:
git_commit_and_push(cfg, forgejo_url)
else:
log("Keine Änderungen")
log("Sync abgeschlossen")
if __name__ == "__main__":
main()