From d2a6391f52d9c81891b96383c6000ca7eea83ae4 Mon Sep 17 00:00:00 2001 From: Homelab Cursor Date: Wed, 25 Mar 2026 20:59:30 +0100 Subject: [PATCH] feat(llm): Ollama warmup bei Start - Modelle permanent im VRAM - warmup_ollama() laedt qwen3:30b-a3b + nomic-embed-text mit keep_alive=-1 - Wird beim Bot-Start in post_init() aufgerufen (via asyncio.to_thread) - keep_alive=-1 nur ueber native Ollama API (/api/generate) moeglich - GPU haelt 22.6/24 GB permanent: Text + Embeddings ohne Swap --- homelab-ai-bot/llm.py | 15 ++++++++++++++- homelab-ai-bot/telegram_bot.py | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/homelab-ai-bot/llm.py b/homelab-ai-bot/llm.py index 77a03775..76d98222 100644 --- a/homelab-ai-bot/llm.py +++ b/homelab-ai-bot/llm.py @@ -25,6 +25,20 @@ FALLBACK_MODEL = "qwen2.5:14b" MAX_TOOL_ROUNDS = 3 OLLAMA_MODELS = {MODEL_LOCAL, FALLBACK_MODEL} + +def warmup_ollama(): + """Laedt Hauptmodell + Embedding permanent in VRAM (keep_alive=-1).""" + for model in [MODEL_LOCAL, "nomic-embed-text"]: + try: + requests.post( + f"{OLLAMA_BASE}/api/generate", + json={"model": model, "prompt": "", "keep_alive": -1}, + timeout=120, + ) + log.info("Ollama warmup: %s permanent geladen", model) + except Exception as e: + log.warning("Ollama warmup fehlgeschlagen fuer %s: %s", model, e) + PASSTHROUGH_TOOLS = {"get_temperaturen", "get_energie", "get_heizung"} _LOCAL_OVERRIDES = [ @@ -269,7 +283,6 @@ def _call_api(messages: list, api_key: str, use_tools: bool = True, headers = {"Content-Type": "application/json"} timeout = _ollama_timeout_for(chosen) _add_no_think(payload.get("messages", [])) - payload["keep_alive"] = -1 else: url = f"{OPENROUTER_BASE}/chat/completions" headers = {"Authorization": f"Bearer {api_key}"} diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index baa16e3c..e21c03ad 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -1025,6 +1025,7 @@ def main(): await application.bot.set_my_commands(BOT_COMMANDS) log.info("Kommandomenü registriert") asyncio.create_task(_watchdog_loop()) + asyncio.create_task(asyncio.to_thread(llm.warmup_ollama)) asyncio.create_task(_monitor_loop(application)) log.info("Monitor-Loop aktiv (alle 10 Min)") if application.job_queue is None: