From a0724ba6f116afa38fe777d6690b7138a04afeb7 Mon Sep 17 00:00:00 2001 From: Homelab Cursor Date: Wed, 25 Mar 2026 20:56:55 +0100 Subject: [PATCH] refactor(llm): GPU-Architektur - Text lokal, Vision Cloud - MODEL_VISION von qwen3-vl:32b (Ollama) auf openai/gpt-4o-mini (OpenRouter) - Vision-Modell aus OLLAMA_MODELS entfernt (kein GPU-Swap mehr) - keep_alive=-1 fuer Ollama: Textmodell bleibt permanent im VRAM - RTX 3090 wird dedizierter Text+Embedding-Server - Neue Dienste koennen Ollama mitnutzen ohne Konkurrenz --- homelab-ai-bot/llm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/homelab-ai-bot/llm.py b/homelab-ai-bot/llm.py index b6355740..77a03775 100644 --- a/homelab-ai-bot/llm.py +++ b/homelab-ai-bot/llm.py @@ -19,11 +19,11 @@ OLLAMA_BASE = "http://100.84.255.83:11434" OPENROUTER_BASE = "https://openrouter.ai/api/v1" MODEL_LOCAL = "qwen3:30b-a3b" -MODEL_VISION = "qwen3-vl:32b" +MODEL_VISION = "openai/gpt-4o-mini" MODEL_ONLINE = "perplexity/sonar" FALLBACK_MODEL = "qwen2.5:14b" MAX_TOOL_ROUNDS = 3 -OLLAMA_MODELS = {MODEL_LOCAL, MODEL_VISION, FALLBACK_MODEL} +OLLAMA_MODELS = {MODEL_LOCAL, FALLBACK_MODEL} PASSTHROUGH_TOOLS = {"get_temperaturen", "get_energie", "get_heizung"} @@ -269,6 +269,7 @@ def _call_api(messages: list, api_key: str, use_tools: bool = True, headers = {"Content-Type": "application/json"} timeout = _ollama_timeout_for(chosen) _add_no_think(payload.get("messages", [])) + payload["keep_alive"] = -1 else: url = f"{OPENROUTER_BASE}/chat/completions" headers = {"Authorization": f"Bearer {api_key}"}