refactor(llm): GPU-Architektur - Text lokal, Vision Cloud
- MODEL_VISION von qwen3-vl:32b (Ollama) auf openai/gpt-4o-mini (OpenRouter) - Vision-Modell aus OLLAMA_MODELS entfernt (kein GPU-Swap mehr) - keep_alive=-1 fuer Ollama: Textmodell bleibt permanent im VRAM - RTX 3090 wird dedizierter Text+Embedding-Server - Neue Dienste koennen Ollama mitnutzen ohne Konkurrenz
This commit is contained in:
parent
5b57bfd27f
commit
a0724ba6f1
1 changed files with 3 additions and 2 deletions
|
|
@ -19,11 +19,11 @@ OLLAMA_BASE = "http://100.84.255.83:11434"
|
||||||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||||
|
|
||||||
MODEL_LOCAL = "qwen3:30b-a3b"
|
MODEL_LOCAL = "qwen3:30b-a3b"
|
||||||
MODEL_VISION = "qwen3-vl:32b"
|
MODEL_VISION = "openai/gpt-4o-mini"
|
||||||
MODEL_ONLINE = "perplexity/sonar"
|
MODEL_ONLINE = "perplexity/sonar"
|
||||||
FALLBACK_MODEL = "qwen2.5:14b"
|
FALLBACK_MODEL = "qwen2.5:14b"
|
||||||
MAX_TOOL_ROUNDS = 3
|
MAX_TOOL_ROUNDS = 3
|
||||||
OLLAMA_MODELS = {MODEL_LOCAL, MODEL_VISION, FALLBACK_MODEL}
|
OLLAMA_MODELS = {MODEL_LOCAL, FALLBACK_MODEL}
|
||||||
|
|
||||||
PASSTHROUGH_TOOLS = {"get_temperaturen", "get_energie", "get_heizung"}
|
PASSTHROUGH_TOOLS = {"get_temperaturen", "get_energie", "get_heizung"}
|
||||||
|
|
||||||
|
|
@ -269,6 +269,7 @@ def _call_api(messages: list, api_key: str, use_tools: bool = True,
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
timeout = _ollama_timeout_for(chosen)
|
timeout = _ollama_timeout_for(chosen)
|
||||||
_add_no_think(payload.get("messages", []))
|
_add_no_think(payload.get("messages", []))
|
||||||
|
payload["keep_alive"] = -1
|
||||||
else:
|
else:
|
||||||
url = f"{OPENROUTER_BASE}/chat/completions"
|
url = f"{OPENROUTER_BASE}/chat/completions"
|
||||||
headers = {"Authorization": f"Bearer {api_key}"}
|
headers = {"Authorization": f"Bearer {api_key}"}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue