fix(llm): Text-Routing auf qwen3:30b-a3b, Timeout-Fallback auf qwen2.5:14b
Vorher lief normaler Text ueber das grosse Vision-Modell qwen3-vl:32b, was bei Tool-Calls (z.B. get_temperaturen) regelmaessig in Read-Timeouts nach 120s resultierte. Aenderungen: - Text-Modell: qwen3:30b-a3b (MoE, schneller fuer reinen Text) - Vision-Modell: bleibt qwen3-vl:32b (nur fuer Fotos/Dokumente) - Fallback bei Timeout: qwen2.5:14b (einmaliger Retry) - Ollama-Timeout modellabhaengig (180s Text, 240s Vision, 90s Fallback) - Alle lokalen Modelle werden korrekt als Ollama erkannt
This commit is contained in:
parent
4c2177baba
commit
db61aaedae
1 changed files with 53 additions and 19 deletions
|
|
@ -18,9 +18,11 @@ log = logging.getLogger('llm')
|
|||
OLLAMA_BASE = "http://100.84.255.83:11434"
|
||||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||
|
||||
MODEL = "qwen3-vl:32b"
|
||||
VISION_MODEL = "qwen3-vl:32b"
|
||||
MODEL = os.environ.get("HAUSMEISTER_TEXT_MODEL", "qwen3:30b-a3b")
|
||||
VISION_MODEL = os.environ.get("HAUSMEISTER_VISION_MODEL", "qwen3-vl:32b")
|
||||
FALLBACK_MODEL = os.environ.get("HAUSMEISTER_TEXT_FALLBACK_MODEL", "qwen2.5:14b")
|
||||
MAX_TOOL_ROUNDS = 3
|
||||
OLLAMA_MODELS = {MODEL, VISION_MODEL, FALLBACK_MODEL}
|
||||
|
||||
import datetime as _dt
|
||||
_TODAY = _dt.date.today()
|
||||
|
|
@ -186,10 +188,34 @@ def _get_api_key() -> str:
|
|||
return cfg.api_keys.get("openrouter_key", "")
|
||||
|
||||
|
||||
def _ollama_timeout_for(model: str) -> int:
|
||||
if model == VISION_MODEL:
|
||||
return 240
|
||||
if model == FALLBACK_MODEL:
|
||||
return 90
|
||||
return 180
|
||||
|
||||
|
||||
def _add_no_think(messages: list) -> None:
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") != "user":
|
||||
continue
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, str) and "/no_think" not in content:
|
||||
msg["content"] = content + " /no_think"
|
||||
elif isinstance(content, list):
|
||||
for item in content:
|
||||
if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
|
||||
item["text"] = item["text"] + " /no_think"
|
||||
break
|
||||
break
|
||||
|
||||
|
||||
def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
|
||||
model: str = None, max_tokens: int = 4000) -> dict:
|
||||
model: str = None, max_tokens: int = 4000,
|
||||
allow_fallback: bool = True) -> dict:
|
||||
chosen = model or MODEL
|
||||
use_ollama = (chosen == MODEL)
|
||||
use_ollama = chosen in OLLAMA_MODELS
|
||||
log.info("LLM-Call: model=%s ollama=%s max_tokens=%d", chosen, use_ollama, max_tokens)
|
||||
|
||||
payload = {
|
||||
|
|
@ -204,26 +230,34 @@ def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
|
|||
if use_ollama:
|
||||
url = f"{OLLAMA_BASE}/v1/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
timeout = 120
|
||||
for msg in reversed(payload.get("messages", [])):
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, str) and "/no_think" not in content:
|
||||
msg["content"] = content + " /no_think"
|
||||
elif isinstance(content, list):
|
||||
for item in content:
|
||||
if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
|
||||
item["text"] = item["text"] + " /no_think"
|
||||
break
|
||||
break
|
||||
timeout = _ollama_timeout_for(chosen)
|
||||
_add_no_think(payload.get("messages", []))
|
||||
else:
|
||||
url = f"{OPENROUTER_BASE}/chat/completions"
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
timeout = 90
|
||||
|
||||
try:
|
||||
r = requests.post(url, headers=headers, json=payload, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except requests.exceptions.ReadTimeout:
|
||||
if use_ollama and allow_fallback and chosen == MODEL and FALLBACK_MODEL and FALLBACK_MODEL != chosen:
|
||||
log.warning(
|
||||
"Ollama timeout for %s after %ss, retrying with fallback model %s",
|
||||
chosen,
|
||||
timeout,
|
||||
FALLBACK_MODEL,
|
||||
)
|
||||
return _call_openrouter(
|
||||
messages,
|
||||
api_key,
|
||||
use_tools=use_tools,
|
||||
model=FALLBACK_MODEL,
|
||||
max_tokens=max_tokens,
|
||||
allow_fallback=False,
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
def ask(question: str, context: str) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue