Fotoerkennung: handle_photo + ask_with_image fuer Hausmeister-Bot

2026-03-16 09:06:00 +07:00 · 2026-03-16 09:06:00 +07:00 · 89f2c03fa0
commit 89f2c03fa0
parent fe94f200e8
2 changed files with 118 additions and 0 deletions
--- a/homelab-ai-bot/llm.py
+++ b/homelab-ai-bot/llm.py
@ -484,3 +484,82 @@ def ask_with_tools(question: str, tool_handlers: dict, session_id: str = None) -

    except Exception as e:
        return f"LLM-Fehler: {e}"
+
+
+def ask_with_image(image_base64: str, caption: str, tool_handlers: dict, session_id: str = None) -> str:
+    """Bild-Analyse mit optionalem Text und Tool-Calling via Vision-faehigem Modell."""
+    api_key = _get_api_key()
+    if not api_key:
+        return "OpenRouter API Key fehlt in homelab.conf"
+
+    try:
+        import memory_client
+        query = caption if caption else "Bild-Analyse"
+        memory_items = memory_client.get_relevant_memory(query, top_k=10)
+        memory_block = memory_client.format_memory_for_prompt(memory_items)
+    except Exception:
+        memory_block = ""
+
+    prompt_text = caption if caption else "Was siehst du auf diesem Bild? Beschreibe was du erkennst."
+    user_content = [
+        {"type": "text", "text": prompt_text},
+        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}},
+    ]
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT + memory_block},
+    ]
+
+    if session_id:
+        try:
+            import memory_client
+            history = memory_client.get_session_messages(session_id, limit=6)
+            for msg in history:
+                role = msg.get("role", "")
+                content = msg.get("content", "")
+                if content and role in ("user", "assistant"):
+                    messages.append({"role": role, "content": content})
+        except Exception:
+            pass
+
+    messages.append({"role": "user", "content": user_content})
+
+    try:
+        for _round in range(MAX_TOOL_ROUNDS):
+            data = _call_openrouter(messages, api_key, use_tools=True)
+            choice = data["choices"][0]
+            msg = choice["message"]
+
+            tool_calls = msg.get("tool_calls")
+            if not tool_calls:
+                return msg.get("content", "Keine Antwort vom LLM.")
+
+            messages.append(msg)
+
+            for tc in tool_calls:
+                fn_name = tc["function"]["name"]
+                try:
+                    fn_args = json.loads(tc["function"]["arguments"])
+                except (json.JSONDecodeError, KeyError):
+                    fn_args = {}
+
+                handler = tool_handlers.get(fn_name)
+                if handler:
+                    try:
+                        result = handler(**fn_args)
+                    except Exception as e:
+                        result = f"Fehler bei {fn_name}: {e}"
+                else:
+                    result = f"Unbekanntes Tool: {fn_name}"
+
+                messages.append({
+                    "role": "tool",
+                    "tool_call_id": tc["id"],
+                    "content": str(result)[:3000],
+                })
+
+        data = _call_openrouter(messages, api_key, use_tools=False)
+        return data["choices"][0]["message"]["content"]
+
+    except Exception as e:
+        return f"LLM-Fehler: {e}"
--- a/homelab-ai-bot/telegram_bot.py
+++ b/homelab-ai-bot/telegram_bot.py
@ -123,6 +123,7 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "/check — Monitoring-Check\n"
        "/feeds — Feed-Status & Artikel\n"
        "/memory — Gedaechtnis anzeigen\n\n"
+        "📷 Foto senden = Bilderkennung\n\n"
        "Oder einfach eine Frage stellen!",
        reply_markup=KEYBOARD,
    )
@ -397,6 +398,43 @@ async def handle_voice(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        await update.message.reply_text(f"Fehler: {e}")


+async def handle_photo(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
+    """Foto-Nachricht: Bild analysieren via Vision-LLM."""
+    if not _authorized(update):
+        return
+    photos = update.message.photo
+    if not photos:
+        return
+
+    photo = photos[-1]
+    caption = update.message.caption or ""
+
+    await update.message.reply_text("🔍 Analysiere Bild...")
+    try:
+        import base64
+        tg_file = await ctx.bot.get_file(photo.file_id)
+        image_data = await tg_file.download_as_bytearray()
+        image_base64 = base64.b64encode(bytes(image_data)).decode("utf-8")
+
+        channel_key = str(update.effective_chat.id)
+        session_id = memory_client.get_or_create_session(channel_key, source="telegram")
+
+        context.last_suggest_result = {"type": None}
+        context.set_source_type("telegram_photo")
+        handlers = context.get_tool_handlers(session_id=session_id)
+        answer = llm.ask_with_image(image_base64, caption, handlers, session_id=session_id)
+
+        if session_id:
+            user_msg = f"[Foto] {caption}" if caption else "[Foto gesendet]"
+            memory_client.log_message(session_id, "user", user_msg)
+            memory_client.log_message(session_id, "assistant", answer)
+
+        await update.message.reply_text(answer[:4000], reply_markup=KEYBOARD)
+    except Exception as e:
+        log.exception("Fehler bei Foto-Analyse")
+        await update.message.reply_text(f"Fehler bei Bildanalyse: {e}")
+
+
 async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    """Button-Presses und Freitext-Fragen verarbeiten."""
    if not _authorized(update):
@ -466,6 +504,7 @@ def main():
    app.add_handler(CommandHandler("feeds", cmd_feeds))
    app.add_handler(CommandHandler("memory", cmd_memory))
    app.add_handler(MessageHandler(filters.VOICE, handle_voice))
+    app.add_handler(MessageHandler(filters.PHOTO, handle_photo))
    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    async def post_init(application):