Fotoerkennung: handle_photo + ask_with_image fuer Hausmeister-Bot

2026-03-16 09:06:00 +07:00 · 2026-03-16 09:06:00 +07:00 · 89f2c03fa0
commit 89f2c03fa0
parent fe94f200e8
2 changed files with 118 additions and 0 deletions
--- a/homelab-ai-bot/llm.py
+++ b/homelab-ai-bot/llm.py
@ -484,3 +484,82 @@ def ask_with_tools(question: str, tool_handlers: dict, session_id: str = None) -
    except Exception as e:
        return f"LLM-Fehler: {e}"
 def ask_with_image(image_base64: str, caption: str, tool_handlers: dict, session_id: str = None) -> str:
    """Bild-Analyse mit optionalem Text und Tool-Calling via Vision-faehigem Modell."""
    api_key = _get_api_key()
    if not api_key:
        return "OpenRouter API Key fehlt in homelab.conf"
    try:
        import memory_client
        query = caption if caption else "Bild-Analyse"
        memory_items = memory_client.get_relevant_memory(query, top_k=10)
        memory_block = memory_client.format_memory_for_prompt(memory_items)
    except Exception:
        memory_block = ""
    prompt_text = caption if caption else "Was siehst du auf diesem Bild? Beschreibe was du erkennst."
    user_content = [
        {"type": "text", "text": prompt_text},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}},
    ]
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT + memory_block},
    ]
    if session_id:
        try:
            import memory_client
            history = memory_client.get_session_messages(session_id, limit=6)
            for msg in history:
                role = msg.get("role", "")
                content = msg.get("content", "")
                if content and role in ("user", "assistant"):
                    messages.append({"role": role, "content": content})
        except Exception:
            pass
    messages.append({"role": "user", "content": user_content})
    try:
        for _round in range(MAX_TOOL_ROUNDS):
            data = _call_openrouter(messages, api_key, use_tools=True)
            choice = data["choices"][0]
            msg = choice["message"]
            tool_calls = msg.get("tool_calls")
            if not tool_calls:
                return msg.get("content", "Keine Antwort vom LLM.")
            messages.append(msg)
            for tc in tool_calls:
                fn_name = tc["function"]["name"]
                try:
                    fn_args = json.loads(tc["function"]["arguments"])
                except (json.JSONDecodeError, KeyError):
                    fn_args = {}
                handler = tool_handlers.get(fn_name)
                if handler:
                    try:
                        result = handler(**fn_args)
                    except Exception as e:
                        result = f"Fehler bei {fn_name}: {e}"
                else:
                    result = f"Unbekanntes Tool: {fn_name}"
                messages.append({
                    "role": "tool",
                    "tool_call_id": tc["id"],
                    "content": str(result)[:3000],
                })
        data = _call_openrouter(messages, api_key, use_tools=False)
        return data["choices"][0]["message"]["content"]
    except Exception as e:
        return f"LLM-Fehler: {e}"
--- a/homelab-ai-bot/telegram_bot.py
+++ b/homelab-ai-bot/telegram_bot.py
@ -123,6 +123,7 @@ async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        "/check — Monitoring-Check\n"
        "/feeds — Feed-Status & Artikel\n"
        "/memory — Gedaechtnis anzeigen\n\n"
        "📷 Foto senden = Bilderkennung\n\n"
        "Oder einfach eine Frage stellen!",
        reply_markup=KEYBOARD,
    )
@ -397,6 +398,43 @@ async def handle_voice(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        await update.message.reply_text(f"Fehler: {e}")
 async def handle_photo(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    """Foto-Nachricht: Bild analysieren via Vision-LLM."""
    if not _authorized(update):
        return
    photos = update.message.photo
    if not photos:
        return
    photo = photos[-1]
    caption = update.message.caption or ""
    await update.message.reply_text("🔍 Analysiere Bild...")
    try:
        import base64
        tg_file = await ctx.bot.get_file(photo.file_id)
        image_data = await tg_file.download_as_bytearray()
        image_base64 = base64.b64encode(bytes(image_data)).decode("utf-8")
        channel_key = str(update.effective_chat.id)
        session_id = memory_client.get_or_create_session(channel_key, source="telegram")
        context.last_suggest_result = {"type": None}
        context.set_source_type("telegram_photo")
        handlers = context.get_tool_handlers(session_id=session_id)
        answer = llm.ask_with_image(image_base64, caption, handlers, session_id=session_id)
        if session_id:
            user_msg = f"[Foto] {caption}" if caption else "[Foto gesendet]"
            memory_client.log_message(session_id, "user", user_msg)
            memory_client.log_message(session_id, "assistant", answer)
        await update.message.reply_text(answer[:4000], reply_markup=KEYBOARD)
    except Exception as e:
        log.exception("Fehler bei Foto-Analyse")
        await update.message.reply_text(f"Fehler bei Bildanalyse: {e}")
 async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    """Button-Presses und Freitext-Fragen verarbeiten."""
    if not _authorized(update):
@ -466,6 +504,7 @@ def main():
    app.add_handler(CommandHandler("feeds", cmd_feeds))
    app.add_handler(CommandHandler("memory", cmd_memory))
    app.add_handler(MessageHandler(filters.VOICE, handle_voice))
    app.add_handler(MessageHandler(filters.PHOTO, handle_photo))
    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
    async def post_init(application):