diff --git a/homelab-ai-bot/tools/rag.py b/homelab-ai-bot/tools/rag.py index 5e7c7ab7..5a3d2d67 100644 --- a/homelab-ai-bot/tools/rag.py +++ b/homelab-ai-bot/tools/rag.py @@ -1,14 +1,24 @@ -"""RAG Dokumentensuche - durchsucht die Hausmeister-Wissensbasis (RAGFlow).""" +"""RAG Dokumentensuche — Elasticsearch direkt (Hybrid: kNN + deutscher Text). +RAGFlow bleibt Ingestion; Suche geht direkt an ES (Issue #51). +""" + +import base64 import json -import urllib.request import logging +import re +import urllib.error +import urllib.request log = logging.getLogger("tools.rag") -RAGFLOW_URL = "http://100.109.101.12:8080/api/v1" -RAGFLOW_TOKEN = "ragflow-test-token-2026" -DATASET_ID = "dc24edda27a311f19fe7fb811de6f016" +ES_BASE = "http://100.109.101.12:1200" +ES_USER = "elastic" +ES_PASS = "infini_rag_flow" +ES_INDEX = "ragflow_61f51c8c279011f1a174bd19863ba33e" +KB_ID = "dc24edda27a311f19fe7fb811de6f016" +OLLAMA_EMBED_URL = "http://100.84.255.83:11434/api/embeddings" +EMBED_MODEL = "nomic-embed-text" TOOLS = [ { @@ -27,15 +37,18 @@ TOOLS = [ "properties": { "query": { "type": "string", - "description": "Suchanfrage: Dokumentname, Thema oder Inhalt. Kurz und praezise, z.B. 'Familienbuch Opa Oma' oder 'Grundsteuer Erklaerung 2024'" + "description": ( + "Suchanfrage: Dokumentname, Thema oder Inhalt. Kurz und praezise, " + "z.B. 'Familienbuch Opa Oma' oder 'Grundsteuer Erklaerung 2024'" + ), }, "top_k": { "type": "integer", "description": "Anzahl Ergebnisse (1-10)", - "default": 5 - } + "default": 5, + }, }, - "required": ["query"] + "required": ["query"], }, }, }, @@ -47,24 +60,97 @@ Nutze rag_search wenn der User nach Dokumenten, Vertraegen, persoenlichen Unterl Die Suchanfrage sollte kurze Keywords sein, KEINE ganzen Saetze. Beispiele: - "Familienbuch Opa Oma" - "Grundsteuer Erklaerung" -- "Nürnberger Versicherung" +- "Nuernberger Versicherung" - "Allianz Beitraege" Bei schlechten Ergebnissen: andere Keywords versuchen oder Dokumentnamen direkt suchen.""" -def _api_call(path, method="GET", body=None): - url = f"{RAGFLOW_URL}{path}" - data = json.dumps(body).encode("utf-8") if body else None +def _basic_auth_header() -> str: + token = base64.b64encode(f"{ES_USER}:{ES_PASS}".encode()).decode() + return f"Basic {token}" + + +def _ollama_embed(text: str) -> list | None: + body = json.dumps({"model": EMBED_MODEL, "prompt": text}).encode() req = urllib.request.Request( - url, data=data, - headers={ - "Authorization": f"Bearer {RAGFLOW_TOKEN}", - "Content-Type": "application/json", - }, - method=method, + OLLAMA_EMBED_URL, + data=body, + method="POST", + headers={"Content-Type": "application/json"}, ) - with urllib.request.urlopen(req, timeout=30) as resp: - return json.loads(resp.read()) + try: + with urllib.request.urlopen(req, timeout=120) as resp: + data = json.load(resp) + emb = data.get("embedding") + if not emb: + return None + if len(emb) != 768: + log.warning("Unexpected embedding dimension %s", len(emb)) + return emb + except Exception as e: + log.error("Ollama embed error: %s", e) + return None + + +def _ocr_note(text: str) -> str: + if not text or len(text) < 40: + return "" + non_alnum = sum(1 for c in text if not c.isalnum() and not c.isspace()) + ratio = non_alnum / max(len(text), 1) + words = re.findall(r"\w+", text, re.UNICODE) + avg_len = (sum(len(w) for w in words) / len(words)) if words else 0.0 + if ratio > 0.15 or avg_len < 2.0: + return " [OCR vermutlich schlecht]" + return "" + + +def _es_hybrid_search(query: str, es_size: int) -> dict: + qvec = _ollama_embed(query) + if not qvec: + return {"_error": "Embedding fehlgeschlagen (Ollama nicht erreichbar?)."} + + kb_filter = {"term": {"kb_id": KB_ID}} + body = { + "size": es_size, + "knn": { + "field": "q_768_vec", + "query_vector": qvec, + "k": es_size, + "num_candidates": min(500, max(es_size * 5, 120)), + "filter": [kb_filter], + }, + "query": { + "bool": { + "filter": [kb_filter], + "should": [ + {"match": {"content_de": {"query": query, "boost": 2.0}}}, + {"match": {"content_ltks": {"query": query.lower(), "boost": 0.4}}}, + {"match": {"docnm_kwd": {"query": query, "boost": 1.5}}}, + ], + "minimum_should_match": 0, + } + }, + } + url = f"{ES_BASE}/{ES_INDEX}/_search" + req = urllib.request.Request( + url, + data=json.dumps(body).encode(), + method="POST", + headers={ + "Content-Type": "application/json", + "Authorization": _basic_auth_header(), + }, + ) + try: + with urllib.request.urlopen(req, timeout=120) as resp: + return json.load(resp) + except urllib.error.HTTPError as e: + err = e.read().decode(errors="replace")[:800] + log.error("ES HTTP %s: %s", e.code, err) + return {"_error": f"ES HTTP {e.code}: {err}"} + except Exception as e: + log.error("ES search error: %s", e) + return {"_error": str(e)} def handle_rag_search(query: str, top_k: int = 5, **kw): @@ -72,46 +158,45 @@ def handle_rag_search(query: str, top_k: int = 5, **kw): return "rag_search: query fehlt." top_k = max(1, min(int(top_k or 5), 10)) + es_size = min(100, max(top_k * 12, 35)) - try: - result = _api_call("/retrieval", method="POST", body={ - "question": query, - "dataset_ids": [DATASET_ID], - "top_k": 2048, - "similarity_threshold": 0.1, - "vector_similarity_weight": 0.3, - }) + data = _es_hybrid_search(query.strip(), es_size) + if "_error" in data: + return f"Fehler bei der Dokumentensuche: {data['_error']}" - chunks = result.get("data", {}).get("chunks", []) - if not chunks: - return f"Keine Ergebnisse fuer '{query}' in der Wissensbasis gefunden." + hits = (data.get("hits") or {}).get("hits") or [] + if not hits: + return f"Keine Ergebnisse fuer '{query}' in der Wissensbasis gefunden." - seen_docs = set() - lines = [f"**{len(chunks)} Treffer fuer '{query}'** (Top {top_k}):\n"] + seen_docs: set[str] = set() + lines: list[str] = [] + lines.append(f"**{len(hits)} Roh-Treffer fuer '{query}'** (Top {top_k} Dokumente):\n") - count = 0 - for c in chunks: - if count >= top_k: - break - doc_name = c.get("document_keyword", c.get("document_name", "?")) - sim = c.get("similarity", 0) - content = c.get("content", "")[:400].strip() + count = 0 + for h in hits: + if count >= top_k: + break + src = h.get("_source") or {} + doc_name = src.get("docnm_kwd") or "?" + doc_key = str(doc_name) + if doc_key in seen_docs: + continue + seen_docs.add(doc_key) - doc_key = doc_name - if doc_key in seen_docs: - continue - seen_docs.add(doc_key) + score = h.get("_score") or 0.0 + raw = src.get("content_with_weight") or src.get("content_de") or "" + content = raw[:400].strip() + ocr = _ocr_note(raw) - lines.append(f"---\n**{count+1}. {doc_name}** (Relevanz: {sim:.0%})") - if content: - lines.append(f"```\n{content}\n```") - count += 1 + lines.append(f"---\n**{count + 1}. {doc_name}** (Score: {score:.3f}){ocr}") + if content: + lines.append(f"```\n{content}\n```") + count += 1 - return "\n".join(lines) + if count == 0: + return f"Keine eindeutigen Dokumente fuer '{query}' (nach Deduplizierung)." - except Exception as e: - log.error("RAG search error: %s", e) - return f"Fehler bei der Dokumentensuche: {e}" + return "\n".join(lines) HANDLERS = {