feat(rag): Cross-Encoder Reranking via CT123 bge-reranker-v2-m3
This commit is contained in:
parent
d718c6d2c2
commit
da0d1cd16c
1 changed files with 72 additions and 44 deletions
|
|
@ -20,6 +20,11 @@ KB_ID = "dc24edda27a311f19fe7fb811de6f016"
|
|||
OLLAMA_EMBED_URL = "http://100.84.255.83:11434/api/embeddings"
|
||||
EMBED_MODEL = "nomic-embed-text"
|
||||
|
||||
# Cross-Encoder Reranking (CT 123, pve-hetzner LAN)
|
||||
RERANKER_URL = "http://10.10.10.123:8099"
|
||||
RERANK_CANDIDATES = 30
|
||||
RERANK_TIMEOUT = 120
|
||||
|
||||
MIN_TOP_K = 5
|
||||
# Breite Übersichten: mehr ES-Runden, mehr distinct Treffer (pro vollem Pfad docnm_kwd)
|
||||
MAX_TOP_K_NORMAL = 25
|
||||
|
|
@ -68,7 +73,6 @@ Du hast Zugriff auf eine private Wissensbasis mit >21.000 Dokumenten (Vertraege,
|
|||
WANN rag_search AUFRUFEN — IMMER bei diesen Fragen:
|
||||
- "habe ich..." / "gibt es..." / "wo ist..." / "finde..." / "zeig mir..." + Dokument/Vertrag/Versicherung/Bescheid
|
||||
- Jede Frage nach persoenlichen Unterlagen, Vertraegen, Versicherungen, Rechnungen, Bescheiden
|
||||
- Wohnungen, Immobilien, Grundstuecke, Mietobjekte, Auslands-Objekte (z.B. Kambodscha): immer rag_search — auch wenn das Gedaechtnis schon einen Wohnort nennt
|
||||
- AUCH wenn du glaubst die Antwort zu kennen — das Gedaechtnis ist NICHT die Wissensbasis!
|
||||
- AUCH wenn das Thema im Gedaechtnis steht — trotzdem rag_search aufrufen fuer vollstaendige Antwort
|
||||
|
||||
|
|
@ -76,7 +80,6 @@ WANN NICHT: Nur bei reinen Homelab/IT-Fragen, Smalltalk, oder wenn der User expl
|
|||
|
||||
SUCHANFRAGE: Kurze Keywords, KEINE ganzen Saetze. Beispiele:
|
||||
- "Familienbuch" / "Grundsteuer Erklaerung" / "Haftpflicht" / "Kindergeld" / "Mietvertrag" / "Arbeitsvertrag" / "Reisepass"
|
||||
- "Wohnung Kambodscha" / "Immobilie" / "Condo"
|
||||
|
||||
ERGEBNISSE AUSWERTEN:
|
||||
- Bei breiten Fragen ("welche Versicherungen", Jahreskosten, Listen): top_k=15-25, ALLE Treffer aus der Tool-Antwort abarbeiten
|
||||
|
|
@ -176,7 +179,6 @@ def _es_hybrid_search(query: str, es_size: int) -> dict:
|
|||
{"match": {"content_de": {"query": query, "boost": 2.0}}},
|
||||
{"match": {"content_ltks": {"query": query.lower(), "boost": 0.4}}},
|
||||
{"match": {"docnm_kwd": {"query": query, "boost": 3.0}}},
|
||||
{"match": {"docnm_search": {"query": query, "boost": 5.0}}},
|
||||
],
|
||||
"minimum_should_match": 0,
|
||||
}
|
||||
|
|
@ -204,6 +206,55 @@ def _es_hybrid_search(query: str, es_size: int) -> dict:
|
|||
return {"_error": str(e)}
|
||||
|
||||
|
||||
def _snippet_for_rerank(src: dict) -> str:
|
||||
raw = src.get("content_with_weight") or src.get("content_de") or ""
|
||||
return raw[:4000]
|
||||
|
||||
|
||||
def _rerank_hits(query: str, hits: list) -> tuple[list, bool]:
|
||||
"""Sortiert die ersten RERANK_CANDIDATES Treffer per Cross-Encoder neu."""
|
||||
if not hits or not RERANKER_URL:
|
||||
return hits, False
|
||||
to_score = hits[:RERANK_CANDIDATES]
|
||||
docs = []
|
||||
for h in to_score:
|
||||
src = h.get("_source") or {}
|
||||
docs.append(_snippet_for_rerank(src))
|
||||
if not any((d or "").strip() for d in docs):
|
||||
return hits, False
|
||||
body = json.dumps({"query": query, "documents": docs}).encode()
|
||||
url = f"{RERANKER_URL.rstrip('/')}/rerank"
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=body,
|
||||
method="POST",
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=RERANK_TIMEOUT) as resp:
|
||||
data = json.load(resp)
|
||||
scores = data.get("scores") or []
|
||||
if len(scores) != len(to_score):
|
||||
log.warning(
|
||||
"rerank score count mismatch: %s vs %s",
|
||||
len(scores),
|
||||
len(to_score),
|
||||
)
|
||||
return hits, False
|
||||
indexed = list(zip(scores, range(len(to_score))))
|
||||
indexed.sort(key=lambda x: x[0], reverse=True)
|
||||
new_order: list = []
|
||||
for sc, idx in indexed:
|
||||
h = to_score[idx]
|
||||
h["_rerank_score"] = float(sc)
|
||||
new_order.append(h)
|
||||
rest = hits[RERANK_CANDIDATES:]
|
||||
return new_order + rest, True
|
||||
except Exception as e:
|
||||
log.warning("rerank failed: %s", e)
|
||||
return hits, False
|
||||
|
||||
|
||||
def _is_wide_recall_query(q: str) -> bool:
|
||||
"""Übersichts-/Listen-/Kostenfragen: mehrfach suchen und mergen."""
|
||||
ql = (q or "").lower()
|
||||
|
|
@ -261,18 +312,7 @@ def _is_wide_recall_query(q: str) -> bool:
|
|||
"jaehrlich",
|
||||
)
|
||||
)
|
||||
if costish and broad:
|
||||
return True
|
||||
|
||||
# Immobilien / Wohnungen / Kambodscha
|
||||
if any(x in ql for x in ("wohnung", "immobilie", "condo", "apartment", "grundstück", "grundstueck")):
|
||||
if any(x in ql for x in ("welche", "alle", "liste", "habe ich", "übersicht", "uebersicht", "wie viele")):
|
||||
return True
|
||||
if any(x in ql for x in ("kambodscha", "cambodia", "takeo", "phnom", "sihanouk")):
|
||||
if any(x in ql for x in ("welche", "alle", "wohnung", "immobilie", "haus", "condo", "apartment", "mietvertrag")):
|
||||
return True
|
||||
|
||||
return False
|
||||
return costish and broad
|
||||
|
||||
|
||||
# Zusatzanfragen decken Sparten + Gesellschaften ab (Recall)
|
||||
|
|
@ -296,23 +336,6 @@ _WIDE_SUBQUERIES = [
|
|||
"Kfz Versicherungsschein Beitrag jährlich",
|
||||
]
|
||||
|
||||
_WIDE_SUBQUERIES_IMMOBILIEN = [
|
||||
"Arakawa Wohnung Mietvertrag",
|
||||
"Arakawa Wohnung D1603",
|
||||
"Arakawa Wohnung G2010",
|
||||
"Arakawa Wohnung-2",
|
||||
"Kambodscha Arakawa Kaufvertrag",
|
||||
"Kambodscha Arakawa Vollmacht",
|
||||
"Kambodscha Arakawa Überweisung",
|
||||
"Wohnung Mietvertrag Kambodscha",
|
||||
"Condo Apartment Cambodia",
|
||||
"Hard Title Wohnung",
|
||||
"Wohnungen Kurtzübersicht",
|
||||
"Mietvertrag Ramirez Antonio",
|
||||
"Mietvertrag Cheng Qiu",
|
||||
"Kambodscha Rechnungen Strom Miete",
|
||||
]
|
||||
|
||||
|
||||
def _merge_hits_from_queries(
|
||||
queries: list[str],
|
||||
|
|
@ -364,16 +387,9 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
|||
es_size = min(ES_SIZE_CAP, max(top_k * 10, 70))
|
||||
|
||||
if wide:
|
||||
ql = qstrip.lower()
|
||||
_immo_wide = any(
|
||||
x in ql
|
||||
for x in ("wohnung", "immobilie", "condo", "apartment", "grundstück", "grundstueck",
|
||||
"kambodscha", "cambodia", "arakawa")
|
||||
)
|
||||
_sq_pool = _WIDE_SUBQUERIES_IMMOBILIEN if _immo_wide else _WIDE_SUBQUERIES
|
||||
subqs = [qstrip]
|
||||
for sq in _sq_pool:
|
||||
if sq.lower() not in ql:
|
||||
for sq in _WIDE_SUBQUERIES:
|
||||
if sq.lower() not in qstrip.lower():
|
||||
subqs.append(sq)
|
||||
pool_cap = max(top_k * 5, 120)
|
||||
hits, err = _merge_hits_from_queries(
|
||||
|
|
@ -400,6 +416,8 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
|||
if not hits:
|
||||
return f"Keine Ergebnisse fuer '{qstrip}' in der Wissensbasis gefunden."
|
||||
|
||||
hits, reranked = _rerank_hits(qstrip, hits)
|
||||
|
||||
seen_docs: set[str] = set()
|
||||
lines: list[str] = []
|
||||
count = 0
|
||||
|
|
@ -417,7 +435,12 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
|||
continue
|
||||
seen_docs.add(dk)
|
||||
|
||||
score = h.get("_score") or 0.0
|
||||
if "_rerank_score" in h:
|
||||
score = float(h["_rerank_score"])
|
||||
score_label = "Rerank"
|
||||
else:
|
||||
score = float(h.get("_score") or 0.0)
|
||||
score_label = "ES"
|
||||
raw = src.get("content_with_weight") or src.get("content_de") or ""
|
||||
content = raw[:snip_len].strip()
|
||||
ocr = _ocr_note(raw)
|
||||
|
|
@ -425,7 +448,9 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
|||
filename = doc_name.rsplit("__", 1)[-1] if "__" in doc_name else doc_name
|
||||
folder_line = f" Ordner: {folder}" if folder else ""
|
||||
|
||||
lines.append(f"---\n**{count + 1}. {filename}** (Score: {score:.1f}){ocr}")
|
||||
lines.append(
|
||||
f"---\n**{count + 1}. {filename}** ({score_label}: {score:.3f}){ocr}"
|
||||
)
|
||||
if folder_line:
|
||||
lines.append(folder_line)
|
||||
if content:
|
||||
|
|
@ -435,7 +460,10 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
|||
if count == 0:
|
||||
return f"Keine Dokumente fuer '{qstrip}' gefunden."
|
||||
|
||||
lines.insert(0, header)
|
||||
hdr = header.rstrip() + (
|
||||
" _(Cross-Encoder reranked)_" if reranked else ""
|
||||
) + "\n"
|
||||
lines.insert(0, hdr)
|
||||
tail = (
|
||||
"\n---\n(Ende der Ergebnisse. Nur diese Dokumente in dieser Runde. "
|
||||
+ (
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue