fix: KAYAK/Momondo Sidebar-Preise herausfiltern
Scraper hat bisher Airline-Filter-Sidebar Preise (z.B. Air China 714EUR) als Flugergebnisse gespeichert. Fix: Header-Preis als Anker holen, Preise unter 80% des Ankerwerts als Sidebar-Artefakte verwerfen.
This commit is contained in:
parent
aed3067fbc
commit
e0578c0ec2
1 changed files with 54 additions and 0 deletions
|
|
@ -329,6 +329,54 @@ def _preise_aus_body(body, scanner, abflug):
|
|||
return results[:10]
|
||||
|
||||
|
||||
|
||||
def _kayak_header_preis(sb) -> float | None:
|
||||
"""Liest den 'Günstigste Option' Preis aus dem KAYAK-Summary-Header.
|
||||
Dieser Wert ist der zuverlässigste Anker — kommt direkt aus den Suchergebnissen."""
|
||||
try:
|
||||
# JavaScript: suche die summary-bar Elemente
|
||||
price = sb.driver.execute_script("""
|
||||
// KAYAK zeigt "Günstigste Option" + Preis in einem summary-container
|
||||
var containers = document.querySelectorAll('[class*="rec-col"], [class*="recommended"], [class*="summary"], [class*="option-header"]');
|
||||
for (var c of containers) {
|
||||
var txt = c.innerText || '';
|
||||
var m = txt.match(/(\d[\d.]{1,6})\s?€|€\s?(\d[\d.]{1,6})/);
|
||||
if (m) {
|
||||
var raw = (m[1] || m[2]).replace('.','').replace(',','.');
|
||||
var v = parseFloat(raw);
|
||||
if (v > 300 && v < 5000) return v;
|
||||
}
|
||||
}
|
||||
// Fallback: suche im Seitentitel / h1
|
||||
var h = document.querySelector('h1, [class*="title"]');
|
||||
if (h) {
|
||||
var m2 = (h.innerText||'').match(/(\d[\d.]{2,6})\s?€/);
|
||||
if (m2) return parseFloat(m2[1].replace('.',''));
|
||||
}
|
||||
return null;
|
||||
""")
|
||||
if price:
|
||||
print(f"[KY] Header-Preis: {price} EUR")
|
||||
return float(price)
|
||||
except Exception as e:
|
||||
print(f"[KY] Header-Preis Fehler: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _filter_sidebar_preise(results: list, anker: float | None, scanner: str) -> list:
|
||||
"""Filtert Sidebar-Preise (Airline-Filter, Preisslider) heraus.
|
||||
Behalte nur Preise die >= 80% des Anker-Preises sind (Sidebar-Preise sind viel günstiger)."""
|
||||
if not anker or not results:
|
||||
return results
|
||||
min_valid = anker * 0.80
|
||||
filtered = [r for r in results if r["preis"] >= min_valid]
|
||||
removed = len(results) - len(filtered)
|
||||
if removed:
|
||||
print(f"[{scanner}] {removed} Sidebar-Preise entfernt (unter {min_valid:.0f} EUR)")
|
||||
return filtered if filtered else results # Fallback: alle behalten wenn alle rausgefiltert
|
||||
|
||||
|
||||
|
||||
def _consent_google(sb):
|
||||
"""Google Consent-Seite (DSGVO) behandeln."""
|
||||
if "consent" in sb.get_current_url() or "Bevor Sie" in sb.get_title():
|
||||
|
|
@ -678,6 +726,9 @@ def scrape_kayak(von, nach, tage=30, aufenthalt_tage=60,
|
|||
if not pe_confirmed:
|
||||
print(f"[KY{airline_label}] WARNUNG: Premium Economy nicht auf Seite bestätigt!")
|
||||
|
||||
# Sidebar-Preise herausfiltern: Header-Preis als Ankerwert holen
|
||||
anker = _kayak_header_preis(sb)
|
||||
results = _filter_sidebar_preise(results, anker, f"kayak{airline_label}")
|
||||
results = _validate_results(results, f"kayak{airline_label}", kabine)
|
||||
print(f"[KY{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
_dismiss_cookie_banner(sb)
|
||||
|
|
@ -925,6 +976,9 @@ def scrape_momondo(von, nach, tage=30, aufenthalt_tage=60,
|
|||
if not pe_confirmed:
|
||||
print(f"[MO{airline_label}] WARNUNG: Premium Economy nicht auf Seite bestätigt!")
|
||||
|
||||
# Sidebar-Preise herausfiltern
|
||||
anker_mo = _kayak_header_preis(sb) # Momondo hat gleiches Layout wie Kayak
|
||||
results = _filter_sidebar_preise(results, anker_mo, f"momondo{airline_label}")
|
||||
results = _validate_results(results, f"momondo{airline_label}", kabine)
|
||||
print(f"[MO{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
_dismiss_cookie_banner(sb)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue