fix: scrape_trip gepaeck param, Google Flights Homepage-Detection, Kayak Bags-Filter Fallback
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
838e8f4af8
commit
b70dbbcd13
1 changed files with 33 additions and 9 deletions
|
|
@ -19,10 +19,11 @@ def scrape(scanner, von, nach, tage=30, aufenthalt_tage=60,
|
||||||
|
|
||||||
|
|
||||||
def _booking_url_google(von, nach, abflug, rueck, kc):
|
def _booking_url_google(von, nach, abflug, rueck, kc):
|
||||||
|
# Hash-Fragment wird von headless Chrome ignoriert → tfs-Parameter nutzen
|
||||||
if rueck:
|
if rueck:
|
||||||
return (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
|
return (f"https://www.google.com/travel/flights?hl=de&curr=EUR"
|
||||||
f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck};c:EUR;e:1;sd:1;t:r;sc:{kc}")
|
f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck};c:EUR;e:1;sd:1;t:r;sc:{kc}")
|
||||||
return (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
|
return (f"https://www.google.com/travel/flights?hl=de&curr=EUR"
|
||||||
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:{kc}")
|
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:{kc}")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -115,15 +116,23 @@ def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
|
|
||||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||||
sb.open(booking_url)
|
sb.open(booking_url)
|
||||||
sb.sleep(7)
|
sb.sleep(8)
|
||||||
|
|
||||||
if _consent_google(sb):
|
if _consent_google(sb):
|
||||||
sb.open(booking_url)
|
sb.open(booking_url)
|
||||||
sb.sleep(8)
|
sb.sleep(10)
|
||||||
|
|
||||||
title = sb.get_title()
|
title = sb.get_title()
|
||||||
print(f"[GF] Title: {title[:80]}")
|
print(f"[GF] Title: {title[:80]}")
|
||||||
|
|
||||||
|
# Wenn Homepage geladen statt Suchergebnisse: JS-Navigation erzwingen
|
||||||
|
if "weltweit" in title or title.strip() == "" or "Google" == title.strip():
|
||||||
|
print("[GF] Homepage erkannt — erzwinge Navigation via JS")
|
||||||
|
sb.execute_script(f"window.location.href = '{booking_url}';")
|
||||||
|
sb.sleep(10)
|
||||||
|
title = sb.get_title()
|
||||||
|
print(f"[GF] Title nach JS-Nav: {title[:80]}")
|
||||||
|
|
||||||
body = sb.get_text("body")
|
body = sb.get_text("body")
|
||||||
print(f"[GF] Body-Länge: {len(body)}")
|
print(f"[GF] Body-Länge: {len(body)}")
|
||||||
|
|
||||||
|
|
@ -156,9 +165,11 @@ def scrape_kayak(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") if trip_type == "roundtrip" else ""
|
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") if trip_type == "roundtrip" else ""
|
||||||
kc = KABINE_KAYAK.get(kabine, "w")
|
kc = KABINE_KAYAK.get(kabine, "w")
|
||||||
# Gepäck-Filter: 1 = mindestens 1 Aufgabekoffer inklusive
|
# Bags-Filter nur wenn explizit Koffer verlangt
|
||||||
|
# Hinweis: fs=bfc%3D1 filtert Kayak auf "1 Freigepäck inkl."
|
||||||
bags = 1 if "koffer" in gepaeck else 0
|
bags = 1 if "koffer" in gepaeck else 0
|
||||||
booking_url = _booking_url_kayak(von, nach, abflug, rueck, kc, bags)
|
booking_url = _booking_url_kayak(von, nach, abflug, rueck, kc, bags)
|
||||||
|
booking_url_raw = _booking_url_kayak(von, nach, abflug, rueck, kc, 0) # ohne Filter für Fallback
|
||||||
|
|
||||||
print(f"[KY] URL: {booking_url}")
|
print(f"[KY] URL: {booking_url}")
|
||||||
results = []
|
results = []
|
||||||
|
|
@ -194,12 +205,24 @@ def scrape_kayak(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
r["booking_url"] = booking_url
|
r["booking_url"] = booking_url
|
||||||
results.append(r)
|
results.append(r)
|
||||||
|
|
||||||
|
# Kein Ergebnis mit Bags-Filter → Fallback ohne Filter (zeigt was verfügbar ist)
|
||||||
|
if not results and bags > 0:
|
||||||
|
print(f"[KY] Kein Ergebnis mit Bags-Filter — Fallback ohne Filter")
|
||||||
|
sb.open(booking_url_raw)
|
||||||
|
sb.sleep(12)
|
||||||
|
body2 = sb.get_text("body")
|
||||||
|
for r in _preise_aus_body(body2, "kayak", abflug):
|
||||||
|
r["ankunft"] = rueck
|
||||||
|
r["booking_url"] = booking_url_raw
|
||||||
|
results.append(r)
|
||||||
|
|
||||||
print(f"[KY] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[KY] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
return results[:10]
|
return results[:10]
|
||||||
|
|
||||||
|
|
||||||
def scrape_trip(von, nach, tage=30, aufenthalt_tage=60,
|
def scrape_trip(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
trip_type="roundtrip", kabine="premium_economy"):
|
trip_type="roundtrip", kabine="premium_economy",
|
||||||
|
gepaeck="1koffer+handgepaeck"):
|
||||||
abflug_fmt = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d")
|
abflug_fmt = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d")
|
||||||
rueck_fmt = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y%m%d") if trip_type == "roundtrip" else ""
|
rueck_fmt = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y%m%d") if trip_type == "roundtrip" else ""
|
||||||
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||||
|
|
@ -259,8 +282,9 @@ def scrape_trip(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
return results[:10]
|
return results[:10]
|
||||||
|
|
||||||
|
|
||||||
def scrape_skyscanner(von, nach, tage=30, aufenthalt_tage=14,
|
def scrape_skyscanner(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
trip_type="roundtrip", kabine="premium_economy"):
|
trip_type="roundtrip", kabine="premium_economy",
|
||||||
|
gepaeck="1koffer+handgepaeck"):
|
||||||
"""Skyscanner hat starken Bot-Schutz — übersprungen."""
|
"""Skyscanner hat starken Bot-Schutz — übersprungen."""
|
||||||
print("[SS] Skyscanner übersprungen (Bot-Detection)")
|
print("[SS] Skyscanner übersprungen (Bot-Detection)")
|
||||||
return []
|
return []
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue