fix: Google Flights scraper — find_elements ohne timeout, robustere Formular-Logik
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
3e6c1011dd
commit
c8dcce6c17
1 changed files with 109 additions and 79 deletions
|
|
@ -158,110 +158,116 @@ def _consent_google(sb):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _gf_fill_field(sb, selectors, text, field_name):
|
||||||
|
"""Hilfsfunktion: Textfeld in Google Flights füllen + ersten Vorschlag klicken."""
|
||||||
|
for sel in selectors:
|
||||||
|
try:
|
||||||
|
field = sb.find_element(sel, timeout=3)
|
||||||
|
field.clear()
|
||||||
|
sb.sleep(0.3)
|
||||||
|
field.send_keys(text)
|
||||||
|
sb.sleep(2)
|
||||||
|
# Ersten Autocomplete-Vorschlag klicken
|
||||||
|
try:
|
||||||
|
sb.find_element('[role="option"]', timeout=3).click()
|
||||||
|
sb.sleep(0.8)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
print(f"[GF] {field_name} gesetzt: {text}")
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
print(f"[GF] {field_name} fehlgeschlagen — kein Feld gefunden")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=60,
|
def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
trip_type="roundtrip", kabine="premium_economy",
|
trip_type="roundtrip", kabine="premium_economy",
|
||||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||||
layover_min=120, layover_max=300,
|
layover_min=120, layover_max=300,
|
||||||
max_flugzeit_h=22, max_stops=2):
|
max_flugzeit_h=22, max_stops=2):
|
||||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||||
abflug_de = (datetime.now() + timedelta(days=tage)).strftime("%d.%m.%Y")
|
abflug_de = (datetime.now() + timedelta(days=tage)).strftime("%d.%m.%Y")
|
||||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") if trip_type == "roundtrip" else ""
|
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \
|
||||||
rueck_de = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%d.%m.%Y") if trip_type == "roundtrip" else ""
|
if trip_type == "roundtrip" else ""
|
||||||
kc = KABINE_GOOGLE.get(kabine, "w")
|
kc = KABINE_GOOGLE.get(kabine, "w")
|
||||||
booking_url = _booking_url_google(von, nach, abflug, rueck, kc)
|
booking_url = _booking_url_google(von, nach, abflug, rueck, kc)
|
||||||
|
|
||||||
# Stadtname-Mapping für die Suchfelder
|
|
||||||
stadtname = {"FRA": "Frankfurt", "HAN": "Hanoi", "KTI": "Phnom Penh",
|
stadtname = {"FRA": "Frankfurt", "HAN": "Hanoi", "KTI": "Phnom Penh",
|
||||||
"PNH": "Phnom Penh", "BKK": "Bangkok", "SGN": "Ho Chi Minh"}
|
"PNH": "Phnom Penh", "BKK": "Bangkok", "SGN": "Ho Chi Minh City"}
|
||||||
von_name = stadtname.get(von, von)
|
von_name = stadtname.get(von, von)
|
||||||
nach_name = stadtname.get(nach, nach)
|
nach_name = stadtname.get(nach, nach)
|
||||||
|
results = []
|
||||||
|
screenshot_b64 = ""
|
||||||
|
|
||||||
print(f"[GF] Formular-Suche: {von_name}→{nach_name} {abflug_de}")
|
print(f"[GF] Suche: {von_name}→{nach_name} {abflug_de}")
|
||||||
results = []
|
|
||||||
|
|
||||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||||
# 1. Startseite laden
|
|
||||||
sb.open("https://www.google.com/travel/flights?hl=de&curr=EUR")
|
sb.open("https://www.google.com/travel/flights?hl=de&curr=EUR")
|
||||||
sb.sleep(6)
|
sb.sleep(5)
|
||||||
|
_consent_google(sb)
|
||||||
|
sb.sleep(2)
|
||||||
|
|
||||||
if _consent_google(sb):
|
# ── 1. Kabine auf "Premium Economy" setzen ──────────────────────────
|
||||||
sb.sleep(4)
|
|
||||||
|
|
||||||
# 2. Kabinen-Typ setzen (Roundtrip / Economy dropdown)
|
|
||||||
try:
|
try:
|
||||||
# Kabinen-Auswahl: nach "Premium Economy" suchen
|
# VfPpkd-Buttons: [0]=Hin+Rück [1]=Economy(Klasse)
|
||||||
kab_label = {"e": "Economy", "w": "Premium Economy", "b": "Business", "f": "First"}
|
btns = sb.find_elements('button[class*="VfPpkd"]')
|
||||||
print(f"[GF] Versuche Kabine zu setzen: {kab_label.get(kc, 'Premium Economy')}")
|
if len(btns) >= 2:
|
||||||
# Kabinen-Dropdown öffnen (2. Dropdown in der Suchleiste)
|
btns[1].click()
|
||||||
kab_btns = sb.find_elements('[class*="cabin"] button, [aria-label*="abine"], [data-value*="cabin"]', timeout=3)
|
|
||||||
if not kab_btns:
|
|
||||||
kab_btns = sb.find_elements('button[class*="VfPpkd"]', timeout=2)
|
|
||||||
if kab_btns:
|
|
||||||
kab_btns[1].click()
|
|
||||||
sb.sleep(1)
|
sb.sleep(1)
|
||||||
|
# Option "Premium Economy" im Dropdown auswählen
|
||||||
|
for opt_sel in ['[data-value="2"]',
|
||||||
|
'li[class*="premium"]',
|
||||||
|
'[role="option"]:nth-child(3)']:
|
||||||
|
try:
|
||||||
|
sb.find_element(opt_sel, timeout=2).click()
|
||||||
|
sb.sleep(0.5)
|
||||||
|
print(f"[GF] Kabine gesetzt via {opt_sel}")
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[GF] Kabinen-Klick fehlgeschlagen: {e}")
|
print(f"[GF] Kabine: {e}")
|
||||||
|
|
||||||
# 3. Abflugort eingeben
|
# ── 2. Von-Feld befüllen ────────────────────────────────────────────
|
||||||
try:
|
_gf_fill_field(sb, [
|
||||||
von_field = None
|
'input[aria-label*="Von"]',
|
||||||
for sel in ['input[placeholder*="Von"], input[aria-label*="Von"], input[aria-label*="Abflug"], '
|
'input[aria-label*="Abflugort"]',
|
||||||
'input[placeholder*="Where from"]']:
|
'input[placeholder*="Von"]',
|
||||||
elems = sb.find_elements(sel, timeout=2)
|
'input[aria-label*="Where from"]',
|
||||||
if elems:
|
], von_name, "Von")
|
||||||
von_field = elems[0]
|
|
||||||
break
|
|
||||||
if not von_field:
|
|
||||||
# Fallback: erster Input im Suchformular
|
|
||||||
von_field = sb.find_element('div[class*="target"] input', timeout=3)
|
|
||||||
|
|
||||||
von_field.clear()
|
# ── 3. Nach-Feld befüllen ───────────────────────────────────────────
|
||||||
von_field.send_keys(von_name)
|
_gf_fill_field(sb, [
|
||||||
sb.sleep(2)
|
'input[aria-label*="Wohin"]',
|
||||||
# Ersten Vorschlag auswählen
|
'input[aria-label*="Zielort"]',
|
||||||
sb.find_element('[role="option"]', timeout=3).click()
|
'input[placeholder*="Wohin"]',
|
||||||
sb.sleep(1)
|
'input[aria-label*="Where to"]',
|
||||||
print(f"[GF] Abflugort gesetzt: {von_name}")
|
], nach_name, "Nach")
|
||||||
except Exception as e:
|
|
||||||
print(f"[GF] Abflugort-Eingabe fehlgeschlagen: {e}")
|
|
||||||
|
|
||||||
# 4. Zielort eingeben
|
# ── 4. Suchen-Button klicken ────────────────────────────────────────
|
||||||
try:
|
gesucht = False
|
||||||
nach_field = sb.find_element(
|
for sel in ['button[aria-label*="Suchen"]', 'button[aria-label*="Search"]',
|
||||||
'input[placeholder*="Wohin"], input[aria-label*="Ziel"], input[aria-label*="Where to"]',
|
'button[jsname="vLv7Lb"]', 'button[type="submit"]']:
|
||||||
timeout=3)
|
try:
|
||||||
nach_field.clear()
|
sb.find_element(sel, timeout=2).click()
|
||||||
nach_field.send_keys(nach_name)
|
print(f"[GF] Suche gestartet via {sel}")
|
||||||
sb.sleep(2)
|
gesucht = True
|
||||||
sb.find_element('[role="option"]', timeout=3).click()
|
break
|
||||||
sb.sleep(1)
|
except Exception:
|
||||||
print(f"[GF] Zielort gesetzt: {nach_name}")
|
continue
|
||||||
except Exception as e:
|
|
||||||
print(f"[GF] Zielort-Eingabe fehlgeschlagen: {e}")
|
|
||||||
|
|
||||||
# 5. Suchen-Button klicken
|
sb.sleep(14)
|
||||||
try:
|
|
||||||
for sel in ['button[aria-label*="Suchen"], button[aria-label*="Search"]',
|
|
||||||
'button[jsname="vLv7Lb"]', 'button[class*="search"]']:
|
|
||||||
btns = sb.find_elements(sel, timeout=2)
|
|
||||||
if btns:
|
|
||||||
btns[0].click()
|
|
||||||
print(f"[GF] Suche gestartet via {sel}")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[GF] Such-Button fehlgeschlagen: {e}")
|
|
||||||
|
|
||||||
sb.sleep(12)
|
|
||||||
title = sb.get_title()
|
title = sb.get_title()
|
||||||
body = sb.get_text("body")
|
body = sb.get_text("body")
|
||||||
print(f"[GF] Title: {title[:80]} | Body: {len(body)} chars")
|
print(f"[GF] Title: {title[:80]} | Body: {len(body)} chars | Suche-OK: {gesucht}")
|
||||||
|
|
||||||
# 6. Preise extrahieren
|
# ── 5. Preise extrahieren ───────────────────────────────────────────
|
||||||
|
# a) aria-label Elemente
|
||||||
try:
|
try:
|
||||||
elems = sb.find_elements('[aria-label*="€"], [aria-label*="EUR"]', timeout=3)
|
for elem in sb.find_elements('[aria-label*="€"], [aria-label*="EUR"]')[:30]:
|
||||||
for elem in elems[:20]:
|
lbl = elem.get_attribute("aria-label") or elem.text
|
||||||
label = elem.get_attribute('aria-label') or elem.text
|
p = _parse_preis(lbl)
|
||||||
p = _parse_preis(label)
|
|
||||||
if p and p > 400:
|
if p and p > 400:
|
||||||
results.append({"scanner": "google_flights", "preis": p,
|
results.append({"scanner": "google_flights", "preis": p,
|
||||||
"waehrung": "EUR", "airline": "",
|
"waehrung": "EUR", "airline": "",
|
||||||
|
|
@ -270,15 +276,39 @@ def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fallback: Regex über Body (nur plausible Preise > 400 EUR)
|
# b) sichtbare Preistexte in Ergebnisliste
|
||||||
|
if not results:
|
||||||
|
for sel in ['.YMlIz', '.FpEdX', '[class*="price"]', 'span[class*="preis"]']:
|
||||||
|
try:
|
||||||
|
for elem in sb.find_elements(sel)[:20]:
|
||||||
|
p = _parse_preis(elem.text)
|
||||||
|
if p and p > 400:
|
||||||
|
results.append({"scanner": "google_flights", "preis": p,
|
||||||
|
"waehrung": "EUR", "airline": "",
|
||||||
|
"abflug": abflug, "ankunft": rueck,
|
||||||
|
"booking_url": booking_url})
|
||||||
|
if results:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# c) Body-Regex Fallback
|
||||||
if not results:
|
if not results:
|
||||||
for r in _preise_aus_body(body, "google_flights", abflug):
|
for r in _preise_aus_body(body, "google_flights", abflug):
|
||||||
if r["preis"] > 400:
|
if r["preis"] > 400:
|
||||||
r["ankunft"] = rueck
|
r["ankunft"] = rueck
|
||||||
r["booking_url"] = booking_url
|
r["booking_url"] = booking_url
|
||||||
results.append(r)
|
results.append(r)
|
||||||
|
|
||||||
results = [r for r in results if r["preis"] > 400]
|
results = [r for r in results if r["preis"] > 400]
|
||||||
|
seen = set()
|
||||||
|
dedup = []
|
||||||
|
for r in results:
|
||||||
|
if r["preis"] not in seen:
|
||||||
|
seen.add(r["preis"])
|
||||||
|
dedup.append(r)
|
||||||
|
results = dedup
|
||||||
|
|
||||||
print(f"[GF] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[GF] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue