diff --git a/flugpreisscanner/worker.py b/flugpreisscanner/worker.py new file mode 100644 index 00000000..be9ca1ad --- /dev/null +++ b/flugpreisscanner/worker.py @@ -0,0 +1,1092 @@ +from seleniumbase import SB +from datetime import datetime, timedelta +import re + +# ── Qualitätsschwellen ──────────────────────────────────────────────────────── +# CX Economy Roundtrip FRA→KTI: 600–1400€ | PE: 700–12000€ +MIN_PREIS_ECONOMY_ROUNDTRIP = 600 +MAX_PREIS_ECONOMY_ROUNDTRIP = 1400 +MIN_PREIS_PE_ROUNDTRIP = 700 +MAX_PREIS_PE_ROUNDTRIP = 12000 + + +def _scrape_disabled(*args, **kwargs): + """Deaktivierter Scanner — gibt leere Ergebnisse zurück.""" + print("[SKIP] Scanner deaktiviert") + return [], "" + + +def _validate_results(results, scanner_name, kabine="economy"): + """Qualitätskontrolle: filtert unplausible Preise raus.""" + if kabine == "economy": + before = len(results) + results = [r for r in results + if MIN_PREIS_ECONOMY_ROUNDTRIP <= r["preis"] <= MAX_PREIS_ECONOMY_ROUNDTRIP] + dropped = before - len(results) + if dropped: + print(f"[QC/{scanner_name}] {dropped} Preise außerhalb " + f"{MIN_PREIS_ECONOMY_ROUNDTRIP}-{MAX_PREIS_ECONOMY_ROUNDTRIP}€ entfernt") + elif kabine == "premium_economy": + before = len(results) + results = [r for r in results if MIN_PREIS_PE_ROUNDTRIP <= r["preis"] <= MAX_PREIS_PE_ROUNDTRIP] + dropped = before - len(results) + if dropped: + print(f"[QC/{scanner_name}] {dropped} Preise außerhalb " + f"{MIN_PREIS_PE_ROUNDTRIP}-{MAX_PREIS_PE_ROUNDTRIP}€ entfernt") + return results + + +def _check_cabin_on_page(body, title, kabine="premium_economy"): + """Prüft ob die Seite die gewünschte Kabinenklasse bestätigt.""" + text = (title + " " + body[:3000]).lower() + if kabine == "premium_economy": + pe_keywords = ["premium economy", "premium eco", "premiumeconomy", + "premium_economy", "kabine: premium", "cabin: premium", + "prem eco", "w class"] + eco_only = ["economy" in text and "premium" not in text] + if any(kw in text for kw in pe_keywords): + return True + if eco_only[0]: + print("[QC] WARNUNG: Seite zeigt 'Economy' OHNE 'Premium' — möglicherweise falsche Kabine!") + return False + return True + + +def _filter_roundtrip_only(results): + """Entfernt One-Way/unpassende Daten: nur Roundtrip mit 50–95 Tagen Aufenthalt.""" + # Aufenthalt 2–3 Monate: 50–95 Tage zwischen Hin- und Rückflug + MIN_AUFENTHALT = 50 + MAX_AUFENTHALT = 95 + filtered = [] + for r in results: + ab, an = r.get("abflug", ""), r.get("ankunft", "") + if not ab or not an: + continue + if an <= ab: + continue + try: + d_ab = datetime.strptime(ab, "%Y-%m-%d") + d_an = datetime.strptime(an, "%Y-%m-%d") + tage = (d_an - d_ab).days + if MIN_AUFENTHALT <= tage <= MAX_AUFENTHALT: + filtered.append(r) + except (ValueError, TypeError): + pass + dropped = len(results) - len(filtered) + if dropped: + print(f"[QC] {dropped} Daten aussortiert (Aufenthalt außerhalb {MIN_AUFENTHALT}-{MAX_AUFENTHALT} Tage)") + return filtered + + +def scrape(scanner, von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2, + via="", stopover_min_h=20, stopover_max_h=30): + """ + Gibt (results, screenshot_b64) zurück. + results = Liste von Preis-Dicts + screenshot_b64 = JPEG Full-Page Screenshot als base64-String (leer wenn Fehler) + """ + dispatcher = { + "google_flights": _scrape_disabled, + "kayak": scrape_kayak, + "kayak_multicity": scrape_kayak_multicity, + "momondo": scrape_momondo, + "wego": _scrape_disabled, + "traveloka": scrape_traveloka, + "skyscanner": _scrape_disabled, + "trip": scrape_trip, + } + fn = dispatcher.get(scanner) + if not fn: + raise ValueError(f"Unbekannter Scanner: {scanner}") + if scanner == "kayak_multicity": + results, screenshot_b64 = fn(von, nach, tage, aufenthalt_tage, kabine, gepaeck, + airline_filter, via, stopover_min_h, stopover_max_h) + else: + results, screenshot_b64 = fn(von, nach, tage, aufenthalt_tage, trip_type, kabine, gepaeck, + airline_filter, layover_min, layover_max, max_flugzeit_h, max_stops) + results = _filter_roundtrip_only(results) + return results, screenshot_b64 + + +def _dismiss_cookie_banner(sb): + """Cookie-/Consent-Banner wegklicken — für saubere Screenshots.""" + # Kayak/Momondo: "Alle akzeptieren" Button (häufigstes Format) + for sel in [ + '//button[contains(., "Alle akzeptieren")]', + '//button[contains(., "Accept all")]', + '.kayak-consent-button', '#cookie-accept', '[data-testid="cookie-banner"]', + '#onetrust-accept-btn-handler', 'button[class*="accept"]', + 'button[title*="akzeptieren"]', '.evidon-banner-acceptbutton', + '.RxNS-button-content', 'button[id*="accept"]', + 'button[aria-label*="Accept"]', '[aria-label*="Akzeptieren"]', + ]: + try: + sb.click(sel, timeout=2) + print(f"[Cookie] Geklickt: {sel[:50]}") + sb.sleep(3) + return True + except Exception: + pass + return False + + + +def _dismiss_comparison_popup(sb): + """Vergleichs-Popups (Opodo, Skyscanner etc.) wegklicken bevor Screenshot gemacht wird.""" + # Erst Escape versuchen (funktioniert bei den meisten Modals) + try: + sb.driver.execute_script("document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27, bubbles: true}));") + sb.sleep(0.5) + except Exception: + pass + + # Dann gezielt Close-Buttons suchen + for sel in [ + 'button[aria-label*="lose"]', + 'button[aria-label*="chließen"]', + 'button[aria-label*="Schließen"]', + '[class*="modal"] button[class*="close"]', + '[class*="dialog"] button[class*="close"]', + '[class*="overlay"] button[class*="close"]', + '[class*="popup"] button[class*="close"]', + 'button[class*="dismiss"]', + '[data-testid*="close"]', + '//button[contains(@aria-label, "lose")]', + '//button[contains(., "Schließen")]', + '//button[contains(., "Nein")]', + '//button[contains(., "Nicht jetzt")]', + '//button[contains(., "Vielleicht später")]', + ]: + try: + sb.click(sel, timeout=1) + print(f"[Popup] Geschlossen: {sel[:60]}") + sb.sleep(0.8) + return True + except Exception: + pass + + # JavaScript-Fallback: alle sichtbaren Modals/Overlays entfernen + try: + removed = sb.driver.execute_script(""" + var removed = 0; + var selectors = ['[class*="modal"]', '[class*="overlay"]', '[class*="dialog"]', + '[class*="popup"]', '[role="dialog"]']; + selectors.forEach(function(sel) { + document.querySelectorAll(sel).forEach(function(el) { + var style = window.getComputedStyle(el); + if (style.display !== 'none' && style.visibility !== 'hidden' + && el.offsetHeight > 100) { + el.remove(); + removed++; + } + }); + }); + return removed; + """) + if removed: + print(f"[Popup] JS: {removed} Elemente entfernt") + sb.sleep(0.5) + except Exception: + pass + + return False + + +def _take_screenshot(sb): + """Full-Page Screenshot via CDP (JPEG 55%, max 3000px). Gibt base64-String zurück.""" + try: + result = sb.driver.execute_cdp_cmd("Page.captureScreenshot", { + "format": "jpeg", + "quality": 55, + "captureBeyondViewport": True, + "clip": {"x": 0, "y": 0, "width": 1280, "height": 3000, "scale": 0.75}, + }) + data = result.get("data", "") + if data: + print(f"[Screenshot] OK — {len(data)//1024} KB base64") + return data + except Exception as e: + print(f"[Screenshot] CDP-Fehler: {e}") + try: + return sb.driver.get_screenshot_as_base64() + except Exception: + return "" + + +def _booking_url_google(von, nach, abflug, rueck, kc): + # Hash-Fragment wird von headless Chrome ignoriert → tfs-Parameter nutzen + if rueck: + return (f"https://www.google.com/travel/flights?hl=de&curr=EUR" + f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck};c:EUR;e:1;sd:1;t:r;sc:{kc}") + return (f"https://www.google.com/travel/flights?hl=de&curr=EUR" + f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:{kc}") + + +def _booking_url_kayak(von, nach, abflug, rueck, kc, bags=1, + layover_min=120, layover_max=300, airline="", + max_flugzeit_h=22, max_stops=2): + """ + Kayak fs-Filter: + bfc=1 → min. 1 Freigepäck inklusive + ctr=120,300 → Umstiegszeit 2–5 Stunden (Minuten) + duration=-1320 → Max. Gesamtflugzeit (Minuten, hier 22h) + s=2 → Max. 2 Stopps + airlines=XX → Airline-Code (CZ, CX, SQ, TG …) + """ + filters = [] + if bags: + filters.append(f"bfc%3D{bags}") + if layover_min and layover_max: + filters.append(f"ctr%3D{layover_min}%2C{layover_max}") + if max_flugzeit_h: + filters.append(f"duration%3D-{max_flugzeit_h * 60}") + if max_stops is not None and max_stops < 10: + filters.append(f"s%3D{max_stops}") + if airline: + filters.append(f"airlines%3D{airline}") + fs = ("&fs=" + "%3B".join(filters)) if filters else "" + base = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}" + if rueck: + return f"{base}/{rueck}?sort=price_a&cabin={kc}¤cy=EUR{fs}" + return f"{base}?sort=price_a&cabin={kc}¤cy=EUR{fs}" + + +def _booking_url_momondo(von, nach, abflug, rueck, kc, bags=1, + layover_min=120, layover_max=300, airline="", + max_flugzeit_h=22, max_stops=2): + """Momondo URL — gleiche Struktur wie Kayak (Booking Holdings), andere Domain.""" + filters = [] + if bags: + filters.append(f"bfc%3D{bags}") + if layover_min and layover_max: + filters.append(f"ctr%3D{layover_min}%2C{layover_max}") + if max_flugzeit_h: + filters.append(f"duration%3D-{max_flugzeit_h * 60}") + if max_stops is not None and max_stops < 10: + filters.append(f"s%3D{max_stops}") + if airline: + filters.append(f"airlines%3D{airline}") + fs = ("&fs=" + "%3B".join(filters)) if filters else "" + base = f"https://www.momondo.de/flight-search/{von}-{nach}/{abflug}" + if rueck: + return f"{base}/{rueck}?sort=price_a&cabin={kc}¤cy=EUR{fs}" + return f"{base}?sort=price_a&cabin={kc}¤cy=EUR{fs}" + + +def _booking_url_trip(von, nach, abflug_fmt, rueck_fmt, kc, von_name, nach_name, airline=""): + params = f"DDate1={abflug_fmt}&class={kc}&curr=EUR" + if rueck_fmt: + params += f"&DDate2={rueck_fmt}" + if airline: + params += f"&airline={airline}" + return (f"https://www.trip.com/flights/{von_name}-to-{nach_name}/" + f"tickets-{von.lower()}-{nach.lower()}/?{params}") + + +# ── Kabinen-Codes ────────────────────────────────────────────────────────────── +KABINE_GOOGLE = {"economy": "e", "premium_economy": "w", "business": "b", "first": "f"} +KABINE_KAYAK = {"economy": "e", "premium_economy": "w", "business": "b", "first": "f"} +KABINE_TRIP = {"economy": "Y", "premium_economy": "W", "business": "C", "first": "F"} + + +def _parse_preis(text): + if not text: + return None + text = text.replace('\xa0', ' ').replace('\u202f', ' ') + for p in [r'(\d{1,2}[.,]\d{3})\s?€', r'(\d{3,5})\s?€', + r'€\s?(\d{3,5})', r'EUR\s?(\d{3,5})', r'(\d{3,5})\s?EUR']: + m = re.search(p, text) + if m: + try: + v = float(m.group(1).replace('.', '').replace(',', '')) + if 200 < v < 15000: + return round(v, 2) + except ValueError: + pass + return None + + +def _preise_aus_body(body, scanner, abflug): + results = [] + seen = set() + for m in re.finditer(r'(\d[\d\s\.]{1,5})\s?€|€\s?(\d[\d\s\.]{1,5})', body): + raw = (m.group(1) or m.group(2)).replace(' ', '').replace('.', '') + try: + v = float(raw) + if 300 < v < 12000 and v not in seen: + seen.add(v) + results.append({ + "scanner": scanner, "preis": v, "waehrung": "EUR", + "airline": "", "abflug": abflug, "ankunft": "" + }) + except ValueError: + pass + results.sort(key=lambda x: x["preis"]) + return results[:10] + + +def _consent_google(sb): + """Google Consent-Seite (DSGVO) behandeln.""" + if "consent" in sb.get_current_url() or "Bevor Sie" in sb.get_title(): + print("[CONSENT] Google Consent erkannt") + for sel in ['form[action*="save"] button', 'button[jsname="tHlp8d"]', + '.lssxud button', 'button[aria-label*="kzeptieren"]']: + try: + sb.click(sel, timeout=3) + sb.sleep(4) + print(f"[CONSENT] Geklickt: {sel}") + return True + except Exception: + pass + return False + + +def _gf_fill_field(sb, selectors, text, field_name): + """ + Textfeld in Google Flights füllen. + Nutzt Keyboard-Navigation (ArrowDown + Return) statt DOM-Klick, + weil Google-Autocomplete-Dropdowns sonst offen bleiben. + """ + from selenium.webdriver.common.keys import Keys + for sel in selectors: + try: + field = sb.find_element(sel, timeout=3) + # Feld leeren via JS (robuster als .clear() bei React-Inputs) + sb.execute_script("arguments[0].value = '';", field) + field.click() + sb.sleep(0.3) + field.send_keys(text) + sb.sleep(2) + # Ersten Vorschlag per Tastatur auswählen (zuverlässiger als Klick) + field.send_keys(Keys.ARROW_DOWN) + sb.sleep(0.5) + field.send_keys(Keys.RETURN) + sb.sleep(1) + # Escape falls Dropdown noch offen + try: + field.send_keys(Keys.ESCAPE) + except Exception: + pass + print(f"[GF] {field_name} gesetzt: {text}") + return True + except Exception: + continue + print(f"[GF] {field_name} fehlgeschlagen — kein Feld gefunden") + return False + + +def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + abflug_de = (datetime.now() + timedelta(days=tage)).strftime("%d.%m.%Y") + rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \ + if trip_type == "roundtrip" else "" + kc = KABINE_GOOGLE.get(kabine, "w") + booking_url = _booking_url_google(von, nach, abflug, rueck, kc) + + stadtname = {"FRA": "Frankfurt", "HAN": "Hanoi", "KTI": "Phnom Penh", + "PNH": "Phnom Penh", "BKK": "Bangkok", "SGN": "Ho Chi Minh City"} + von_name = stadtname.get(von, von) + nach_name = stadtname.get(nach, nach) + results = [] + screenshot_b64 = "" + + print(f"[GF] Suche: {von_name}→{nach_name} {abflug_de}") + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + # ── Strategie 1: Direkte URL mit Datums-Parametern ───────────────── + # Google Flights verarbeitet den Hash-Fragment erst nach JS-Ausführung + direct_url = ( + f"https://www.google.com/travel/flights?hl=de&curr=EUR" + f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck}" + f";c:EUR;e:1;sd:1;t:r;sc:w" + ) if rueck else ( + f"https://www.google.com/travel/flights?hl=de&curr=EUR" + f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:w" + ) + sb.open(direct_url) + sb.sleep(8) + _consent_google(sb) + sb.sleep(3) + title_direct = sb.get_title() + print(f"[GF] URL-Ansatz: {title_direct[:60]}") + + # Wenn direkte URL Ergebnisse liefert (Titel enthält Städtenamen) + url_erfolgreich = any(kw in title_direct for kw in + [von, nach, "FRA", "KTI", "Frankfurt", "Phnom", "Flüge"]) + if not url_erfolgreich: + # ── Strategie 2: Startseite + Formular befüllen ───────────────── + print("[GF] Direktlink kein Ergebnis — wechsle zu Formular-Ansatz") + sb.open("https://www.google.com/travel/flights?hl=de&curr=EUR") + sb.sleep(5) + _consent_google(sb) + sb.sleep(2) + + # ── 1. Kabine auf "Premium Economy" setzen ────────────────────────── + try: + # VfPpkd-Buttons: [0]=Hin+Rück [1]=Economy(Klasse) + btns = sb.find_elements('button[class*="VfPpkd"]') + if len(btns) >= 2: + btns[1].click() + sb.sleep(1) + # Option "Premium Economy" im Dropdown auswählen + for opt_sel in ['[data-value="2"]', + 'li[class*="premium"]', + '[role="option"]:nth-child(3)']: + try: + sb.find_element(opt_sel, timeout=2).click() + sb.sleep(0.5) + print(f"[GF] Kabine gesetzt via {opt_sel}") + break + except Exception: + pass + except Exception as e: + print(f"[GF] Kabine: {e}") + + # ── 2. Von-Feld befüllen ──────────────────────────────────────────── + _gf_fill_field(sb, [ + 'input[aria-label*="Von"]', + 'input[aria-label*="Abflugort"]', + 'input[placeholder*="Von"]', + 'input[aria-label*="Where from"]', + ], von_name, "Von") + sb.sleep(1.5) # Warten bis Von-Auswahl abgeschlossen + + # ── 3. Nach-Feld befüllen ─────────────────────────────────────────── + from selenium.webdriver.common.keys import Keys as _Keys + nach_gesetzt = False + + # Versuch 1: Explizite aria-label / role Selektoren + for nach_sel in [ + 'input[role="combobox"]', # Google nutzt combobox für Autocomplete + 'input[aria-label*="Wohin"]', + 'input[aria-label*="Zielort"]', + 'input[aria-label*="Ziel"]', + 'input[placeholder*="Wohin"]', + 'input[aria-label*="Where to"]', + 'input[aria-label*="Destination"]', + ]: + try: + # Wenn mehrere combobox-Inputs: zweiten nehmen (1. = Von, 2. = Nach) + elems = sb.find_elements(nach_sel) + field = elems[1] if len(elems) >= 2 else (elems[0] if elems else None) + if field and field != sb.driver.switch_to.active_element: + sb.execute_script("arguments[0].value = '';", field) + field.click() + sb.sleep(0.3) + field.send_keys(nach_name) + sb.sleep(2) + field.send_keys(_Keys.ARROW_DOWN) + sb.sleep(0.5) + field.send_keys(_Keys.RETURN) + sb.sleep(1) + print(f"[GF] Nach via {nach_sel}: {nach_name}") + nach_gesetzt = True + break + except Exception: + continue + + # Versuch 2: JS — zweites Input-Element finden und befüllen + if not nach_gesetzt: + try: + nach_field = sb.execute_script(""" + var inputs = document.querySelectorAll('input[role="combobox"], input[aria-label]'); + for (var i = 0; i < inputs.length; i++) { + var lbl = inputs[i].getAttribute('aria-label') || ''; + if (lbl.match(/Wohin|Ziel|Destination|Where to/i)) return inputs[i]; + } + // Fallback: zweites sichtbares Input + var all = Array.from(document.querySelectorAll('input')).filter( + e => e.offsetWidth > 0 && e.offsetHeight > 0); + return all[1] || null; + """) + if nach_field: + sb.execute_script("arguments[0].value = '';", nach_field) + nach_field.click() + sb.sleep(0.3) + nach_field.send_keys(nach_name) + sb.sleep(2) + nach_field.send_keys(_Keys.ARROW_DOWN) + sb.sleep(0.5) + nach_field.send_keys(_Keys.RETURN) + sb.sleep(1) + print(f"[GF] Nach via JS-Input: {nach_name}") + nach_gesetzt = True + except Exception as e: + print(f"[GF] Nach JS-Fehler: {e}") + + # ── 4. Suchen-Button klicken ──────────────────────────────────────── + from selenium.webdriver.common.keys import Keys + gesucht = False + # Variante A: bekannte Selektoren + for sel in ['button[aria-label*="Suchen"]', 'button[aria-label*="Search"]', + 'button[jsname="vLv7Lb"]', 'button[type="submit"]', + 'button[class*="search"]']: + try: + sb.find_element(sel, timeout=2).click() + print(f"[GF] Suche via Selector: {sel}") + gesucht = True + break + except Exception: + continue + + # Variante B: JS — Button mit Text "Suchen" / "Search" finden + if not gesucht: + try: + clicked = sb.execute_script(""" + var btns = document.querySelectorAll('button'); + for (var b of btns) { + var t = (b.textContent || b.innerText || '').trim(); + if (t === 'Suchen' || t === 'Search') { b.click(); return true; } + } + return false; + """) + if clicked: + print("[GF] Suche via JS-Text-Klick") + gesucht = True + except Exception: + pass + + # Variante C: Enter-Taste auf body (löst Formular-Submit aus) + if not gesucht: + try: + sb.driver.find_element("css selector", "body").send_keys(Keys.RETURN) + print("[GF] Suche via Enter-Taste") + gesucht = True + except Exception: + pass + + sb.sleep(14) + title = sb.get_title() + body = sb.get_text("body") + print(f"[GF] Title: {title[:80]} | Body: {len(body)} chars | Suche-OK: {gesucht}") + + # ── 5. Preise extrahieren ─────────────────────────────────────────── + # a) aria-label Elemente + try: + for elem in sb.find_elements('[aria-label*="€"], [aria-label*="EUR"]')[:30]: + lbl = elem.get_attribute("aria-label") or elem.text + p = _parse_preis(lbl) + if p and p > 400: + results.append({"scanner": "google_flights", "preis": p, + "waehrung": "EUR", "airline": "", + "abflug": abflug, "ankunft": rueck, + "booking_url": booking_url}) + except Exception: + pass + + # b) sichtbare Preistexte in Ergebnisliste + if not results: + for sel in ['.YMlIz', '.FpEdX', '[class*="price"]', 'span[class*="preis"]']: + try: + for elem in sb.find_elements(sel)[:20]: + p = _parse_preis(elem.text) + if p and p > 400: + results.append({"scanner": "google_flights", "preis": p, + "waehrung": "EUR", "airline": "", + "abflug": abflug, "ankunft": rueck, + "booking_url": booking_url}) + if results: + break + except Exception: + pass + + # c) Body-Regex Fallback + if not results: + for r in _preise_aus_body(body, "google_flights", abflug): + if r["preis"] > 400: + r["ankunft"] = rueck + r["booking_url"] = booking_url + results.append(r) + + results = [r for r in results if r["preis"] > 400] + seen = set() + dedup = [] + for r in results: + if r["preis"] not in seen: + seen.add(r["preis"]) + dedup.append(r) + results = dedup + + print(f"[GF] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def scrape_kayak(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") if trip_type == "roundtrip" else "" + kc = KABINE_KAYAK.get(kabine, "w") + bags = 1 if "koffer" in gepaeck else 0 + booking_url = _booking_url_kayak(von, nach, abflug, rueck, kc, bags, + layover_min, layover_max, airline_filter, + max_flugzeit_h, max_stops) + airline_label = f" [{airline_filter}]" if airline_filter else "" + print(f"[KY{airline_label}] URL: {booking_url}") + + results = [] + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + sb.open(booking_url) + sb.sleep(15) + _dismiss_cookie_banner(sb) + sb.sleep(4) + + title = sb.get_title() + body = sb.get_text("body") + print(f"[KY] Title: {title[:80]}") + + for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span', + 'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price']: + try: + elems = sb.find_elements(sel, timeout=2) + if elems: + for e in elems[:15]: + p = _parse_preis(e.text) + if p: + results.append({"scanner": "kayak", "preis": p, + "waehrung": "EUR", + "airline": airline_filter or "", + "abflug": abflug, "ankunft": rueck, + "booking_url": booking_url}) + if results: + break + except Exception: + pass + + if not results: + for r in _preise_aus_body(body, "kayak", abflug): + r["ankunft"] = rueck + r["booking_url"] = booking_url + r["airline"] = airline_filter or "" + results.append(r) + + # Kabinen-Verifikation: prüfe ob "Premium Economy" in der Seite steht + pe_confirmed = _check_cabin_on_page(body, title, "premium_economy") + if not pe_confirmed: + print(f"[KY{airline_label}] WARNUNG: Premium Economy nicht auf Seite bestätigt!") + + results = _validate_results(results, f"kayak{airline_label}", kabine) + print(f"[KY{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_cookie_banner(sb) + sb.sleep(3) + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def scrape_trip(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + abflug_fmt = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d") + rueck_fmt = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y%m%d") if trip_type == "roundtrip" else "" + abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") if trip_type == "roundtrip" else "" + kc = KABINE_TRIP.get(kabine, "W") + + stadtname = {"FRA": "frankfurt", "HAN": "hanoi", "KTI": "phnom-penh", + "PNH": "phnom-penh", "BKK": "bangkok", "SGN": "ho-chi-minh-city"} + von_name = stadtname.get(von, von.lower()) + nach_name = stadtname.get(nach, nach.lower()) + + booking_url = _booking_url_trip(von, nach, abflug_fmt, rueck_fmt, kc, von_name, nach_name, + airline_filter) + print(f"[TR] URL: {booking_url}") + results = [] + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + sb.open(booking_url) + sb.sleep(12) + + title = sb.get_title() + body = sb.get_text("body") + print(f"[TR] Title: {title[:80]}") + + for sel in ['button[id*="accept"]', 'button[class*="accept"]', + 'button[aria-label*="Accept"]', '#onetrust-accept-btn-handler']: + try: + sb.click(sel, timeout=2) + sb.sleep(2) + break + except Exception: + pass + + for sel in ['.price-box .price', '.flight-price', 'span[class*="price"]', + 'div[class*="price-num"]', 'em[class*="price"]', '.c-price']: + try: + elems = sb.find_elements(sel, timeout=2) + if elems: + for e in elems[:10]: + p = _parse_preis(e.text) + if p: + results.append({"scanner": "trip", "preis": p, + "waehrung": "EUR", "airline": "", + "abflug": abflug_iso, "ankunft": rueck_iso, + "booking_url": booking_url}) + if results: + break + except Exception: + pass + + if not results: + for r in _preise_aus_body(body, "trip", abflug_iso): + r["ankunft"] = rueck_iso + r["booking_url"] = booking_url + results.append(r) + + pe_confirmed = _check_cabin_on_page(body, title, "premium_economy") + if not pe_confirmed: + print("[TR] WARNUNG: Premium Economy nicht auf Seite bestätigt!") + + results = _validate_results(results, "trip", kabine) + print(f"[TR] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_cookie_banner(sb) + sb.sleep(2) + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def _booking_url_kayak_multicity(von, nach, via, abflug, via_datum, rueck, kc, bags=1, airline=""): + """ + Kayak Multi-City URL: FRA→HKG/DATE1 → HKG→KTI/DATE2 → KTI→FRA/DATE3 + Kabinen-Code: w=Premium Economy + """ + filters = [] + if bags: + filters.append(f"bfc%3D{bags}") + if airline: + filters.append(f"airlines%3D{airline}") + fs = ("&fs=" + "%3B".join(filters)) if filters else "" + # Kayak Multi-City Format: /flights/FRA-HKG/DATE/HKG-KTI/DATE/KTI-FRA/DATE + return (f"https://www.kayak.de/flights" + f"/{von}-{via}/{abflug}" + f"/{via}-{nach}/{via_datum}" + f"/{nach}-{von}/{rueck}" + f"?sort=price_a&cabin={kc}¤cy=EUR{fs}") + + +def scrape_kayak_multicity(von, nach, tage=30, aufenthalt_tage=60, + kabine="premium_economy", + gepaeck="1koffer+handgepaeck", + airline_filter="", + via="HKG", stopover_min_h=20, stopover_max_h=30): + """ + Multi-City Suche: FRA → HKG (1 Tag Aufenthalt) → KTI → FRA + Nutzt Cathay Pacific (CX) oder alle Airlines wenn airline_filter leer. + """ + abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + via_datum = (datetime.now() + timedelta(days=tage + 1)).strftime("%Y-%m-%d") + rueck = (datetime.now() + timedelta(days=tage + 1 + aufenthalt_tage)).strftime("%Y-%m-%d") + kc = KABINE_KAYAK.get(kabine, "w") + bags = 1 if "koffer" in gepaeck else 0 + airline_label = f" [{airline_filter}]" if airline_filter else "" + + booking_url = _booking_url_kayak_multicity(von, nach, via, abflug, via_datum, rueck, + kc, bags, airline_filter) + + print(f"[MC{airline_label}] Multi-City via {via}: {abflug} → +1T → {rueck}") + print(f"[MC{airline_label}] URL: {booking_url}") + + results = [] + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + sb.open(booking_url) + sb.sleep(15) + _dismiss_cookie_banner(sb) + sb.sleep(4) + + title = sb.get_title() + body = sb.get_text("body") + print(f"[MC] Title: {title[:80]}") + + for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span', + 'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price']: + try: + elems = sb.find_elements(sel, timeout=2) + if elems: + for e in elems[:15]: + p = _parse_preis(e.text) + if p and p > 600: + results.append({ + "scanner": "kayak_multicity", + "preis": p, + "waehrung": "EUR", + "airline": airline_filter or via, + "abflug": abflug, + "ankunft": rueck, + "booking_url": booking_url, + }) + if results: + break + except Exception: + pass + + if not results: + for r in _preise_aus_body(body, "kayak_multicity", abflug): + if r["preis"] > 600: + r["ankunft"] = rueck + r["booking_url"] = booking_url + r["airline"] = airline_filter or via + results.append(r) + + results = _validate_results(results, f"multicity{airline_label}", kabine) + print(f"[MC{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_cookie_banner(sb) + sb.sleep(3) + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def scrape_momondo(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + """Momondo — gleiche Firma wie Kayak, aber oft andere Preise.""" + abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \ + if trip_type == "roundtrip" else "" + kc = KABINE_KAYAK.get(kabine, "w") + bags = 1 if "koffer" in gepaeck else 0 + booking_url = _booking_url_momondo(von, nach, abflug, rueck, kc, bags, + layover_min, layover_max, airline_filter, + max_flugzeit_h, max_stops) + airline_label = f" [{airline_filter}]" if airline_filter else "" + print(f"[MO{airline_label}] URL: {booking_url}") + + results = [] + screenshot_b64 = "" + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + sb.open(booking_url) + sb.sleep(8) + + # Momondo Cookie-Consent wegklicken + for sel in ['button[class*="accept"]', '.RxNS-button-content', + '#onetrust-accept-btn-handler', 'button[title*="akzeptieren"]', + 'button[title*="Alle akzeptieren"]', '.evidon-banner-acceptbutton']: + try: + sb.find_element(sel, timeout=2).click() + print(f"[MO] Consent geklickt: {sel}") + sb.sleep(3) + break + except Exception: + pass + + # Nach Consent: Seite muss neu laden / Ergebnisse warten + sb.sleep(12) + title = sb.get_title() + body = sb.get_text("body") + print(f"[MO] Title: {title[:80]} | Body: {len(body)} chars") + + for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span', + 'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price', + '[class*="resultPrice"]', '.lowest-price']: + try: + elems = sb.find_elements(sel) + if elems: + for e in elems[:15]: + p = _parse_preis(e.text) + if p: + results.append({"scanner": "momondo", "preis": p, + "waehrung": "EUR", + "airline": airline_filter or "", + "abflug": abflug, "ankunft": rueck, + "booking_url": booking_url}) + if results: + break + except Exception: + pass + + if not results: + for r in _preise_aus_body(body, "momondo", abflug): + r["ankunft"] = rueck + r["booking_url"] = booking_url + r["airline"] = airline_filter or "" + results.append(r) + + pe_confirmed = _check_cabin_on_page(body, title, "premium_economy") + if not pe_confirmed: + print(f"[MO{airline_label}] WARNUNG: Premium Economy nicht auf Seite bestätigt!") + + results = _validate_results(results, f"momondo{airline_label}", kabine) + print(f"[MO{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_cookie_banner(sb) + sb.sleep(2) + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def scrape_wego(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + """Wego — asiatische Flugsuchmaschine, populär in Südostasien.""" + abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \ + if trip_type == "roundtrip" else "" + + KABINE_WEGO = {"economy": "economy", "premium_economy": "premiumEconomy", + "business": "business", "first": "first"} + kc = KABINE_WEGO.get(kabine, "premiumEconomy") + + stadtname_wego = {"FRA": "frankfurt", "KTI": "phnom-penh", "HAN": "hanoi", + "BKK": "bangkok", "SGN": "ho-chi-minh-city", "HKG": "hong-kong"} + von_slug = stadtname_wego.get(von, von.lower()) + nach_slug = stadtname_wego.get(nach, nach.lower()) + if rueck: + booking_url = (f"https://www.wego.com/flights/{von.lower()}/{nach.lower()}" + f"/{abflug}/{rueck}" + f"?cabin_class={kc}&adults_count=1&sort=price¤cy_code=EUR") + else: + booking_url = (f"https://www.wego.com/flights/{von.lower()}/{nach.lower()}" + f"/{abflug}" + f"?cabin_class={kc}&adults_count=1&sort=price¤cy_code=EUR") + + print(f"[WG] URL: {booking_url}") + results = [] + screenshot_b64 = "" + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + sb.open(booking_url) + sb.sleep(18) + + title = sb.get_title() + body = sb.get_text("body") + print(f"[WG] Title: {title[:80]} | Body: {len(body)} chars") + + for sel in ['[class*="price"]', '[data-testid*="price"]', + '.flight-price', 'span[class*="Price"]', + '.fare-price', '[class*="FarePrice"]']: + try: + elems = sb.find_elements(sel) + if elems: + for e in elems[:15]: + p = _parse_preis(e.text) + if p: + results.append({"scanner": "wego", "preis": p, + "waehrung": "EUR", "airline": "", + "abflug": abflug, "ankunft": rueck, + "booking_url": booking_url}) + if results: + break + except Exception: + pass + + if not results: + for r in _preise_aus_body(body, "wego", abflug): + r["ankunft"] = rueck + r["booking_url"] = booking_url + results.append(r) + + print(f"[WG] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def _parse_preis_usd(text): + """Parst USD-Preise aus Text wie 'USD 1,388.60' und wandelt grob in EUR um.""" + if not text: + return None + # USD-Format: 1,388.60 (Komma als Tausender, Punkt als Dezimal) + for p in [r'USD\s?([\d,]+\.?\d*)', r'\$\s?([\d,]+\.?\d*)']: + m = re.search(p, text) + if m: + try: + v = float(m.group(1).replace(',', '')) + eur = round(v * 0.92, 2) # grobe USD→EUR Umrechnung + if 200 < eur < 15000: + return eur + except ValueError: + pass + return None + + +def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + """Traveloka — größte Reiseplattform Südostasiens. Preise in USD, werden in EUR umgerechnet.""" + abflug = (datetime.now() + timedelta(days=tage)).strftime("%d-%m-%Y") + rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%d-%m-%Y") \ + if trip_type == "roundtrip" else "" + abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d") + rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \ + if trip_type == "roundtrip" else "" + + KABINE_TV = {"economy": "ECONOMY", "premium_economy": "PREMIUM_ECONOMY", + "business": "BUSINESS", "first": "FIRST_CLASS"} + kc = KABINE_TV.get(kabine, "PREMIUM_ECONOMY") + + if rueck: + booking_url = (f"https://www.traveloka.com/en-en/flight/fullsearch?" + f"ap={von}.{nach}&dt={abflug}.{rueck}" + f"&ps=1.0.0&sc={kc}") + else: + booking_url = (f"https://www.traveloka.com/en-en/flight/fullsearch?" + f"ap={von}.{nach}&dt={abflug}" + f"&ps=1.0.0&sc={kc}") + + print(f"[TV] URL: {booking_url}") + results = [] + screenshot_b64 = "" + + with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb: + sb.open(booking_url) + sb.sleep(18) + + title = sb.get_title() + body = sb.get_text("body") + print(f"[TV] Title: {title[:80]} | Body: {len(body)} chars") + + # Preise aus dem Body-Text extrahieren (USD → EUR) + seen = set() + for m in re.finditer(r'USD\s?([\d,]+\.?\d*)', body): + try: + usd = float(m.group(1).replace(',', '')) + eur = round(usd * 0.92) + if 400 < eur < 12000 and eur not in seen: + seen.add(eur) + results.append({"scanner": "traveloka", "preis": eur, + "waehrung": "EUR", "airline": "", + "abflug": abflug_iso, "ankunft": rueck_iso, + "booking_url": booking_url}) + except ValueError: + pass + + results.sort(key=lambda x: x["preis"]) + results = _validate_results(results, "traveloka", "premium_economy") + print(f"[TV] Ergebnis: {[r['preis'] for r in results[:5]]}") + _dismiss_comparison_popup(sb) + screenshot_b64 = _take_screenshot(sb) + return results[:10], screenshot_b64 + + +def scrape_skyscanner(von, nach, tage=30, aufenthalt_tage=60, + trip_type="roundtrip", kabine="premium_economy", + gepaeck="1koffer+handgepaeck", airline_filter="", + layover_min=120, layover_max=300, + max_flugzeit_h=22, max_stops=2): + """Skyscanner hat starken Bot-Schutz — übersprungen.""" + print("[SS] Skyscanner übersprungen (Bot-Detection)") + return [], ""