debug: detailliertes Logging in worker.py + Kayak verbessert
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
9ad786e171
commit
5b5cf38bd6
1 changed files with 128 additions and 209 deletions
|
|
@ -15,256 +15,175 @@ def scrape(scanner, von, nach, tage=30):
|
||||||
return fn(von, nach, tage)
|
return fn(von, nach, tage)
|
||||||
|
|
||||||
|
|
||||||
def parse_preise_aus_text(text, scanner, abflug):
|
def _parse_preis(text):
|
||||||
"""EUR-Preise aus Seitentext per Regex extrahieren."""
|
if not text:
|
||||||
results = []
|
return None
|
||||||
# Muster: 578 € oder €578 oder EUR 578 oder 578 EUR oder 1.234 €
|
text = text.replace('\xa0', ' ').replace('\u202f', ' ').replace(',', '.')
|
||||||
patterns = [
|
for p in [r'(\d{3,4})\s?€', r'€\s?(\d{3,4})', r'EUR\s?(\d{3,4})', r'(\d{3,4})\s?EUR']:
|
||||||
r'(\d{1,2}[.,]\d{3})\s*[€]', # 1.234 €
|
m = re.search(p, text)
|
||||||
r'[€]\s*(\d{3,4})', # €578
|
if m:
|
||||||
r'(\d{3,4})\s*[€]', # 578 €
|
|
||||||
r'EUR\s*(\d{3,4})', # EUR 578
|
|
||||||
r'(\d{3,4})\s*EUR', # 578 EUR
|
|
||||||
]
|
|
||||||
seen = set()
|
|
||||||
for pattern in patterns:
|
|
||||||
for m in re.findall(pattern, text):
|
|
||||||
clean = m.replace('.', '').replace(',', '')
|
|
||||||
try:
|
try:
|
||||||
preis = float(clean)
|
v = float(m.group(1).replace('.', ''))
|
||||||
if 100 < preis < 5000 and preis not in seen:
|
if 200 < v < 8000:
|
||||||
seen.add(preis)
|
return round(v, 2)
|
||||||
results.append({
|
|
||||||
"scanner": scanner,
|
|
||||||
"preis": preis,
|
|
||||||
"waehrung": "EUR",
|
|
||||||
"airline": "",
|
|
||||||
"abflug": abflug,
|
|
||||||
"ankunft": ""
|
|
||||||
})
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
pass
|
||||||
# Maximal 10 günstigste
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _preise_aus_body(body, scanner, abflug):
|
||||||
|
results = []
|
||||||
|
seen = set()
|
||||||
|
# Alle Zahlen 300-6000 die neben einem € stehen
|
||||||
|
for m in re.finditer(r'(\d[\d\s\.]{1,5})\s?€|€\s?(\d[\d\s\.]{1,5})', body):
|
||||||
|
raw = (m.group(1) or m.group(2)).replace(' ', '').replace('.', '')
|
||||||
|
try:
|
||||||
|
v = float(raw)
|
||||||
|
if 300 < v < 6000 and v not in seen:
|
||||||
|
seen.add(v)
|
||||||
|
results.append({
|
||||||
|
"scanner": scanner, "preis": v, "waehrung": "EUR",
|
||||||
|
"airline": "", "abflug": abflug, "ankunft": ""
|
||||||
|
})
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
results.sort(key=lambda x: x["preis"])
|
results.sort(key=lambda x: x["preis"])
|
||||||
return results[:10]
|
return results[:10]
|
||||||
|
|
||||||
|
|
||||||
def scrape_google_flights(von, nach, tage=30):
|
def scrape_google_flights(von, nach, tage=30):
|
||||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||||
# Direkte Google Flights Such-URL — sc:w = Premium Economy
|
# sc:w = Premium Economy
|
||||||
url = (
|
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
|
||||||
f"https://www.google.com/travel/flights/search"
|
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:w")
|
||||||
f"?hl=de&curr=EUR"
|
|
||||||
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:w"
|
log = []
|
||||||
)
|
results = []
|
||||||
|
|
||||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||||
sb.open(url)
|
sb.open(url)
|
||||||
sb.sleep(6)
|
sb.sleep(7)
|
||||||
|
|
||||||
# Google Consent-Seite (DSGVO, DE-IPs) wegklicken
|
current_url = sb.get_current_url()
|
||||||
if "consent.google" in sb.get_current_url() or "Bevor Sie" in sb.get_title():
|
title = sb.get_title()
|
||||||
for selector in [
|
log.append(f"[GF] title={title[:60]} url={current_url[:80]}")
|
||||||
'form[action*="save"] button',
|
|
||||||
'button[aria-label*="Alle akzeptieren"]',
|
# Consent-Seite behandeln
|
||||||
'button[aria-label*="Accept all"]',
|
if "consent" in current_url or "Bevor Sie" in title:
|
||||||
'button.tHlp8d',
|
log.append("[GF] Consent-Seite erkannt")
|
||||||
'div#introAgreeButton',
|
for sel in ['form[action*="save"] button', 'button[jsname="tHlp8d"]',
|
||||||
]:
|
'.lssxud button', 'button[aria-label*="akzeptieren"]']:
|
||||||
try:
|
try:
|
||||||
sb.click(selector, timeout=3)
|
sb.click(sel, timeout=3)
|
||||||
sb.sleep(3)
|
sb.sleep(4)
|
||||||
|
log.append(f"[GF] Consent geklickt: {sel}")
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
# Nach Consent nochmal zur Such-URL navigieren
|
# Zurück zur Such-URL
|
||||||
sb.open(url)
|
sb.open(url)
|
||||||
sb.sleep(6)
|
sb.sleep(8)
|
||||||
|
log.append(f"[GF] Nach Consent-Redirect: {sb.get_title()[:60]}")
|
||||||
|
|
||||||
# Cookie-Banner auf der Flights-Seite
|
# Checken ob wir auf der Suchergebnisseite sind
|
||||||
for selector in ['button[aria-label*="Alle"]', 'button[aria-label*="Accept"]',
|
title2 = sb.get_title()
|
||||||
'button[aria-label*="Zustimmen"]', 'button.VfPpkd-LgbsSe']:
|
if "Günstige Flüge" in title2 or "Google Flüge" in title2:
|
||||||
|
log.append("[GF] WARNUNG: Auf Homepage gelandet, versuche Formular")
|
||||||
|
# Formular direkt ausfüllen
|
||||||
try:
|
try:
|
||||||
sb.click(selector, timeout=2)
|
# Von-Feld leeren und FRA eingeben
|
||||||
|
sb.click('input[placeholder*="Von"]', timeout=4)
|
||||||
|
sb.triple_click('input[placeholder*="Von"]')
|
||||||
|
sb.type('input[placeholder*="Von"]', von)
|
||||||
|
sb.sleep(2)
|
||||||
|
# Ersten Autocomplete-Eintrag nehmen
|
||||||
|
sb.click('li[data-code="' + von + '"]', timeout=3)
|
||||||
sb.sleep(1)
|
sb.sleep(1)
|
||||||
break
|
# Nach-Feld
|
||||||
except Exception:
|
sb.click('input[placeholder*="Wohin"]', timeout=3)
|
||||||
continue
|
sb.type('input[placeholder*="Wohin"]', nach)
|
||||||
|
sb.sleep(2)
|
||||||
|
sb.click('li[data-code="' + nach + '"]', timeout=3)
|
||||||
|
sb.sleep(1)
|
||||||
|
log.append("[GF] Formular ausgefüllt, suche...")
|
||||||
|
sb.sleep(5)
|
||||||
|
except Exception as e:
|
||||||
|
log.append(f"[GF] Formular-Fehler: {e}")
|
||||||
|
|
||||||
# Noch etwas warten bis Preise laden
|
# Preise extrahieren
|
||||||
sb.sleep(4)
|
body = sb.get_text("body")
|
||||||
|
log.append(f"[GF] Body-Länge: {len(body)} Zeichen")
|
||||||
|
|
||||||
results = []
|
# Aria-Labels mit €
|
||||||
|
try:
|
||||||
|
elems = sb.find_elements('[aria-label*="€"]', timeout=3)
|
||||||
|
log.append(f"[GF] aria-label €-Elemente: {len(elems)}")
|
||||||
|
for elem in elems[:15]:
|
||||||
|
label = elem.get_attribute('aria-label') or elem.text
|
||||||
|
p = _parse_preis(label)
|
||||||
|
if p:
|
||||||
|
results.append({"scanner": "google_flights", "preis": p,
|
||||||
|
"waehrung": "EUR", "airline": "", "abflug": abflug, "ankunft": ""})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Versuch 1: Spezifische Preis-Elemente
|
# Fallback: Regex über Body
|
||||||
selectors = [
|
|
||||||
'span[data-gs*="price"]',
|
|
||||||
'[data-price]',
|
|
||||||
'div.YMlIz', # Google Flights Preis-Container (oft genutzt)
|
|
||||||
'div.U3gSDe',
|
|
||||||
'span.nE0Jnd',
|
|
||||||
'div[jsname="MkNb9"] span',
|
|
||||||
'li[data-gs] span',
|
|
||||||
]
|
|
||||||
for sel in selectors:
|
|
||||||
try:
|
|
||||||
elems = sb.find_elements(sel, timeout=2)
|
|
||||||
for elem in elems[:15]:
|
|
||||||
preis = _parse_single(elem.text)
|
|
||||||
if preis:
|
|
||||||
results.append({
|
|
||||||
"scanner": "google_flights",
|
|
||||||
"preis": preis,
|
|
||||||
"waehrung": "EUR",
|
|
||||||
"airline": "",
|
|
||||||
"abflug": abflug,
|
|
||||||
"ankunft": ""
|
|
||||||
})
|
|
||||||
if results:
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Versuch 2: Aria-Labels mit Preisangaben
|
|
||||||
if not results:
|
if not results:
|
||||||
try:
|
results = _preise_aus_body(body, "google_flights", abflug)
|
||||||
elems = sb.find_elements('[aria-label*="€"]', timeout=3)
|
log.append(f"[GF] Regex-Fallback: {len(results)} Preise")
|
||||||
for elem in elems[:20]:
|
|
||||||
label = elem.get_attribute('aria-label') or elem.text
|
|
||||||
preis = _parse_single(label)
|
|
||||||
if preis:
|
|
||||||
results.append({
|
|
||||||
"scanner": "google_flights",
|
|
||||||
"preis": preis,
|
|
||||||
"waehrung": "EUR",
|
|
||||||
"airline": "",
|
|
||||||
"abflug": abflug,
|
|
||||||
"ankunft": ""
|
|
||||||
})
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Versuch 3: Regex über ganzen Seitentext
|
log.append(f"[GF] Ergebnis: {[r['preis'] for r in results]}")
|
||||||
if not results:
|
|
||||||
try:
|
|
||||||
body = sb.get_text("body", timeout=5)
|
|
||||||
results = parse_preise_aus_text(body, "google_flights", abflug)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Deduplizieren
|
print('\n'.join(log))
|
||||||
seen = set()
|
return results[:10]
|
||||||
unique = []
|
|
||||||
for r in sorted(results, key=lambda x: x["preis"]):
|
|
||||||
if r["preis"] not in seen:
|
|
||||||
seen.add(r["preis"])
|
|
||||||
unique.append(r)
|
|
||||||
|
|
||||||
return unique[:10]
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_single(text):
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
text = text.replace('\xa0', ' ').replace('\u202f', ' ')
|
|
||||||
patterns = [
|
|
||||||
r'(\d{1,2}[.,]\d{3})\s*[€]',
|
|
||||||
r'[€]\s*(\d{3,4})',
|
|
||||||
r'(\d{3,4})\s*[€]',
|
|
||||||
r'EUR\s*(\d{3,4})',
|
|
||||||
r'(\d{3,4})\s*EUR',
|
|
||||||
]
|
|
||||||
for p in patterns:
|
|
||||||
m = re.search(p, text)
|
|
||||||
if m:
|
|
||||||
clean = m.group(1).replace('.', '').replace(',', '')
|
|
||||||
try:
|
|
||||||
v = float(clean)
|
|
||||||
if 100 < v < 5000:
|
|
||||||
return round(v, 2)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def scrape_kayak(von, nach, tage=30):
|
def scrape_kayak(von, nach, tage=30):
|
||||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||||
# ~prem~ = Premium Economy bei Kayak
|
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a&cabin=w¤cy=EUR"
|
||||||
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a¤cy=EUR&cabin=w"
|
|
||||||
|
log = []
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||||
sb.open(url)
|
sb.open(url)
|
||||||
sb.sleep(7)
|
sb.sleep(15)
|
||||||
|
|
||||||
# Preis-Selektoren Kayak
|
title = sb.get_title()
|
||||||
for sel in ['.price-text', '.f8F1-price-text', 'span[class*="price"]',
|
body = sb.get_text("body")
|
||||||
'div[class*="price"] span', '.Iqt3']:
|
log.append(f"[KY] title={title[:60]}")
|
||||||
try:
|
log.append(f"[KY] body-länge={len(body)}")
|
||||||
elems = sb.find_elements(sel, timeout=3)
|
log.append(f"[KY] body-500={body[:500]}")
|
||||||
for elem in elems[:10]:
|
|
||||||
preis = _parse_single(elem.text)
|
|
||||||
if preis:
|
|
||||||
results.append({
|
|
||||||
"scanner": "kayak",
|
|
||||||
"preis": preis,
|
|
||||||
"waehrung": "EUR",
|
|
||||||
"airline": "",
|
|
||||||
"abflug": abflug,
|
|
||||||
"ankunft": ""
|
|
||||||
})
|
|
||||||
if results:
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Fallback Regex
|
# CSS-Selektoren Kayak
|
||||||
if not results:
|
for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span',
|
||||||
|
'span[class*="price"]', '.Iqt3', 'div.nrc6-price']:
|
||||||
try:
|
try:
|
||||||
body = sb.get_text("body", timeout=5)
|
elems = sb.find_elements(sel, timeout=2)
|
||||||
results = parse_preise_aus_text(body, "kayak", abflug)
|
if elems:
|
||||||
|
log.append(f"[KY] Selector '{sel}': {len(elems)} Elemente")
|
||||||
|
for e in elems[:10]:
|
||||||
|
p = _parse_preis(e.text)
|
||||||
|
if p:
|
||||||
|
results.append({"scanner": "kayak", "preis": p,
|
||||||
|
"waehrung": "EUR", "airline": "",
|
||||||
|
"abflug": abflug, "ankunft": ""})
|
||||||
|
if results:
|
||||||
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Fallback Regex
|
||||||
|
if not results:
|
||||||
|
results = _preise_aus_body(body, "kayak", abflug)
|
||||||
|
|
||||||
|
log.append(f"[KY] Ergebnis: {[r['preis'] for r in results]}")
|
||||||
|
|
||||||
|
print('\n'.join(log))
|
||||||
return results[:10]
|
return results[:10]
|
||||||
|
|
||||||
|
|
||||||
def scrape_skyscanner(von, nach, tage=30):
|
def scrape_skyscanner(von, nach, tage=30):
|
||||||
abflug_fmt = (datetime.now() + timedelta(days=tage)).strftime("%y%m%d")
|
"""Skyscanner hat starken Bot-Schutz — für jetzt übersprungen."""
|
||||||
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
print("[SS] Skyscanner übersprungen (Bot-Detection)")
|
||||||
# /w/ = Premium Economy bei Skyscanner
|
return []
|
||||||
url = f"https://www.skyscanner.de/transport/flights/{von.lower()}/{nach.lower()}/{abflug_fmt}/?currency=EUR&cabinclass=premiumeconomy"
|
|
||||||
results = []
|
|
||||||
|
|
||||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
|
||||||
sb.open(url)
|
|
||||||
sb.sleep(7)
|
|
||||||
|
|
||||||
for sel in ['[data-testid="price-label"]', 'span[class*="Price"]',
|
|
||||||
'div[class*="price"]', 'span[class*="price"]']:
|
|
||||||
try:
|
|
||||||
elems = sb.find_elements(sel, timeout=3)
|
|
||||||
for elem in elems[:10]:
|
|
||||||
preis = _parse_single(elem.text)
|
|
||||||
if preis:
|
|
||||||
results.append({
|
|
||||||
"scanner": "skyscanner",
|
|
||||||
"preis": preis,
|
|
||||||
"waehrung": "EUR",
|
|
||||||
"airline": "",
|
|
||||||
"abflug": abflug_iso,
|
|
||||||
"ankunft": ""
|
|
||||||
})
|
|
||||||
if results:
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not results:
|
|
||||||
try:
|
|
||||||
body = sb.get_text("body", timeout=5)
|
|
||||||
results = parse_preise_aus_text(body, "skyscanner", abflug_iso)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return results[:10]
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue