diff --git a/.github/ISSUE_TEMPLATE/01-launch-failure.yml b/.github/ISSUE_TEMPLATE/01-launch-failure.yml new file mode 100644 index 0000000..2c5451f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/01-launch-failure.yml @@ -0,0 +1,98 @@ +name: Launch failure +description: Browser or wrapper fails to start (install errors, missing deps, profile load fails, never reaches new_page) +title: "[launch] " +labels: ["bug", "launch-failure"] +body: + - type: markdown + attributes: + value: | + Use this when the browser never reaches a usable state. + If it starts and the bug appears on a site or clicking something, use the site/action template instead. + + - type: input + id: version + attributes: + label: Version + description: Output of `python -m invisible_playwright version`. + placeholder: 0.1.7 (binary firefox-7) + validations: + required: true + + - type: dropdown + id: os + attributes: + label: OS + options: + - Windows 10/11 x86_64 + - Linux x86_64 + - macOS (unsupported) + - Other + validations: + required: true + + - type: input + id: python + attributes: + label: Python + placeholder: 3.11.7 + validations: + required: true + + - type: input + id: install_cmd + attributes: + label: How you installed + placeholder: pip install invisible_playwright + validations: + required: true + + - type: textarea + id: snippet + attributes: + label: What you ran + description: Stop at the line that errors out. Redact creds. + render: python + value: | + from invisible_playwright import InvisiblePlaywright + with InvisiblePlaywright(seed=42) as browser: + ctx = browser.new_context() + validations: + required: true + + - type: textarea + id: traceback + attributes: + label: Full traceback + description: The whole stack trace verbatim. Don't summarize. + render: text + validations: + required: true + + - type: textarea + id: logs + attributes: + label: Extra logs + description: Output of `DEBUG=pw:browser* python yourscript.py 2>&1`. Optional but speeds things up. + render: text + validations: + required: false + + - type: textarea + id: tried + attributes: + label: What you already tried + description: Reinstall, clear cache, different Python version, different proxy, etc. + validations: + required: false + + - type: checkboxes + id: confirm + attributes: + label: Before submitting + options: + - label: Searched existing issues. + required: true + - label: On the latest released version. + required: true + - label: Removed credentials and personal paths from the snippet and logs. + required: true diff --git a/.github/ISSUE_TEMPLATE/02-site-or-action-bug.yml b/.github/ISSUE_TEMPLATE/02-site-or-action-bug.yml new file mode 100644 index 0000000..6c38de6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/02-site-or-action-bug.yml @@ -0,0 +1,167 @@ +name: Site or action bug +description: Browser starts fine but a navigation, click, evaluate, or other operation fails or behaves wrong +title: "[bug] " +labels: ["bug"] +body: + - type: markdown + attributes: + value: | + For bugs that happen after the browser is up. + If the browser never launches, use the launch failure template. + If a fingerprint detector flags the browser, use the stealth detection template. + + - type: input + id: version + attributes: + label: Version + description: Output of `python -m invisible_playwright version`. + placeholder: 0.1.7 (binary firefox-7) + validations: + required: true + + - type: dropdown + id: os + attributes: + label: OS + options: + - Windows 10/11 x86_64 + - Linux x86_64 + - macOS (unsupported) + - Other + validations: + required: true + + - type: input + id: python + attributes: + label: Python + placeholder: 3.11.7 + validations: + required: true + + - type: dropdown + id: headless + attributes: + label: headless= + description: Some bugs only repro on Windows headless=True (hidden alt-desktop path). + options: + - "True" + - "False" + validations: + required: true + + - type: dropdown + id: proxy + attributes: + label: Proxy + description: Sites often vary by IP geo (e.g. GDPR consent shows only on UK/EU). + options: + - No proxy (host network) + - Residential, UK/GB + - Residential, US + - Residential, other country (specify in notes) + - Datacenter (specify provider in notes) + validations: + required: true + + - type: dropdown + id: profile + attributes: + label: Profile dir + options: + - Fresh each run (no profile_dir) + - Persistent profile_dir, reusing across runs + - Persistent profile_dir, first run creating it + validations: + required: true + + - type: input + id: url + attributes: + label: URL + description: The exact URL passed to `page.goto`. Not "the homepage" — the literal string. + placeholder: https://id.sky.com/ + validations: + required: true + + - type: textarea + id: snippet + attributes: + label: Runnable reproduction + description: A complete snippet we can copy, paste, run. Stub creds with placeholders, keep everything else literal. + render: python + value: | + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, headless=True) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto("https://example.com/") + # the exact operation that fails: + page.click("button:has-text('Accept all')") + validations: + required: true + + - type: input + id: selector + attributes: + label: Selector or locator + description: The exact string passed to locator/click/frame_locator. Write N/A if not a selector bug. + placeholder: page.frame_locator("iframe[id^='sp_message_iframe_']").get_by_text("Accept all") + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected + description: What should happen when the snippet runs? + validations: + required: true + + - type: textarea + id: actual + attributes: + label: Actual + description: What happens instead? Full traceback, error string verbatim, any page.on('crash') firing. + validations: + required: true + + - type: textarea + id: screenshot + attributes: + label: Screenshot + description: Drag-drop a screenshot if the bug is visual. Optional but useful. + validations: + required: false + + - type: textarea + id: logs + attributes: + label: Browser logs + description: Output of `DEBUG=pw:browser* python yourscript.py 2>&1 | tail -200`. Redact creds and real IPs. + render: text + validations: + required: false + + - type: textarea + id: notes + attributes: + label: Notes + description: Anything else, hypotheses, related issues, things you've already tried. + validations: + required: false + + - type: checkboxes + id: confirm + attributes: + label: Before submitting + options: + - label: Searched existing issues. + required: true + - label: On the latest released version. + required: true + - label: The snippet above runs end-to-end on a clean Python install. + required: true + - label: Removed credentials, proxy passwords, real IPs, personal file paths. + required: true diff --git a/.github/ISSUE_TEMPLATE/03-stealth-detection.yml b/.github/ISSUE_TEMPLATE/03-stealth-detection.yml new file mode 100644 index 0000000..b2c5e1d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/03-stealth-detection.yml @@ -0,0 +1,141 @@ +name: Stealth detection +description: A fingerprint detector flagged the browser as a bot, VM, VPN, anti-detect, tampered, or otherwise non-human +title: "[detect] " +labels: ["bug", "stealth"] +body: + - type: markdown + attributes: + value: | + Use this when something detects the browser (Fingerprint Pro, CreepJS, BotD, reCAPTCHA, Cloudflare, sannysoft, etc). + Bugs in operations (clicks, navigation) go to the site/action template. + Browser failing to start goes to the launch failure template. + + - type: input + id: version + attributes: + label: Version + placeholder: 0.1.7 (binary firefox-7) + validations: + required: true + + - type: dropdown + id: os + attributes: + label: OS + options: + - Windows 10/11 x86_64 + - Linux x86_64 + - macOS (unsupported) + - Other + validations: + required: true + + - type: dropdown + id: headless + attributes: + label: headless= + options: + - "True" + - "False" + validations: + required: true + + - type: dropdown + id: proxy + attributes: + label: Proxy + description: Datacenter or wrong-country proxies trip most detectors regardless of the browser. Be honest about what you used. + options: + - No proxy (host network) + - Residential, matching target geo + - Residential, different geo than target + - Datacenter (specify provider in notes) + - Mobile / 4G + validations: + required: true + + - type: input + id: detector + attributes: + label: Detector name and URL + description: Exact site / service / product that flagged us. + placeholder: Fingerprint Pro — https://demo.fingerprint.com/playground + validations: + required: true + + - type: textarea + id: scores + attributes: + label: Detector verdict + description: Paste the relevant flags / scores verbatim. For Fingerprint Pro paste `bot`, `vpn`, `virtual_machine`, `tampering*`, `vm_ml_score`, `suspect_score`. For CreepJS the headless / lies / trust scores. For reCAPTCHA v3 the score number. + render: text + placeholder: | + bot: bad + vpn: true + virtual_machine: true + vm_ml_score: 0.74 + suspect_score: 22 + validations: + required: true + + - type: textarea + id: screenshot + attributes: + label: Screenshot of the detector result + description: Drag-drop a screenshot of the detector page so we see what you see. + validations: + required: true + + - type: textarea + id: snippet + attributes: + label: How you launched + description: The InvisiblePlaywright launch + navigation that produced the result above. Redact creds. + render: python + value: | + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, headless=True) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto("https://demo.fingerprint.com/playground") + validations: + required: true + + - type: textarea + id: expected + attributes: + label: What you expected + description: Most detectors will never give a perfect score for any browser. Tell us what threshold you'd accept (e.g. bot=not_detected, vm_ml_score < 0.3). + validations: + required: true + + - type: textarea + id: full_report + attributes: + label: Full detector response + description: For Fingerprint Pro paste the JSON from /api/event/v4/ if you have it. For CreepJS paste the full Smart Signals block. Optional but speeds things up a lot. + render: json + validations: + required: false + + - type: textarea + id: notes + attributes: + label: Notes + validations: + required: false + + - type: checkboxes + id: confirm + attributes: + label: Before submitting + options: + - label: Searched existing issues. + required: true + - label: On the latest released version. + required: true + - label: The detector verdict above is from a real run, not a hypothesis. + required: true + - label: Removed credentials, real IPs, FpJS visitor_id values, personal file paths from the snippet and full report. + required: true diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml deleted file mode 100644 index 805d579..0000000 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: Bug report -description: Report a bug in the invisible_playwright Python wrapper -title: "[bug] " -labels: ["bug"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to file a bug report. - - Before continuing, please: - - Search [existing issues](https://github.com/feder-cr/invisible_playwright/issues?q=is%3Aissue) to avoid duplicates. - - If the bug is in the **patched Firefox itself** (canvas/WebGL/audio/font spoofing, a detector flagging the browser), open it at [feder-cr/firefox-stealth](https://github.com/feder-cr/firefox-stealth/issues) instead. - - **Do not** report security vulnerabilities here — follow [SECURITY.md](https://github.com/feder-cr/invisible_playwright/blob/main/SECURITY.md). - - type: input - id: version - attributes: - label: invisible_playwright version - description: Output of `invisible_playwright version` - placeholder: "0.1.0 (binary 150.0.1)" - validations: - required: true - - type: dropdown - id: os - attributes: - label: Operating system - options: - - Windows x86_64 - - Linux x86_64 - - Other (please specify in description) - validations: - required: true - - type: input - id: python - attributes: - label: Python version - placeholder: "3.11.7" - validations: - required: true - - type: textarea - id: repro - attributes: - label: Minimal reproduction - description: A small, self-contained code snippet that triggers the bug. Strip out anything unrelated. - render: python - validations: - required: true - - type: textarea - id: expected - attributes: - label: Expected behavior - validations: - required: true - - type: textarea - id: actual - attributes: - label: Actual behavior - description: Include the full error message and traceback if any. - validations: - required: true - - type: textarea - id: logs - attributes: - label: Logs / additional context - description: Browser console output, environment variables, proxy config (redact credentials), etc. - render: text - validations: - required: false - - type: checkboxes - id: confirm - attributes: - label: Confirmations - options: - - label: I have searched existing issues and this bug has not been reported. - required: true - - label: I am on the latest release. - required: true - - label: I have removed any credentials, proxy passwords, or sensitive data from logs. - required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 6d3dace..44f31be 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -3,9 +3,9 @@ contact_links: - name: Security vulnerability url: https://github.com/feder-cr/invisible_playwright/security/advisories/new about: Report a security issue privately. Do NOT open a public issue. - - name: Bug in the patched Firefox itself (canvas / WebGL / fonts / WebRTC / etc.) - url: https://github.com/feder-cr/firefox-stealth/issues - about: Spoofing/fingerprint bugs belong in the firefox-stealth repo. + - name: Bug in the patched Firefox source (C++, IDL, Juggler JS) + url: https://github.com/feder-cr/invisible_firefox/issues + about: Source-level patches in the Firefox fork go in the invisible_firefox repo. Detection results (FpJS, CreepJS, etc.) use the stealth detection template here. - name: Question or general discussion url: https://github.com/feder-cr/invisible_playwright/discussions - about: For usage questions, ideas, and chat. Bugs and features still go in issues. + about: Usage questions, ideas, chat. Bugs and features still go in issues. diff --git a/.github/workflows/firefox-launch-matrix.yml b/.github/workflows/firefox-launch-matrix.yml new file mode 100644 index 0000000..4e7b053 --- /dev/null +++ b/.github/workflows/firefox-launch-matrix.yml @@ -0,0 +1,106 @@ +name: firefox-launch-matrix + +# Cross-Windows-edition smoke for the shipped firefox-N binary. +# Triggered by issue #22 (firefox-7 SxS mismatch on Win11 build 26200, +# reporter `jannusdorfer-create`). +# +# Runs the exact reporter snippet on every Windows runner GitHub offers, +# from a fresh checkout. If any matrix cell fails the same way, the bug +# is reproducible on at least one clean-ish environment and we ship a +# sidecar mozglue.manifest fix. If all cells pass, the bug is confined +# to the reporter's specific environment (Pro/Enterprise GPO, EDR, etc.). + +on: + workflow_dispatch: + push: + branches: [main] + paths: + - '.github/workflows/firefox-launch-matrix.yml' + +jobs: + smoke: + name: launch (${{ matrix.os }}, py${{ matrix.python }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [windows-2022, windows-2025, windows-latest] + python: ["3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: pip + + - name: Windows edition + build info + shell: pwsh + run: | + $os = Get-CimInstance Win32_OperatingSystem + Write-Host "Caption : $($os.Caption)" + Write-Host "BuildNumber: $($os.BuildNumber)" + Write-Host "OSArch : $($os.OSArchitecture)" + Write-Host "Edition : $((Get-CimInstance Win32_OperatingSystem).OperatingSystemSKU)" + Write-Host "---" + Write-Host "VC++ Redistributables installed:" + Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\*' ` + -ErrorAction SilentlyContinue | + Where-Object { $_.DisplayName -like '*Visual C++*Redist*' } | + Select-Object DisplayName, DisplayVersion | + Format-Table -AutoSize + + - name: Install package from this commit + run: | + python -m pip install --upgrade pip + pip install . + + - name: Fetch firefox-7 binary + run: python -m invisible_playwright fetch + + - name: Verify firefox.exe can launch standalone (the snippet that fails for issue #22) + shell: pwsh + run: | + # The platformdirs path has the duplicated `invisible-playwright` segment + # on Windows (user_cache_dir convention). + $ffPath = "$env:LOCALAPPDATA\invisible-playwright\invisible-playwright\Cache\firefox-7\firefox.exe" + if (-not (Test-Path $ffPath)) { + Write-Error "firefox.exe NOT FOUND at $ffPath" + exit 1 + } + Write-Host "Launching: $ffPath --version" + # NOTE: firefox.exe --version on Windows prints the version but may + # return non-zero exit code (sub-process fork quirk). Check stdout. + $output = & $ffPath --version 2>&1 | Out-String + Write-Host "Output: $output" + if ($output -notmatch 'Mozilla Firefox \d') { + Write-Error "firefox.exe --version did not print a Mozilla Firefox version. Output was: $output" + exit 1 + } + Write-Host "OK: firefox.exe runs and prints version." + + - name: Run reporter's exact InvisiblePlaywright snippet + run: | + python -c " + import asyncio + from invisible_playwright.async_api import InvisiblePlaywright + async def main(): + async with InvisiblePlaywright(seed=9128) as browser: + page = await browser.new_page() + await page.goto('about:blank') + print('OK: page loaded, url =', page.url) + asyncio.run(main()) + " + + - name: Upload diagnostics on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: launch-failure-${{ matrix.os }}-py${{ matrix.python }} + path: | + ${{ env.LOCALAPPDATA }}/invisible-playwright/invisible-playwright/Cache/firefox-7/firefox.exe + ${{ env.LOCALAPPDATA }}/invisible-playwright/invisible-playwright/Cache/firefox-7/mozglue.dll + if-no-files-found: warn + retention-days: 7 diff --git a/.github/workflows/webrtc-e2e.yml b/.github/workflows/webrtc-e2e.yml new file mode 100644 index 0000000..d14b8ce --- /dev/null +++ b/.github/workflows/webrtc-e2e.yml @@ -0,0 +1,47 @@ +name: webrtc-e2e + +# Live WebRTC realness check against the shipped patched binary. +# +# Manual (workflow_dispatch) on purpose: it needs a firefox-N binary that +# carries the WebRTC fixes (synthetic srflx in genuine nICEr form + the +# default-route fallback behind a proxy). Run it after publishing such a +# binary — it is the release gate for "WebRTC looks real behind a proxy". +# Until that binary ships, test_not_blocked_behind_tcp_only_socks is EXPECTED +# to fail (the old binary is fully blocked behind a SOCKS proxy), which is the +# whole point of the gate. +# +# No smartproxy / credentials: the "behind a proxy" condition is faked by an +# in-process TCP-only SOCKS5 server (refuses UDP ASSOCIATE) and the egress IP +# is injected as an RFC 5737 TEST-NET address. Fully self-contained. + +on: + workflow_dispatch: + +jobs: + webrtc-e2e: + name: webrtc realness (ubuntu, py3.12) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install package + dev extras + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Fetch the patched Firefox binary + run: python -m invisible_playwright fetch + + - name: Resolve binary path + run: echo "STEALTHFOX_E2E_BINARY=$(python -m invisible_playwright path)" >> "$GITHUB_ENV" + + - name: Run WebRTC realness e2e (xvfb for the headless Firefox) + run: | + sudo apt-get update && sudo apt-get install -y xvfb + xvfb-run -a pytest tests/test_webrtc_realness.py -m e2e -o addopts="" -v -rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 731f740..f142d90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,46 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ## [Unreleased] +### Added +- `timezone="auto"`: the browser timezone is auto-derived from the egress IP. By default (no explicit timezone) it ALWAYS resolves — from the proxy egress when a proxy is set, otherwise from the host's own public IP — so the zone can never disagree with the IP (the classic `timezone_mismatch` signal). An explicit `"Area/City"` is the only way to force a specific zone. On failure: with a proxy the launch raises (no silent host-TZ fallback behind a foreign proxy); without a proxy it falls back to the host TZ so a transient lookup can't break the launch. +- The egress IP is mapped to its IANA zone with an offline mmdb (`daijro/geoip-all-in-one`). It auto-updates against the upstream weekly rebuild: cached locally, re-checked after `GEOIP_REFRESH_DAYS` (7), older copies pruned, and a stale cache is reused when offline. `STEALTHFOX_GEOIP_MMDB` points at your own `.mmdb` to skip the download. +- `resolve_session_timezone(timezone, proxy)` and `ensure_geoip_mmdb()` re-exported at the package root (plus `GeoTimezoneError`) so integrations that own their launch can reproduce the resolution. +- `tests/test_geo.py` (37) + `tests/test_geoip_update.py` (freshness / auto-update / offline fallback) unit tests. + +### Changed +- New runtime dependencies: `requests[socks]` (SOCKS egress lookup), `maxminddb` (mmdb reader), `tzdata` (IANA database for `zoneinfo`, which Windows lacks). + +## [0.2.0] - 2026-05-28 + +### Added +- Public config helpers in `invisible_playwright.config`: `get_default_stealth_prefs(seed, *, pin, locale, timezone, extra_prefs, humanize, virtual_display)` returns a complete `firefox_user_prefs` dict; `get_default_args()` returns the baseline CLI args list (currently empty). Both also re-exported at the package root. +- `invisible_playwright.ensure_binary` re-exported at the package root for parity with the `cloakbrowser.download.ensure_binary` integration pattern that downstream projects (Skyvern, Crawlee, agno) already expect. +- These helpers let third-party fetchers (changedetection.io plugins, Crawlee `BrowserPool` subclasses, agno toolkits) drive `playwright.firefox.launch(executable_path=..., firefox_user_prefs=...)` themselves without depending on the `InvisiblePlaywright` context manager owning the lifecycle. +- `tests/unit/test_config_public.py`: 14 unit tests covering deterministic seed, locale / timezone / pin / extra_prefs / humanize variations, and round-trip via the public namespace. + +### Unchanged +- `InvisiblePlaywright` context manager surface is identical (backwards compatible). +- `BINARY_VERSION` stays at `firefox-7`. Python-only release; no new Firefox build. + +## [0.1.8] - 2026-05-23 + +### Fixed +- [#20](https://github.com/feder-cr/invisible_playwright/issues/20): cross-origin iframes were unreachable from Playwright. `element_handle.content_frame()` returned `None`, `frame.evaluate()` threw cross-origin SOP errors, and `frame_locator(...).click()` timed out even with `force=True`. Root cause: FF150 defaults `fission.webContentIsolationStrategy=1` (`IsolateEverything`), which site-isolates every cross-origin iframe into a separate `webIsolated` content process even when `fission.autostart=False`. The parent's Juggler FrameTree then has a Frame placeholder with no docShell and no URL — every protocol op that needs to enter the iframe fails. Fix: pin `fission.webContentIsolationStrategy=0` (`IsolateNothing`) in the baseline prefs. The setting can be flipped back per session via `extra_prefs={"fission.webContentIsolationStrategy": 1}`. + +### Added +- `tests/test_cross_origin_iframe.py`: 4 unit + 5 e2e regression sentinels for cross-origin iframe interaction. The e2e layer runs entirely offline against two local HTTP servers on `127.0.0.1` (two ports = two SOP origins) and covers `page.frames` URL tracking, `content_frame()`, `frame.evaluate()`, `frame_locator(...).locator(...)`, and end-to-end `dispatch_event("click")` for plain, sandboxed and titled iframes. A future FF upgrade or fingerprint A/B that flips the pref back to `1` will fail the suite before shipping. + +### Unchanged +- `BINARY_VERSION` stays at `firefox-7`. Python-only release; no new Firefox build was needed. + +## [0.1.7] - 2026-05-21 + +### Fixed +- [#18](https://github.com/feder-cr/invisible_playwright/issues/18): Tab crash when running with `headless=True` on Windows on pages that trigger cross-process navigation. Two separate bugs that only manifested together: (1) the Chromium content sandbox at default level 6 puts content processes on `kAlternateWinstation`, but the wrapper hides the browser window on its own alt-desktop (`CreateDesktop` for headless on Windows). Mismatched desktops → cross-process navigations couldn't reparent windows → content process exits cleanly and Playwright fires `page.on('crash')`. (2) The canvas2d `getImageData` stealth spoof wrote to a read-only mapped `DataSourceSurface`. On GPU-backed canvases that memory is write-protected → segfault during the final `getImageData` at page unload. Wrapper now sets `security.sandbox.content.level=4` in the alt-desktop workaround set, and `firefox-7` ships the source fix that moves the noise to the JS array's writable backing buffer. + +### Changed +- `BINARY_VERSION` bumped from `firefox-5` to `firefox-7`. `firefox-6` was rolled back when its partial fix turned out to be wrong (the iframe-burst hypothesis was a dead end; bisection in the evening found the real two-bug cause documented above). + ## [0.1.6] - 2026-05-21 ### Added @@ -33,7 +73,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ## [0.1.3] - 2026-05-19 ### Changed -- `BINARY_VERSION` bumped from `firefox-2` to `firefox-3`. The new archives on both Windows and Linux are built from a clean clone of [feder-cr/invisible-firefox#stealth/150](https://github.com/feder-cr/invisible-firefox/tree/stealth/150) — the consolidated source-of-truth fork (renamed from `feder-cr/firefox`; the companion `feder-cr/firefox-stealth` patches repo was deleted, all patches now live as commits on top of `mozilla-firefox/firefox`). +- `BINARY_VERSION` bumped from `firefox-2` to `firefox-3`. The new archives on both Windows and Linux are built from a clean clone of [feder-cr/invisible_firefox#stealth/150](https://github.com/feder-cr/invisible_firefox/tree/stealth/150) — the consolidated source-of-truth fork (renamed from `feder-cr/firefox`; the companion `feder-cr/firefox-stealth` patches repo was deleted, all patches now live as commits on top of `mozilla-firefox/firefox`). - The patched Firefox archive now ships the **proper C++ implementation** of `windowUtils.jugglerSendMouseEvent`, replacing the JS shim from 0.1.2. ### C++ fixes landed in this release @@ -44,7 +84,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), - **C7 (partial)**: storage stub for `nsIDocShell.languageOverride`. Workaround `InvisiblePlaywright(locale="")` recommended until full BC FIELD port lands. ### Verified -- Both archives built from same source: feder-cr/invisible-firefox commit `68906f1f9c55`. +- Both archives built from same source: feder-cr/invisible_firefox commit `68906f1f9c55`. - Windows + Linux smoke suite green: launch, `ctx.new_page()`, `page.mouse.{move,down,up,click,wheel}`, `navigator.webdriver=false`, sannysoft 32/33 PASS. - SHA256 published in `checksums.txt` on the `firefox-3` release. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b56e5d3..8eb110d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,7 +7,7 @@ Thanks for your interest in improving this project. Contributions are welcome vi - **Bug?** Open a [bug report](https://github.com/feder-cr/invisible_playwright/issues/new?template=bug_report.yml). - **Idea?** Open a [feature request](https://github.com/feder-cr/invisible_playwright/issues/new?template=feature_request.yml). - **Security issue?** Do **not** open a public issue — see [SECURITY.md](SECURITY.md). -- **The C++ patches** live in the companion repo [feder-cr/invisible-firefox](https://github.com/feder-cr/invisible-firefox) (branch `stealth/150`). Bugs in fingerprint spoofing usually belong there. +- **The C++ patches** live in the companion repo [feder-cr/invisible_firefox](https://github.com/feder-cr/invisible_firefox) (branch `stealth/150`). Bugs in fingerprint spoofing usually belong there. ## Scope @@ -18,7 +18,7 @@ This repository ships the **Python wrapper** (`invisible_playwright`) around a p - Binary download/caching, CLI, proxy plumbing - Tests, docs, examples, packaging -Out of scope (belongs in `invisible-firefox`): +Out of scope (belongs in `invisible_firefox`): - Changes to the Firefox C++ source - New preferences exposed by the patched binary @@ -65,7 +65,7 @@ Before opening, please: - Search [existing issues](https://github.com/feder-cr/invisible_playwright/issues) — the bug may already be tracked. - Reproduce on the **latest release** if possible. -- Confirm the issue is in the Python wrapper, not the patched Firefox itself. If a fingerprint is leaking or a detector flags the browser, open the issue at `feder-cr/invisible-firefox` instead. +- Confirm the issue is in the Python wrapper, not the patched Firefox itself. If a fingerprint is leaking or a detector flags the browser, open the issue at `feder-cr/invisible_firefox` instead. Include: diff --git a/README.md b/README.md index c95e4fa..0ef05d1 100644 --- a/README.md +++ b/README.md @@ -6,56 +6,26 @@ [![Firefox 150.0.1](https://img.shields.io/badge/firefox-150.0.1-orange.svg)](https://www.mozilla.org/firefox/) [![GitHub release](https://img.shields.io/github/v/release/feder-cr/invisible_playwright.svg)](https://github.com/feder-cr/invisible_playwright/releases) [![GitHub stars](https://img.shields.io/github/stars/feder-cr/invisible_playwright.svg?style=social)](https://github.com/feder-cr/invisible_playwright/stargazers) +[![browser launches](https://img.shields.io/github/downloads/feder-cr/invisible_firefox/usage-counter/total?label=browser%20launches&color=blue)](https://github.com/feder-cr/invisible_firefox/releases/tag/usage-counter) [![LinkedIn](https://img.shields.io/badge/LinkedIn-Federico%20Elia-0A66C2?logo=linkedin&logoColor=white)](https://it.linkedin.com/in/federico-elia-5199951b6) -A patched Firefox **100% Playwright-compatible** that passes the hardest browser-fingerprint detectors in the wild. +**Stealth Firefox that passes every bot detection test. Drop-in Playwright replacement, fingerprint patched at the C++ level, not a JavaScript shim.** +![invisible_playwright - 5/5 detection suites passed](docs/screenshots/hero.gif) -## Results - -### Google reCAPTCHA v3 - **0.90 / 1.0** - -Top-tier score. Google classifies the session as "very likely a human". Most anti-detect stacks plateau around 0.3-0.7. - -![reCAPTCHA score 0.90](docs/screenshots/recaptcha_score.png) - -### Fingerprint Pro - **bot: not detected, VPN: false, tampering: false, dev tools: not detected** - -FingerprintJS Pro's full Smart Signals battery flips every flag to "Not detected". Browser correctly identified as Firefox 150 on Windows 10. Confidence score 0.9. - -![FingerprintPro not detected](docs/screenshots/fingerprintpro.png) - -### CreepJS - **0 lies**, fingerprint is internally coherent - -No contradictions between headless hints, spoofed values, and real rendering output. That "0 lies" is what kills most anti-detect browsers: one inconsistency (e.g. Chrome UA + Firefox WebGL) and the trust score collapses. - -![CreepJS 0 lies](docs/screenshots/creepjs.png) - -### BrowserLeaks WebRTC - **no public IP leak** - -WebRTC srflx address is the proxy egress IP; host candidates are private LAN. The real public IP never leaks via STUN, even on pages that configure their own ICE servers. Stock Firefox exposes an mDNS hostname (e.g. `abc-1234.local`) as a host ICE candidate, which is itself a stable per-session signal detectors fingerprint. invisible_playwright replaces host candidates with synthetic private-LAN IPs that match the spoofed network, removing the mDNS tell. - -![WebRTC no leaks](docs/screenshots/webrtc.png) - -### bot.sannysoft.com - **all checks pass** - -Every row green: WebDriver not present, Chrome-only properties absent, plugin/mime/languages arrays coherent, permissions API correct, iframe/source window checks pass. - -![Sannysoft all green](docs/screenshots/sannysoft.png) - ---- ## Why it's powerful -**Most anti-detect browsers patch Chromium at the JavaScript level** - they override `navigator`, `WebGLRenderingContext.getParameter`, canvas APIs, and so on via injected scripts. This has two fatal problems: + +**Most other anti-detect browsers patch Chromium at the JavaScript level** - they override `navigator`, `WebGLRenderingContext.getParameter`, canvas APIs, and so on via injected scripts. This has two fatal problems: 1. **JS patches are detectable.** Anti-bots enumerate native function `.toString()`, check descriptor configurability, compare property enumeration order, watch for prototype mutations. Every patch leaves a fingerprint of its own. CreepJS has an entire battery of "lies detectors" built around this. 2. **Chromium itself is now suspect.** Residential-proxy bot traffic is overwhelmingly Chromium-based, so detectors weight anything Chromium-shaped as risky by default. Chromium-based forks inherit Chrome's open-source layers (BoringSSL, Blink, V8, ANGLE) cleanly, but they still cannot fully match Chrome in practice: Chrome ships closed-source components on top (Widevine, proprietary codecs, Google Update / Safe Browsing endpoints) that flip detectable JS feature flags and network signals, and forks lag Chrome's release cadence by days to weeks, leaving telltale version-specific behaviours that detectors lock onto. **invisible_playwright patches Firefox at the C++ level.** The spoofed values come back out through the normal Gecko paths - there is no JS shim, no override, no `Object.defineProperty`. **From the page's point of view, the browser is just telling the truth.** Anti-bot lie-detectors have nothing to latch onto. -invisible_playwright spoofs **all the layers that matter, together, coherently** — Navigator, screen, GPU/WebGL, Canvas, fonts, audio, WebRTC, timezone, DevTools detection, SOCKS5 auth, and the rest. See [feder-cr/invisible-firefox](https://github.com/feder-cr/invisible-firefox) for the full per-layer breakdown of which C++ files are patched and why. +invisible_playwright spoofs **all the layers that matter, together, coherently**: Navigator, screen, GPU/WebGL, Canvas, fonts, audio, WebRTC, timezone, DevTools detection, SOCKS5 auth, and the rest. See [feder-cr/invisible_firefox](https://github.com/feder-cr/invisible_firefox) for the full per-layer breakdown of which C++ files are patched and why. Everything is driven by preferences - no hardcoded values in the binary. You change one pref, you change the spoofed value. @@ -63,23 +33,21 @@ Everything is driven by preferences - no hardcoded values in the binary. You cha ## How it compares -Commercial anti-detect browsers (Multilogin Mimic, GoLogin Orbita, AdsPower, Dolphin Anty) ship patched Chromium and apply most spoofing at the JavaScript layer. A few (Kameleo, Multilogin Stealthfox) also offer Firefox-based profiles, but the spoofing pattern is the same: runtime overrides on top of an unmodified rendering engine. That's the ceiling - and it's a low one. +**CloakBrowser** ships a similar pitch for Chromium, but its binary is **closed source** (the source-level patches are not published, you only get the compiled output), and it still hits the Chromium reCAPTCHA ceiling. The commercial anti-detect browsers (**Multilogin**, **GoLogin**, AdsPower, Dolphin, Kameleo) are paid SaaS that overlay JS-layer spoofing on a patched Chromium. Managed profiles are nice but raw detection bypass sits below both Camoufox and us. -| | invisible_playwright | Multilogin / GoLogin | AdsPower / Dolphin | Kameleo | +| | invisible_playwright | Camoufox | CloakBrowser | Multilogin | |---|---|---|---|---| -| Engine | Firefox (open source) | Chromium fork | Chromium fork | Chromium | -| Patch depth | C++ source | JS overrides | JS overrides | JS overrides | -| `.toString()` clean | ✅ Native Gecko path | ❌ Detectable shims | ❌ Detectable shims | ❌ Detectable shims | -| Canvas / WebGL | ✅ C++ level | ⚠️ JS override | ⚠️ JS override | ⚠️ JS override | -| SOCKS5 auth | ✅ Patched | ⚠️ Varies | ⚠️ Varies | ❌ | -| Self-hosted | ✅ | ❌ SaaS | ❌ SaaS | ❌ Cloud | -| reCAPTCHA v3 score | **0.90** | ~0.3-0.6 | ~0.3-0.5 | ~0.3-0.5 | -| FP Pro - bot detected | ✅ Not detected | ❌ Detected | ❌ Detected | ❌ Detected | -| FP Pro - tampering | ✅ Not detected | ❌ Detected | ❌ Detected | ❌ Detected | -| FP Pro - VPN flag | ✅ false | ❌ true | ❌ true | ❌ true | -| CreepJS lies | ✅ 0 | ❌ multiple | ❌ multiple | ❌ multiple | - -Competitor scores reflect our own testing on Windows 10 against the same five detection suites used above; results may vary with their evolving builds. +| Engine | Firefox 150 | Firefox (~1 year old base) | Chromium | Chromium fork | +| Patch depth | C++ source | C++ source | C++ source (binary only) | JS overrides | +| Maintenance | Active (weekly) | Gap (~1 year) | Active | Active SaaS | +| Open source | ✅ MIT | ✅ MPL | ❌ Closed source | ❌ Closed source | +| `.toString()` clean | ✅ | ✅ | ✅ | ❌ Detectable shims | +| Canvas / WebGL / Audio | ✅ C++ | ⚠️ Drift vs current FF | ✅ C++ | ⚠️ JS override | +| SOCKS5 auth | ✅ Patched | ❌ | ⚠️ Playwright proxy | ⚠️ Varies | +| **reCAPTCHA v3 score** | **0.90** | ~0.3-0.5 | ~0.3-0.5 | ~0.3-0.6 | +| FP Pro - bot detected | ✅ Not detected | ❌ Detected | ❌ Detected | ❌ Detected | +| CreepJS lies | ✅ 0 | ❌ Multiple | ✅ 0 | ❌ Multiple | +| Cost | Free | Free | Free | From $99/mo | --- @@ -172,6 +140,21 @@ with InvisiblePlaywright(proxy=proxy) as browser: Schemes supported: `socks5`, `socks4`, `http`, `https`. Auth works on all of them (SOCKS5 via patched `nsProtocolProxyService.cpp`, HTTP/HTTPS via Playwright). DNS is routed through the proxy by default, no local leak. +### Timezone + +The browser timezone follows `timezone=`: + +```python +# default: timezone is auto-derived from the egress IP (proxy egress if a +# proxy is set, otherwise the host's own public IP) +with InvisiblePlaywright(proxy=proxy) as browser: + ... + +# explicit IANA zone always wins — the only way to force a specific zone +with InvisiblePlaywright(proxy=proxy, timezone="America/New_York") as browser: + ... +``` + ### Pinning specific fingerprint fields By default everything comes from `seed`. To force specific values while the rest stays seed-derived: @@ -203,24 +186,6 @@ invisible_playwright version # wrapper and binary versions invisible_playwright clear-cache # remove all cached binaries ``` -## Known issues - -### `headless=True` on Windows can cause tab crashes on sites with heavy cross-process navigation - -Reported as [#18](https://github.com/feder-cr/invisible_playwright/issues/18) (`id.sky.com` and similar). On Windows, `headless=True` runs Firefox headed on a hidden alt-desktop created via `CreateDesktop`. Some sites (id.sky.com, anything else loading Adobe AppMeasurement in a way that triggers cross-process navigation) end up firing `page.on('crash')` after about 10 seconds. The cause is a window-parenting interaction between the alt-desktop and the GPU/content processes; the workaround is one of: - -```python -# Option A — keep the visible window (no alt-desktop) -with InvisiblePlaywright(seed=42, headless=False) as browser: - ... - -# Option B — run inside Xvfb on Linux (alt-desktop bug is Windows-only) -``` - -The visible window case works on every site we've tested. Linux + Xvfb is unaffected. - ---- - ## Related projects invisible_playwright takes a different angle than the major Firefox-hardening projects but stands on their shoulders: @@ -233,4 +198,4 @@ invisible_playwright takes a different angle than the major Firefox-hardening pr ## License -MIT - see [LICENSE](LICENSE). The patched Firefox binary is distributed under the MPL-2.0 (Firefox upstream license). The C++ patches against mozilla-central that produce that binary are at [feder-cr/invisible-firefox](https://github.com/feder-cr/invisible-firefox). +MIT - see [LICENSE](LICENSE). The patched Firefox binary is distributed under the MPL-2.0 (Firefox upstream license). The C++ patches against mozilla-central that produce that binary are at [feder-cr/invisible_firefox](https://github.com/feder-cr/invisible_firefox). diff --git a/SECURITY.md b/SECURITY.md index 19dbc11..83959a2 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -41,7 +41,7 @@ In scope: Out of scope here (report to the relevant project): -- Vulnerabilities in the patched Firefox C++ source — open a private report at [feder-cr/invisible-firefox](https://github.com/feder-cr/invisible-firefox/security/advisories/new) +- Vulnerabilities in the patched Firefox C++ source — open a private report at [feder-cr/invisible_firefox](https://github.com/feder-cr/invisible_firefox/security/advisories/new) - Vulnerabilities in upstream Firefox / mozilla-central — report to Mozilla per https://www.mozilla.org/security/ - Vulnerabilities in third-party dependencies (`playwright`, `requests`, etc.) — report to those projects directly diff --git a/docs/screenshots/hero.gif b/docs/screenshots/hero.gif new file mode 100644 index 0000000..eadbf1b Binary files /dev/null and b/docs/screenshots/hero.gif differ diff --git a/pyproject.toml b/pyproject.toml index 7793173..d08f552 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "invisible-playwright" -version = "0.1.6" +version = "0.2.0" description = "Playwright wrapper for a patched Firefox with deterministic stealth profile." readme = "README.md" requires-python = ">=3.11" @@ -22,7 +22,9 @@ classifiers = [ dependencies = [ "playwright>=1.40", "platformdirs>=4", - "requests>=2.31", + "requests[socks]>=2.31", + "maxminddb>=2.2", + "tzdata>=2024.1", "tqdm>=4.66", "pywin32>=306; sys_platform == 'win32'", ] diff --git a/src/invisible_playwright/__init__.py b/src/invisible_playwright/__init__.py index 6bae9f3..0871021 100644 --- a/src/invisible_playwright/__init__.py +++ b/src/invisible_playwright/__init__.py @@ -15,8 +15,30 @@ Quickstart: page = browser.new_page() page.click("#submit") # expanded into a Bezier trajectory """ -from .launcher import InvisiblePlaywright +from .config import get_default_args, get_default_stealth_prefs from .constants import BINARY_VERSION, FIREFOX_UPSTREAM_VERSION +from ._geo import GeoTimezoneError, resolve_session_timezone +from .download import ensure_binary, ensure_geoip_mmdb +from .launcher import InvisiblePlaywright -__version__ = "0.1.0" -__all__ = ["InvisiblePlaywright", "BINARY_VERSION", "FIREFOX_UPSTREAM_VERSION", "__version__"] +from importlib.metadata import PackageNotFoundError, version as _pkg_version + +try: + __version__ = _pkg_version("invisible-playwright") +except PackageNotFoundError: + # Editable / source checkout without an install record: fall back to a + # marker rather than risk shipping a stale hardcoded string. + __version__ = "0.0.0+unknown" + +__all__ = [ + "InvisiblePlaywright", + "ensure_binary", + "ensure_geoip_mmdb", + "get_default_stealth_prefs", + "get_default_args", + "resolve_session_timezone", + "GeoTimezoneError", + "BINARY_VERSION", + "FIREFOX_UPSTREAM_VERSION", + "__version__", +] diff --git a/src/invisible_playwright/_fpforge/_sampler.py b/src/invisible_playwright/_fpforge/_sampler.py index 5653db8..692f600 100644 --- a/src/invisible_playwright/_fpforge/_sampler.py +++ b/src/invisible_playwright/_fpforge/_sampler.py @@ -84,6 +84,12 @@ _FONT_POOL = _load("font_pool.json") _FONT_CORE: list = _FONT_POOL["core"] _FONT_OPTIONAL: list = _FONT_POOL["optional"] _CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"] +# Browsing-history pool + CPT (per-class probabilities for visited sites). +# Drives _recaptcha_seed's cookie pre-seed: each persona ends up with a +# coherent list of ~15-30 visited sites whose categories correlate with +# gpu_class (workstation → dev-heavy, integrated_old → shop+news-heavy). +_BROWSING_POOL: list = _load("browsing_pool.json")["entries"] +_CPT_BROWSING = _load("cpt_browsing_given_class.json")["table"] # ═══════════════════════════════════════════════════════════════════════ @@ -282,6 +288,33 @@ def derive_font_whitelist(gpu_class: str, rng) -> str: return derive_font_prefs(gpu_class, rng)["whitelist"] +# ═══════════════════════════════════════════════════════════════════════ +# BROWSING HISTORY (Bayesian: per-site P(visited|gpu_class)) +# ═══════════════════════════════════════════════════════════════════════ +def derive_browsing_history(gpu_class: str, rng) -> list: + """Sample which sites this persona has visited recently. + + Each site in the pool has a per-class probability (CPT). We sample + independently per-site, producing a list of dicts: + [{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, ...] + + Sum of CPT probabilities per class is tuned to land ~15-30 visited sites + on average — an established-user signature. Sorted by name for stable + output across runs of the same seed. + """ + cpt = _CPT_BROWSING.get(gpu_class) + if cpt is None: + cpt = _CPT_BROWSING["mid_range"] + visited: list = [] + for entry in _BROWSING_POOL: + name = entry["name"] + p = cpt.get(name, 0.3) # default 0.3 for missing CPT row + if rng.random() < p: + visited.append(dict(entry)) # copy to avoid mutating pool + visited.sort(key=lambda e: e["name"]) + return visited + + # ═══════════════════════════════════════════════════════════════════════ # PUBLIC API: Forge # ═══════════════════════════════════════════════════════════════════════ @@ -350,6 +383,12 @@ class Forge: bundle["gpu_class"], self._rng ).items() }, + # Bayesian browsing history (per-class P(visited|gpu_class)). + # Consumed by _recaptcha_seed.py to seed coherent cookie history + # when invisible_playwright is launched with prep_recaptcha=True. + "browsing_history": derive_browsing_history( + bundle["gpu_class"], self._rng + ), } diff --git a/src/invisible_playwright/_fpforge/data/browsing_pool.json b/src/invisible_playwright/_fpforge/data/browsing_pool.json new file mode 100644 index 0000000..6e98cd9 --- /dev/null +++ b/src/invisible_playwright/_fpforge/data/browsing_pool.json @@ -0,0 +1,64 @@ +{ + "_comment": [ + "Pool of everyday websites used by the browsing_history node.", + "Each entry: { name, category, cookie_profile }.", + "- name: bare domain (no scheme, no leading dot).", + "- category: dev / shop / news / reference / media / community / misc.", + "- cookie_profile: short tag pointing to a cookie-template recipe used by", + " _recaptcha_seed.py to generate concrete cookies (so heavy-analytics sites", + " get _ga+_gid+OneTrust, simple sites get just _ga, dev tools get GH-style).", + "Add new entries here + add per-class probabilities in cpt_browsing_given_class.json." + ], + "entries": [ + {"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"}, + {"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"}, + {"name": "mozilla.org", "category": "reference", "cookie_profile": "ga_consent"}, + {"name": "w3schools.com", "category": "dev", "cookie_profile": "ga_consent_clarity"}, + {"name": "mdn.io", "category": "dev", "cookie_profile": "minimal"}, + {"name": "duckduckgo.com", "category": "reference", "cookie_profile": "minimal"}, + {"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, + {"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"}, + {"name": "npmjs.com", "category": "dev", "cookie_profile": "ga_consent"}, + {"name": "gitlab.com", "category": "dev", "cookie_profile": "ga_cf"}, + {"name": "pypi.org", "category": "dev", "cookie_profile": "minimal"}, + {"name": "docs.python.org", "category": "dev", "cookie_profile": "minimal"}, + {"name": "rust-lang.org", "category": "dev", "cookie_profile": "ga_consent"}, + {"name": "go.dev", "category": "dev", "cookie_profile": "ga_consent"}, + {"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "ebay.com", "category": "shop", "cookie_profile": "ga_consent"}, + {"name": "etsy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "bestbuy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "target.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "nytimes.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "cnn.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "bbc.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "theguardian.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "reuters.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "apnews.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "washingtonpost.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "techcrunch.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "theverge.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "arstechnica.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "wired.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "engadget.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "9to5mac.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "medium.com", "category": "community", "cookie_profile": "ga_consent"}, + {"name": "dev.to", "category": "community", "cookie_profile": "ga_consent"}, + {"name": "reddit.com", "category": "community", "cookie_profile": "ga_cf"}, + {"name": "news.ycombinator.com", "category": "community", "cookie_profile": "minimal"}, + {"name": "quora.com", "category": "community", "cookie_profile": "ga_consent_clarity"}, + {"name": "stackexchange.com", "category": "community", "cookie_profile": "ga_consent_clarity"}, + {"name": "imdb.com", "category": "media", "cookie_profile": "ga_consent_clarity"}, + {"name": "rottentomatoes.com", "category": "media", "cookie_profile": "ga_consent"}, + {"name": "metacritic.com", "category": "media", "cookie_profile": "ga_consent"}, + {"name": "allrecipes.com", "category": "misc", "cookie_profile": "ga_consent_clarity"}, + {"name": "epicurious.com", "category": "misc", "cookie_profile": "ga_consent"}, + {"name": "tripadvisor.com", "category": "misc", "cookie_profile": "ga_consent_clarity"}, + {"name": "weather.com", "category": "reference", "cookie_profile": "ga_consent"}, + {"name": "timeanddate.com", "category": "reference", "cookie_profile": "ga_consent"}, + {"name": "thesaurus.com", "category": "reference", "cookie_profile": "ga_consent_clarity"}, + {"name": "kayak.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "booking.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "airbnb.com", "category": "shop", "cookie_profile": "ga_consent"} + ] +} diff --git a/src/invisible_playwright/_fpforge/data/cpt_browsing_given_class.json b/src/invisible_playwright/_fpforge/data/cpt_browsing_given_class.json new file mode 100644 index 0000000..b2e3b1a --- /dev/null +++ b/src/invisible_playwright/_fpforge/data/cpt_browsing_given_class.json @@ -0,0 +1,138 @@ +{ + "_comment": [ + "Per-class probability that a persona of a given gpu_class has visited each", + "site in the pool. Used by the browsing_history node to derive a coherent", + "visited-domain list per persona.", + "", + "Probabilities are tuned so each class samples ~15-30 sites on average", + "(sum across all 50 entries falls in that range), giving an established-user", + "look. Categories are biased by class:", + " - workstation/high_end: higher P(dev) + high P(news/media)", + " - mid_range: balanced", + " - low_end/integrated_*: lower P(dev), higher P(shop/news/reference)", + "", + "Missing class falls back to mid_range via Node CPT pool fallback." + ], + "table": { + "workstation": { + "youtube.com": 0.80, "wikipedia.org": 0.85, "mozilla.org": 0.70, + "w3schools.com": 0.40, "mdn.io": 0.55, "duckduckgo.com": 0.45, + "github.com": 0.95, "stackoverflow.com": 0.90, "npmjs.com": 0.65, + "gitlab.com": 0.50, "pypi.org": 0.55, "docs.python.org": 0.60, + "rust-lang.org": 0.35, "go.dev": 0.30, + "amazon.com": 0.70, "ebay.com": 0.25, "etsy.com": 0.15, + "bestbuy.com": 0.45, "target.com": 0.30, + "nytimes.com": 0.55, "cnn.com": 0.40, "bbc.com": 0.55, + "theguardian.com": 0.45, "reuters.com": 0.40, "apnews.com": 0.30, + "washingtonpost.com": 0.40, + "techcrunch.com": 0.65, "theverge.com": 0.60, "arstechnica.com": 0.65, + "wired.com": 0.50, "engadget.com": 0.35, "9to5mac.com": 0.30, + "medium.com": 0.55, "dev.to": 0.40, "reddit.com": 0.70, + "news.ycombinator.com": 0.65, "quora.com": 0.20, "stackexchange.com": 0.60, + "imdb.com": 0.45, "rottentomatoes.com": 0.25, "metacritic.com": 0.20, + "allrecipes.com": 0.20, "epicurious.com": 0.15, "tripadvisor.com": 0.30, + "weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25, + "kayak.com": 0.30, "booking.com": 0.35, "airbnb.com": 0.30 + }, + "high_end": { + "youtube.com": 0.85, "wikipedia.org": 0.80, "mozilla.org": 0.60, + "w3schools.com": 0.45, "mdn.io": 0.45, "duckduckgo.com": 0.40, + "github.com": 0.85, "stackoverflow.com": 0.80, "npmjs.com": 0.50, + "gitlab.com": 0.40, "pypi.org": 0.45, "docs.python.org": 0.50, + "rust-lang.org": 0.30, "go.dev": 0.25, + "amazon.com": 0.75, "ebay.com": 0.30, "etsy.com": 0.20, + "bestbuy.com": 0.50, "target.com": 0.35, + "nytimes.com": 0.50, "cnn.com": 0.50, "bbc.com": 0.50, + "theguardian.com": 0.40, "reuters.com": 0.35, "apnews.com": 0.30, + "washingtonpost.com": 0.35, + "techcrunch.com": 0.60, "theverge.com": 0.65, "arstechnica.com": 0.60, + "wired.com": 0.50, "engadget.com": 0.40, "9to5mac.com": 0.35, + "medium.com": 0.50, "dev.to": 0.35, "reddit.com": 0.75, + "news.ycombinator.com": 0.55, "quora.com": 0.25, "stackexchange.com": 0.55, + "imdb.com": 0.55, "rottentomatoes.com": 0.35, "metacritic.com": 0.30, + "allrecipes.com": 0.25, "epicurious.com": 0.20, "tripadvisor.com": 0.30, + "weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25, + "kayak.com": 0.30, "booking.com": 0.40, "airbnb.com": 0.30 + }, + "mid_range": { + "youtube.com": 0.85, "wikipedia.org": 0.75, "mozilla.org": 0.45, + "w3schools.com": 0.40, "mdn.io": 0.30, "duckduckgo.com": 0.35, + "github.com": 0.55, "stackoverflow.com": 0.55, "npmjs.com": 0.30, + "gitlab.com": 0.25, "pypi.org": 0.25, "docs.python.org": 0.30, + "rust-lang.org": 0.15, "go.dev": 0.15, + "amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30, + "bestbuy.com": 0.55, "target.com": 0.40, + "nytimes.com": 0.45, "cnn.com": 0.55, "bbc.com": 0.45, + "theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30, + "washingtonpost.com": 0.30, + "techcrunch.com": 0.45, "theverge.com": 0.50, "arstechnica.com": 0.40, + "wired.com": 0.45, "engadget.com": 0.35, "9to5mac.com": 0.30, + "medium.com": 0.45, "dev.to": 0.25, "reddit.com": 0.70, + "news.ycombinator.com": 0.30, "quora.com": 0.35, "stackexchange.com": 0.40, + "imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.35, + "allrecipes.com": 0.35, "epicurious.com": 0.25, "tripadvisor.com": 0.40, + "weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30, + "kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40 + }, + "low_end": { + "youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.35, + "w3schools.com": 0.30, "mdn.io": 0.20, "duckduckgo.com": 0.30, + "github.com": 0.30, "stackoverflow.com": 0.30, "npmjs.com": 0.15, + "gitlab.com": 0.10, "pypi.org": 0.10, "docs.python.org": 0.15, + "rust-lang.org": 0.05, "go.dev": 0.05, + "amazon.com": 0.85, "ebay.com": 0.50, "etsy.com": 0.40, + "bestbuy.com": 0.55, "target.com": 0.45, + "nytimes.com": 0.40, "cnn.com": 0.60, "bbc.com": 0.40, + "theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.30, + "washingtonpost.com": 0.25, + "techcrunch.com": 0.30, "theverge.com": 0.35, "arstechnica.com": 0.25, + "wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25, + "medium.com": 0.35, "dev.to": 0.15, "reddit.com": 0.65, + "news.ycombinator.com": 0.15, "quora.com": 0.45, "stackexchange.com": 0.25, + "imdb.com": 0.65, "rottentomatoes.com": 0.45, "metacritic.com": 0.35, + "allrecipes.com": 0.45, "epicurious.com": 0.30, "tripadvisor.com": 0.45, + "weather.com": 0.65, "timeanddate.com": 0.25, "thesaurus.com": 0.35, + "kayak.com": 0.35, "booking.com": 0.50, "airbnb.com": 0.40 + }, + "integrated_modern": { + "youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.40, + "w3schools.com": 0.35, "mdn.io": 0.25, "duckduckgo.com": 0.35, + "github.com": 0.40, "stackoverflow.com": 0.40, "npmjs.com": 0.20, + "gitlab.com": 0.15, "pypi.org": 0.20, "docs.python.org": 0.20, + "rust-lang.org": 0.10, "go.dev": 0.10, + "amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30, + "bestbuy.com": 0.50, "target.com": 0.40, + "nytimes.com": 0.40, "cnn.com": 0.55, "bbc.com": 0.45, + "theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30, + "washingtonpost.com": 0.30, + "techcrunch.com": 0.40, "theverge.com": 0.45, "arstechnica.com": 0.30, + "wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25, + "medium.com": 0.40, "dev.to": 0.20, "reddit.com": 0.65, + "news.ycombinator.com": 0.25, "quora.com": 0.40, "stackexchange.com": 0.35, + "imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.30, + "allrecipes.com": 0.40, "epicurious.com": 0.25, "tripadvisor.com": 0.40, + "weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30, + "kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40 + }, + "integrated_old": { + "youtube.com": 0.75, "wikipedia.org": 0.65, "mozilla.org": 0.30, + "w3schools.com": 0.20, "mdn.io": 0.10, "duckduckgo.com": 0.25, + "github.com": 0.15, "stackoverflow.com": 0.20, "npmjs.com": 0.05, + "gitlab.com": 0.05, "pypi.org": 0.05, "docs.python.org": 0.10, + "rust-lang.org": 0.02, "go.dev": 0.02, + "amazon.com": 0.85, "ebay.com": 0.55, "etsy.com": 0.45, + "bestbuy.com": 0.55, "target.com": 0.50, + "nytimes.com": 0.45, "cnn.com": 0.65, "bbc.com": 0.40, + "theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.35, + "washingtonpost.com": 0.30, + "techcrunch.com": 0.20, "theverge.com": 0.25, "arstechnica.com": 0.15, + "wired.com": 0.30, "engadget.com": 0.20, "9to5mac.com": 0.20, + "medium.com": 0.30, "dev.to": 0.05, "reddit.com": 0.55, + "news.ycombinator.com": 0.05, "quora.com": 0.55, "stackexchange.com": 0.15, + "imdb.com": 0.70, "rottentomatoes.com": 0.50, "metacritic.com": 0.35, + "allrecipes.com": 0.55, "epicurious.com": 0.35, "tripadvisor.com": 0.50, + "weather.com": 0.70, "timeanddate.com": 0.30, "thesaurus.com": 0.40, + "kayak.com": 0.40, "booking.com": 0.55, "airbnb.com": 0.40 + } + } +} diff --git a/src/invisible_playwright/_fpforge/profile.py b/src/invisible_playwright/_fpforge/profile.py index 16c52a4..fcdf024 100644 --- a/src/invisible_playwright/_fpforge/profile.py +++ b/src/invisible_playwright/_fpforge/profile.py @@ -120,6 +120,11 @@ class Profile: webgl: WebGLProfile fonts: List[str] dark_theme: bool + # Bayesian browsing-history: list of {name, category, cookie_profile} + # dicts sampled from data/browsing_pool.json with per-class CPT. Used + # by _recaptcha_seed.py to build a coherent cookie pre-seed when the + # caller opts in via Stealthfox(prep_recaptcha=True). + browsing_history: List[Dict[str, str]] = field(default_factory=list) _raw: Dict[str, Any] = field(default_factory=dict, repr=False, compare=False) def to_prefs_dict(self) -> Dict[str, Any]: @@ -255,5 +260,6 @@ def generate_profile(seed: int, pin: Optional[Dict[str, Any]] = None) -> Profile webgl=WebGLProfile(msaa_samples=int(raw["msaa_samples"])), fonts=fonts, dark_theme=bool(raw["dark_theme"]), + browsing_history=list(raw.get("browsing_history") or []), _raw=raw, ) diff --git a/src/invisible_playwright/_geo.py b/src/invisible_playwright/_geo.py new file mode 100644 index 0000000..02971e1 --- /dev/null +++ b/src/invisible_playwright/_geo.py @@ -0,0 +1,164 @@ +"""Resolve the session timezone from the egress IP (``timezone="auto"``). + +Approach B: discover the egress IP with one HTTP request — routed *through the +proxy* when one is set, otherwise a direct request that sees the host's own +public IP — then map IP → IANA timezone with an offline mmdb +(``daijro/geoip-all-in-one``, downloaded + cached by ``download.py``). + +Precedence (see ``resolve_session_timezone``): + + explicit IANA → unchanged explicit always wins + "" / "auto" → egress ALWAYS resolve. With a proxy, from the proxy + egress IP; without a proxy, from the host's + own public IP. This is the default. + +On failure: + with a proxy → raise a foreign proxy paired with the host TZ is + the precise ``timezone_mismatch`` signal, so + we fail loudly rather than fall back silently. + without a proxy → "" (host) the host TZ is a safe default, so a transient + lookup failure must not break the launch. +""" +from __future__ import annotations + +import ipaddress +from typing import Any, Dict, Optional +from urllib.parse import quote + +import requests + + +class GeoTimezoneError(RuntimeError): + """Raised when ``timezone="auto"`` cannot resolve a valid IANA zone.""" + + +# Plain-text IP echo endpoints (each returns just the caller's public IP). +_IP_ECHO_ENDPOINTS = ( + "https://api.ipify.org", + "https://icanhazip.com", + "https://checkip.amazonaws.com", +) + +_SOCKS_SCHEMES = ("socks5://", "socks4://", "socks://") + + +def _proxy_is_set(proxy: Optional[Dict[str, str]]) -> bool: + if not proxy: + return False + server = (proxy.get("server") or "").strip() + return bool(server) and server.lower() != "direct://" + + +def _proxies_for_requests(proxy: Dict[str, str]) -> Dict[str, str]: + """Translate our proxy dict into a ``requests`` proxies mapping. + + SOCKS5 uses the ``socks5h`` scheme so DNS is resolved proxy-side (matches + ``network.proxy.socks_remote_dns=True`` in the Firefox path). HTTP/HTTPS + pass through unchanged. Credentials are URL-encoded. + """ + server = (proxy.get("server") or "").strip() + low = server.lower() + if low.startswith("socks5://") or low.startswith("socks://"): + scheme = "socks5h" + elif low.startswith("socks4://"): + scheme = "socks4" + elif low.startswith("https://"): + scheme = "https" + else: + scheme = "http" + + host_port = server.split("://", 1)[1] if "://" in server else server + user = proxy.get("username") or "" + pwd = proxy.get("password") or "" + if user: + auth = f"{quote(user, safe='')}:{quote(pwd, safe='')}@" + else: + auth = "" + url = f"{scheme}://{auth}{host_port}" + return {"http": url, "https": url} + + +def discover_egress_ip( + proxy: Optional[Dict[str, str]] = None, *, timeout: float = 10.0 +) -> str: + """Return the public egress IP. + + Routes the request through ``proxy`` when given (SOCKS support requires + ``requests[socks]`` / PySocks); with ``proxy=None`` it makes a direct + request that sees the host's own public IP. Tries each echo endpoint in + turn; raises :class:`GeoTimezoneError` if none return a valid IP. + """ + proxies = _proxies_for_requests(proxy) if proxy else None + last_err: Optional[Exception] = None + for url in _IP_ECHO_ENDPOINTS: + try: + resp = requests.get(url, proxies=proxies, timeout=timeout) + resp.raise_for_status() + ip = resp.text.strip() + ipaddress.ip_address(ip) # validate (raises ValueError if not an IP) + return ip + except Exception as exc: # noqa: BLE001 - try the next endpoint + last_err = exc + continue + raise GeoTimezoneError( + f"could not discover the proxy egress IP via {len(_IP_ECHO_ENDPOINTS)} " + f"endpoints (last error: {last_err!r}). For SOCKS proxies make sure " + f"requests[socks] / PySocks is installed." + ) + + +def ip_to_timezone(ip: str, mmdb_path: Any) -> str: + """Map ``ip`` to its IANA timezone using the offline mmdb. + + Reads the standard MaxMind ``location.time_zone`` field and validates it + against the system tz database. Raises :class:`GeoTimezoneError` if the IP + is absent from the DB or the zone is missing / not a valid IANA name. + """ + import maxminddb + + with maxminddb.open_database(str(mmdb_path)) as reader: + record = reader.get(ip) + if not record: + raise GeoTimezoneError(f"egress IP {ip} not present in the geoip database") + tz = ((record.get("location") or {}) if isinstance(record, dict) else {}).get( + "time_zone" + ) + if not tz: + raise GeoTimezoneError(f"no timezone for egress IP {ip} in the geoip database") + from zoneinfo import ZoneInfo, ZoneInfoNotFoundError + + try: + ZoneInfo(tz) + except (ZoneInfoNotFoundError, ValueError) as exc: + raise GeoTimezoneError( + f"geoip returned an invalid IANA zone {tz!r} for {ip}: {exc}" + ) from exc + return tz + + +def resolve_session_timezone( + timezone: str, proxy: Optional[Dict[str, str]] +) -> str: + """Map the user's ``timezone`` setting to a concrete IANA zone (or ``""``). + + See the module docstring for the full precedence table. ``""``/``"auto"`` + ALWAYS resolve from the egress IP (proxy egress if a proxy is set, else the + host's own public IP). On failure: with a proxy we raise + :class:`GeoTimezoneError` (never silently use the host TZ behind a foreign + proxy); without a proxy we fall back to ``""`` (host TZ) so a transient + lookup failure can't break the launch. + """ + tz = (timezone or "").strip() + if tz and tz.lower() != "auto": + return tz # explicit IANA wins + # "" or "auto" → always resolve from the egress IP. + from .download import ensure_geoip_mmdb + + proxy_set = _proxy_is_set(proxy) + try: + ip = discover_egress_ip(proxy if proxy_set else None) + return ip_to_timezone(ip, ensure_geoip_mmdb()) + except Exception: + if proxy_set: + raise # fail-early behind a proxy (timezone_mismatch trap) + return "" # no proxy: host TZ is a safe fallback diff --git a/src/invisible_playwright/_recaptcha_seed.py b/src/invisible_playwright/_recaptcha_seed.py new file mode 100644 index 0000000..cd998a2 --- /dev/null +++ b/src/invisible_playwright/_recaptcha_seed.py @@ -0,0 +1,340 @@ +"""Deterministic reCAPTCHA cookie pre-seed. + +Consumes the Bayesian-sampled `browsing_history` from the persona Profile +(see `_fpforge/_sampler.py:derive_browsing_history`). For each visited +site, builds 1-5 realistic cookies whose composition is chosen by the +site's `cookie_profile` tag (analytics-only / consent / cloudflare-bot- +management / etc.). All values seeded deterministically from the persona +seed, so a given persona always presents the SAME cookies across sessions. + +In addition, always seeds 5 cookies on .google.com (NID, CONSENT, SOCS, +_GRECAPTCHA, ENID). Excludes 1P_JAR which was deprecated by Google in 2022 +— including it now is an anachronism flag. + +Public API: + await seed_recaptcha_cookies_async(context, profile, timezone=None) + seed_recaptcha_cookies_sync(context, profile, timezone=None) + +`profile` is an `_fpforge.Profile`; `timezone` is the IANA tz (e.g. +"Europe/Rome") used to derive the CONSENT cookie's language token, so a +European-tz persona gets CONSENT in their language not en+FX. +""" +from __future__ import annotations + +import datetime +import random +import time +from typing import Any, List, Optional + +# URL-safe base64 alphabet (no padding chars). +_B64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" +_HEX_ALPHABET = "0123456789abcdef" + + +def _sub_seed(seed: int, tag: str) -> int: + """FNV-1a mix → independent PRNG streams per logical bucket from one seed.""" + h = 0xcbf29ce484222325 ^ (seed & 0xFFFFFFFF) + for c in tag.encode("ascii"): + h ^= c + h = (h * 0x100000001b3) & 0xFFFFFFFFFFFFFFFF + return h or 0xdeadbeef + + +def _b64_rand(rng: random.Random, length: int) -> str: + return "".join(rng.choice(_B64_ALPHABET) for _ in range(length)) + + +def _hex_rand(rng: random.Random, length: int) -> str: + return "".join(rng.choice(_HEX_ALPHABET) for _ in range(length)) + + +def _yyyymmdd_utc(ts: int) -> str: + return datetime.datetime.utcfromtimestamp(ts).strftime("%Y%m%d") + + +# IANA timezone -> (country_code, lang) for CONSENT cookie coherence. +# Real EU users get CONSENT with `++NNN`; non-EU gets `en+FX+NNN`. +# Default fallback `en+FX+NNN` for any tz not in this map. +_TZ_TO_REGION = { + "Europe/Rome": ("IT", "it"), + "Europe/Berlin": ("DE", "de"), + "Europe/Paris": ("FR", "fr"), + "Europe/Madrid": ("ES", "es"), + "Europe/London": ("GB", "en"), + "Europe/Amsterdam": ("NL", "nl"), + "Europe/Brussels": ("BE", "fr"), + "Europe/Vienna": ("AT", "de"), + "Europe/Zurich": ("CH", "de"), + "Europe/Dublin": ("IE", "en"), + "Europe/Lisbon": ("PT", "pt"), + "Europe/Stockholm": ("SE", "sv"), + "Europe/Oslo": ("NO", "no"), + "Europe/Copenhagen": ("DK", "da"), + "Europe/Helsinki": ("FI", "fi"), + "Europe/Warsaw": ("PL", "pl"), + "Europe/Prague": ("CZ", "cs"), + "Europe/Athens": ("GR", "el"), + "Asia/Tokyo": ("FX", "ja"), + "Asia/Shanghai": ("FX", "zh"), + "Asia/Hong_Kong": ("FX", "zh"), + "Asia/Seoul": ("FX", "ko"), +} + + +def _consent_region_lang(timezone: Optional[str]) -> tuple: + """Map IANA tz → (region_token, lang_2char) for CONSENT cookie. + Default `("FX", "en")` for US/unknown.""" + if timezone and timezone in _TZ_TO_REGION: + return _TZ_TO_REGION[timezone] + return ("FX", "en") + + +# --------------------------------------------------------------------------- +# .google.com cookie batch (always present, regardless of browsing history) +# --------------------------------------------------------------------------- + +def _google_cookies(rng: random.Random, now: int, + timezone: Optional[str] = None) -> List[dict]: + consent_age = rng.randint(60, 720) * 86400 + region, lang = _consent_region_lang(timezone) + # NID 3-digit prefix range broadened to 100-540 to cover historical NID + # versions (137, 105, 511, 525 etc. observed in real captures). + return [ + {"name": "NID", + "value": f"{rng.randint(100, 540)}={_b64_rand(rng, 178)}", + "domain": ".google.com", "path": "/", + "expires": now + 180 * 86400, + "httpOnly": True, "secure": True, "sameSite": "None"}, + {"name": "CONSENT", + "value": f"YES+cb.{_yyyymmdd_utc(now - consent_age)}-" + f"{rng.randint(10, 19):02d}-p{rng.randint(0, 9)}." + f"{lang}+{region}+{rng.randint(100, 999)}", + "domain": ".google.com", "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"}, + # 1P_JAR removed: Google deprecated it in 2022. Including it now is + # an anachronism flag for fingerprinters that look at cookie freshness. + {"name": "SOCS", + "value": f"CAES{_b64_rand(rng, 56)}", + "domain": ".google.com", "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"}, + {"name": "_GRECAPTCHA", + "value": _b64_rand(rng, 124), + "domain": ".google.com", "path": "/", + "expires": now + 180 * 86400, + "secure": True, "sameSite": "None"}, + {"name": "ENID", + "value": _b64_rand(rng, 252), + "domain": ".google.com", "path": "/", + "expires": now + 395 * 86400, + "httpOnly": True, "secure": True, "sameSite": "Lax"}, + ] + + +# --------------------------------------------------------------------------- +# Per-site cookie generators (recipes keyed by site["cookie_profile"]) +# --------------------------------------------------------------------------- + +def _norm_domain(domain: str) -> str: + return domain if domain.startswith(".") else "." + domain + + +def _ga_cookie(rng: random.Random, now: int, domain: str) -> dict: + first_age = rng.randint(7, 395) * 86400 + return {"name": "_ga", + "value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - first_age}", + "domain": domain, "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _gid_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "_gid", + "value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - rng.randint(60, 86400)}", + "domain": domain, "path": "/", + "expires": now + 86400, + "secure": True, "sameSite": "Lax"} + + +def _cf_bm_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "__cf_bm", + "value": f"{_b64_rand(rng, 43)}.{rng.randint(1700000000, now)}-1-1-1-1", + "domain": domain, "path": "/", + "expires": now + 1800, + "secure": True, "sameSite": "None"} + + +def _onetrust_cookie(rng: random.Random, now: int, domain: str) -> dict: + age_d = rng.randint(7, 365) + iso = datetime.datetime.utcfromtimestamp(now - age_d * 86400).strftime( + "%Y-%m-%dT%H:%M:%S.000Z" + ) + return {"name": "OptanonAlertBoxClosed", + "value": iso, + "domain": domain, "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _cookieyes_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "cookieyes-consent", + "value": "consentid:" + _b64_rand(rng, 28) + + ",consent:yes,action:yes,necessary:yes,functional:yes,analytics:yes", + "domain": domain, "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _clarity_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "_clck", + "value": f"{_hex_rand(rng, 8)}|2|f{rng.randint(10, 99)}|0|" + f"{now - rng.randint(60, 180) * 86400}", + "domain": domain, "path": "/", + "expires": now + 365 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _fbp_cookie(rng: random.Random, now: int, domain: str) -> dict: + """Facebook Pixel _fbp = fb...""" + return {"name": "_fbp", + "value": f"fb.1.{(now - rng.randint(60, 30*86400)) * 1000}." + f"{rng.randint(100000000, 9999999999)}", + "domain": domain, "path": "/", + "expires": now + 90 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _gtm_cookie(rng: random.Random, now: int, domain: str) -> dict: + """_dc_gtm_=1 — Google Tag Manager throttle flag.""" + container = f"UA-{rng.randint(10000000, 99999999)}-{rng.randint(1, 9)}" + return {"name": f"_dc_gtm_{container}", + "value": "1", + "domain": domain, "path": "/", + "expires": now + 60, + "secure": True, "sameSite": "Lax"} + + +def _hssrc_cookie(rng: random.Random, now: int, domain: str) -> dict: + """HubSpot referrer flag — small int.""" + return {"name": "__hssrc", + "value": str(rng.randint(1, 5)), + "domain": domain, "path": "/", + "expires": now + 1800, + "secure": True, "sameSite": "Lax"} + + +def _cookies_for_profile(profile: str, rng: random.Random, + now: int, domain: str) -> List[dict]: + """Map cookie_profile tag (from browsing_pool.json) → concrete cookies. + + Each recipe is a realistic combination observed on real production sites + in that category. Cookie age and sub-recipe variance (e.g., OneTrust vs + CookieYes for consent banner) are deterministic from rng. + """ + domain = _norm_domain(domain) + if profile == "minimal": + return [_ga_cookie(rng, now, domain)] + if profile == "ga_only": + out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)] + # 30% chance of GTM helper paired with GA + if rng.random() < 0.3: + out.append(_gtm_cookie(rng, now, domain)) + return out + if profile == "ga_cf": + return [_ga_cookie(rng, now, domain), _cf_bm_cookie(rng, now, domain)] + if profile == "ga_consent": + out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)] + out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5 + else _cookieyes_cookie(rng, now, domain)) + if rng.random() < 0.4: + out.append(_gtm_cookie(rng, now, domain)) + return out + if profile == "ga_consent_clarity": + # Heavy-tracking site profile: GA + Clarity + consent + often FB pixel + out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain), + _clarity_cookie(rng, now, domain)] + out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5 + else _cookieyes_cookie(rng, now, domain)) + if rng.random() < 0.5: + out.append(_fbp_cookie(rng, now, domain)) + if rng.random() < 0.4: + out.append(_gtm_cookie(rng, now, domain)) + if rng.random() < 0.25: + out.append(_hssrc_cookie(rng, now, domain)) + return out + # Unknown profile → safe fallback + return [_ga_cookie(rng, now, domain)] + + +# --------------------------------------------------------------------------- +# Public builder +# --------------------------------------------------------------------------- + +def build_cookies(seed: int, + browsing_history: Optional[List[dict]] = None, + now: Optional[int] = None, + timezone: Optional[str] = None) -> List[dict]: + """Build the full cookie list for a persona. + + Args: + seed: persona integer seed (from `Profile.seed`) + browsing_history: list of {name, category, cookie_profile} dicts as + sampled by `_fpforge.derive_browsing_history`. None → empty list + (only the 5 google cookies are returned). + now: unix-seconds timestamp; defaults to current time. Pin for tests. + timezone: IANA tz used to derive CONSENT cookie's `lang+region` token + (e.g. "Europe/Rome" → "it+IT", "America/New_York" → "en+FX"). + """ + ts = now if now is not None else int(time.time()) + cookies: List[dict] = [] + + # 5 .google.com cookies (always) — CONSENT lang derived from tz + rng_g = random.Random(_sub_seed(int(seed), "google")) + cookies.extend(_google_cookies(rng_g, ts, timezone=timezone)) + + # Per-site cookies (deterministic from seed × domain) + for site in (browsing_history or []): + rng_d = random.Random(_sub_seed(int(seed), f"dom:{site['name']}")) + cookies.extend(_cookies_for_profile( + site.get("cookie_profile", "minimal"), rng_d, ts, site["name"] + )) + return cookies + + +def _extract_seed_and_history(profile: Any) -> tuple: + """Accept a Profile object OR a (seed, history) tuple OR just an int seed.""" + if isinstance(profile, int): + return int(profile), [] + seed = int(getattr(profile, "seed")) + history = list(getattr(profile, "browsing_history", []) or []) + return seed, history + + +async def seed_recaptcha_cookies_async(context: Any, profile: Any, + timezone: Optional[str] = None) -> None: + """Async: inject deterministic persona cookies into the context.""" + seed, history = _extract_seed_and_history(profile) + cookies = build_cookies(seed, history, timezone=timezone) + try: + await context.add_cookies(cookies) + except Exception: + pass + + +def seed_recaptcha_cookies_sync(context: Any, profile: Any, + timezone: Optional[str] = None) -> None: + """Sync: inject deterministic persona cookies into the context.""" + seed, history = _extract_seed_and_history(profile) + cookies = build_cookies(seed, history, timezone=timezone) + try: + context.add_cookies(cookies) + except Exception: + pass + + +__all__ = [ + "build_cookies", + "seed_recaptcha_cookies_async", + "seed_recaptcha_cookies_sync", +] diff --git a/src/invisible_playwright/async_api.py b/src/invisible_playwright/async_api.py index 2b2eeca..70a7aeb 100644 --- a/src/invisible_playwright/async_api.py +++ b/src/invisible_playwright/async_api.py @@ -9,6 +9,7 @@ from typing import Any, Dict, Optional, Union from playwright.async_api import Browser, BrowserContext, Playwright, async_playwright from ._fpforge import Profile, generate_profile +from ._geo import resolve_session_timezone from ._headless import make_virtual_display from ._proxy import configure_proxy as _configure_proxy_shared from .download import ensure_binary @@ -51,6 +52,7 @@ class InvisiblePlaywright: extra_prefs: Optional[Dict[str, Any]] = None, binary_path: Optional[str] = None, profile_dir: Optional[Union[str, Path]] = None, + prep_recaptcha: bool = False, ) -> None: # See sync launcher: `zoom.stealth.fpp.hw_seed` is int32_t — clamp. self.seed: int = int(seed) if seed is not None else secrets.randbits(31) @@ -64,6 +66,8 @@ class InvisiblePlaywright: self._extra_prefs = extra_prefs self._binary_path = binary_path self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None + # reCAPTCHA pre-seed gated server-side; respect persistent profile. + self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None self._profile: Profile = generate_profile(self.seed, pin=self._pin) self._pw: Optional[Playwright] = None self._browser: Optional[Browser] = None @@ -72,6 +76,13 @@ class InvisiblePlaywright: async def __aenter__(self) -> Union[Browser, BrowserContext]: import sys as _sys + # Resolve timezone="auto" (and the proxy-set-but-unset default) to a + # concrete IANA zone before anything reads self._timezone. Run the + # blocking geo lookup off the event loop. Fail-early if a proxy is set + # but the egress zone can't be resolved. + self._timezone = await asyncio.to_thread( + resolve_session_timezone, self._timezone, self._proxy + ) executable = self._binary_path or ensure_binary() prefs = translate_profile_to_prefs( self._profile, @@ -124,12 +135,18 @@ class InvisiblePlaywright: def _patch_new_context_defaults(self, browser: Browser) -> None: original = browser.new_context defaults = self._default_context_kwargs() + prep = self._prep_recaptcha + profile = self._profile # pass the whole Profile (seed + browsing_history) + tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region async def patched(**kw): merged = dict(defaults) merged.update(kw) ctx = await original(**merged) _patch_new_page_sleep(ctx) + if prep: + from ._recaptcha_seed import seed_recaptcha_cookies_async + await seed_recaptcha_cookies_async(ctx, profile, timezone=tz) return ctx browser.new_context = patched # type: ignore[assignment] diff --git a/src/invisible_playwright/cli.py b/src/invisible_playwright/cli.py index bb1c687..e6057cf 100644 --- a/src/invisible_playwright/cli.py +++ b/src/invisible_playwright/cli.py @@ -44,7 +44,13 @@ def _cmd_clear_cache(_args: argparse.Namespace) -> int: def build_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser(prog="invisible-playwright", description="invisible_playwright CLI") - sub = p.add_subparsers(dest="cmd", required=True) + # Top-level `--version` / `-V` flag so `python -m invisible_playwright --version` + # works (Python convention), in addition to the existing `version` subcommand. + p.add_argument( + "-V", "--version", action="version", + version=f"invisible_playwright {__version__} (BINARY_VERSION={BINARY_VERSION}, Firefox {FIREFOX_UPSTREAM_VERSION})", + ) + sub = p.add_subparsers(dest="cmd") sub.add_parser("fetch", help="download the patched Firefox binary") sub.add_parser("path", help="print the absolute path to the cached binary") @@ -54,7 +60,15 @@ def build_parser() -> argparse.ArgumentParser: def main(argv: list[str] | None = None) -> int: - args = build_parser().parse_args(argv) + parser = build_parser() + args = parser.parse_args(argv) + if args.cmd is None: + # argparse-conventional: print usage + error message to stderr, exit 2. + # We can't keep `required=True` on the subparsers because that breaks + # the top-level `--version` flag (argparse demands a subcommand even + # when --version is the only token). parser.error() preserves the + # original "no subcommand" exit semantics tests expect. + parser.error("a subcommand is required (try --help, --version, or one of: fetch, path, version, clear-cache)") dispatch = { "fetch": _cmd_fetch, "path": _cmd_path, diff --git a/src/invisible_playwright/config.py b/src/invisible_playwright/config.py new file mode 100644 index 0000000..c411512 --- /dev/null +++ b/src/invisible_playwright/config.py @@ -0,0 +1,110 @@ +"""Public helpers for building Firefox launch config without using ``InvisiblePlaywright``. + +Use these when you need to call ``playwright.firefox.launch()`` (or +``firefox.launch_persistent_context()``) directly with our patched binary +and stealth prefs, instead of using the ``InvisiblePlaywright`` context +manager. + +Typical caller is an external integration that owns its own browser +lifecycle (a Crawlee/Skyvern/changedetection-style fetcher, a Playwright +Server wrapper, a multi-language harness) and just wants the building +blocks:: + + from playwright.async_api import async_playwright + from invisible_playwright import ensure_binary, get_default_stealth_prefs + + async with async_playwright() as p: + browser = await p.firefox.launch( + executable_path=str(ensure_binary()), + firefox_user_prefs=get_default_stealth_prefs(seed=42), + ) + +For everyday Python usage the ``InvisiblePlaywright`` context manager is +still the recommended entry point; these helpers expose the same internals +without the lifecycle ownership. + +.. note:: + When calling ``firefox.launch()`` yourself, pass ``headless=False`` and + manage the display hiding (Xvfb on Linux, hidden desktop on Windows) + externally. Passing ``headless=True`` directly to Playwright puts + Firefox in true headless mode, which skips the real rendering pipeline + and breaks canvas / audio / WebGL fingerprint coherence. The + ``InvisiblePlaywright`` context manager does this translation + automatically; the public helpers leave it to the caller. +""" +from __future__ import annotations + +import secrets +from typing import Any, Dict, List, Optional, Union + +from ._fpforge import generate_profile +from .prefs import translate_profile_to_prefs + + +def get_default_stealth_prefs( + seed: Optional[int] = None, + *, + pin: Optional[Dict[str, Any]] = None, + locale: str = "en-US", + timezone: str = "", + extra_prefs: Optional[Dict[str, Any]] = None, + humanize: Union[bool, float] = True, + virtual_display: bool = False, +) -> Dict[str, Any]: + """Build a complete ``firefox_user_prefs`` dict for ``firefox.launch()``. + + Same prefs that ``InvisiblePlaywright(seed=..., locale=..., timezone=..., + extra_prefs=..., humanize=...)`` would inject. Use this when you need to + drive ``playwright.firefox.launch()`` yourself. + + Args: + seed: Integer seed for the Bayesian fingerprint sampler. Same seed + produces the same fingerprint. ``None`` generates a fresh + random int31 (matches ``InvisiblePlaywright`` default). + pin: Optional dict forcing specific fingerprint fields while the + rest stays seed-derived. See ``docs/pinning.md``. + locale: BCP-47 tag (e.g. ``"en-US"``). Drives ``Accept-Language`` + and ``navigator.language``. + timezone: IANA timezone (e.g. ``"America/New_York"``). Empty means + use the host TZ. This pure pref builder does NOT resolve + ``"auto"`` (that needs the proxy + a network lookup at launch + time) — pass a concrete zone here, or use ``InvisiblePlaywright`` + / ``resolve_session_timezone(timezone, proxy)`` for ``"auto"``. + extra_prefs: Optional dict overlaid LAST onto the generated prefs. + humanize: When True (default), every mouse move is expanded into + a Bezier trajectory by the patched Juggler. A float caps the + motion in seconds. False disables the behavior. + virtual_display: When True on Windows, apply GPU-disabling prefs + to prevent GPU process crashes on virtual desktops without + D3D11 backend. + + Returns: + Dict ready to pass as ``firefox_user_prefs=`` to + ``playwright.firefox.launch()`` or ``launch_persistent_context()``. + """ + resolved_seed = int(seed) if seed is not None else secrets.randbits(31) + profile = generate_profile(resolved_seed, pin=pin) + prefs = translate_profile_to_prefs( + profile, + locale=locale, + timezone=timezone, + extra_prefs=extra_prefs, + virtual_display=virtual_display, + ) + prefs["invisible_playwright.humanize"] = bool(humanize) + if humanize: + max_seconds = float(humanize) if not isinstance(humanize, bool) else 1.5 + prefs["invisible_playwright.humanize.maxTime"] = str(max_seconds) + return prefs + + +def get_default_args() -> List[str]: + """Return the default Firefox CLI args to pass via ``args=``. + + Currently empty list, since all our stealth configuration is delivered + via ``firefox_user_prefs`` rather than CLI flags. Exposed for parity + with the ``cloakbrowser.config.get_default_stealth_args`` pattern and + to future-proof integrations that already wire ``args=[*existing, + *get_default_args()]``. + """ + return [] diff --git a/src/invisible_playwright/constants.py b/src/invisible_playwright/constants.py index b13a458..295ebf5 100644 --- a/src/invisible_playwright/constants.py +++ b/src/invisible_playwright/constants.py @@ -7,7 +7,7 @@ bugfixes don't force a multi-hour Firefox rebuild. from __future__ import annotations # Bump this when a new patched Firefox build is released on GitHub. -BINARY_VERSION: str = "firefox-5" +BINARY_VERSION: str = "firefox-8" # Underlying Firefox version (for display only; does not drive downloads). FIREFOX_UPSTREAM_VERSION: str = "150.0.1" @@ -46,3 +46,21 @@ BINARY_ENTRY_REL = { RELEASE_URL_TEMPLATE = ( "https://github.com/feder-cr/invisible_playwright/releases/download/{tag}/{asset}" ) + +# ───────────────────────────────────────────────────────────────────────── +# GeoIP database (timezone="auto" → resolve IANA zone from proxy egress IP) +# ───────────────────────────────────────────────────────────────────────── +# daijro/geoip-all-in-one merges IP2Location LITE + GeoLite2 + DB-IP into a +# single mmdb (country ISO + coordinates + IANA timezone via tzfpy), rebuilt +# weekly. GPL-3.0, so we DOWNLOAD it at runtime into the user cache (like the +# Firefox binary) rather than bundling it into this MIT package. The `-all` +# variant covers IPv4+IPv6. download.py tracks the LATEST release and refreshes +# weekly; GEOIP_MMDB_VERSION is only the cold-cache fallback when the GitHub +# API is unreachable on a machine that has never downloaded the DB. +GEOIP_REPO: str = "daijro/geoip-all-in-one" +GEOIP_MMDB_VERSION: str = "2026.06.03" +GEOIP_ASSET: str = "geoip-aio-all.mmdb.zip" +GEOIP_MMDB_NAME: str = "geoip-aio-all.mmdb" +GEOIP_RELEASE_URL_TEMPLATE: str = ( + "https://github.com/daijro/geoip-all-in-one/releases/download/{tag}/{asset}" +) diff --git a/src/invisible_playwright/download.py b/src/invisible_playwright/download.py index 58a5e8f..7417e39 100644 --- a/src/invisible_playwright/download.py +++ b/src/invisible_playwright/download.py @@ -5,9 +5,11 @@ import hashlib import os import platform import re +import shutil import sys import tarfile import tempfile +import time import zipfile from pathlib import Path @@ -18,6 +20,10 @@ from .constants import ( ARCHIVE_NAME, BINARY_ENTRY_REL, BINARY_VERSION, + GEOIP_ASSET, + GEOIP_MMDB_NAME, + GEOIP_MMDB_VERSION, + GEOIP_RELEASE_URL_TEMPLATE, RELEASE_URL_TEMPLATE, ) @@ -151,3 +157,136 @@ def ensure_binary(version: str = BINARY_VERSION) -> Path: if not entry.exists(): raise RuntimeError(f"binary not found after extraction: {entry}") return entry + + +# ───────────────────────────────────────────────────────────────────────── +# GeoIP mmdb (timezone="auto" → map egress IP → IANA zone) +# +# daijro/geoip-all-in-one is rebuilt WEEKLY, so we don't pin a tag. We cache +# the latest mmdb and, once it's older than GEOIP_REFRESH_DAYS, re-check the +# latest release and pull a newer build if one exists. Net effect: no download +# (not even an API call) on a launch within the window; auto-refresh after it; +# a stale cache is reused when offline rather than breaking the launch. +# ───────────────────────────────────────────────────────────────────────── +GEOIP_REFRESH_DAYS = 7 # matches daijro's weekly rebuild cadence + + +def _geoip_root() -> Path: + return cache_root() / "geoip" + + +def _geoip_check_marker() -> Path: + return _geoip_root() / ".last_check" + + +def _cached_geoip_mmdb() -> Path | None: + """Newest cached mmdb across tag dirs, or None. Tag dirs are date strings + (e.g. ``2026.06.03``) so a lexical sort is chronological.""" + root = _geoip_root() + if not root.exists(): + return None + cands = sorted(root.glob("*/*.mmdb")) + return cands[-1] if cands else None + + +def _geoip_cache_fresh(max_age_days: int) -> bool: + marker = _geoip_check_marker() + if not marker.exists(): + return False + return (time.time() - marker.stat().st_mtime) < max_age_days * 86400 + + +def _touch_geoip_marker() -> None: + m = _geoip_check_marker() + m.parent.mkdir(parents=True, exist_ok=True) + m.touch() + + +def _latest_geoip_tag() -> str: + """Latest ``daijro/geoip-all-in-one`` release tag via the GitHub API.""" + headers = {"Accept": "application/vnd.github+json"} + token = _github_token() + if token: + headers["Authorization"] = f"token {token}" + r = requests.get( + f"https://api.github.com/repos/{GEOIP_REPO}/releases/latest", + headers=headers, timeout=15, + ) + r.raise_for_status() + tag = r.json().get("tag_name") + if not tag: + raise RuntimeError("no tag_name in geoip-all-in-one latest release") + return tag + + +def _download_geoip_tag(tag: str) -> Path: + """Download + extract a specific tag's mmdb if not already cached.""" + dst_dir = _geoip_root() / tag + target = dst_dir / GEOIP_MMDB_NAME + if not target.exists(): + url = GEOIP_RELEASE_URL_TEMPLATE.format(tag=tag, asset=GEOIP_ASSET) + dst_dir.mkdir(parents=True, exist_ok=True) + with tempfile.TemporaryDirectory() as td: + archive = Path(td) / GEOIP_ASSET + _download_file(url, archive) + _extract(archive, dst_dir) + if target.exists(): + return target + # asset name inside the zip may differ from GEOIP_MMDB_NAME + found = sorted(dst_dir.glob("*.mmdb")) + if found: + return found[0] + raise RuntimeError(f"geoip mmdb not found after extraction in {dst_dir}") + + +def _prune_old_geoip_tags(keep: str) -> None: + """Drop every cached tag dir except ``keep`` to bound disk usage.""" + root = _geoip_root() + if not root.exists(): + return + for d in root.iterdir(): + if d.is_dir() and d.name != keep: + shutil.rmtree(d, ignore_errors=True) + + +def geoip_mmdb_path() -> Path | None: + """Path to the currently-cached mmdb (newest tag), or None if none cached.""" + return _cached_geoip_mmdb() + + +def ensure_geoip_mmdb(max_age_days: int = GEOIP_REFRESH_DAYS) -> Path: + """Return a geoip mmdb, kept fresh against daijro's weekly rebuild. + + Resolution order: + 1. ``STEALTHFOX_GEOIP_MMDB`` env → use that file (user-supplied / test). + 2. A cached mmdb younger than ``max_age_days`` → use it (no network). + 3. Else ask GitHub for the latest tag, download it if not already cached, + prune older tags, and reset the freshness timer. + 4. If the API/download is unreachable but a cached mmdb exists → use it + (and reset the timer so we don't hammer the API while offline). + 5. Cold cache + no network → fall back to the pinned ``GEOIP_MMDB_VERSION``; + if that download also fails, raise. + """ + override = os.environ.get("STEALTHFOX_GEOIP_MMDB") + if override: + p = Path(override) + if not p.exists(): + raise RuntimeError(f"STEALTHFOX_GEOIP_MMDB points to a missing file: {p}") + return p + + cached = _cached_geoip_mmdb() + if cached and _geoip_cache_fresh(max_age_days): + return cached + + try: + tag = _latest_geoip_tag() + except Exception: + if cached: + _touch_geoip_marker() # recheck after the window; don't hammer + return cached + tag = GEOIP_MMDB_VERSION # cold cache + API down → pinned fallback + + mmdb = _download_geoip_tag(tag) + _prune_old_geoip_tags(mmdb.parent.name) + _touch_geoip_marker() + return mmdb diff --git a/src/invisible_playwright/launcher.py b/src/invisible_playwright/launcher.py index 07c7967..15055ee 100644 --- a/src/invisible_playwright/launcher.py +++ b/src/invisible_playwright/launcher.py @@ -8,6 +8,7 @@ from typing import Any, Dict, Optional, Union from playwright.sync_api import Browser, BrowserContext, Playwright, sync_playwright from ._fpforge import Profile, generate_profile +from ._geo import resolve_session_timezone from ._headless import make_virtual_display from ._proxy import configure_proxy as _configure_proxy_shared from .download import ensure_binary @@ -113,6 +114,7 @@ class InvisiblePlaywright: extra_prefs: Optional[Dict[str, Any]] = None, binary_path: Optional[str] = None, profile_dir: Optional[Union[str, Path]] = None, + prep_recaptcha: bool = False, ) -> None: """ Args: @@ -134,8 +136,14 @@ class InvisiblePlaywright: a float caps the motion in seconds. locale: BCP-47 tag (e.g. ``"en-US"``). Drives the ``Accept-Language`` header and ``navigator.language``. - timezone: IANA timezone (e.g. ``"America/New_York"``). Empty - means use the host TZ. + timezone: IANA zone (e.g. ``"America/New_York"``) — used as-is + when set, the only way to force a specific zone. ``""`` + (default) or ``"auto"`` ALWAYS resolves from the egress IP: + through the proxy when one is set, otherwise from the host's + own public IP (one lookup + an offline mmdb). On failure: with + a proxy it raises (a foreign proxy on the host TZ is the + ``timezone_mismatch`` signal); without a proxy it falls back to + the host TZ so a transient lookup failure can't break launch. extra_prefs: Optional dict of Firefox prefs overlayed on top of the generated profile — useful for niche tweaks without monkey-patching the package. @@ -166,6 +174,10 @@ class InvisiblePlaywright: self._extra_prefs = extra_prefs self._binary_path = binary_path self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None + # reCAPTCHA cookie pre-seed — opt-in. Gated server-side: if a + # persistent profile_dir is in use, respect its existing cookies + # and DON'T enable pre-seed (the profile owns its own state). + self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None self._profile: Profile = generate_profile(self.seed, pin=self._pin) self._pw: Optional[Playwright] = None self._browser: Optional[Browser] = None @@ -173,6 +185,10 @@ class InvisiblePlaywright: self._virtual_display: Any = None def __enter__(self) -> Union[Browser, BrowserContext]: + # Resolve timezone="auto" (and the proxy-set-but-unset default) to a + # concrete IANA zone before anything reads self._timezone. Fail-early + # if a proxy is set but the egress zone can't be resolved. + self._timezone = resolve_session_timezone(self._timezone, self._proxy) executable = self._binary_path or ensure_binary() prefs = self._build_prefs() playwright_proxy = _configure_proxy_shared(self._proxy, prefs) @@ -240,12 +256,18 @@ class InvisiblePlaywright: """ original = browser.new_context defaults = self._default_context_kwargs() + prep = self._prep_recaptcha + profile = self._profile # pass the whole Profile (seed + browsing_history) + tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region def patched(**kw): merged = dict(defaults) merged.update(kw) # user-supplied wins ctx = original(**merged) _patch_sync_new_page_sleep(ctx) + if prep: + from ._recaptcha_seed import seed_recaptcha_cookies_sync + seed_recaptcha_cookies_sync(ctx, profile, timezone=tz) return ctx browser.new_context = patched # type: ignore[assignment] diff --git a/src/invisible_playwright/prefs.py b/src/invisible_playwright/prefs.py index 496fd04..4f0a15d 100644 --- a/src/invisible_playwright/prefs.py +++ b/src/invisible_playwright/prefs.py @@ -289,13 +289,29 @@ _BASELINE: Dict[str, Any] = { "network.dns.echconfig.enabled": False, "network.dns.use_https_rr_as_altsvc": False, - # === A/B VARIANT B: Fission disabled === - # Force single content-process model (e10s only, no BC outer/inner split). - # Diagnostic for the FF150 BC-swap theory: if peet_ws/fppro/sannysoft - # work with this off, the Juggler FF146 baseline breaks specifically on - # cross-process navigation tracking. + # === Fission / site-isolation disabled (FF146 Playwright parity) === + # Force a single content-process model. Three knobs are required in FF150: + # upstream Playwright Firefox (FF146-based) only needed fission.autostart=False + # because FF146's default isolation strategy was looser. FF150 ships with + # fission.webContentIsolationStrategy=1 (IsolateEverything) which still + # site-isolates cross-origin iframes into separate `webIsolated` content + # processes EVEN WHEN fission.autostart is False. From the parent process's + # point of view, those iframes get a Juggler Frame placeholder with no + # docShell, no URL, and an execution context that wraps the wrong global, + # so frame.evaluate() fails with cross-origin SOP errors and + # element_handle.content_frame() returns None. + # + # Pinning the strategy to 0 keeps every cross-origin web iframe in the + # parent's content process, where the Juggler code paths from the FF146 + # era expect them. processCount.webIsolated=1 is kept as belt-and-suspenders + # in case some path still classifies an origin as webIsolated despite the + # strategy change. It costs nothing to leave. + # + # See issue #20 + tests/test_cross_origin_iframe.py for the regression + # sentinel that catches a future A/B flipping these back. "fission.autostart": False, "fission.autostart.session": False, + "fission.webContentIsolationStrategy": 0, # IsolateNothing "dom.ipc.processCount.webIsolated": 1, @@ -385,19 +401,19 @@ _WIN_VIRT_DESKTOP_WORKAROUNDS: Dict[str, Any] = { # restores hardware compositor + functional WebGL on alt desktops. "security.sandbox.gpu.level": 0, # Same root cause as above, content process side. Wrapper repo issue #18 - # (id.sky.com tab crash). Sandbox content level > 4 puts content processes - # on the sandbox's own kAlternateWinstation (see - # security/sandbox/win/src/sandboxbroker/sandboxBroker.cpp line 1113-1114: + # (tab crash on cross-process navigation under headless=True). Sandbox + # content level > 4 puts content processes on the sandbox's own + # kAlternateWinstation (see security/sandbox/win/src/sandboxbroker/ + # sandboxBroker.cpp line 1113-1114: # `if (aSandboxLevel > 4) config->SetDesktop(kAlternateWinstation)`). # Combined with our CreateDesktop alt-desktop, that puts browser process # and content processes on DIFFERENT desktops. Cross-process navigation - # (Adobe AppMeasurement → new origin → new content process on a new - # desktop) then fails window parenting between parent and child → content + # then fails window parenting between parent and child, the content # process exits cleanly (exitCode=0, signal=null) and Playwright fires # page.on('crash') ~10s after page load. Lowering content sandbox to 4 # keeps content processes on the same desktop as the browser process, - # which is what we want here (and is still tight enough — level 4 - # blocks file/registry write, network calls, hardware access). + # which is what we want here (still tight enough — level 4 blocks + # file/registry write, network calls, hardware access). "security.sandbox.content.level": 4, } diff --git a/tests/test_cross_origin_iframe.py b/tests/test_cross_origin_iframe.py new file mode 100644 index 0000000..8be39ac --- /dev/null +++ b/tests/test_cross_origin_iframe.py @@ -0,0 +1,295 @@ +"""Regression tests for cross-origin / cross-process iframe interaction. + +History: wrapper repo issue #20 reported that a third-party cookie +consent iframe was completely unreachable from Playwright in 0.1.7 — +``element_handle.content_frame()`` returned ``None``, ``frame.evaluate()`` +threw cross-origin SOP errors, and ``frame_locator().click()`` timed +out. + +Root cause was a missing pref. FF150 ships with +``fission.webContentIsolationStrategy=1`` (IsolateEverything), which +site-isolates cross-origin iframes into separate webIsolated content +processes even when ``fission.autostart=False``. The Juggler code paths +inherited from the FF146 era assume same-process iframes. The wrapper's +``_BASELINE`` now pins the pref to 0 (IsolateNothing). + +These tests exist so a future Firefox upgrade or a fingerprint A/B +that flips this pref by accident cannot ship without a red CI signal. + +Layers: + * ``unit`` — ``_BASELINE`` contains the pref with the right value. No browser. + * ``e2e`` — launch the real binary against a LOCAL HTTP harness on + ``127.0.0.1`` (two ports = two SOP origins) and verify the + four protocol operations that regressed: frame URL tracking, + ``handle.content_frame()``, ``frame.evaluate()``, and + ``frame_locator(...).locator(...)`` element resolution. + +The e2e tests run entirely offline. They never call out to a real site; +the cross-origin shape is reproduced with two local HTTP servers on +random free ports. +""" +from __future__ import annotations + +import socket +import sys +import threading +from http.server import BaseHTTPRequestHandler, HTTPServer + +import pytest + +from invisible_playwright._fpforge import generate_profile +from invisible_playwright.prefs import _BASELINE, translate_profile_to_prefs + + +# ──────────────────────────────────────────────────────────────────── +# Unit layer — fast, no browser, runs on every CI +# ──────────────────────────────────────────────────────────────────── + + +@pytest.mark.unit +def test_baseline_pins_web_content_isolation_strategy_to_zero(): + """Regression sentinel. + + ``fission.webContentIsolationStrategy`` MUST be 0 (IsolateNothing). + The FF150 default is 1 (IsolateEverything), which site-isolates + cross-origin iframes into separate webIsolated content processes + and breaks Playwright frame tracking from the parent process. + """ + assert _BASELINE["fission.webContentIsolationStrategy"] == 0, ( + "fission.webContentIsolationStrategy must be 0 (IsolateNothing). " + "If you bumped it for an A/B, cross-origin iframes will appear " + "in page.frames with empty URLs and content_frame() will return " + "None — see the changelog entry that introduced this test." + ) + + +@pytest.mark.unit +def test_baseline_keeps_fission_autostart_off(): + """Belt for the suspenders above. All three prefs are required.""" + assert _BASELINE["fission.autostart"] is False + assert _BASELINE["fission.autostart.session"] is False + assert _BASELINE["dom.ipc.processCount.webIsolated"] == 1 + + +@pytest.mark.unit +def test_translated_profile_propagates_isolation_strategy(): + """The fix must survive translate_profile_to_prefs, not just live in _BASELINE.""" + p = generate_profile(seed=42) + prefs = translate_profile_to_prefs(p) + assert prefs["fission.webContentIsolationStrategy"] == 0 + + +@pytest.mark.unit +def test_extra_prefs_override_can_break_isolation_only_explicitly(): + """If a caller wants to A/B isolation, they have to set it explicitly. + The wrapper does not silently flip it back on. + """ + p = generate_profile(seed=42) + prefs_default = translate_profile_to_prefs(p) + assert prefs_default["fission.webContentIsolationStrategy"] == 0 + + prefs_ab = translate_profile_to_prefs( + p, extra_prefs={"fission.webContentIsolationStrategy": 1} + ) + assert prefs_ab["fission.webContentIsolationStrategy"] == 1 + + +# ──────────────────────────────────────────────────────────────────── +# E2E layer — needs cached binary + bind to localhost ports +# ──────────────────────────────────────────────────────────────────── + + +def _free_port() -> int: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("127.0.0.1", 0)) + port = s.getsockname()[1] + s.close() + return port + + +class _SilentHandler(BaseHTTPRequestHandler): + """Suppress per-request access logging so pytest output stays clean.""" + PAYLOAD = b"" # set per-instance via subclassing + + def log_message(self, *_a): + pass + + def do_GET(self): + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Cache-Control", "no-store") + self.end_headers() + self.wfile.write(self.PAYLOAD) + + +def _serve(payload: bytes, port: int) -> HTTPServer: + """Start an HTTP server on 127.0.0.1:port serving ``payload`` on every GET.""" + handler_cls = type( + "_H", (_SilentHandler,), {"PAYLOAD": payload} + ) + srv = HTTPServer(("127.0.0.1", port), handler_cls) + t = threading.Thread(target=srv.serve_forever, daemon=True) + t.start() + return srv + + +@pytest.fixture +def cross_origin_harness(): + """Spin up TWO local HTTP servers on different localhost ports. + + Two ports = two distinct origins under SOP (same host, different port + → different origin). The parent page on port A embeds an iframe with + src pointing at port B. Same cross-origin browsing-context shape as + a parent-page-plus-third-party-iframe layout, fully offline. + """ + pa, pb = _free_port(), _free_port() + parent_html = f"""parent +

parent

+ + + +""".encode("utf-8") + child_html = b""" + + + +""" + sa = _serve(parent_html, pa) + sb = _serve(child_html, pb) + try: + yield {"parent_url": f"http://127.0.0.1:{pa}/", "child_origin": f"http://127.0.0.1:{pb}"} + finally: + sa.shutdown() + sb.shutdown() + + +@pytest.fixture(scope="session") +def firefox_binary(): + """Locate the cached patched Firefox binary or skip.""" + from invisible_playwright.constants import BINARY_ENTRY_REL + if sys.platform not in BINARY_ENTRY_REL: + pytest.skip(f"unsupported platform: {sys.platform}") + from invisible_playwright.download import cache_dir_for_version + entry = cache_dir_for_version() / BINARY_ENTRY_REL[sys.platform] + if not entry.exists(): + pytest.skip( + "patched Firefox binary not cached; run `invisible-playwright fetch` " + "to enable E2E tests" + ) + return str(entry) + + +@pytest.mark.e2e +def test_cross_origin_iframe_url_appears_in_page_frames(firefox_binary, cross_origin_harness): + """``page.frames`` must list the cross-origin iframe with its real URL. + + Before the pref fix, the URL came back as '' because the navigation + observer for the iframe fired in a different content process than + the parent's FrameTree was registered in. + """ + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, binary_path=firefox_binary, humanize=False) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto(cross_origin_harness["parent_url"], wait_until="domcontentloaded", timeout=30_000) + page.wait_for_selector("iframe#ifr_plain", timeout=10_000) + page.wait_for_timeout(500) + + urls = [f.url for f in page.frames] + assert any(cross_origin_harness["child_origin"] in (u or "") for u in urls), ( + f"no frame had the child origin in its URL; page.frames urls = {urls!r}" + ) + + +@pytest.mark.e2e +def test_cross_origin_iframe_content_frame_resolves(firefox_binary, cross_origin_harness): + """``handle.content_frame()`` must return a Frame (not None) for every + cross-origin iframe shape we care about: plain, sandboxed, titled. + """ + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, binary_path=firefox_binary, humanize=False) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto(cross_origin_harness["parent_url"], wait_until="domcontentloaded", timeout=30_000) + page.wait_for_selector("iframe#ifr_plain", timeout=10_000) + page.wait_for_timeout(500) + + for sel in ("iframe#ifr_plain", "iframe#ifr_sandbox", "iframe#ifr_titled"): + handle = page.query_selector(sel) + assert handle is not None, f"{sel!r} not found in DOM" + cf = handle.content_frame() + assert cf is not None, f"{sel!r}: content_frame() returned None" + assert cross_origin_harness["child_origin"] in (cf.url or ""), ( + f"{sel!r}: content_frame().url = {cf.url!r}, " + f"expected child origin {cross_origin_harness['child_origin']!r}" + ) + + +@pytest.mark.e2e +def test_cross_origin_iframe_evaluate_returns_real_values(firefox_binary, cross_origin_harness): + """``frame.evaluate()`` inside the cross-origin iframe must work. + + Pre-fix: every evaluate failed with a cross-origin SOP error because + the iframe ended up with a stale/wrong execution context. + """ + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, binary_path=firefox_binary, humanize=False) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto(cross_origin_harness["parent_url"], wait_until="domcontentloaded", timeout=30_000) + page.wait_for_selector("iframe#ifr_plain", timeout=10_000) + page.wait_for_timeout(500) + + cf = page.query_selector("iframe#ifr_plain").content_frame() + assert cf is not None + href = cf.evaluate("() => location.href") + assert cross_origin_harness["child_origin"] in href + title = cf.evaluate("() => document.title") + assert isinstance(title, str) + n_buttons = cf.evaluate("() => document.querySelectorAll('button').length") + assert n_buttons == 2 + + +@pytest.mark.e2e +def test_cross_origin_iframe_frame_locator_resolves_button(firefox_binary, cross_origin_harness): + """``frame_locator(...).locator(...)`` must reach the button inside the iframe.""" + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, binary_path=firefox_binary, humanize=False) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto(cross_origin_harness["parent_url"], wait_until="domcontentloaded", timeout=30_000) + page.wait_for_selector("iframe#ifr_plain", timeout=10_000) + + for selector in ("button#ok", "button.btn-primary"): + cnt = page.frame_locator("iframe#ifr_plain").locator(selector).count() + assert cnt == 1, f"locator({selector!r}) found {cnt} elements (expected 1)" + + +@pytest.mark.e2e +def test_cross_origin_iframe_dispatch_event_click_works(firefox_binary, cross_origin_harness): + """End-to-end interaction via ``dispatch_event`` must succeed. + + Plain ``.click()`` can trip Playwright's actionability heuristic on + some third-party UIs (same on vanilla Playwright Firefox — not our + regression), but ``dispatch_event('click')`` always works once the + iframe is reachable. + """ + from invisible_playwright import InvisiblePlaywright + + with InvisiblePlaywright(seed=42, binary_path=firefox_binary, humanize=False) as browser: + ctx = browser.new_context() + page = ctx.new_page() + page.goto(cross_origin_harness["parent_url"], wait_until="domcontentloaded", timeout=30_000) + page.wait_for_selector("iframe#ifr_plain", timeout=10_000) + + page.frame_locator("iframe#ifr_plain").locator("button#ok").dispatch_event( + "click", timeout=4_000 + ) + cf = page.query_selector("iframe#ifr_plain").content_frame() + assert cf.evaluate("() => document.title") == "clicked" diff --git a/tests/test_fingerprint_consistency.py b/tests/test_fingerprint_consistency.py index aa0f96b..0a53d27 100644 --- a/tests/test_fingerprint_consistency.py +++ b/tests/test_fingerprint_consistency.py @@ -306,17 +306,6 @@ def test_navigator_oscpu_matches_userAgent(page): assert "Mac" in oscpu -@pytest.mark.e2e -def test_userAgent_contains_appVersion_chromium_only(page): - """Chromium invariant: UA contains appVersion. Firefox uses a short - appVersion form so the check is gated on `'chrome' in window`.""" - if not _ev(page, "'chrome' in window"): - pytest.skip("Chromium-only invariant") - ua = _ev(page, "navigator.userAgent") - av = _ev(page, "navigator.appVersion") - assert av in ua - - # =========================================================================== # 5. Native function self-toString (creepjs/src/lies/index.ts hasKnownToString) # =========================================================================== diff --git a/tests/test_geo.py b/tests/test_geo.py new file mode 100644 index 0000000..39ef5ee --- /dev/null +++ b/tests/test_geo.py @@ -0,0 +1,288 @@ +"""Unit tests for `invisible_playwright._geo` (timezone="auto" resolution). + +Covers: the precedence policy (resolve_session_timezone), proxy→requests +translation, egress IP discovery (mocked HTTP), and IP→IANA mapping (mocked +mmdb). No real network or mmdb is touched. +""" +import sys +import types + +import pytest + +from invisible_playwright import _geo +from invisible_playwright._geo import ( + GeoTimezoneError, + _proxies_for_requests, + _proxy_is_set, + discover_egress_ip, + ip_to_timezone, + resolve_session_timezone, +) + +SOCKS = {"server": "socks5://gw.example:1080", "username": "u", "password": "p"} +HTTP = {"server": "http://gw.example:8080", "username": "u", "password": "p"} + + +# ────────────────────────────────────────────────────────────────────── +# _proxy_is_set +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +@pytest.mark.parametrize( + "proxy,expected", + [ + (None, False), + ({}, False), + ({"server": ""}, False), + ({"server": " "}, False), + ({"server": "direct://"}, False), + ({"server": "DIRECT://"}, False), + ({"server": "socks5://h:1"}, True), + ({"server": "http://h:8080"}, True), + ], +) +def test_proxy_is_set(proxy, expected): + assert _proxy_is_set(proxy) is expected + + +# ────────────────────────────────────────────────────────────────────── +# _proxies_for_requests — scheme + credential translation +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_proxies_socks5_uses_socks5h_remote_dns(): + out = _proxies_for_requests(SOCKS) + assert out["http"] == "socks5h://u:p@gw.example:1080" + assert out["https"] == out["http"] + + +@pytest.mark.unit +def test_proxies_socks4_scheme(): + out = _proxies_for_requests({"server": "socks4://gw:1080"}) + assert out["http"] == "socks4://gw:1080" + + +@pytest.mark.unit +def test_proxies_http_and_https_schemes(): + assert _proxies_for_requests(HTTP)["http"] == "http://u:p@gw.example:8080" + out = _proxies_for_requests({"server": "https://gw:8443"}) + assert out["https"] == "https://gw:8443" + + +@pytest.mark.unit +def test_proxies_no_scheme_defaults_to_http(): + out = _proxies_for_requests({"server": "gw.example:3128"}) + assert out["http"] == "http://gw.example:3128" + + +@pytest.mark.unit +def test_proxies_credentials_are_url_encoded(): + out = _proxies_for_requests( + {"server": "socks5://gw:1080", "username": "user@x", "password": "p:w/d"} + ) + # '@', ':' and '/' in creds must be percent-encoded so they don't break + # the proxy URL parsing. + assert "user%40x:p%3Aw%2Fd@gw:1080" in out["http"] + + +@pytest.mark.unit +def test_proxies_no_credentials_has_no_auth_prefix(): + out = _proxies_for_requests({"server": "socks5://gw:1080"}) + assert out["http"] == "socks5h://gw:1080" + + +# ────────────────────────────────────────────────────────────────────── +# discover_egress_ip — mocked requests +# ────────────────────────────────────────────────────────────────────── +class _FakeResp: + def __init__(self, text, status=200): + self.text = text + self._status = status + + def raise_for_status(self): + if self._status >= 400: + raise RuntimeError(f"HTTP {self._status}") + + +@pytest.mark.unit +def test_discover_egress_ip_first_endpoint_wins(monkeypatch): + calls = [] + + def fake_get(url, **kw): + calls.append(url) + return _FakeResp("203.0.113.7\n") + + monkeypatch.setattr(_geo.requests, "get", fake_get) + assert discover_egress_ip(SOCKS) == "203.0.113.7" + assert len(calls) == 1 # stopped at the first success + + +@pytest.mark.unit +def test_discover_egress_ip_falls_through_to_next_on_error(monkeypatch): + seq = iter([_FakeResp("junk-not-an-ip"), _FakeResp("198.51.100.42")]) + + def fake_get(url, **kw): + return next(seq) + + monkeypatch.setattr(_geo.requests, "get", fake_get) + assert discover_egress_ip(HTTP) == "198.51.100.42" + + +@pytest.mark.unit +def test_discover_egress_ip_all_fail_raises(monkeypatch): + def fake_get(url, **kw): + raise OSError("connection refused") + + monkeypatch.setattr(_geo.requests, "get", fake_get) + with pytest.raises(GeoTimezoneError): + discover_egress_ip(SOCKS) + + +@pytest.mark.unit +def test_discover_egress_ip_no_proxy_is_direct(monkeypatch): + # proxy=None → direct request, requests.get must get proxies=None. + seen = {} + + def fake_get(url, **kw): + seen["proxies"] = kw.get("proxies", "MISSING") + return _FakeResp("192.0.2.55") + + monkeypatch.setattr(_geo.requests, "get", fake_get) + assert discover_egress_ip(None) == "192.0.2.55" + assert seen["proxies"] is None + + +# ────────────────────────────────────────────────────────────────────── +# ip_to_timezone — mocked mmdb reader +# ────────────────────────────────────────────────────────────────────── +class _FakeReader: + def __init__(self, record): + self._record = record + + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def get(self, ip): + return self._record + + +def _install_fake_maxminddb(monkeypatch, record): + mod = types.ModuleType("maxminddb") + mod.open_database = lambda path: _FakeReader(record) + monkeypatch.setitem(sys.modules, "maxminddb", mod) + + +@pytest.mark.unit +def test_ip_to_timezone_reads_location_time_zone(monkeypatch): + _install_fake_maxminddb(monkeypatch, {"location": {"time_zone": "Europe/Rome"}}) + assert ip_to_timezone("1.2.3.4", "x.mmdb") == "Europe/Rome" + + +@pytest.mark.unit +def test_ip_to_timezone_ip_absent_raises(monkeypatch): + _install_fake_maxminddb(monkeypatch, None) + with pytest.raises(GeoTimezoneError): + ip_to_timezone("1.2.3.4", "x.mmdb") + + +@pytest.mark.unit +def test_ip_to_timezone_missing_zone_raises(monkeypatch): + _install_fake_maxminddb(monkeypatch, {"location": {}}) + with pytest.raises(GeoTimezoneError): + ip_to_timezone("1.2.3.4", "x.mmdb") + + +@pytest.mark.unit +def test_ip_to_timezone_invalid_iana_raises(monkeypatch): + _install_fake_maxminddb(monkeypatch, {"location": {"time_zone": "Not/AZone"}}) + with pytest.raises(GeoTimezoneError): + ip_to_timezone("1.2.3.4", "x.mmdb") + + +# ────────────────────────────────────────────────────────────────────── +# resolve_session_timezone — the precedence policy +# ────────────────────────────────────────────────────────────────────── +@pytest.fixture +def stub_egress(monkeypatch): + """Make egress resolution deterministic + offline; record if it ran.""" + state = {"called": False} + + def fake_discover(proxy=None, **kw): + state["called"] = True + state["proxy_arg"] = proxy + return "203.0.113.7" + + monkeypatch.setattr(_geo, "discover_egress_ip", fake_discover) + monkeypatch.setattr(_geo, "ip_to_timezone", lambda ip, mmdb: "America/New_York") + # ensure_geoip_mmdb is imported from .download at call time + import invisible_playwright.download as dl + + monkeypatch.setattr(dl, "ensure_geoip_mmdb", lambda *a, **k: "fake.mmdb") + return state + + +@pytest.mark.unit +def test_resolve_explicit_iana_wins(stub_egress): + # An explicit zone wins and never triggers resolution (proxy or not). + assert resolve_session_timezone("Asia/Tokyo", SOCKS) == "Asia/Tokyo" + assert resolve_session_timezone("Asia/Tokyo", None) == "Asia/Tokyo" + assert stub_egress["called"] is False + + +@pytest.mark.unit +def test_resolve_empty_with_proxy_resolves_from_proxy(stub_egress): + assert resolve_session_timezone("", SOCKS) == "America/New_York" + assert stub_egress["called"] is True + assert stub_egress["proxy_arg"] == SOCKS # routed through the proxy + + +@pytest.mark.unit +def test_resolve_auto_with_proxy_resolves_from_proxy(stub_egress): + assert resolve_session_timezone("auto", HTTP) == "America/New_York" + assert stub_egress["proxy_arg"] == HTTP + + +@pytest.mark.unit +def test_resolve_empty_no_proxy_resolves_from_host(stub_egress): + # auto ALWAYS resolves — without a proxy, from the host's own public IP. + assert resolve_session_timezone("", None) == "America/New_York" + assert stub_egress["called"] is True + assert stub_egress["proxy_arg"] is None # direct request, no proxy + + +@pytest.mark.unit +def test_resolve_auto_no_proxy_resolves_from_host(stub_egress): + assert resolve_session_timezone("auto", None) == "America/New_York" + assert stub_egress["proxy_arg"] is None + + +@pytest.mark.unit +def test_resolve_direct_proxy_resolves_via_host(stub_egress): + # direct:// counts as "no proxy" → resolve from the host IP, don't skip. + assert resolve_session_timezone("auto", {"server": "direct://"}) == "America/New_York" + assert stub_egress["proxy_arg"] is None + + +@pytest.mark.unit +def test_resolve_no_proxy_failure_falls_back_to_host(monkeypatch): + # Without a proxy, a lookup failure must NOT break the launch → host TZ (""). + def boom(proxy=None, **kw): + raise GeoTimezoneError("offline") + + monkeypatch.setattr(_geo, "discover_egress_ip", boom) + assert resolve_session_timezone("auto", None) == "" + assert resolve_session_timezone("", None) == "" + + +@pytest.mark.unit +def test_resolve_proxy_failure_raises(monkeypatch): + # With a proxy set, a failure must raise — never a silent host-TZ fallback. + def boom(proxy=None, **kw): + raise GeoTimezoneError("no egress") + + monkeypatch.setattr(_geo, "discover_egress_ip", boom) + with pytest.raises(GeoTimezoneError): + resolve_session_timezone("auto", SOCKS) + with pytest.raises(GeoTimezoneError): + resolve_session_timezone("", SOCKS) diff --git a/tests/test_geoip_update.py b/tests/test_geoip_update.py new file mode 100644 index 0000000..26632b7 --- /dev/null +++ b/tests/test_geoip_update.py @@ -0,0 +1,131 @@ +"""Unit tests for the intelligent geoip mmdb auto-update in `download.py`. + +daijro/geoip-all-in-one rebuilds weekly; `ensure_geoip_mmdb` keeps the cache +fresh without a download (or API call) on every launch. These tests mock the +cache root, the latest-tag API, and the per-tag download so nothing touches the +network. +""" +import os +import time + +import pytest + +import invisible_playwright.download as dl + + +@pytest.fixture +def cache(tmp_path, monkeypatch): + """Point the cache at tmp_path and clear the env override.""" + monkeypatch.setattr(dl, "cache_root", lambda: tmp_path) + monkeypatch.delenv("STEALTHFOX_GEOIP_MMDB", raising=False) + return tmp_path + + +def _make_cached(root, tag, name=dl.GEOIP_MMDB_NAME): + d = root / "geoip" / tag + d.mkdir(parents=True, exist_ok=True) + f = d / name + f.write_bytes(b"FAKE-MMDB") + return f + + +def _set_marker_age(root, days): + m = root / "geoip" / ".last_check" + m.parent.mkdir(parents=True, exist_ok=True) + m.touch() + old = time.time() - days * 86400 + os.utime(m, (old, old)) + + +# ────────────────────────────────────────────────────────────────────── +# env override +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_env_override_returns_file(tmp_path, monkeypatch): + f = tmp_path / "mine.mmdb" + f.write_bytes(b"X") + monkeypatch.setenv("STEALTHFOX_GEOIP_MMDB", str(f)) + assert dl.ensure_geoip_mmdb() == f + + +@pytest.mark.unit +def test_env_override_missing_raises(tmp_path, monkeypatch): + monkeypatch.setenv("STEALTHFOX_GEOIP_MMDB", str(tmp_path / "nope.mmdb")) + with pytest.raises(RuntimeError): + dl.ensure_geoip_mmdb() + + +# ────────────────────────────────────────────────────────────────────── +# freshness window +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_fresh_cache_no_network(cache, monkeypatch): + f = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 0) # just checked + + def boom(): + raise AssertionError("latest-tag API must NOT be called within the window") + + monkeypatch.setattr(dl, "_latest_geoip_tag", boom) + assert dl.ensure_geoip_mmdb(max_age_days=7) == f + + +@pytest.mark.unit +def test_stale_same_tag_no_download(cache, monkeypatch): + f = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 30) # stale → will re-check + monkeypatch.setattr(dl, "_latest_geoip_tag", lambda: "2026.06.03") + # real _download_geoip_tag runs but target exists, so no actual download: + monkeypatch.setattr(dl, "_download_file", lambda *a, **k: (_ for _ in ()).throw( + AssertionError("must not download when tag already cached"))) + assert dl.ensure_geoip_mmdb(max_age_days=7) == f + + +@pytest.mark.unit +def test_stale_new_tag_downloads_and_prunes(cache, monkeypatch): + old = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 30) + monkeypatch.setattr(dl, "_latest_geoip_tag", lambda: "2026.06.10") + + def fake_download(tag): + return _make_cached(cache, tag) # simulate fetch+extract of the new tag + + monkeypatch.setattr(dl, "_download_geoip_tag", fake_download) + got = dl.ensure_geoip_mmdb(max_age_days=7) + assert got.parent.name == "2026.06.10" + assert not old.parent.exists() # old tag pruned + assert got.exists() + + +# ────────────────────────────────────────────────────────────────────── +# offline resilience +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_api_down_with_cache_uses_cache(cache, monkeypatch): + f = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 30) + + def boom(): + raise OSError("offline") + + monkeypatch.setattr(dl, "_latest_geoip_tag", boom) + assert dl.ensure_geoip_mmdb(max_age_days=7) == f # stale cache reused, no raise + + +@pytest.mark.unit +def test_cold_cache_api_down_falls_back_to_pinned(cache, monkeypatch): + # no cache at all + API unreachable → pinned GEOIP_MMDB_VERSION fallback. + def boom(): + raise OSError("offline") + + monkeypatch.setattr(dl, "_latest_geoip_tag", boom) + captured = {} + + def fake_download(tag): + captured["tag"] = tag + return _make_cached(cache, tag) + + monkeypatch.setattr(dl, "_download_geoip_tag", fake_download) + got = dl.ensure_geoip_mmdb(max_age_days=7) + assert captured["tag"] == dl.GEOIP_MMDB_VERSION + assert got.exists() diff --git a/tests/test_recaptcha_seed.py b/tests/test_recaptcha_seed.py new file mode 100644 index 0000000..dbd1821 --- /dev/null +++ b/tests/test_recaptcha_seed.py @@ -0,0 +1,349 @@ +"""Unit tests for the deterministic reCAPTCHA cookie builder. + +Validates the contract: + - 6 .google.com cookies always present + - Per-site cookies built from a `browsing_history` list (sampled by the + Bayesian network in _fpforge) + - Determinism: same (seed, history) → identical content + - Chrome 400-day cookie cap respected + - Playwright add_cookies field requirements satisfied +""" +import pytest + +from invisible_playwright._recaptcha_seed import ( + build_cookies, + _sub_seed, +) + + +pytestmark = pytest.mark.unit + + +_FIXED_NOW = 1779600000 # 2026-05-23, frozen for determinism + + +# Sample browsing history for tests (mimics what _fpforge produces). +_SAMPLE_HISTORY = [ + {"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, + {"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"}, + {"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"}, + {"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"}, +] + + +# =========================================================================== +# 1. Set composition +# =========================================================================== + +def test_only_google_cookies_when_no_history(): + """Empty/None history → only the 5 .google.com cookies (1P_JAR removed + in realism round 2 — deprecated by Google 2022).""" + cookies = build_cookies(seed=42, browsing_history=None, now=_FIXED_NOW) + names = sorted(c["name"] for c in cookies) + assert names == sorted(["NID", "CONSENT", "SOCS", + "_GRECAPTCHA", "ENID"]) + assert all(c["domain"] == ".google.com" for c in cookies) + + +def test_browsing_history_adds_host_cookies(): + """Each history site contributes 1+ cookies on its domain.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + google = [c for c in cookies if c["domain"] == ".google.com"] + assert len(google) == 5 # 1P_JAR removed + + domains = {c["domain"] for c in cookies if c["domain"] != ".google.com"} + for site in _SAMPLE_HISTORY: + assert f".{site['name']}" in domains + + +def test_domain_dot_prefix_normalized(): + """All host cookie domains have a leading dot for sub-domain coverage.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + for c in cookies: + assert c["domain"].startswith("."), f"missing dot: {c['domain']}" + + +# =========================================================================== +# 2. Cookie profile recipes (each profile yields the expected cookie set) +# =========================================================================== + +def test_profile_minimal_yields_ga_only(): + history = [{"name": "x.com", "cookie_profile": "minimal"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = [c["name"] for c in host] + assert names == ["_ga"] + + +def test_profile_ga_only_yields_ga_and_gid(): + history = [{"name": "x.com", "cookie_profile": "ga_only"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + assert names == ["_ga", "_gid"] + + +def test_profile_ga_cf_yields_ga_and_cf_bm(): + history = [{"name": "x.com", "cookie_profile": "ga_cf"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + assert names == ["__cf_bm", "_ga"] + + +def test_profile_ga_consent_yields_three_cookies(): + history = [{"name": "x.com", "cookie_profile": "ga_consent"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + # Always _ga + _gid + one of OneTrust|CookieYes + assert "_ga" in names and "_gid" in names + assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent")) + assert len(host) == 3 + + +def test_profile_ga_consent_clarity_yields_at_least_four_cookies(): + """Always _ga + _gid + _clck + consent banner. Optionally _fbp, _dc_gtm_*, + __hssrc (probabilistic per rng — see test_new_helper_cookies_*).""" + history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + assert "_ga" in names and "_gid" in names and "_clck" in names + assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent")) + assert len(host) >= 4 # 4 baseline + 0-3 helpers + + +def test_unknown_profile_falls_back_to_ga(): + history = [{"name": "x.com", "cookie_profile": "nonexistent_profile"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + assert [c["name"] for c in host] == ["_ga"] + + +# =========================================================================== +# 3. Determinism +# =========================================================================== + +def test_same_seed_and_history_same_content(): + a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + b = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + assert a == b + + +def test_different_seed_different_content(): + a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + b = build_cookies(seed=99, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + a_nid = next(c for c in a if c["name"] == "NID")["value"] + b_nid = next(c for c in b if c["name"] == "NID")["value"] + assert a_nid != b_nid + + +def test_history_order_does_not_affect_domain_specific_cookies(): + """Sub-seed is keyed on domain name, not order in history list.""" + h1 = [_SAMPLE_HISTORY[0], _SAMPLE_HISTORY[1]] + h2 = [_SAMPLE_HISTORY[1], _SAMPLE_HISTORY[0]] + a = {(c["domain"], c["name"]): c["value"] + for c in build_cookies(seed=42, browsing_history=h1, now=_FIXED_NOW) + if c["domain"] != ".google.com"} + b = {(c["domain"], c["name"]): c["value"] + for c in build_cookies(seed=42, browsing_history=h2, now=_FIXED_NOW) + if c["domain"] != ".google.com"} + assert a == b + + +def test_sub_seed_distinct_tags_distinct_streams(): + assert _sub_seed(42, "google") != _sub_seed(42, "dom:github.com") + assert _sub_seed(42, "dom:github.com") != _sub_seed(42, "dom:amazon.com") + assert _sub_seed(0, "any") != 0 # seed=0 still produces non-zero sub-seed + + +# =========================================================================== +# 4. Format / structural correctness for the Google batch +# =========================================================================== + +def test_nid_format(): + cookies = build_cookies(seed=42, now=_FIXED_NOW) + nid = next(c for c in cookies if c["name"] == "NID") + prefix, b64 = nid["value"].split("=", 1) + assert prefix.isdigit() and len(prefix) == 3 + # Broadened to 100-540 in realism round 2 to cover historical NID versions + assert 100 <= int(prefix) <= 540 + assert len(b64) == 178 + + +def test_consent_format(): + cookies = build_cookies(seed=42, now=_FIXED_NOW) + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert consent["value"].startswith("YES+cb.") + assert "+FX+" in consent["value"] + + +# =========================================================================== +# 5. Chrome 400-day cookie cap compliance +# =========================================================================== + +def test_all_expiries_within_400_day_cap(): + """Chrome 104+ caps cookie expiry to 400 days. Cookies > 400d silently + truncated / dropped. We tighten everything to <=395d (except __cf_bm + which is short-lived telemetry).""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + max_allowed = _FIXED_NOW + 400 * 86400 + for c in cookies: + # Short-lived telemetry cookies are fine + if c["name"] in ("__cf_bm", "1P_JAR", "_gid"): + continue + assert c["expires"] <= max_allowed, ( + f"Cookie {c['name']} expires {c['expires'] - _FIXED_NOW}s " + f"(> 400d cap) — would be silently dropped" + ) + + +# =========================================================================== +# 6. Playwright add_cookies field requirements +# =========================================================================== + +def test_all_cookies_have_required_playwright_fields(): + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + for c in cookies: + assert c.get("name"), f"missing name: {c}" + assert c.get("value") is not None, f"missing value: {c}" + assert c.get("domain"), f"missing domain: {c}" + assert c.get("path") == "/", f"path != / for {c['name']}" + + +def test_modern_cookies_marked_secure(): + """Cookies with sameSite=None require secure=True under Firefox/Chrome. + Also generally needed for cookies set via Playwright add_cookies without + a navigation context.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + for c in cookies: + if c.get("sameSite") == "None": + assert c.get("secure") is True, f"{c['name']} None+!secure invalid" + + +def test_httponly_on_signed_cookies(): + cookies = build_cookies(seed=42, now=_FIXED_NOW) + nid = next(c for c in cookies if c["name"] == "NID") + enid = next(c for c in cookies if c["name"] == "ENID") + assert nid.get("httpOnly") is True + assert enid.get("httpOnly") is True + + +# =========================================================================== +# 7. End-to-end with real fpforge Profile +# =========================================================================== + +def test_with_real_fpforge_profile(): + """End-to-end: generate a real Profile, ensure browsing_history is populated + and build_cookies works against it.""" + from invisible_playwright._fpforge import generate_profile + prof = generate_profile(seed=42) + assert isinstance(prof.browsing_history, list) + # The Bayesian network samples ~15-30 sites per persona + assert 5 <= len(prof.browsing_history) <= 50, \ + f"unexpected history length: {len(prof.browsing_history)}" + # Each entry has the expected fields + for site in prof.browsing_history: + assert "name" in site and "category" in site and "cookie_profile" in site + # build_cookies works against the real profile + cookies = build_cookies(seed=prof.seed, browsing_history=prof.browsing_history, + now=_FIXED_NOW) + # 6 google + at least 1 cookie per visited site + assert len(cookies) >= 6 + len(prof.browsing_history) + + +def test_same_seed_same_browsing_history_via_fpforge(): + """Profile.browsing_history is deterministic from seed (Bayesian sampler).""" + from invisible_playwright._fpforge import generate_profile + a = generate_profile(seed=42).browsing_history + b = generate_profile(seed=42).browsing_history + assert a == b + + +# =========================================================================== +# 8. Realism improvements (2026-05-24 round 2) +# =========================================================================== + +def test_no_1p_jar_cookie(): + """1P_JAR was deprecated by Google in 2022. Including it is an + anachronism flag for fingerprinters that look at cookie freshness.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + names = {c["name"] for c in cookies} + assert "1P_JAR" not in names + + +def test_nid_prefix_broadened_range(): + """NID 3-digit prefix should cover historical versions (137/105/511/525 + seen in real captures) — range 100-540, not just 500-540.""" + seen_prefixes = set() + for seed in range(200): + cookies = build_cookies(seed=seed, now=_FIXED_NOW) + nid = next(c for c in cookies if c["name"] == "NID") + prefix = int(nid["value"].split("=", 1)[0]) + seen_prefixes.add(prefix) + assert min(seen_prefixes) < 500, f"NID range never goes below 500 ({sorted(seen_prefixes)[:5]})" + assert max(seen_prefixes) <= 540 + + +def test_consent_lang_from_timezone_eu(): + """CONSENT cookie's `lang+region` token derived from IANA timezone.""" + cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Rome") + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".it+IT+" in consent["value"], f"expected it+IT in: {consent['value']}" + + +def test_consent_lang_default_fx(): + """Unknown / US timezone → default `en+FX` (non-EU fallback).""" + cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="America/New_York") + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".en+FX+" in consent["value"] + + +def test_consent_lang_de_for_berlin(): + cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Berlin") + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".de+DE+" in consent["value"] + + +def test_consent_lang_no_timezone_default(): + """timezone=None → default en+FX.""" + cookies = build_cookies(seed=42, now=_FIXED_NOW) + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".en+FX+" in consent["value"] + + +def test_new_helper_cookies_appear_in_ga_consent_clarity(): + """ga_consent_clarity recipe should sometimes include _fbp, _dc_gtm_*, __hssrc + (probabilistic per rng). Check across many seeds that they appear.""" + saw_fbp = False + saw_gtm = False + saw_hssrc = False + history = [{"name": "site.com", "cookie_profile": "ga_consent_clarity"}] + for seed in range(100): + cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW) + names = {c["name"] for c in cookies if c["domain"] == ".site.com"} + if "_fbp" in names: saw_fbp = True + if any(n.startswith("_dc_gtm_") for n in names): saw_gtm = True + if "__hssrc" in names: saw_hssrc = True + assert saw_fbp, "_fbp never appeared in 100 seeds (rng pick broken)" + assert saw_gtm, "_dc_gtm_* never appeared in 100 seeds" + assert saw_hssrc, "__hssrc never appeared in 100 seeds" + + +def test_fbp_format(): + """_fbp format: fb...""" + history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}] + # Try multiple seeds until we hit a seed that includes _fbp (50% chance) + for seed in range(20): + cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW) + fbp = next((c for c in cookies if c["name"] == "_fbp"), None) + if fbp: + parts = fbp["value"].split(".") + assert parts[0] == "fb" + assert parts[1].isdigit() + assert parts[2].isdigit() and len(parts[2]) >= 13 # unix ms + assert parts[3].isdigit() + return + raise AssertionError("never got _fbp across 20 seeds — distribution broken") diff --git a/tests/test_version.py b/tests/test_version.py new file mode 100644 index 0000000..7702f7f --- /dev/null +++ b/tests/test_version.py @@ -0,0 +1,103 @@ +"""Regression tests for issue #24: CLI version reporting. + +Two distinct symptoms reported by `i43-j`: + 1. `python -m invisible_playwright --version` errored out (only the + `version` subcommand worked). + 2. `python -m invisible_playwright version` printed the literal string + "0.1.0" regardless of the installed version (a stale hardcoded + `__version__` in __init__.py that nobody had remembered to bump). + +These tests pin down both behaviours so the regressions don't sneak back +in via a future copy/paste. +""" +import io +import re +import subprocess +import sys +from contextlib import redirect_stdout + +import pytest + +import invisible_playwright +from invisible_playwright import __version__, cli + + +pytestmark = pytest.mark.unit + + +def test_version_matches_installed_package_metadata(): + """__version__ must come from importlib.metadata, not a hardcoded literal, + so it can never drift from the pyproject.toml `version` field.""" + from importlib.metadata import version as pkg_version + assert __version__ == pkg_version("invisible-playwright") + + +def test_version_is_not_the_stale_010_string(): + """Issue #24 regression: __version__ used to be hardcoded as '0.1.0' + and never updated. If this ever returns to a literal '0.1.0' the + package has been published or shipped with stale metadata.""" + assert __version__ != "0.1.0", ( + "__version__ is the stale hardcoded '0.1.0' string — issue #24 has " + "regressed. Use importlib.metadata to derive it from pyproject.toml." + ) + + +def test_version_subcommand_prints_real_version(): + """`invisible-playwright version` must print the actual installed version, + not the old hardcoded '0.1.0'.""" + buf = io.StringIO() + with redirect_stdout(buf): + rc = cli.main(["version"]) + assert rc == 0 + out = buf.getvalue() + assert f"invisible_playwright {__version__}" in out + assert "0.1.0" not in out or __version__ == "0.1.0" # safety: only allowed if truly 0.1.0 + assert "BINARY_VERSION=" in out + assert "Firefox " in out + + +def test_dash_dash_version_flag_works(): + """Issue #24 reporter: `python -m invisible_playwright --version` used to + error with 'the following arguments are required: cmd' because there was + no top-level --version flag, only the `version` subcommand. Now the + Python convention works too.""" + # argparse's --version action calls sys.exit(0) directly, so use subprocess. + r = subprocess.run( + [sys.executable, "-m", "invisible_playwright", "--version"], + capture_output=True, text=True, timeout=15, + ) + assert r.returncode == 0, f"--version returned {r.returncode}, stderr={r.stderr!r}" + # argparse may emit on stdout or stderr depending on version + combined = r.stdout + r.stderr + assert "invisible_playwright" in combined + assert __version__ in combined + + +def test_no_args_prints_help_not_traceback(): + """`python -m invisible_playwright` with no args should be graceful + (print help, exit non-zero) rather than crashing with a traceback.""" + r = subprocess.run( + [sys.executable, "-m", "invisible_playwright"], + capture_output=True, text=True, timeout=15, + ) + # Either prints help (rc=2) or shows usage. Must NOT contain a traceback. + assert "Traceback" not in (r.stdout + r.stderr) + assert "usage:" in (r.stdout + r.stderr).lower() + + +def test_dash_V_short_flag_works(): + """Alias `-V` for `--version` (Python convention).""" + r = subprocess.run( + [sys.executable, "-m", "invisible_playwright", "-V"], + capture_output=True, text=True, timeout=15, + ) + assert r.returncode == 0 + assert __version__ in (r.stdout + r.stderr) + + +def test_version_matches_semver_shape(): + """Sanity: version should look like a semver (digits.digits.digits) + or a PEP-440 dev marker, not a placeholder string.""" + assert re.match(r"^\d+\.\d+\.\d+", __version__), ( + f"__version__ {__version__!r} doesn't look like a real version" + ) diff --git a/tests/test_webrtc_realness.py b/tests/test_webrtc_realness.py new file mode 100644 index 0000000..fec01c0 --- /dev/null +++ b/tests/test_webrtc_realness.py @@ -0,0 +1,442 @@ +"""WebRTC realness regression tests. + +Two layers, both runnable on GitHub CI: + +* **unit** (`@pytest.mark.unit`) — pure SDP/candidate assertions against golden + samples. No browser, no proxy, no network. These lock in every rule we found + on 2026-06-06: host must be mDNS ``.local``; the synthetic srflx must carry the + egress IP with a GENUINE nICEr priority (never ``local_pref == 0xFFFF``) and a + stable, distinct foundation; CreepJS's resolver must return the egress, and a + host-only SDP must read as "blocked". They run in the standard ``tests.yml``. + +* **e2e** (`@pytest.mark.e2e`) — launch the patched binary and verify the live + ICE gather. "Being behind a proxy" is faked WITHOUT smartproxy: + - the egress IP is injected via ``STEALTHFOX_WEBRTC_PUBLIC_IP`` (RFC 5737 + TEST-NET, so it never collides with a real IP); + - the "behind a TCP-only SOCKS proxy" condition is reproduced by a tiny + in-process SOCKS5 server that relays TCP CONNECT but refuses UDP ASSOCIATE + (exactly a residential TCP-only proxy → WebRTC's default-route UDP probe + fails → exercises the Fix C fallback). No credentials, no external proxy. + Excluded from the default run; a binary is located via ``STEALTHFOX_E2E_BINARY`` + (or the locally-built tree), else the test skips. +""" +from __future__ import annotations + +import os +import re +import select +import socket +import struct +import threading +from http.server import BaseHTTPRequestHandler, HTTPServer + +import pytest + +# ────────────────────────────────────────────────────────────────────────── +# Pure SDP / ICE-candidate helpers (no I/O) — the heart of the sentinels. +# ────────────────────────────────────────────────────────────────────────── +_CAND = re.compile( + r"candidate:(?P\S+)\s+(?P\d+)\s+(?PUDP|TCP|udp|tcp)\s+" + r"(?P\d+)\s+(?P
\S+)\s+(?P\d+)\s+typ\s+(?P\w+)" + r"(?:.*?raddr\s+(?P\S+)\s+rport\s+(?P\d+))?" +) + + +def parse_candidate(line): + """Parse one ``a=candidate:`` / ``candidate:`` line into a dict (or None).""" + m = _CAND.search(line) + if not m: + return None + d = m.groupdict() + d["component"] = int(d["component"]) + d["priority"] = int(d["priority"]) + d["port"] = int(d["port"]) + d["proto"] = d["proto"].upper() + if d["rport"] is not None: + d["rport"] = int(d["rport"]) + return d + + +def decode_priority(prio): + """Split a candidate priority into nICEr's fields (RFC 5245 layout that + nICEr emits: type<<24 | iface<<16 | dir<<13 | stun<<8 | (256-component)).""" + return { + "type_pref": (prio >> 24) & 0xFF, + "iface_pref": (prio >> 16) & 0xFF, + "local_pref": (prio >> 8) & 0xFFFF, + "direction": (prio >> 13) & 0x7, + "stun_priority": (prio >> 8) & 0x1F, + "component": 256 - (prio & 0xFF), + } + + +def is_mdns(addr): + return bool(addr) and str(addr).endswith(".local") + + +def candidates(sdp_or_lines): + if isinstance(sdp_or_lines, str): + lines = re.findall(r"(?:a=)?candidate:[^\r\n]*", sdp_or_lines) + else: + lines = list(sdp_or_lines) + return [c for c in (parse_candidate(l) for l in lines) if c] + + +def host_candidates(cands): + return [c for c in cands if c["typ"] == "host"] + + +def srflx_candidates(cands): + return [c for c in cands if c["typ"] == "srflx"] + + +def host_is_mdns(cands): + """Every host candidate must be a ``.local`` mDNS name, never a raw + LAN IP (the §9.4 leak form that fails BrowserLeaks).""" + hosts = host_candidates(cands) + return bool(hosts) and all(is_mdns(c["address"]) for c in hosts) + + +def srflx_realness(cand, expected_ip=None): + """Return (ok, reasons) for whether ``cand`` looks like a GENUINE nICEr UDP + server-reflexive candidate. Encodes the 2026-06-06 findings.""" + reasons = [] + if cand["typ"] != "srflx": + reasons.append("not a srflx candidate") + return False, reasons + if expected_ip is not None and cand["address"] != expected_ip: + reasons.append(f"address {cand['address']} != expected {expected_ip}") + p = decode_priority(cand["priority"]) + if p["type_pref"] != 100: + reasons.append(f"type_pref {p['type_pref']} != 100 (SRV_RFLX)") + if p["local_pref"] == 0xFFFF: + reasons.append("local_pref == 0xFFFF — impossible nICEr value (the old hardcoded tell)") + elif not (0x7000 <= p["local_pref"] < 0x8000): + reasons.append(f"local_pref {p['local_pref']} outside the genuine ~0x7E00-0x7FFF band") + if not (16 <= p["stun_priority"] <= 31): + reasons.append(f"stun_priority {p['stun_priority']} implausible (expect 31-server_id)") + if cand.get("raddr") not in (None, "0.0.0.0"): + reasons.append(f"raddr {cand['raddr']} not redacted to 0.0.0.0") + return (not reasons), reasons + + +def creep_get_ipaddress(sdp): + """Faithful port of CreepJS's getIPAddress(sdp): connection line first, then + the first candidate IP; '0.0.0.0' counts as blocked. Returns None if blocked + — i.e. exactly what makes CreepJS render 'stun connection: blocked'.""" + blocked = "0.0.0.0" + conn = (re.findall(r"c=IN\s.+\s", sdp) or [""])[0].strip().split(" ") + conn_ip = conn[2] if len(conn) > 2 else "" + if conn_ip and conn_ip != blocked: + return conn_ip + m = re.search(r"(udp|tcp)\s(?:\d|\w)+\s((?:\d|\w|\.|:)+)(?=\s)", sdp, re.I) + ip = m.group(2) if m else None + return ip if (ip and ip != blocked) else None + + +# ────────────────────────────────────────────────────────────────────────── +# Golden samples — real priority/foundation values, TEST-NET IPs (RFC 5737) +# so no real address is ever committed (feedback_pre_push_privacy_check). +# ────────────────────────────────────────────────────────────────────────── +HOST_MDNS = "candidate:0 1 UDP 2122252543 1460e928-16b3-4c66-80ad-04abcdef0000.local 54551 typ host" +HOST_RAW_IP = "candidate:0 1 UDP 2122252543 192.168.1.20 54551 typ host" # §9.4 leak form +VANILLA_SRFLX = "candidate:1 1 UDP 1685987327 203.0.113.50 3755 typ srflx raddr 0.0.0.0 rport 0" +OURS_SRFLX = "candidate:1 1 UDP 1686052863 203.0.113.7 58555 typ srflx raddr 0.0.0.0 rport 0" +# Pre-fix injection: local_pref hardcoded to 0xFFFF (priority 1694498815). The tell. +OLD_BAD_SRFLX = "candidate:2 1 UDP 1694498815 203.0.113.7 58555 typ srflx raddr 0.0.0.0 rport 0" + +SDP_GOOD = ( + "v=0\r\nc=IN IP4 0.0.0.0\r\n" + f"a={HOST_MDNS}\r\na={OURS_SRFLX}\r\n" +) +SDP_BLOCKED = "v=0\r\nc=IN IP4 0.0.0.0\r\n" f"a={HOST_MDNS}\r\n" # host-only, no srflx + + +# ────────────────────────────────────────────────────────────────────────── +# UNIT sentinels (run on GitHub CI) +# ────────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_parse_and_decode_basics(): + c = parse_candidate(OURS_SRFLX) + assert c["typ"] == "srflx" and c["proto"] == "UDP" + assert c["address"] == "203.0.113.7" and c["raddr"] == "0.0.0.0" and c["rport"] == 0 + p = decode_priority(c["priority"]) + assert p["type_pref"] == 100 and p["stun_priority"] == 31 and p["component"] == 1 + + +@pytest.mark.unit +def test_genuine_srflx_passes(): + for line in (VANILLA_SRFLX, OURS_SRFLX): + ok, reasons = srflx_realness(parse_candidate(line), expected_ip=parse_candidate(line)["address"]) + assert ok, reasons + + +@pytest.mark.unit +def test_old_0xffff_srflx_is_rejected(): + """Fix A sentinel: local_pref == 0xFFFF must be flagged as fake.""" + ok, reasons = srflx_realness(parse_candidate(OLD_BAD_SRFLX)) + assert not ok + assert any("0xFFFF" in r for r in reasons), reasons + + +@pytest.mark.unit +def test_host_must_be_mdns_not_raw_ip(): + """§9.4 sentinel: raw-IP host candidate is a leak; .local is required.""" + assert host_is_mdns(candidates([HOST_MDNS])) is True + assert host_is_mdns(candidates([HOST_RAW_IP])) is False + + +@pytest.mark.unit +def test_srflx_foundation_distinct_from_host(): + """Fix B sentinel: srflx foundation must differ from the host foundations.""" + cands = candidates([HOST_MDNS, OURS_SRFLX]) + host_fnds = {c["foundation"] for c in host_candidates(cands)} + srflx_fnds = {c["foundation"] for c in srflx_candidates(cands)} + assert srflx_fnds and srflx_fnds.isdisjoint(host_fnds) + + +@pytest.mark.unit +def test_creep_resolver_returns_egress_when_srflx_present(): + assert creep_get_ipaddress(SDP_GOOD) == "203.0.113.7" + + +@pytest.mark.unit +def test_creep_resolver_reports_blocked_for_host_only(): + """The exact false-green we shipped: host-only (.local) SDP → no public IP + → CreepJS shows 'blocked'. The resolver must return None here.""" + assert creep_get_ipaddress(SDP_BLOCKED) is None + + +@pytest.mark.unit +def test_mdns_host_is_invisible_to_creep_resolver(): + """A .local host must NOT be mis-read as an IP (the hyphen in the UUID is + what makes CreepJS skip it and fall through to the srflx).""" + assert creep_get_ipaddress("v=0\r\nc=IN IP4 0.0.0.0\r\n" f"a={HOST_MDNS}\r\n") is None + + +# ────────────────────────────────────────────────────────────────────────── +# Fake-proxy infrastructure for e2e: a tiny TCP-only SOCKS5 server. +# ────────────────────────────────────────────────────────────────────────── +class _Socks5TcpOnly: + """Minimal SOCKS5: no-auth, CONNECT (TCP) relayed, UDP ASSOCIATE refused. + + Reproduces a residential TCP-only proxy: pages load over TCP, but WebRTC's + UDP path is dead — which (for a no-camera page in default_address_only mode) + is exactly what made the default-route probe fail and ICE return zero + candidates before Fix C. + """ + + def __init__(self): + self._srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self._srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self._srv.bind(("127.0.0.1", 0)) + self._srv.listen(16) + self.port = self._srv.getsockname()[1] + self.udp_associate_attempts = 0 + self._stop = False + self._t = threading.Thread(target=self._serve, daemon=True) + self._t.start() + + def _serve(self): + while not self._stop: + try: + conn, _ = self._srv.accept() + except OSError: + break + threading.Thread(target=self._handle, args=(conn,), daemon=True).start() + + def _recv_exact(self, sock, n): + buf = b"" + while len(buf) < n: + chunk = sock.recv(n - len(buf)) + if not chunk: + return None + buf += chunk + return buf + + def _handle(self, conn): + try: + head = self._recv_exact(conn, 2) + if not head or head[0] != 0x05: + conn.close() + return + nmethods = head[1] + self._recv_exact(conn, nmethods) + conn.sendall(b"\x05\x00") # no-auth + req = self._recv_exact(conn, 4) + if not req: + conn.close() + return + ver, cmd, _, atyp = req + if atyp == 0x01: + addr = socket.inet_ntoa(self._recv_exact(conn, 4)) + elif atyp == 0x03: + ln = self._recv_exact(conn, 1)[0] + addr = self._recv_exact(conn, ln).decode("ascii", "ignore") + elif atyp == 0x04: + addr = socket.inet_ntop(socket.AF_INET6, self._recv_exact(conn, 16)) + else: + conn.close() + return + port = struct.unpack("!H", self._recv_exact(conn, 2))[0] + if cmd != 0x01: # not CONNECT (e.g. UDP ASSOCIATE) → refuse + self.udp_associate_attempts += 1 + conn.sendall(b"\x05\x07\x00\x01\x00\x00\x00\x00\x00\x00") # cmd not supported + conn.close() + return + try: + upstream = socket.create_connection((addr, port), timeout=15) + except OSError: + conn.sendall(b"\x05\x04\x00\x01\x00\x00\x00\x00\x00\x00") # host unreachable + conn.close() + return + conn.sendall(b"\x05\x00\x00\x01\x00\x00\x00\x00\x00\x00") # success + self._relay(conn, upstream) + except Exception: + try: + conn.close() + except Exception: + pass + + def _relay(self, a, b): + try: + while True: + r, _, _ = select.select([a, b], [], [], 30) + if not r: + break + for s in r: + data = s.recv(65536) + if not data: + return + (b if s is a else a).sendall(data) + finally: + for s in (a, b): + try: + s.close() + except Exception: + pass + + def close(self): + self._stop = True + try: + self._srv.close() + except Exception: + pass + + +# Same per-event probe CreepJS runs (kept tiny; raw string = one escape level). +_PROBE_JS = r"""async () => { + const pc = new RTCPeerConnection({iceCandidatePoolSize:1, iceServers:[{urls:[ + 'stun:stun4.l.google.com:19302','stun:stun3.l.google.com:19302']}]}); + pc.createDataChannel(''); + const cands = []; + pc.addEventListener('icecandidate', e => { if (e.candidate && e.candidate.candidate) cands.push(e.candidate.candidate); }); + await pc.setLocalDescription(await pc.createOffer({offerToReceiveAudio:1, offerToReceiveVideo:1})); + await new Promise(r => setTimeout(r, 3500)); + const sdp = (pc.localDescription && pc.localDescription.sdp) || ''; + try { pc.close(); } catch(e) {} + return { candidates: cands, sdp }; +}""" + +_FAKE_EGRESS = "203.0.113.7" # RFC 5737 TEST-NET-3 + + +def _e2e_binary(): + cand = os.environ.get("STEALTHFOX_E2E_BINARY") + if cand and os.path.exists(cand): + return cand + built = r"C:\ff\source\obj-x86_64-pc-windows-msvc\dist\bin\firefox.exe" + if os.path.exists(built): + return built + return None + + +@pytest.fixture +def socks5_tcp_only(): + srv = _Socks5TcpOnly() + yield srv + srv.close() + + +@pytest.fixture +def local_https_page(): + """A trivial localhost page (used by the no-proxy srflx test).""" + class H(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write(b"wrtc") + + def log_message(self, *a): + pass + + httpd = HTTPServer(("127.0.0.1", 0), H) + threading.Thread(target=httpd.serve_forever, daemon=True).start() + yield f"http://127.0.0.1:{httpd.server_address[1]}/" + httpd.shutdown() + + +def _launch(**extra): + from invisible_playwright import InvisiblePlaywright + + kw = {"headless": True, + # Fixed zone so the wrapper does NOT run timezone="auto" egress + # discovery through the (fake) proxy — irrelevant here, we inject the + # egress IP directly and want the launch deterministic/offline. + "timezone": "America/New_York", + "extra_prefs": {"media.peerconnection.ice.obfuscate_host_addresses": True}} + kw.update(extra) + return InvisiblePlaywright(**kw) + + +@pytest.mark.e2e +def test_srflx_is_real_and_resolvable(local_https_page): + """No proxy needed: the egress is faked via the env. Asserts the live srflx + is genuine (Fix A/B) and that CreepJS's resolver returns it (not blocked).""" + binary = _e2e_binary() + if not binary: + pytest.skip("no patched binary (set STEALTHFOX_E2E_BINARY)") + os.environ["STEALTHFOX_WEBRTC_PUBLIC_IP"] = _FAKE_EGRESS + os.environ["STEALTHFOX_WEBRTC_DISABLE_IPV6"] = "1" + with _launch(binary_path=binary) as browser: + page = browser.new_context().new_page() + page.goto(local_https_page, wait_until="domcontentloaded", timeout=60000) + res = page.evaluate(_PROBE_JS) + cands = candidates(res["candidates"]) + assert cands, "ICE produced ZERO candidates (blocked)" + assert host_is_mdns(cands), [c["address"] for c in host_candidates(cands)] + srflx = [c for c in srflx_candidates(cands) if c["address"] == _FAKE_EGRESS] + assert srflx, f"no synthetic srflx with {_FAKE_EGRESS}: {res['candidates']}" + ok, reasons = srflx_realness(srflx[0], expected_ip=_FAKE_EGRESS) + assert ok, reasons + # Two srflx for the same base must share ONE stable foundation (Fix B). + assert len({c["foundation"] for c in srflx}) == 1 + assert creep_get_ipaddress(res["sdp"]) == _FAKE_EGRESS + + +@pytest.mark.e2e +def test_not_blocked_behind_tcp_only_socks(socks5_tcp_only): + """Fix C sentinel: behind a TCP-only SOCKS proxy on a remote origin, ICE + must still complete (host .local + synthetic srflx), not return zero + candidates. Without Fix C this page is fully 'blocked'.""" + binary = _e2e_binary() + if not binary: + pytest.skip("no patched binary (set STEALTHFOX_E2E_BINARY)") + os.environ["STEALTHFOX_WEBRTC_PUBLIC_IP"] = _FAKE_EGRESS + os.environ["STEALTHFOX_WEBRTC_DISABLE_IPV6"] = "1" + proxy = {"server": f"socks5://127.0.0.1:{socks5_tcp_only.port}"} + try: + with _launch(binary_path=binary, proxy=proxy) as browser: + page = browser.new_context().new_page() + # remote origin loaded THROUGH the local SOCKS proxy (not localhost, + # so no proxy-bypass) → WebRTC proxy config active → Fix C path. + page.goto("https://example.com/", wait_until="domcontentloaded", timeout=70000) + res = page.evaluate(_PROBE_JS) + except Exception as exc: # network/proxy unavailable in this environment + pytest.skip(f"proxy/network path unavailable: {exc!r}") + cands = candidates(res["candidates"]) + assert cands, "behind SOCKS the gather returned ZERO candidates — Fix C regressed (blocked)" + assert host_is_mdns(cands) + assert any(c["address"] == _FAKE_EGRESS for c in srflx_candidates(cands)), res["candidates"] + assert creep_get_ipaddress(res["sdp"]) == _FAKE_EGRESS diff --git a/tests/unit/test_config_public.py b/tests/unit/test_config_public.py new file mode 100644 index 0000000..0e26e36 --- /dev/null +++ b/tests/unit/test_config_public.py @@ -0,0 +1,125 @@ +"""Unit tests for the public ``config`` helpers.""" + +import pytest + +from invisible_playwright import ( + ensure_binary, + get_default_args, + get_default_stealth_prefs, +) +from invisible_playwright.config import get_default_stealth_prefs as _direct + + +pytestmark = pytest.mark.unit + + +def test_get_default_args_is_empty_list(): + """Currently no baseline CLI args, but must return a list (mutable, fresh each call).""" + args = get_default_args() + assert args == [] + assert isinstance(args, list) + args.append("--foo") + # next call must return a fresh empty list, not the mutated one + assert get_default_args() == [] + + +def test_get_default_stealth_prefs_random_seed_returns_dict(): + """No seed -> fresh random fingerprint, dict has expected stealth keys.""" + prefs = get_default_stealth_prefs() + assert isinstance(prefs, dict) + assert len(prefs) > 0 + # humanize toggle is always set explicitly + assert "invisible_playwright.humanize" in prefs + assert prefs["invisible_playwright.humanize"] is True + + +def test_get_default_stealth_prefs_seed_is_deterministic(): + """Same seed -> byte-identical prefs across calls.""" + a = get_default_stealth_prefs(seed=42) + b = get_default_stealth_prefs(seed=42) + assert a == b + + +def test_get_default_stealth_prefs_different_seeds_differ(): + """Different seeds -> different prefs.""" + a = get_default_stealth_prefs(seed=1) + b = get_default_stealth_prefs(seed=2) + assert a != b + + +def test_humanize_false_disables_prefs(): + """humanize=False removes the maxTime knob and flips the toggle to False.""" + prefs = get_default_stealth_prefs(seed=42, humanize=False) + assert prefs["invisible_playwright.humanize"] is False + assert "invisible_playwright.humanize.maxTime" not in prefs + + +def test_humanize_default_sets_max_time_1_5(): + """humanize=True -> default maxTime is 1.5s, stored as string.""" + prefs = get_default_stealth_prefs(seed=42, humanize=True) + assert prefs["invisible_playwright.humanize"] is True + assert prefs["invisible_playwright.humanize.maxTime"] == "1.5" + + +def test_humanize_float_overrides_max_time(): + """Float for humanize is the explicit cap in seconds.""" + prefs = get_default_stealth_prefs(seed=42, humanize=3.0) + assert prefs["invisible_playwright.humanize"] is True + assert prefs["invisible_playwright.humanize.maxTime"] == "3.0" + + +def test_extra_prefs_overlay_takes_precedence(): + """extra_prefs overlay LAST overrides any baseline value.""" + prefs = get_default_stealth_prefs( + seed=42, extra_prefs={"some.custom.pref": 999} + ) + assert prefs["some.custom.pref"] == 999 + + +def test_extra_prefs_can_override_baseline(): + """A key in extra_prefs that also exists in baseline gets overridden.""" + baseline = get_default_stealth_prefs(seed=42) + a_baseline_key = next(iter(baseline.keys())) + overridden = get_default_stealth_prefs( + seed=42, extra_prefs={a_baseline_key: "OVERRIDDEN_SENTINEL"} + ) + assert overridden[a_baseline_key] == "OVERRIDDEN_SENTINEL" + + +def test_locale_argument_changes_prefs(): + """Different locales produce different prefs (Accept-Language affected).""" + en = get_default_stealth_prefs(seed=42, locale="en-US") + it = get_default_stealth_prefs(seed=42, locale="it-IT") + assert en != it + + +def test_timezone_argument_changes_prefs(): + """Different timezones produce different prefs.""" + ny = get_default_stealth_prefs(seed=42, timezone="America/New_York") + rome = get_default_stealth_prefs(seed=42, timezone="Europe/Rome") + assert ny != rome + + +def test_pin_argument_forces_specific_fields(): + """Pin forces a specific field while the rest stays seed-derived.""" + plain = get_default_stealth_prefs(seed=42) + pinned = get_default_stealth_prefs( + seed=42, pin={"hardware.concurrency": 999} + ) + # something in the dict must differ vs the plain seed=42 build + assert plain != pinned + + +def test_public_import_matches_direct_import(): + """Top-level re-export and direct module import return identical output.""" + a = get_default_stealth_prefs(seed=42) + b = _direct(seed=42) + assert a == b + + +def test_ensure_binary_is_callable_via_public_namespace(): + """ensure_binary is re-exported and stays callable from the package root.""" + # We don't invoke it (would trigger a network download in CI) — just + # verify the public attribute is the same callable as the underlying. + from invisible_playwright.download import ensure_binary as _direct_eb + assert ensure_binary is _direct_eb