feat: deterministic reCAPTCHA cookie pre-seed via Bayesian browsing history

Adds opt-in helper that auto-injects coherent cookie history into every
BrowserContext created via new_context(). Content is fully deterministic
from the persona seed so a given seed always presents the same cookies
across sessions.

Composition (per persona, all derived from seed):
  - 5 cookies on .google.com (NID, CONSENT, SOCS, _GRECAPTCHA, ENID).
    Excludes 1P_JAR which was deprecated by Google in 2022. CONSENT
    `lang+region` token derived from the persona's IANA timezone
    (Europe/Rome -> it+IT, America/* -> en+FX, etc.). NID prefix
    broadened to 100-540 to cover historical versions.
  - Per-site cookies on 13-25 "visited" everyday domains, sampled from a
    Bayesian network conditioned on gpu_class - workstation/high_end
    personas trend toward dev/tech sites, low_end/integrated_old trend
    toward shop/news/reference. Each site contributes 1-7 cookies based
    on a `cookie_profile` tag. Cookie pool includes _ga, _gid, _clck,
    _clsk, __cf_bm, OneTrust/CookieYes consent, _fbp (Facebook Pixel),
    _dc_gtm_<id> (Tag Manager helper), __hssrc (HubSpot helper).

API:
    Stealthfox(seed=42, prep_recaptcha=True)

No per-call configuration: visited-sites + cookie composition all derived
from the persona seed via the Bayesian sampler.

Gated server-side: forced False if profile_dir is set (persistent profile
owns its own state). All expiries capped to 395 days per Chrome/Firefox
400-day RFC 6265bis-15 limit.

Bayesian integration:
  - New `derive_browsing_history(gpu_class, rng)` in _fpforge/_sampler.py
    (parallel to `derive_font_prefs`).
  - New data files: browsing_pool.json (50 site entries) and
    cpt_browsing_given_class.json (per-class probabilities).
  - Profile dataclass exposes `browsing_history` field.
  - _recaptcha_seed.py consumes Profile.browsing_history; receives
    timezone separately to derive CONSENT lang+region.

Also drops a dead Chromium-only e2e test that always skipped on our
Firefox-only wrapper.

Test coverage: 29 unit tests covering composition, profile recipes
(minimal/ga_only/ga_cf/ga_consent/ga_consent_clarity), determinism,
Chrome 400-day cap, Playwright field requirements, CONSENT lang
mapping (IT/DE/US/default), helper-cookie probability distributions,
end-to-end with real fpforge Profile.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
feder-cr 2026-05-24 19:18:58 -07:00
parent 413db06690
commit 3d8ba0b82c
9 changed files with 956 additions and 11 deletions

View file

@ -84,6 +84,12 @@ _FONT_POOL = _load("font_pool.json")
_FONT_CORE: list = _FONT_POOL["core"]
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
# Browsing-history pool + CPT (per-class probabilities for visited sites).
# Drives _recaptcha_seed's cookie pre-seed: each persona ends up with a
# coherent list of ~15-30 visited sites whose categories correlate with
# gpu_class (workstation → dev-heavy, integrated_old → shop+news-heavy).
_BROWSING_POOL: list = _load("browsing_pool.json")["entries"]
_CPT_BROWSING = _load("cpt_browsing_given_class.json")["table"]
# ═══════════════════════════════════════════════════════════════════════
@ -282,6 +288,33 @@ def derive_font_whitelist(gpu_class: str, rng) -> str:
return derive_font_prefs(gpu_class, rng)["whitelist"]
# ═══════════════════════════════════════════════════════════════════════
# BROWSING HISTORY (Bayesian: per-site P(visited|gpu_class))
# ═══════════════════════════════════════════════════════════════════════
def derive_browsing_history(gpu_class: str, rng) -> list:
"""Sample which sites this persona has visited recently.
Each site in the pool has a per-class probability (CPT). We sample
independently per-site, producing a list of dicts:
[{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, ...]
Sum of CPT probabilities per class is tuned to land ~15-30 visited sites
on average an established-user signature. Sorted by name for stable
output across runs of the same seed.
"""
cpt = _CPT_BROWSING.get(gpu_class)
if cpt is None:
cpt = _CPT_BROWSING["mid_range"]
visited: list = []
for entry in _BROWSING_POOL:
name = entry["name"]
p = cpt.get(name, 0.3) # default 0.3 for missing CPT row
if rng.random() < p:
visited.append(dict(entry)) # copy to avoid mutating pool
visited.sort(key=lambda e: e["name"])
return visited
# ═══════════════════════════════════════════════════════════════════════
# PUBLIC API: Forge
# ═══════════════════════════════════════════════════════════════════════
@ -350,6 +383,12 @@ class Forge:
bundle["gpu_class"], self._rng
).items()
},
# Bayesian browsing history (per-class P(visited|gpu_class)).
# Consumed by _recaptcha_seed.py to seed coherent cookie history
# when invisible_playwright is launched with prep_recaptcha=True.
"browsing_history": derive_browsing_history(
bundle["gpu_class"], self._rng
),
}

View file

@ -0,0 +1,64 @@
{
"_comment": [
"Pool of everyday websites used by the browsing_history node.",
"Each entry: { name, category, cookie_profile }.",
"- name: bare domain (no scheme, no leading dot).",
"- category: dev / shop / news / reference / media / community / misc.",
"- cookie_profile: short tag pointing to a cookie-template recipe used by",
" _recaptcha_seed.py to generate concrete cookies (so heavy-analytics sites",
" get _ga+_gid+OneTrust, simple sites get just _ga, dev tools get GH-style).",
"Add new entries here + add per-class probabilities in cpt_browsing_given_class.json."
],
"entries": [
{"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"},
{"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"},
{"name": "mozilla.org", "category": "reference", "cookie_profile": "ga_consent"},
{"name": "w3schools.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
{"name": "mdn.io", "category": "dev", "cookie_profile": "minimal"},
{"name": "duckduckgo.com", "category": "reference", "cookie_profile": "minimal"},
{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"},
{"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
{"name": "npmjs.com", "category": "dev", "cookie_profile": "ga_consent"},
{"name": "gitlab.com", "category": "dev", "cookie_profile": "ga_cf"},
{"name": "pypi.org", "category": "dev", "cookie_profile": "minimal"},
{"name": "docs.python.org", "category": "dev", "cookie_profile": "minimal"},
{"name": "rust-lang.org", "category": "dev", "cookie_profile": "ga_consent"},
{"name": "go.dev", "category": "dev", "cookie_profile": "ga_consent"},
{"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "ebay.com", "category": "shop", "cookie_profile": "ga_consent"},
{"name": "etsy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "bestbuy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "target.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "nytimes.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
{"name": "cnn.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "bbc.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "theguardian.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
{"name": "reuters.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "apnews.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "washingtonpost.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "techcrunch.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
{"name": "theverge.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "arstechnica.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "wired.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
{"name": "engadget.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "9to5mac.com", "category": "news", "cookie_profile": "ga_consent"},
{"name": "medium.com", "category": "community", "cookie_profile": "ga_consent"},
{"name": "dev.to", "category": "community", "cookie_profile": "ga_consent"},
{"name": "reddit.com", "category": "community", "cookie_profile": "ga_cf"},
{"name": "news.ycombinator.com", "category": "community", "cookie_profile": "minimal"},
{"name": "quora.com", "category": "community", "cookie_profile": "ga_consent_clarity"},
{"name": "stackexchange.com", "category": "community", "cookie_profile": "ga_consent_clarity"},
{"name": "imdb.com", "category": "media", "cookie_profile": "ga_consent_clarity"},
{"name": "rottentomatoes.com", "category": "media", "cookie_profile": "ga_consent"},
{"name": "metacritic.com", "category": "media", "cookie_profile": "ga_consent"},
{"name": "allrecipes.com", "category": "misc", "cookie_profile": "ga_consent_clarity"},
{"name": "epicurious.com", "category": "misc", "cookie_profile": "ga_consent"},
{"name": "tripadvisor.com", "category": "misc", "cookie_profile": "ga_consent_clarity"},
{"name": "weather.com", "category": "reference", "cookie_profile": "ga_consent"},
{"name": "timeanddate.com", "category": "reference", "cookie_profile": "ga_consent"},
{"name": "thesaurus.com", "category": "reference", "cookie_profile": "ga_consent_clarity"},
{"name": "kayak.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "booking.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "airbnb.com", "category": "shop", "cookie_profile": "ga_consent"}
]
}

View file

@ -0,0 +1,138 @@
{
"_comment": [
"Per-class probability that a persona of a given gpu_class has visited each",
"site in the pool. Used by the browsing_history node to derive a coherent",
"visited-domain list per persona.",
"",
"Probabilities are tuned so each class samples ~15-30 sites on average",
"(sum across all 50 entries falls in that range), giving an established-user",
"look. Categories are biased by class:",
" - workstation/high_end: higher P(dev) + high P(news/media)",
" - mid_range: balanced",
" - low_end/integrated_*: lower P(dev), higher P(shop/news/reference)",
"",
"Missing class falls back to mid_range via Node CPT pool fallback."
],
"table": {
"workstation": {
"youtube.com": 0.80, "wikipedia.org": 0.85, "mozilla.org": 0.70,
"w3schools.com": 0.40, "mdn.io": 0.55, "duckduckgo.com": 0.45,
"github.com": 0.95, "stackoverflow.com": 0.90, "npmjs.com": 0.65,
"gitlab.com": 0.50, "pypi.org": 0.55, "docs.python.org": 0.60,
"rust-lang.org": 0.35, "go.dev": 0.30,
"amazon.com": 0.70, "ebay.com": 0.25, "etsy.com": 0.15,
"bestbuy.com": 0.45, "target.com": 0.30,
"nytimes.com": 0.55, "cnn.com": 0.40, "bbc.com": 0.55,
"theguardian.com": 0.45, "reuters.com": 0.40, "apnews.com": 0.30,
"washingtonpost.com": 0.40,
"techcrunch.com": 0.65, "theverge.com": 0.60, "arstechnica.com": 0.65,
"wired.com": 0.50, "engadget.com": 0.35, "9to5mac.com": 0.30,
"medium.com": 0.55, "dev.to": 0.40, "reddit.com": 0.70,
"news.ycombinator.com": 0.65, "quora.com": 0.20, "stackexchange.com": 0.60,
"imdb.com": 0.45, "rottentomatoes.com": 0.25, "metacritic.com": 0.20,
"allrecipes.com": 0.20, "epicurious.com": 0.15, "tripadvisor.com": 0.30,
"weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25,
"kayak.com": 0.30, "booking.com": 0.35, "airbnb.com": 0.30
},
"high_end": {
"youtube.com": 0.85, "wikipedia.org": 0.80, "mozilla.org": 0.60,
"w3schools.com": 0.45, "mdn.io": 0.45, "duckduckgo.com": 0.40,
"github.com": 0.85, "stackoverflow.com": 0.80, "npmjs.com": 0.50,
"gitlab.com": 0.40, "pypi.org": 0.45, "docs.python.org": 0.50,
"rust-lang.org": 0.30, "go.dev": 0.25,
"amazon.com": 0.75, "ebay.com": 0.30, "etsy.com": 0.20,
"bestbuy.com": 0.50, "target.com": 0.35,
"nytimes.com": 0.50, "cnn.com": 0.50, "bbc.com": 0.50,
"theguardian.com": 0.40, "reuters.com": 0.35, "apnews.com": 0.30,
"washingtonpost.com": 0.35,
"techcrunch.com": 0.60, "theverge.com": 0.65, "arstechnica.com": 0.60,
"wired.com": 0.50, "engadget.com": 0.40, "9to5mac.com": 0.35,
"medium.com": 0.50, "dev.to": 0.35, "reddit.com": 0.75,
"news.ycombinator.com": 0.55, "quora.com": 0.25, "stackexchange.com": 0.55,
"imdb.com": 0.55, "rottentomatoes.com": 0.35, "metacritic.com": 0.30,
"allrecipes.com": 0.25, "epicurious.com": 0.20, "tripadvisor.com": 0.30,
"weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25,
"kayak.com": 0.30, "booking.com": 0.40, "airbnb.com": 0.30
},
"mid_range": {
"youtube.com": 0.85, "wikipedia.org": 0.75, "mozilla.org": 0.45,
"w3schools.com": 0.40, "mdn.io": 0.30, "duckduckgo.com": 0.35,
"github.com": 0.55, "stackoverflow.com": 0.55, "npmjs.com": 0.30,
"gitlab.com": 0.25, "pypi.org": 0.25, "docs.python.org": 0.30,
"rust-lang.org": 0.15, "go.dev": 0.15,
"amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30,
"bestbuy.com": 0.55, "target.com": 0.40,
"nytimes.com": 0.45, "cnn.com": 0.55, "bbc.com": 0.45,
"theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30,
"washingtonpost.com": 0.30,
"techcrunch.com": 0.45, "theverge.com": 0.50, "arstechnica.com": 0.40,
"wired.com": 0.45, "engadget.com": 0.35, "9to5mac.com": 0.30,
"medium.com": 0.45, "dev.to": 0.25, "reddit.com": 0.70,
"news.ycombinator.com": 0.30, "quora.com": 0.35, "stackexchange.com": 0.40,
"imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.35,
"allrecipes.com": 0.35, "epicurious.com": 0.25, "tripadvisor.com": 0.40,
"weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30,
"kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40
},
"low_end": {
"youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.35,
"w3schools.com": 0.30, "mdn.io": 0.20, "duckduckgo.com": 0.30,
"github.com": 0.30, "stackoverflow.com": 0.30, "npmjs.com": 0.15,
"gitlab.com": 0.10, "pypi.org": 0.10, "docs.python.org": 0.15,
"rust-lang.org": 0.05, "go.dev": 0.05,
"amazon.com": 0.85, "ebay.com": 0.50, "etsy.com": 0.40,
"bestbuy.com": 0.55, "target.com": 0.45,
"nytimes.com": 0.40, "cnn.com": 0.60, "bbc.com": 0.40,
"theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.30,
"washingtonpost.com": 0.25,
"techcrunch.com": 0.30, "theverge.com": 0.35, "arstechnica.com": 0.25,
"wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25,
"medium.com": 0.35, "dev.to": 0.15, "reddit.com": 0.65,
"news.ycombinator.com": 0.15, "quora.com": 0.45, "stackexchange.com": 0.25,
"imdb.com": 0.65, "rottentomatoes.com": 0.45, "metacritic.com": 0.35,
"allrecipes.com": 0.45, "epicurious.com": 0.30, "tripadvisor.com": 0.45,
"weather.com": 0.65, "timeanddate.com": 0.25, "thesaurus.com": 0.35,
"kayak.com": 0.35, "booking.com": 0.50, "airbnb.com": 0.40
},
"integrated_modern": {
"youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.40,
"w3schools.com": 0.35, "mdn.io": 0.25, "duckduckgo.com": 0.35,
"github.com": 0.40, "stackoverflow.com": 0.40, "npmjs.com": 0.20,
"gitlab.com": 0.15, "pypi.org": 0.20, "docs.python.org": 0.20,
"rust-lang.org": 0.10, "go.dev": 0.10,
"amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30,
"bestbuy.com": 0.50, "target.com": 0.40,
"nytimes.com": 0.40, "cnn.com": 0.55, "bbc.com": 0.45,
"theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30,
"washingtonpost.com": 0.30,
"techcrunch.com": 0.40, "theverge.com": 0.45, "arstechnica.com": 0.30,
"wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25,
"medium.com": 0.40, "dev.to": 0.20, "reddit.com": 0.65,
"news.ycombinator.com": 0.25, "quora.com": 0.40, "stackexchange.com": 0.35,
"imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.30,
"allrecipes.com": 0.40, "epicurious.com": 0.25, "tripadvisor.com": 0.40,
"weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30,
"kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40
},
"integrated_old": {
"youtube.com": 0.75, "wikipedia.org": 0.65, "mozilla.org": 0.30,
"w3schools.com": 0.20, "mdn.io": 0.10, "duckduckgo.com": 0.25,
"github.com": 0.15, "stackoverflow.com": 0.20, "npmjs.com": 0.05,
"gitlab.com": 0.05, "pypi.org": 0.05, "docs.python.org": 0.10,
"rust-lang.org": 0.02, "go.dev": 0.02,
"amazon.com": 0.85, "ebay.com": 0.55, "etsy.com": 0.45,
"bestbuy.com": 0.55, "target.com": 0.50,
"nytimes.com": 0.45, "cnn.com": 0.65, "bbc.com": 0.40,
"theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.35,
"washingtonpost.com": 0.30,
"techcrunch.com": 0.20, "theverge.com": 0.25, "arstechnica.com": 0.15,
"wired.com": 0.30, "engadget.com": 0.20, "9to5mac.com": 0.20,
"medium.com": 0.30, "dev.to": 0.05, "reddit.com": 0.55,
"news.ycombinator.com": 0.05, "quora.com": 0.55, "stackexchange.com": 0.15,
"imdb.com": 0.70, "rottentomatoes.com": 0.50, "metacritic.com": 0.35,
"allrecipes.com": 0.55, "epicurious.com": 0.35, "tripadvisor.com": 0.50,
"weather.com": 0.70, "timeanddate.com": 0.30, "thesaurus.com": 0.40,
"kayak.com": 0.40, "booking.com": 0.55, "airbnb.com": 0.40
}
}
}

View file

@ -120,6 +120,11 @@ class Profile:
webgl: WebGLProfile
fonts: List[str]
dark_theme: bool
# Bayesian browsing-history: list of {name, category, cookie_profile}
# dicts sampled from data/browsing_pool.json with per-class CPT. Used
# by _recaptcha_seed.py to build a coherent cookie pre-seed when the
# caller opts in via Stealthfox(prep_recaptcha=True).
browsing_history: List[Dict[str, str]] = field(default_factory=list)
_raw: Dict[str, Any] = field(default_factory=dict, repr=False, compare=False)
def to_prefs_dict(self) -> Dict[str, Any]:
@ -255,5 +260,6 @@ def generate_profile(seed: int, pin: Optional[Dict[str, Any]] = None) -> Profile
webgl=WebGLProfile(msaa_samples=int(raw["msaa_samples"])),
fonts=fonts,
dark_theme=bool(raw["dark_theme"]),
browsing_history=list(raw.get("browsing_history") or []),
_raw=raw,
)

View file

@ -0,0 +1,340 @@
"""Deterministic reCAPTCHA cookie pre-seed.
Consumes the Bayesian-sampled `browsing_history` from the persona Profile
(see `_fpforge/_sampler.py:derive_browsing_history`). For each visited
site, builds 1-5 realistic cookies whose composition is chosen by the
site's `cookie_profile` tag (analytics-only / consent / cloudflare-bot-
management / etc.). All values seeded deterministically from the persona
seed, so a given persona always presents the SAME cookies across sessions.
In addition, always seeds 5 cookies on .google.com (NID, CONSENT, SOCS,
_GRECAPTCHA, ENID). Excludes 1P_JAR which was deprecated by Google in 2022
including it now is an anachronism flag.
Public API:
await seed_recaptcha_cookies_async(context, profile, timezone=None)
seed_recaptcha_cookies_sync(context, profile, timezone=None)
`profile` is an `_fpforge.Profile`; `timezone` is the IANA tz (e.g.
"Europe/Rome") used to derive the CONSENT cookie's language token, so a
European-tz persona gets CONSENT in their language not en+FX.
"""
from __future__ import annotations
import datetime
import random
import time
from typing import Any, List, Optional
# URL-safe base64 alphabet (no padding chars).
_B64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
_HEX_ALPHABET = "0123456789abcdef"
def _sub_seed(seed: int, tag: str) -> int:
"""FNV-1a mix → independent PRNG streams per logical bucket from one seed."""
h = 0xcbf29ce484222325 ^ (seed & 0xFFFFFFFF)
for c in tag.encode("ascii"):
h ^= c
h = (h * 0x100000001b3) & 0xFFFFFFFFFFFFFFFF
return h or 0xdeadbeef
def _b64_rand(rng: random.Random, length: int) -> str:
return "".join(rng.choice(_B64_ALPHABET) for _ in range(length))
def _hex_rand(rng: random.Random, length: int) -> str:
return "".join(rng.choice(_HEX_ALPHABET) for _ in range(length))
def _yyyymmdd_utc(ts: int) -> str:
return datetime.datetime.utcfromtimestamp(ts).strftime("%Y%m%d")
# IANA timezone -> (country_code, lang) for CONSENT cookie coherence.
# Real EU users get CONSENT with `<lang>+<COUNTRY>+NNN`; non-EU gets `en+FX+NNN`.
# Default fallback `en+FX+NNN` for any tz not in this map.
_TZ_TO_REGION = {
"Europe/Rome": ("IT", "it"),
"Europe/Berlin": ("DE", "de"),
"Europe/Paris": ("FR", "fr"),
"Europe/Madrid": ("ES", "es"),
"Europe/London": ("GB", "en"),
"Europe/Amsterdam": ("NL", "nl"),
"Europe/Brussels": ("BE", "fr"),
"Europe/Vienna": ("AT", "de"),
"Europe/Zurich": ("CH", "de"),
"Europe/Dublin": ("IE", "en"),
"Europe/Lisbon": ("PT", "pt"),
"Europe/Stockholm": ("SE", "sv"),
"Europe/Oslo": ("NO", "no"),
"Europe/Copenhagen": ("DK", "da"),
"Europe/Helsinki": ("FI", "fi"),
"Europe/Warsaw": ("PL", "pl"),
"Europe/Prague": ("CZ", "cs"),
"Europe/Athens": ("GR", "el"),
"Asia/Tokyo": ("FX", "ja"),
"Asia/Shanghai": ("FX", "zh"),
"Asia/Hong_Kong": ("FX", "zh"),
"Asia/Seoul": ("FX", "ko"),
}
def _consent_region_lang(timezone: Optional[str]) -> tuple:
"""Map IANA tz → (region_token, lang_2char) for CONSENT cookie.
Default `("FX", "en")` for US/unknown."""
if timezone and timezone in _TZ_TO_REGION:
return _TZ_TO_REGION[timezone]
return ("FX", "en")
# ---------------------------------------------------------------------------
# .google.com cookie batch (always present, regardless of browsing history)
# ---------------------------------------------------------------------------
def _google_cookies(rng: random.Random, now: int,
timezone: Optional[str] = None) -> List[dict]:
consent_age = rng.randint(60, 720) * 86400
region, lang = _consent_region_lang(timezone)
# NID 3-digit prefix range broadened to 100-540 to cover historical NID
# versions (137, 105, 511, 525 etc. observed in real captures).
return [
{"name": "NID",
"value": f"{rng.randint(100, 540)}={_b64_rand(rng, 178)}",
"domain": ".google.com", "path": "/",
"expires": now + 180 * 86400,
"httpOnly": True, "secure": True, "sameSite": "None"},
{"name": "CONSENT",
"value": f"YES+cb.{_yyyymmdd_utc(now - consent_age)}-"
f"{rng.randint(10, 19):02d}-p{rng.randint(0, 9)}."
f"{lang}+{region}+{rng.randint(100, 999)}",
"domain": ".google.com", "path": "/",
"expires": now + 395 * 86400,
"secure": True, "sameSite": "Lax"},
# 1P_JAR removed: Google deprecated it in 2022. Including it now is
# an anachronism flag for fingerprinters that look at cookie freshness.
{"name": "SOCS",
"value": f"CAES{_b64_rand(rng, 56)}",
"domain": ".google.com", "path": "/",
"expires": now + 395 * 86400,
"secure": True, "sameSite": "Lax"},
{"name": "_GRECAPTCHA",
"value": _b64_rand(rng, 124),
"domain": ".google.com", "path": "/",
"expires": now + 180 * 86400,
"secure": True, "sameSite": "None"},
{"name": "ENID",
"value": _b64_rand(rng, 252),
"domain": ".google.com", "path": "/",
"expires": now + 395 * 86400,
"httpOnly": True, "secure": True, "sameSite": "Lax"},
]
# ---------------------------------------------------------------------------
# Per-site cookie generators (recipes keyed by site["cookie_profile"])
# ---------------------------------------------------------------------------
def _norm_domain(domain: str) -> str:
return domain if domain.startswith(".") else "." + domain
def _ga_cookie(rng: random.Random, now: int, domain: str) -> dict:
first_age = rng.randint(7, 395) * 86400
return {"name": "_ga",
"value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - first_age}",
"domain": domain, "path": "/",
"expires": now + 395 * 86400,
"secure": True, "sameSite": "Lax"}
def _gid_cookie(rng: random.Random, now: int, domain: str) -> dict:
return {"name": "_gid",
"value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - rng.randint(60, 86400)}",
"domain": domain, "path": "/",
"expires": now + 86400,
"secure": True, "sameSite": "Lax"}
def _cf_bm_cookie(rng: random.Random, now: int, domain: str) -> dict:
return {"name": "__cf_bm",
"value": f"{_b64_rand(rng, 43)}.{rng.randint(1700000000, now)}-1-1-1-1",
"domain": domain, "path": "/",
"expires": now + 1800,
"secure": True, "sameSite": "None"}
def _onetrust_cookie(rng: random.Random, now: int, domain: str) -> dict:
age_d = rng.randint(7, 365)
iso = datetime.datetime.utcfromtimestamp(now - age_d * 86400).strftime(
"%Y-%m-%dT%H:%M:%S.000Z"
)
return {"name": "OptanonAlertBoxClosed",
"value": iso,
"domain": domain, "path": "/",
"expires": now + 395 * 86400,
"secure": True, "sameSite": "Lax"}
def _cookieyes_cookie(rng: random.Random, now: int, domain: str) -> dict:
return {"name": "cookieyes-consent",
"value": "consentid:" + _b64_rand(rng, 28) +
",consent:yes,action:yes,necessary:yes,functional:yes,analytics:yes",
"domain": domain, "path": "/",
"expires": now + 395 * 86400,
"secure": True, "sameSite": "Lax"}
def _clarity_cookie(rng: random.Random, now: int, domain: str) -> dict:
return {"name": "_clck",
"value": f"{_hex_rand(rng, 8)}|2|f{rng.randint(10, 99)}|0|"
f"{now - rng.randint(60, 180) * 86400}",
"domain": domain, "path": "/",
"expires": now + 365 * 86400,
"secure": True, "sameSite": "Lax"}
def _fbp_cookie(rng: random.Random, now: int, domain: str) -> dict:
"""Facebook Pixel _fbp = fb.<subdomain_index>.<unix_ms>.<random_int>"""
return {"name": "_fbp",
"value": f"fb.1.{(now - rng.randint(60, 30*86400)) * 1000}."
f"{rng.randint(100000000, 9999999999)}",
"domain": domain, "path": "/",
"expires": now + 90 * 86400,
"secure": True, "sameSite": "Lax"}
def _gtm_cookie(rng: random.Random, now: int, domain: str) -> dict:
"""_dc_gtm_<container_id>=1 — Google Tag Manager throttle flag."""
container = f"UA-{rng.randint(10000000, 99999999)}-{rng.randint(1, 9)}"
return {"name": f"_dc_gtm_{container}",
"value": "1",
"domain": domain, "path": "/",
"expires": now + 60,
"secure": True, "sameSite": "Lax"}
def _hssrc_cookie(rng: random.Random, now: int, domain: str) -> dict:
"""HubSpot referrer flag — small int."""
return {"name": "__hssrc",
"value": str(rng.randint(1, 5)),
"domain": domain, "path": "/",
"expires": now + 1800,
"secure": True, "sameSite": "Lax"}
def _cookies_for_profile(profile: str, rng: random.Random,
now: int, domain: str) -> List[dict]:
"""Map cookie_profile tag (from browsing_pool.json) → concrete cookies.
Each recipe is a realistic combination observed on real production sites
in that category. Cookie age and sub-recipe variance (e.g., OneTrust vs
CookieYes for consent banner) are deterministic from rng.
"""
domain = _norm_domain(domain)
if profile == "minimal":
return [_ga_cookie(rng, now, domain)]
if profile == "ga_only":
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)]
# 30% chance of GTM helper paired with GA
if rng.random() < 0.3:
out.append(_gtm_cookie(rng, now, domain))
return out
if profile == "ga_cf":
return [_ga_cookie(rng, now, domain), _cf_bm_cookie(rng, now, domain)]
if profile == "ga_consent":
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)]
out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5
else _cookieyes_cookie(rng, now, domain))
if rng.random() < 0.4:
out.append(_gtm_cookie(rng, now, domain))
return out
if profile == "ga_consent_clarity":
# Heavy-tracking site profile: GA + Clarity + consent + often FB pixel
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain),
_clarity_cookie(rng, now, domain)]
out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5
else _cookieyes_cookie(rng, now, domain))
if rng.random() < 0.5:
out.append(_fbp_cookie(rng, now, domain))
if rng.random() < 0.4:
out.append(_gtm_cookie(rng, now, domain))
if rng.random() < 0.25:
out.append(_hssrc_cookie(rng, now, domain))
return out
# Unknown profile → safe fallback
return [_ga_cookie(rng, now, domain)]
# ---------------------------------------------------------------------------
# Public builder
# ---------------------------------------------------------------------------
def build_cookies(seed: int,
browsing_history: Optional[List[dict]] = None,
now: Optional[int] = None,
timezone: Optional[str] = None) -> List[dict]:
"""Build the full cookie list for a persona.
Args:
seed: persona integer seed (from `Profile.seed`)
browsing_history: list of {name, category, cookie_profile} dicts as
sampled by `_fpforge.derive_browsing_history`. None empty list
(only the 5 google cookies are returned).
now: unix-seconds timestamp; defaults to current time. Pin for tests.
timezone: IANA tz used to derive CONSENT cookie's `lang+region` token
(e.g. "Europe/Rome" "it+IT", "America/New_York" "en+FX").
"""
ts = now if now is not None else int(time.time())
cookies: List[dict] = []
# 5 .google.com cookies (always) — CONSENT lang derived from tz
rng_g = random.Random(_sub_seed(int(seed), "google"))
cookies.extend(_google_cookies(rng_g, ts, timezone=timezone))
# Per-site cookies (deterministic from seed × domain)
for site in (browsing_history or []):
rng_d = random.Random(_sub_seed(int(seed), f"dom:{site['name']}"))
cookies.extend(_cookies_for_profile(
site.get("cookie_profile", "minimal"), rng_d, ts, site["name"]
))
return cookies
def _extract_seed_and_history(profile: Any) -> tuple:
"""Accept a Profile object OR a (seed, history) tuple OR just an int seed."""
if isinstance(profile, int):
return int(profile), []
seed = int(getattr(profile, "seed"))
history = list(getattr(profile, "browsing_history", []) or [])
return seed, history
async def seed_recaptcha_cookies_async(context: Any, profile: Any,
timezone: Optional[str] = None) -> None:
"""Async: inject deterministic persona cookies into the context."""
seed, history = _extract_seed_and_history(profile)
cookies = build_cookies(seed, history, timezone=timezone)
try:
await context.add_cookies(cookies)
except Exception:
pass
def seed_recaptcha_cookies_sync(context: Any, profile: Any,
timezone: Optional[str] = None) -> None:
"""Sync: inject deterministic persona cookies into the context."""
seed, history = _extract_seed_and_history(profile)
cookies = build_cookies(seed, history, timezone=timezone)
try:
context.add_cookies(cookies)
except Exception:
pass
__all__ = [
"build_cookies",
"seed_recaptcha_cookies_async",
"seed_recaptcha_cookies_sync",
]

View file

@ -51,6 +51,7 @@ class InvisiblePlaywright:
extra_prefs: Optional[Dict[str, Any]] = None,
binary_path: Optional[str] = None,
profile_dir: Optional[Union[str, Path]] = None,
prep_recaptcha: bool = False,
) -> None:
# See sync launcher: `zoom.stealth.fpp.hw_seed` is int32_t — clamp.
self.seed: int = int(seed) if seed is not None else secrets.randbits(31)
@ -64,6 +65,8 @@ class InvisiblePlaywright:
self._extra_prefs = extra_prefs
self._binary_path = binary_path
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
# reCAPTCHA pre-seed gated server-side; respect persistent profile.
self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
self._pw: Optional[Playwright] = None
self._browser: Optional[Browser] = None
@ -124,12 +127,18 @@ class InvisiblePlaywright:
def _patch_new_context_defaults(self, browser: Browser) -> None:
original = browser.new_context
defaults = self._default_context_kwargs()
prep = self._prep_recaptcha
profile = self._profile # pass the whole Profile (seed + browsing_history)
tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region
async def patched(**kw):
merged = dict(defaults)
merged.update(kw)
ctx = await original(**merged)
_patch_new_page_sleep(ctx)
if prep:
from ._recaptcha_seed import seed_recaptcha_cookies_async
await seed_recaptcha_cookies_async(ctx, profile, timezone=tz)
return ctx
browser.new_context = patched # type: ignore[assignment]

View file

@ -113,6 +113,7 @@ class InvisiblePlaywright:
extra_prefs: Optional[Dict[str, Any]] = None,
binary_path: Optional[str] = None,
profile_dir: Optional[Union[str, Path]] = None,
prep_recaptcha: bool = False,
) -> None:
"""
Args:
@ -166,6 +167,10 @@ class InvisiblePlaywright:
self._extra_prefs = extra_prefs
self._binary_path = binary_path
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
# reCAPTCHA cookie pre-seed — opt-in. Gated server-side: if a
# persistent profile_dir is in use, respect its existing cookies
# and DON'T enable pre-seed (the profile owns its own state).
self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
self._pw: Optional[Playwright] = None
self._browser: Optional[Browser] = None
@ -240,12 +245,18 @@ class InvisiblePlaywright:
"""
original = browser.new_context
defaults = self._default_context_kwargs()
prep = self._prep_recaptcha
profile = self._profile # pass the whole Profile (seed + browsing_history)
tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region
def patched(**kw):
merged = dict(defaults)
merged.update(kw) # user-supplied wins
ctx = original(**merged)
_patch_sync_new_page_sleep(ctx)
if prep:
from ._recaptcha_seed import seed_recaptcha_cookies_sync
seed_recaptcha_cookies_sync(ctx, profile, timezone=tz)
return ctx
browser.new_context = patched # type: ignore[assignment]

View file

@ -306,17 +306,6 @@ def test_navigator_oscpu_matches_userAgent(page):
assert "Mac" in oscpu
@pytest.mark.e2e
def test_userAgent_contains_appVersion_chromium_only(page):
"""Chromium invariant: UA contains appVersion. Firefox uses a short
appVersion form so the check is gated on `'chrome' in window`."""
if not _ev(page, "'chrome' in window"):
pytest.skip("Chromium-only invariant")
ua = _ev(page, "navigator.userAgent")
av = _ev(page, "navigator.appVersion")
assert av in ua
# ===========================================================================
# 5. Native function self-toString (creepjs/src/lies/index.ts hasKnownToString)
# ===========================================================================

View file

@ -0,0 +1,349 @@
"""Unit tests for the deterministic reCAPTCHA cookie builder.
Validates the contract:
- 6 .google.com cookies always present
- Per-site cookies built from a `browsing_history` list (sampled by the
Bayesian network in _fpforge)
- Determinism: same (seed, history) identical content
- Chrome 400-day cookie cap respected
- Playwright add_cookies field requirements satisfied
"""
import pytest
from invisible_playwright._recaptcha_seed import (
build_cookies,
_sub_seed,
)
pytestmark = pytest.mark.unit
_FIXED_NOW = 1779600000 # 2026-05-23, frozen for determinism
# Sample browsing history for tests (mimics what _fpforge produces).
_SAMPLE_HISTORY = [
{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"},
{"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
{"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
{"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"},
{"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"},
]
# ===========================================================================
# 1. Set composition
# ===========================================================================
def test_only_google_cookies_when_no_history():
"""Empty/None history → only the 5 .google.com cookies (1P_JAR removed
in realism round 2 deprecated by Google 2022)."""
cookies = build_cookies(seed=42, browsing_history=None, now=_FIXED_NOW)
names = sorted(c["name"] for c in cookies)
assert names == sorted(["NID", "CONSENT", "SOCS",
"_GRECAPTCHA", "ENID"])
assert all(c["domain"] == ".google.com" for c in cookies)
def test_browsing_history_adds_host_cookies():
"""Each history site contributes 1+ cookies on its domain."""
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
google = [c for c in cookies if c["domain"] == ".google.com"]
assert len(google) == 5 # 1P_JAR removed
domains = {c["domain"] for c in cookies if c["domain"] != ".google.com"}
for site in _SAMPLE_HISTORY:
assert f".{site['name']}" in domains
def test_domain_dot_prefix_normalized():
"""All host cookie domains have a leading dot for sub-domain coverage."""
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
for c in cookies:
assert c["domain"].startswith("."), f"missing dot: {c['domain']}"
# ===========================================================================
# 2. Cookie profile recipes (each profile yields the expected cookie set)
# ===========================================================================
def test_profile_minimal_yields_ga_only():
history = [{"name": "x.com", "cookie_profile": "minimal"}]
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
host = [c for c in cookies if c["domain"] == ".x.com"]
names = [c["name"] for c in host]
assert names == ["_ga"]
def test_profile_ga_only_yields_ga_and_gid():
history = [{"name": "x.com", "cookie_profile": "ga_only"}]
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
host = [c for c in cookies if c["domain"] == ".x.com"]
names = sorted(c["name"] for c in host)
assert names == ["_ga", "_gid"]
def test_profile_ga_cf_yields_ga_and_cf_bm():
history = [{"name": "x.com", "cookie_profile": "ga_cf"}]
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
host = [c for c in cookies if c["domain"] == ".x.com"]
names = sorted(c["name"] for c in host)
assert names == ["__cf_bm", "_ga"]
def test_profile_ga_consent_yields_three_cookies():
history = [{"name": "x.com", "cookie_profile": "ga_consent"}]
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
host = [c for c in cookies if c["domain"] == ".x.com"]
names = sorted(c["name"] for c in host)
# Always _ga + _gid + one of OneTrust|CookieYes
assert "_ga" in names and "_gid" in names
assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent"))
assert len(host) == 3
def test_profile_ga_consent_clarity_yields_at_least_four_cookies():
"""Always _ga + _gid + _clck + consent banner. Optionally _fbp, _dc_gtm_*,
__hssrc (probabilistic per rng see test_new_helper_cookies_*)."""
history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}]
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
host = [c for c in cookies if c["domain"] == ".x.com"]
names = sorted(c["name"] for c in host)
assert "_ga" in names and "_gid" in names and "_clck" in names
assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent"))
assert len(host) >= 4 # 4 baseline + 0-3 helpers
def test_unknown_profile_falls_back_to_ga():
history = [{"name": "x.com", "cookie_profile": "nonexistent_profile"}]
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
host = [c for c in cookies if c["domain"] == ".x.com"]
assert [c["name"] for c in host] == ["_ga"]
# ===========================================================================
# 3. Determinism
# ===========================================================================
def test_same_seed_and_history_same_content():
a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
b = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
assert a == b
def test_different_seed_different_content():
a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
b = build_cookies(seed=99, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
a_nid = next(c for c in a if c["name"] == "NID")["value"]
b_nid = next(c for c in b if c["name"] == "NID")["value"]
assert a_nid != b_nid
def test_history_order_does_not_affect_domain_specific_cookies():
"""Sub-seed is keyed on domain name, not order in history list."""
h1 = [_SAMPLE_HISTORY[0], _SAMPLE_HISTORY[1]]
h2 = [_SAMPLE_HISTORY[1], _SAMPLE_HISTORY[0]]
a = {(c["domain"], c["name"]): c["value"]
for c in build_cookies(seed=42, browsing_history=h1, now=_FIXED_NOW)
if c["domain"] != ".google.com"}
b = {(c["domain"], c["name"]): c["value"]
for c in build_cookies(seed=42, browsing_history=h2, now=_FIXED_NOW)
if c["domain"] != ".google.com"}
assert a == b
def test_sub_seed_distinct_tags_distinct_streams():
assert _sub_seed(42, "google") != _sub_seed(42, "dom:github.com")
assert _sub_seed(42, "dom:github.com") != _sub_seed(42, "dom:amazon.com")
assert _sub_seed(0, "any") != 0 # seed=0 still produces non-zero sub-seed
# ===========================================================================
# 4. Format / structural correctness for the Google batch
# ===========================================================================
def test_nid_format():
cookies = build_cookies(seed=42, now=_FIXED_NOW)
nid = next(c for c in cookies if c["name"] == "NID")
prefix, b64 = nid["value"].split("=", 1)
assert prefix.isdigit() and len(prefix) == 3
# Broadened to 100-540 in realism round 2 to cover historical NID versions
assert 100 <= int(prefix) <= 540
assert len(b64) == 178
def test_consent_format():
cookies = build_cookies(seed=42, now=_FIXED_NOW)
consent = next(c for c in cookies if c["name"] == "CONSENT")
assert consent["value"].startswith("YES+cb.")
assert "+FX+" in consent["value"]
# ===========================================================================
# 5. Chrome 400-day cookie cap compliance
# ===========================================================================
def test_all_expiries_within_400_day_cap():
"""Chrome 104+ caps cookie expiry to 400 days. Cookies > 400d silently
truncated / dropped. We tighten everything to <=395d (except __cf_bm
which is short-lived telemetry)."""
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
max_allowed = _FIXED_NOW + 400 * 86400
for c in cookies:
# Short-lived telemetry cookies are fine
if c["name"] in ("__cf_bm", "1P_JAR", "_gid"):
continue
assert c["expires"] <= max_allowed, (
f"Cookie {c['name']} expires {c['expires'] - _FIXED_NOW}s "
f"(> 400d cap) — would be silently dropped"
)
# ===========================================================================
# 6. Playwright add_cookies field requirements
# ===========================================================================
def test_all_cookies_have_required_playwright_fields():
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
for c in cookies:
assert c.get("name"), f"missing name: {c}"
assert c.get("value") is not None, f"missing value: {c}"
assert c.get("domain"), f"missing domain: {c}"
assert c.get("path") == "/", f"path != / for {c['name']}"
def test_modern_cookies_marked_secure():
"""Cookies with sameSite=None require secure=True under Firefox/Chrome.
Also generally needed for cookies set via Playwright add_cookies without
a navigation context."""
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
for c in cookies:
if c.get("sameSite") == "None":
assert c.get("secure") is True, f"{c['name']} None+!secure invalid"
def test_httponly_on_signed_cookies():
cookies = build_cookies(seed=42, now=_FIXED_NOW)
nid = next(c for c in cookies if c["name"] == "NID")
enid = next(c for c in cookies if c["name"] == "ENID")
assert nid.get("httpOnly") is True
assert enid.get("httpOnly") is True
# ===========================================================================
# 7. End-to-end with real fpforge Profile
# ===========================================================================
def test_with_real_fpforge_profile():
"""End-to-end: generate a real Profile, ensure browsing_history is populated
and build_cookies works against it."""
from invisible_playwright._fpforge import generate_profile
prof = generate_profile(seed=42)
assert isinstance(prof.browsing_history, list)
# The Bayesian network samples ~15-30 sites per persona
assert 5 <= len(prof.browsing_history) <= 50, \
f"unexpected history length: {len(prof.browsing_history)}"
# Each entry has the expected fields
for site in prof.browsing_history:
assert "name" in site and "category" in site and "cookie_profile" in site
# build_cookies works against the real profile
cookies = build_cookies(seed=prof.seed, browsing_history=prof.browsing_history,
now=_FIXED_NOW)
# 6 google + at least 1 cookie per visited site
assert len(cookies) >= 6 + len(prof.browsing_history)
def test_same_seed_same_browsing_history_via_fpforge():
"""Profile.browsing_history is deterministic from seed (Bayesian sampler)."""
from invisible_playwright._fpforge import generate_profile
a = generate_profile(seed=42).browsing_history
b = generate_profile(seed=42).browsing_history
assert a == b
# ===========================================================================
# 8. Realism improvements (2026-05-24 round 2)
# ===========================================================================
def test_no_1p_jar_cookie():
"""1P_JAR was deprecated by Google in 2022. Including it is an
anachronism flag for fingerprinters that look at cookie freshness."""
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
names = {c["name"] for c in cookies}
assert "1P_JAR" not in names
def test_nid_prefix_broadened_range():
"""NID 3-digit prefix should cover historical versions (137/105/511/525
seen in real captures) range 100-540, not just 500-540."""
seen_prefixes = set()
for seed in range(200):
cookies = build_cookies(seed=seed, now=_FIXED_NOW)
nid = next(c for c in cookies if c["name"] == "NID")
prefix = int(nid["value"].split("=", 1)[0])
seen_prefixes.add(prefix)
assert min(seen_prefixes) < 500, f"NID range never goes below 500 ({sorted(seen_prefixes)[:5]})"
assert max(seen_prefixes) <= 540
def test_consent_lang_from_timezone_eu():
"""CONSENT cookie's `lang+region` token derived from IANA timezone."""
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Rome")
consent = next(c for c in cookies if c["name"] == "CONSENT")
assert ".it+IT+" in consent["value"], f"expected it+IT in: {consent['value']}"
def test_consent_lang_default_fx():
"""Unknown / US timezone → default `en+FX` (non-EU fallback)."""
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="America/New_York")
consent = next(c for c in cookies if c["name"] == "CONSENT")
assert ".en+FX+" in consent["value"]
def test_consent_lang_de_for_berlin():
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Berlin")
consent = next(c for c in cookies if c["name"] == "CONSENT")
assert ".de+DE+" in consent["value"]
def test_consent_lang_no_timezone_default():
"""timezone=None → default en+FX."""
cookies = build_cookies(seed=42, now=_FIXED_NOW)
consent = next(c for c in cookies if c["name"] == "CONSENT")
assert ".en+FX+" in consent["value"]
def test_new_helper_cookies_appear_in_ga_consent_clarity():
"""ga_consent_clarity recipe should sometimes include _fbp, _dc_gtm_*, __hssrc
(probabilistic per rng). Check across many seeds that they appear."""
saw_fbp = False
saw_gtm = False
saw_hssrc = False
history = [{"name": "site.com", "cookie_profile": "ga_consent_clarity"}]
for seed in range(100):
cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW)
names = {c["name"] for c in cookies if c["domain"] == ".site.com"}
if "_fbp" in names: saw_fbp = True
if any(n.startswith("_dc_gtm_") for n in names): saw_gtm = True
if "__hssrc" in names: saw_hssrc = True
assert saw_fbp, "_fbp never appeared in 100 seeds (rng pick broken)"
assert saw_gtm, "_dc_gtm_* never appeared in 100 seeds"
assert saw_hssrc, "__hssrc never appeared in 100 seeds"
def test_fbp_format():
"""_fbp format: fb.<idx>.<unix_ms>.<random_int>"""
history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}]
# Try multiple seeds until we hit a seed that includes _fbp (50% chance)
for seed in range(20):
cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW)
fbp = next((c for c in cookies if c["name"] == "_fbp"), None)
if fbp:
parts = fbp["value"].split(".")
assert parts[0] == "fb"
assert parts[1].isdigit()
assert parts[2].isdigit() and len(parts[2]) >= 13 # unix ms
assert parts[3].isdigit()
return
raise AssertionError("never got _fbp across 20 seeds — distribution broken")