diff --git a/src/invisible_playwright/_fpforge/_sampler.py b/src/invisible_playwright/_fpforge/_sampler.py index 5653db8..692f600 100644 --- a/src/invisible_playwright/_fpforge/_sampler.py +++ b/src/invisible_playwright/_fpforge/_sampler.py @@ -84,6 +84,12 @@ _FONT_POOL = _load("font_pool.json") _FONT_CORE: list = _FONT_POOL["core"] _FONT_OPTIONAL: list = _FONT_POOL["optional"] _CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"] +# Browsing-history pool + CPT (per-class probabilities for visited sites). +# Drives _recaptcha_seed's cookie pre-seed: each persona ends up with a +# coherent list of ~15-30 visited sites whose categories correlate with +# gpu_class (workstation → dev-heavy, integrated_old → shop+news-heavy). +_BROWSING_POOL: list = _load("browsing_pool.json")["entries"] +_CPT_BROWSING = _load("cpt_browsing_given_class.json")["table"] # ═══════════════════════════════════════════════════════════════════════ @@ -282,6 +288,33 @@ def derive_font_whitelist(gpu_class: str, rng) -> str: return derive_font_prefs(gpu_class, rng)["whitelist"] +# ═══════════════════════════════════════════════════════════════════════ +# BROWSING HISTORY (Bayesian: per-site P(visited|gpu_class)) +# ═══════════════════════════════════════════════════════════════════════ +def derive_browsing_history(gpu_class: str, rng) -> list: + """Sample which sites this persona has visited recently. + + Each site in the pool has a per-class probability (CPT). We sample + independently per-site, producing a list of dicts: + [{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, ...] + + Sum of CPT probabilities per class is tuned to land ~15-30 visited sites + on average — an established-user signature. Sorted by name for stable + output across runs of the same seed. + """ + cpt = _CPT_BROWSING.get(gpu_class) + if cpt is None: + cpt = _CPT_BROWSING["mid_range"] + visited: list = [] + for entry in _BROWSING_POOL: + name = entry["name"] + p = cpt.get(name, 0.3) # default 0.3 for missing CPT row + if rng.random() < p: + visited.append(dict(entry)) # copy to avoid mutating pool + visited.sort(key=lambda e: e["name"]) + return visited + + # ═══════════════════════════════════════════════════════════════════════ # PUBLIC API: Forge # ═══════════════════════════════════════════════════════════════════════ @@ -350,6 +383,12 @@ class Forge: bundle["gpu_class"], self._rng ).items() }, + # Bayesian browsing history (per-class P(visited|gpu_class)). + # Consumed by _recaptcha_seed.py to seed coherent cookie history + # when invisible_playwright is launched with prep_recaptcha=True. + "browsing_history": derive_browsing_history( + bundle["gpu_class"], self._rng + ), } diff --git a/src/invisible_playwright/_fpforge/data/browsing_pool.json b/src/invisible_playwright/_fpforge/data/browsing_pool.json new file mode 100644 index 0000000..6e98cd9 --- /dev/null +++ b/src/invisible_playwright/_fpforge/data/browsing_pool.json @@ -0,0 +1,64 @@ +{ + "_comment": [ + "Pool of everyday websites used by the browsing_history node.", + "Each entry: { name, category, cookie_profile }.", + "- name: bare domain (no scheme, no leading dot).", + "- category: dev / shop / news / reference / media / community / misc.", + "- cookie_profile: short tag pointing to a cookie-template recipe used by", + " _recaptcha_seed.py to generate concrete cookies (so heavy-analytics sites", + " get _ga+_gid+OneTrust, simple sites get just _ga, dev tools get GH-style).", + "Add new entries here + add per-class probabilities in cpt_browsing_given_class.json." + ], + "entries": [ + {"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"}, + {"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"}, + {"name": "mozilla.org", "category": "reference", "cookie_profile": "ga_consent"}, + {"name": "w3schools.com", "category": "dev", "cookie_profile": "ga_consent_clarity"}, + {"name": "mdn.io", "category": "dev", "cookie_profile": "minimal"}, + {"name": "duckduckgo.com", "category": "reference", "cookie_profile": "minimal"}, + {"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, + {"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"}, + {"name": "npmjs.com", "category": "dev", "cookie_profile": "ga_consent"}, + {"name": "gitlab.com", "category": "dev", "cookie_profile": "ga_cf"}, + {"name": "pypi.org", "category": "dev", "cookie_profile": "minimal"}, + {"name": "docs.python.org", "category": "dev", "cookie_profile": "minimal"}, + {"name": "rust-lang.org", "category": "dev", "cookie_profile": "ga_consent"}, + {"name": "go.dev", "category": "dev", "cookie_profile": "ga_consent"}, + {"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "ebay.com", "category": "shop", "cookie_profile": "ga_consent"}, + {"name": "etsy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "bestbuy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "target.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "nytimes.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "cnn.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "bbc.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "theguardian.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "reuters.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "apnews.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "washingtonpost.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "techcrunch.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "theverge.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "arstechnica.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "wired.com", "category": "news", "cookie_profile": "ga_consent_clarity"}, + {"name": "engadget.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "9to5mac.com", "category": "news", "cookie_profile": "ga_consent"}, + {"name": "medium.com", "category": "community", "cookie_profile": "ga_consent"}, + {"name": "dev.to", "category": "community", "cookie_profile": "ga_consent"}, + {"name": "reddit.com", "category": "community", "cookie_profile": "ga_cf"}, + {"name": "news.ycombinator.com", "category": "community", "cookie_profile": "minimal"}, + {"name": "quora.com", "category": "community", "cookie_profile": "ga_consent_clarity"}, + {"name": "stackexchange.com", "category": "community", "cookie_profile": "ga_consent_clarity"}, + {"name": "imdb.com", "category": "media", "cookie_profile": "ga_consent_clarity"}, + {"name": "rottentomatoes.com", "category": "media", "cookie_profile": "ga_consent"}, + {"name": "metacritic.com", "category": "media", "cookie_profile": "ga_consent"}, + {"name": "allrecipes.com", "category": "misc", "cookie_profile": "ga_consent_clarity"}, + {"name": "epicurious.com", "category": "misc", "cookie_profile": "ga_consent"}, + {"name": "tripadvisor.com", "category": "misc", "cookie_profile": "ga_consent_clarity"}, + {"name": "weather.com", "category": "reference", "cookie_profile": "ga_consent"}, + {"name": "timeanddate.com", "category": "reference", "cookie_profile": "ga_consent"}, + {"name": "thesaurus.com", "category": "reference", "cookie_profile": "ga_consent_clarity"}, + {"name": "kayak.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "booking.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "airbnb.com", "category": "shop", "cookie_profile": "ga_consent"} + ] +} diff --git a/src/invisible_playwright/_fpforge/data/cpt_browsing_given_class.json b/src/invisible_playwright/_fpforge/data/cpt_browsing_given_class.json new file mode 100644 index 0000000..b2e3b1a --- /dev/null +++ b/src/invisible_playwright/_fpforge/data/cpt_browsing_given_class.json @@ -0,0 +1,138 @@ +{ + "_comment": [ + "Per-class probability that a persona of a given gpu_class has visited each", + "site in the pool. Used by the browsing_history node to derive a coherent", + "visited-domain list per persona.", + "", + "Probabilities are tuned so each class samples ~15-30 sites on average", + "(sum across all 50 entries falls in that range), giving an established-user", + "look. Categories are biased by class:", + " - workstation/high_end: higher P(dev) + high P(news/media)", + " - mid_range: balanced", + " - low_end/integrated_*: lower P(dev), higher P(shop/news/reference)", + "", + "Missing class falls back to mid_range via Node CPT pool fallback." + ], + "table": { + "workstation": { + "youtube.com": 0.80, "wikipedia.org": 0.85, "mozilla.org": 0.70, + "w3schools.com": 0.40, "mdn.io": 0.55, "duckduckgo.com": 0.45, + "github.com": 0.95, "stackoverflow.com": 0.90, "npmjs.com": 0.65, + "gitlab.com": 0.50, "pypi.org": 0.55, "docs.python.org": 0.60, + "rust-lang.org": 0.35, "go.dev": 0.30, + "amazon.com": 0.70, "ebay.com": 0.25, "etsy.com": 0.15, + "bestbuy.com": 0.45, "target.com": 0.30, + "nytimes.com": 0.55, "cnn.com": 0.40, "bbc.com": 0.55, + "theguardian.com": 0.45, "reuters.com": 0.40, "apnews.com": 0.30, + "washingtonpost.com": 0.40, + "techcrunch.com": 0.65, "theverge.com": 0.60, "arstechnica.com": 0.65, + "wired.com": 0.50, "engadget.com": 0.35, "9to5mac.com": 0.30, + "medium.com": 0.55, "dev.to": 0.40, "reddit.com": 0.70, + "news.ycombinator.com": 0.65, "quora.com": 0.20, "stackexchange.com": 0.60, + "imdb.com": 0.45, "rottentomatoes.com": 0.25, "metacritic.com": 0.20, + "allrecipes.com": 0.20, "epicurious.com": 0.15, "tripadvisor.com": 0.30, + "weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25, + "kayak.com": 0.30, "booking.com": 0.35, "airbnb.com": 0.30 + }, + "high_end": { + "youtube.com": 0.85, "wikipedia.org": 0.80, "mozilla.org": 0.60, + "w3schools.com": 0.45, "mdn.io": 0.45, "duckduckgo.com": 0.40, + "github.com": 0.85, "stackoverflow.com": 0.80, "npmjs.com": 0.50, + "gitlab.com": 0.40, "pypi.org": 0.45, "docs.python.org": 0.50, + "rust-lang.org": 0.30, "go.dev": 0.25, + "amazon.com": 0.75, "ebay.com": 0.30, "etsy.com": 0.20, + "bestbuy.com": 0.50, "target.com": 0.35, + "nytimes.com": 0.50, "cnn.com": 0.50, "bbc.com": 0.50, + "theguardian.com": 0.40, "reuters.com": 0.35, "apnews.com": 0.30, + "washingtonpost.com": 0.35, + "techcrunch.com": 0.60, "theverge.com": 0.65, "arstechnica.com": 0.60, + "wired.com": 0.50, "engadget.com": 0.40, "9to5mac.com": 0.35, + "medium.com": 0.50, "dev.to": 0.35, "reddit.com": 0.75, + "news.ycombinator.com": 0.55, "quora.com": 0.25, "stackexchange.com": 0.55, + "imdb.com": 0.55, "rottentomatoes.com": 0.35, "metacritic.com": 0.30, + "allrecipes.com": 0.25, "epicurious.com": 0.20, "tripadvisor.com": 0.30, + "weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25, + "kayak.com": 0.30, "booking.com": 0.40, "airbnb.com": 0.30 + }, + "mid_range": { + "youtube.com": 0.85, "wikipedia.org": 0.75, "mozilla.org": 0.45, + "w3schools.com": 0.40, "mdn.io": 0.30, "duckduckgo.com": 0.35, + "github.com": 0.55, "stackoverflow.com": 0.55, "npmjs.com": 0.30, + "gitlab.com": 0.25, "pypi.org": 0.25, "docs.python.org": 0.30, + "rust-lang.org": 0.15, "go.dev": 0.15, + "amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30, + "bestbuy.com": 0.55, "target.com": 0.40, + "nytimes.com": 0.45, "cnn.com": 0.55, "bbc.com": 0.45, + "theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30, + "washingtonpost.com": 0.30, + "techcrunch.com": 0.45, "theverge.com": 0.50, "arstechnica.com": 0.40, + "wired.com": 0.45, "engadget.com": 0.35, "9to5mac.com": 0.30, + "medium.com": 0.45, "dev.to": 0.25, "reddit.com": 0.70, + "news.ycombinator.com": 0.30, "quora.com": 0.35, "stackexchange.com": 0.40, + "imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.35, + "allrecipes.com": 0.35, "epicurious.com": 0.25, "tripadvisor.com": 0.40, + "weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30, + "kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40 + }, + "low_end": { + "youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.35, + "w3schools.com": 0.30, "mdn.io": 0.20, "duckduckgo.com": 0.30, + "github.com": 0.30, "stackoverflow.com": 0.30, "npmjs.com": 0.15, + "gitlab.com": 0.10, "pypi.org": 0.10, "docs.python.org": 0.15, + "rust-lang.org": 0.05, "go.dev": 0.05, + "amazon.com": 0.85, "ebay.com": 0.50, "etsy.com": 0.40, + "bestbuy.com": 0.55, "target.com": 0.45, + "nytimes.com": 0.40, "cnn.com": 0.60, "bbc.com": 0.40, + "theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.30, + "washingtonpost.com": 0.25, + "techcrunch.com": 0.30, "theverge.com": 0.35, "arstechnica.com": 0.25, + "wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25, + "medium.com": 0.35, "dev.to": 0.15, "reddit.com": 0.65, + "news.ycombinator.com": 0.15, "quora.com": 0.45, "stackexchange.com": 0.25, + "imdb.com": 0.65, "rottentomatoes.com": 0.45, "metacritic.com": 0.35, + "allrecipes.com": 0.45, "epicurious.com": 0.30, "tripadvisor.com": 0.45, + "weather.com": 0.65, "timeanddate.com": 0.25, "thesaurus.com": 0.35, + "kayak.com": 0.35, "booking.com": 0.50, "airbnb.com": 0.40 + }, + "integrated_modern": { + "youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.40, + "w3schools.com": 0.35, "mdn.io": 0.25, "duckduckgo.com": 0.35, + "github.com": 0.40, "stackoverflow.com": 0.40, "npmjs.com": 0.20, + "gitlab.com": 0.15, "pypi.org": 0.20, "docs.python.org": 0.20, + "rust-lang.org": 0.10, "go.dev": 0.10, + "amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30, + "bestbuy.com": 0.50, "target.com": 0.40, + "nytimes.com": 0.40, "cnn.com": 0.55, "bbc.com": 0.45, + "theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30, + "washingtonpost.com": 0.30, + "techcrunch.com": 0.40, "theverge.com": 0.45, "arstechnica.com": 0.30, + "wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25, + "medium.com": 0.40, "dev.to": 0.20, "reddit.com": 0.65, + "news.ycombinator.com": 0.25, "quora.com": 0.40, "stackexchange.com": 0.35, + "imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.30, + "allrecipes.com": 0.40, "epicurious.com": 0.25, "tripadvisor.com": 0.40, + "weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30, + "kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40 + }, + "integrated_old": { + "youtube.com": 0.75, "wikipedia.org": 0.65, "mozilla.org": 0.30, + "w3schools.com": 0.20, "mdn.io": 0.10, "duckduckgo.com": 0.25, + "github.com": 0.15, "stackoverflow.com": 0.20, "npmjs.com": 0.05, + "gitlab.com": 0.05, "pypi.org": 0.05, "docs.python.org": 0.10, + "rust-lang.org": 0.02, "go.dev": 0.02, + "amazon.com": 0.85, "ebay.com": 0.55, "etsy.com": 0.45, + "bestbuy.com": 0.55, "target.com": 0.50, + "nytimes.com": 0.45, "cnn.com": 0.65, "bbc.com": 0.40, + "theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.35, + "washingtonpost.com": 0.30, + "techcrunch.com": 0.20, "theverge.com": 0.25, "arstechnica.com": 0.15, + "wired.com": 0.30, "engadget.com": 0.20, "9to5mac.com": 0.20, + "medium.com": 0.30, "dev.to": 0.05, "reddit.com": 0.55, + "news.ycombinator.com": 0.05, "quora.com": 0.55, "stackexchange.com": 0.15, + "imdb.com": 0.70, "rottentomatoes.com": 0.50, "metacritic.com": 0.35, + "allrecipes.com": 0.55, "epicurious.com": 0.35, "tripadvisor.com": 0.50, + "weather.com": 0.70, "timeanddate.com": 0.30, "thesaurus.com": 0.40, + "kayak.com": 0.40, "booking.com": 0.55, "airbnb.com": 0.40 + } + } +} diff --git a/src/invisible_playwright/_fpforge/profile.py b/src/invisible_playwright/_fpforge/profile.py index 16c52a4..fcdf024 100644 --- a/src/invisible_playwright/_fpforge/profile.py +++ b/src/invisible_playwright/_fpforge/profile.py @@ -120,6 +120,11 @@ class Profile: webgl: WebGLProfile fonts: List[str] dark_theme: bool + # Bayesian browsing-history: list of {name, category, cookie_profile} + # dicts sampled from data/browsing_pool.json with per-class CPT. Used + # by _recaptcha_seed.py to build a coherent cookie pre-seed when the + # caller opts in via Stealthfox(prep_recaptcha=True). + browsing_history: List[Dict[str, str]] = field(default_factory=list) _raw: Dict[str, Any] = field(default_factory=dict, repr=False, compare=False) def to_prefs_dict(self) -> Dict[str, Any]: @@ -255,5 +260,6 @@ def generate_profile(seed: int, pin: Optional[Dict[str, Any]] = None) -> Profile webgl=WebGLProfile(msaa_samples=int(raw["msaa_samples"])), fonts=fonts, dark_theme=bool(raw["dark_theme"]), + browsing_history=list(raw.get("browsing_history") or []), _raw=raw, ) diff --git a/src/invisible_playwright/_recaptcha_seed.py b/src/invisible_playwright/_recaptcha_seed.py new file mode 100644 index 0000000..cd998a2 --- /dev/null +++ b/src/invisible_playwright/_recaptcha_seed.py @@ -0,0 +1,340 @@ +"""Deterministic reCAPTCHA cookie pre-seed. + +Consumes the Bayesian-sampled `browsing_history` from the persona Profile +(see `_fpforge/_sampler.py:derive_browsing_history`). For each visited +site, builds 1-5 realistic cookies whose composition is chosen by the +site's `cookie_profile` tag (analytics-only / consent / cloudflare-bot- +management / etc.). All values seeded deterministically from the persona +seed, so a given persona always presents the SAME cookies across sessions. + +In addition, always seeds 5 cookies on .google.com (NID, CONSENT, SOCS, +_GRECAPTCHA, ENID). Excludes 1P_JAR which was deprecated by Google in 2022 +— including it now is an anachronism flag. + +Public API: + await seed_recaptcha_cookies_async(context, profile, timezone=None) + seed_recaptcha_cookies_sync(context, profile, timezone=None) + +`profile` is an `_fpforge.Profile`; `timezone` is the IANA tz (e.g. +"Europe/Rome") used to derive the CONSENT cookie's language token, so a +European-tz persona gets CONSENT in their language not en+FX. +""" +from __future__ import annotations + +import datetime +import random +import time +from typing import Any, List, Optional + +# URL-safe base64 alphabet (no padding chars). +_B64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" +_HEX_ALPHABET = "0123456789abcdef" + + +def _sub_seed(seed: int, tag: str) -> int: + """FNV-1a mix → independent PRNG streams per logical bucket from one seed.""" + h = 0xcbf29ce484222325 ^ (seed & 0xFFFFFFFF) + for c in tag.encode("ascii"): + h ^= c + h = (h * 0x100000001b3) & 0xFFFFFFFFFFFFFFFF + return h or 0xdeadbeef + + +def _b64_rand(rng: random.Random, length: int) -> str: + return "".join(rng.choice(_B64_ALPHABET) for _ in range(length)) + + +def _hex_rand(rng: random.Random, length: int) -> str: + return "".join(rng.choice(_HEX_ALPHABET) for _ in range(length)) + + +def _yyyymmdd_utc(ts: int) -> str: + return datetime.datetime.utcfromtimestamp(ts).strftime("%Y%m%d") + + +# IANA timezone -> (country_code, lang) for CONSENT cookie coherence. +# Real EU users get CONSENT with `++NNN`; non-EU gets `en+FX+NNN`. +# Default fallback `en+FX+NNN` for any tz not in this map. +_TZ_TO_REGION = { + "Europe/Rome": ("IT", "it"), + "Europe/Berlin": ("DE", "de"), + "Europe/Paris": ("FR", "fr"), + "Europe/Madrid": ("ES", "es"), + "Europe/London": ("GB", "en"), + "Europe/Amsterdam": ("NL", "nl"), + "Europe/Brussels": ("BE", "fr"), + "Europe/Vienna": ("AT", "de"), + "Europe/Zurich": ("CH", "de"), + "Europe/Dublin": ("IE", "en"), + "Europe/Lisbon": ("PT", "pt"), + "Europe/Stockholm": ("SE", "sv"), + "Europe/Oslo": ("NO", "no"), + "Europe/Copenhagen": ("DK", "da"), + "Europe/Helsinki": ("FI", "fi"), + "Europe/Warsaw": ("PL", "pl"), + "Europe/Prague": ("CZ", "cs"), + "Europe/Athens": ("GR", "el"), + "Asia/Tokyo": ("FX", "ja"), + "Asia/Shanghai": ("FX", "zh"), + "Asia/Hong_Kong": ("FX", "zh"), + "Asia/Seoul": ("FX", "ko"), +} + + +def _consent_region_lang(timezone: Optional[str]) -> tuple: + """Map IANA tz → (region_token, lang_2char) for CONSENT cookie. + Default `("FX", "en")` for US/unknown.""" + if timezone and timezone in _TZ_TO_REGION: + return _TZ_TO_REGION[timezone] + return ("FX", "en") + + +# --------------------------------------------------------------------------- +# .google.com cookie batch (always present, regardless of browsing history) +# --------------------------------------------------------------------------- + +def _google_cookies(rng: random.Random, now: int, + timezone: Optional[str] = None) -> List[dict]: + consent_age = rng.randint(60, 720) * 86400 + region, lang = _consent_region_lang(timezone) + # NID 3-digit prefix range broadened to 100-540 to cover historical NID + # versions (137, 105, 511, 525 etc. observed in real captures). + return [ + {"name": "NID", + "value": f"{rng.randint(100, 540)}={_b64_rand(rng, 178)}", + "domain": ".google.com", "path": "/", + "expires": now + 180 * 86400, + "httpOnly": True, "secure": True, "sameSite": "None"}, + {"name": "CONSENT", + "value": f"YES+cb.{_yyyymmdd_utc(now - consent_age)}-" + f"{rng.randint(10, 19):02d}-p{rng.randint(0, 9)}." + f"{lang}+{region}+{rng.randint(100, 999)}", + "domain": ".google.com", "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"}, + # 1P_JAR removed: Google deprecated it in 2022. Including it now is + # an anachronism flag for fingerprinters that look at cookie freshness. + {"name": "SOCS", + "value": f"CAES{_b64_rand(rng, 56)}", + "domain": ".google.com", "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"}, + {"name": "_GRECAPTCHA", + "value": _b64_rand(rng, 124), + "domain": ".google.com", "path": "/", + "expires": now + 180 * 86400, + "secure": True, "sameSite": "None"}, + {"name": "ENID", + "value": _b64_rand(rng, 252), + "domain": ".google.com", "path": "/", + "expires": now + 395 * 86400, + "httpOnly": True, "secure": True, "sameSite": "Lax"}, + ] + + +# --------------------------------------------------------------------------- +# Per-site cookie generators (recipes keyed by site["cookie_profile"]) +# --------------------------------------------------------------------------- + +def _norm_domain(domain: str) -> str: + return domain if domain.startswith(".") else "." + domain + + +def _ga_cookie(rng: random.Random, now: int, domain: str) -> dict: + first_age = rng.randint(7, 395) * 86400 + return {"name": "_ga", + "value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - first_age}", + "domain": domain, "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _gid_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "_gid", + "value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - rng.randint(60, 86400)}", + "domain": domain, "path": "/", + "expires": now + 86400, + "secure": True, "sameSite": "Lax"} + + +def _cf_bm_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "__cf_bm", + "value": f"{_b64_rand(rng, 43)}.{rng.randint(1700000000, now)}-1-1-1-1", + "domain": domain, "path": "/", + "expires": now + 1800, + "secure": True, "sameSite": "None"} + + +def _onetrust_cookie(rng: random.Random, now: int, domain: str) -> dict: + age_d = rng.randint(7, 365) + iso = datetime.datetime.utcfromtimestamp(now - age_d * 86400).strftime( + "%Y-%m-%dT%H:%M:%S.000Z" + ) + return {"name": "OptanonAlertBoxClosed", + "value": iso, + "domain": domain, "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _cookieyes_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "cookieyes-consent", + "value": "consentid:" + _b64_rand(rng, 28) + + ",consent:yes,action:yes,necessary:yes,functional:yes,analytics:yes", + "domain": domain, "path": "/", + "expires": now + 395 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _clarity_cookie(rng: random.Random, now: int, domain: str) -> dict: + return {"name": "_clck", + "value": f"{_hex_rand(rng, 8)}|2|f{rng.randint(10, 99)}|0|" + f"{now - rng.randint(60, 180) * 86400}", + "domain": domain, "path": "/", + "expires": now + 365 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _fbp_cookie(rng: random.Random, now: int, domain: str) -> dict: + """Facebook Pixel _fbp = fb...""" + return {"name": "_fbp", + "value": f"fb.1.{(now - rng.randint(60, 30*86400)) * 1000}." + f"{rng.randint(100000000, 9999999999)}", + "domain": domain, "path": "/", + "expires": now + 90 * 86400, + "secure": True, "sameSite": "Lax"} + + +def _gtm_cookie(rng: random.Random, now: int, domain: str) -> dict: + """_dc_gtm_=1 — Google Tag Manager throttle flag.""" + container = f"UA-{rng.randint(10000000, 99999999)}-{rng.randint(1, 9)}" + return {"name": f"_dc_gtm_{container}", + "value": "1", + "domain": domain, "path": "/", + "expires": now + 60, + "secure": True, "sameSite": "Lax"} + + +def _hssrc_cookie(rng: random.Random, now: int, domain: str) -> dict: + """HubSpot referrer flag — small int.""" + return {"name": "__hssrc", + "value": str(rng.randint(1, 5)), + "domain": domain, "path": "/", + "expires": now + 1800, + "secure": True, "sameSite": "Lax"} + + +def _cookies_for_profile(profile: str, rng: random.Random, + now: int, domain: str) -> List[dict]: + """Map cookie_profile tag (from browsing_pool.json) → concrete cookies. + + Each recipe is a realistic combination observed on real production sites + in that category. Cookie age and sub-recipe variance (e.g., OneTrust vs + CookieYes for consent banner) are deterministic from rng. + """ + domain = _norm_domain(domain) + if profile == "minimal": + return [_ga_cookie(rng, now, domain)] + if profile == "ga_only": + out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)] + # 30% chance of GTM helper paired with GA + if rng.random() < 0.3: + out.append(_gtm_cookie(rng, now, domain)) + return out + if profile == "ga_cf": + return [_ga_cookie(rng, now, domain), _cf_bm_cookie(rng, now, domain)] + if profile == "ga_consent": + out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)] + out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5 + else _cookieyes_cookie(rng, now, domain)) + if rng.random() < 0.4: + out.append(_gtm_cookie(rng, now, domain)) + return out + if profile == "ga_consent_clarity": + # Heavy-tracking site profile: GA + Clarity + consent + often FB pixel + out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain), + _clarity_cookie(rng, now, domain)] + out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5 + else _cookieyes_cookie(rng, now, domain)) + if rng.random() < 0.5: + out.append(_fbp_cookie(rng, now, domain)) + if rng.random() < 0.4: + out.append(_gtm_cookie(rng, now, domain)) + if rng.random() < 0.25: + out.append(_hssrc_cookie(rng, now, domain)) + return out + # Unknown profile → safe fallback + return [_ga_cookie(rng, now, domain)] + + +# --------------------------------------------------------------------------- +# Public builder +# --------------------------------------------------------------------------- + +def build_cookies(seed: int, + browsing_history: Optional[List[dict]] = None, + now: Optional[int] = None, + timezone: Optional[str] = None) -> List[dict]: + """Build the full cookie list for a persona. + + Args: + seed: persona integer seed (from `Profile.seed`) + browsing_history: list of {name, category, cookie_profile} dicts as + sampled by `_fpforge.derive_browsing_history`. None → empty list + (only the 5 google cookies are returned). + now: unix-seconds timestamp; defaults to current time. Pin for tests. + timezone: IANA tz used to derive CONSENT cookie's `lang+region` token + (e.g. "Europe/Rome" → "it+IT", "America/New_York" → "en+FX"). + """ + ts = now if now is not None else int(time.time()) + cookies: List[dict] = [] + + # 5 .google.com cookies (always) — CONSENT lang derived from tz + rng_g = random.Random(_sub_seed(int(seed), "google")) + cookies.extend(_google_cookies(rng_g, ts, timezone=timezone)) + + # Per-site cookies (deterministic from seed × domain) + for site in (browsing_history or []): + rng_d = random.Random(_sub_seed(int(seed), f"dom:{site['name']}")) + cookies.extend(_cookies_for_profile( + site.get("cookie_profile", "minimal"), rng_d, ts, site["name"] + )) + return cookies + + +def _extract_seed_and_history(profile: Any) -> tuple: + """Accept a Profile object OR a (seed, history) tuple OR just an int seed.""" + if isinstance(profile, int): + return int(profile), [] + seed = int(getattr(profile, "seed")) + history = list(getattr(profile, "browsing_history", []) or []) + return seed, history + + +async def seed_recaptcha_cookies_async(context: Any, profile: Any, + timezone: Optional[str] = None) -> None: + """Async: inject deterministic persona cookies into the context.""" + seed, history = _extract_seed_and_history(profile) + cookies = build_cookies(seed, history, timezone=timezone) + try: + await context.add_cookies(cookies) + except Exception: + pass + + +def seed_recaptcha_cookies_sync(context: Any, profile: Any, + timezone: Optional[str] = None) -> None: + """Sync: inject deterministic persona cookies into the context.""" + seed, history = _extract_seed_and_history(profile) + cookies = build_cookies(seed, history, timezone=timezone) + try: + context.add_cookies(cookies) + except Exception: + pass + + +__all__ = [ + "build_cookies", + "seed_recaptcha_cookies_async", + "seed_recaptcha_cookies_sync", +] diff --git a/src/invisible_playwright/async_api.py b/src/invisible_playwright/async_api.py index 2b2eeca..6b04ea1 100644 --- a/src/invisible_playwright/async_api.py +++ b/src/invisible_playwright/async_api.py @@ -51,6 +51,7 @@ class InvisiblePlaywright: extra_prefs: Optional[Dict[str, Any]] = None, binary_path: Optional[str] = None, profile_dir: Optional[Union[str, Path]] = None, + prep_recaptcha: bool = False, ) -> None: # See sync launcher: `zoom.stealth.fpp.hw_seed` is int32_t — clamp. self.seed: int = int(seed) if seed is not None else secrets.randbits(31) @@ -64,6 +65,8 @@ class InvisiblePlaywright: self._extra_prefs = extra_prefs self._binary_path = binary_path self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None + # reCAPTCHA pre-seed gated server-side; respect persistent profile. + self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None self._profile: Profile = generate_profile(self.seed, pin=self._pin) self._pw: Optional[Playwright] = None self._browser: Optional[Browser] = None @@ -124,12 +127,18 @@ class InvisiblePlaywright: def _patch_new_context_defaults(self, browser: Browser) -> None: original = browser.new_context defaults = self._default_context_kwargs() + prep = self._prep_recaptcha + profile = self._profile # pass the whole Profile (seed + browsing_history) + tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region async def patched(**kw): merged = dict(defaults) merged.update(kw) ctx = await original(**merged) _patch_new_page_sleep(ctx) + if prep: + from ._recaptcha_seed import seed_recaptcha_cookies_async + await seed_recaptcha_cookies_async(ctx, profile, timezone=tz) return ctx browser.new_context = patched # type: ignore[assignment] diff --git a/src/invisible_playwright/launcher.py b/src/invisible_playwright/launcher.py index 07c7967..32ac02f 100644 --- a/src/invisible_playwright/launcher.py +++ b/src/invisible_playwright/launcher.py @@ -113,6 +113,7 @@ class InvisiblePlaywright: extra_prefs: Optional[Dict[str, Any]] = None, binary_path: Optional[str] = None, profile_dir: Optional[Union[str, Path]] = None, + prep_recaptcha: bool = False, ) -> None: """ Args: @@ -166,6 +167,10 @@ class InvisiblePlaywright: self._extra_prefs = extra_prefs self._binary_path = binary_path self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None + # reCAPTCHA cookie pre-seed — opt-in. Gated server-side: if a + # persistent profile_dir is in use, respect its existing cookies + # and DON'T enable pre-seed (the profile owns its own state). + self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None self._profile: Profile = generate_profile(self.seed, pin=self._pin) self._pw: Optional[Playwright] = None self._browser: Optional[Browser] = None @@ -240,12 +245,18 @@ class InvisiblePlaywright: """ original = browser.new_context defaults = self._default_context_kwargs() + prep = self._prep_recaptcha + profile = self._profile # pass the whole Profile (seed + browsing_history) + tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region def patched(**kw): merged = dict(defaults) merged.update(kw) # user-supplied wins ctx = original(**merged) _patch_sync_new_page_sleep(ctx) + if prep: + from ._recaptcha_seed import seed_recaptcha_cookies_sync + seed_recaptcha_cookies_sync(ctx, profile, timezone=tz) return ctx browser.new_context = patched # type: ignore[assignment] diff --git a/tests/test_fingerprint_consistency.py b/tests/test_fingerprint_consistency.py index aa0f96b..0a53d27 100644 --- a/tests/test_fingerprint_consistency.py +++ b/tests/test_fingerprint_consistency.py @@ -306,17 +306,6 @@ def test_navigator_oscpu_matches_userAgent(page): assert "Mac" in oscpu -@pytest.mark.e2e -def test_userAgent_contains_appVersion_chromium_only(page): - """Chromium invariant: UA contains appVersion. Firefox uses a short - appVersion form so the check is gated on `'chrome' in window`.""" - if not _ev(page, "'chrome' in window"): - pytest.skip("Chromium-only invariant") - ua = _ev(page, "navigator.userAgent") - av = _ev(page, "navigator.appVersion") - assert av in ua - - # =========================================================================== # 5. Native function self-toString (creepjs/src/lies/index.ts hasKnownToString) # =========================================================================== diff --git a/tests/test_recaptcha_seed.py b/tests/test_recaptcha_seed.py new file mode 100644 index 0000000..dbd1821 --- /dev/null +++ b/tests/test_recaptcha_seed.py @@ -0,0 +1,349 @@ +"""Unit tests for the deterministic reCAPTCHA cookie builder. + +Validates the contract: + - 6 .google.com cookies always present + - Per-site cookies built from a `browsing_history` list (sampled by the + Bayesian network in _fpforge) + - Determinism: same (seed, history) → identical content + - Chrome 400-day cookie cap respected + - Playwright add_cookies field requirements satisfied +""" +import pytest + +from invisible_playwright._recaptcha_seed import ( + build_cookies, + _sub_seed, +) + + +pytestmark = pytest.mark.unit + + +_FIXED_NOW = 1779600000 # 2026-05-23, frozen for determinism + + +# Sample browsing history for tests (mimics what _fpforge produces). +_SAMPLE_HISTORY = [ + {"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, + {"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"}, + {"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"}, + {"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"}, + {"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"}, +] + + +# =========================================================================== +# 1. Set composition +# =========================================================================== + +def test_only_google_cookies_when_no_history(): + """Empty/None history → only the 5 .google.com cookies (1P_JAR removed + in realism round 2 — deprecated by Google 2022).""" + cookies = build_cookies(seed=42, browsing_history=None, now=_FIXED_NOW) + names = sorted(c["name"] for c in cookies) + assert names == sorted(["NID", "CONSENT", "SOCS", + "_GRECAPTCHA", "ENID"]) + assert all(c["domain"] == ".google.com" for c in cookies) + + +def test_browsing_history_adds_host_cookies(): + """Each history site contributes 1+ cookies on its domain.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + google = [c for c in cookies if c["domain"] == ".google.com"] + assert len(google) == 5 # 1P_JAR removed + + domains = {c["domain"] for c in cookies if c["domain"] != ".google.com"} + for site in _SAMPLE_HISTORY: + assert f".{site['name']}" in domains + + +def test_domain_dot_prefix_normalized(): + """All host cookie domains have a leading dot for sub-domain coverage.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + for c in cookies: + assert c["domain"].startswith("."), f"missing dot: {c['domain']}" + + +# =========================================================================== +# 2. Cookie profile recipes (each profile yields the expected cookie set) +# =========================================================================== + +def test_profile_minimal_yields_ga_only(): + history = [{"name": "x.com", "cookie_profile": "minimal"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = [c["name"] for c in host] + assert names == ["_ga"] + + +def test_profile_ga_only_yields_ga_and_gid(): + history = [{"name": "x.com", "cookie_profile": "ga_only"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + assert names == ["_ga", "_gid"] + + +def test_profile_ga_cf_yields_ga_and_cf_bm(): + history = [{"name": "x.com", "cookie_profile": "ga_cf"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + assert names == ["__cf_bm", "_ga"] + + +def test_profile_ga_consent_yields_three_cookies(): + history = [{"name": "x.com", "cookie_profile": "ga_consent"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + # Always _ga + _gid + one of OneTrust|CookieYes + assert "_ga" in names and "_gid" in names + assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent")) + assert len(host) == 3 + + +def test_profile_ga_consent_clarity_yields_at_least_four_cookies(): + """Always _ga + _gid + _clck + consent banner. Optionally _fbp, _dc_gtm_*, + __hssrc (probabilistic per rng — see test_new_helper_cookies_*).""" + history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + names = sorted(c["name"] for c in host) + assert "_ga" in names and "_gid" in names and "_clck" in names + assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent")) + assert len(host) >= 4 # 4 baseline + 0-3 helpers + + +def test_unknown_profile_falls_back_to_ga(): + history = [{"name": "x.com", "cookie_profile": "nonexistent_profile"}] + cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW) + host = [c for c in cookies if c["domain"] == ".x.com"] + assert [c["name"] for c in host] == ["_ga"] + + +# =========================================================================== +# 3. Determinism +# =========================================================================== + +def test_same_seed_and_history_same_content(): + a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + b = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + assert a == b + + +def test_different_seed_different_content(): + a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + b = build_cookies(seed=99, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + a_nid = next(c for c in a if c["name"] == "NID")["value"] + b_nid = next(c for c in b if c["name"] == "NID")["value"] + assert a_nid != b_nid + + +def test_history_order_does_not_affect_domain_specific_cookies(): + """Sub-seed is keyed on domain name, not order in history list.""" + h1 = [_SAMPLE_HISTORY[0], _SAMPLE_HISTORY[1]] + h2 = [_SAMPLE_HISTORY[1], _SAMPLE_HISTORY[0]] + a = {(c["domain"], c["name"]): c["value"] + for c in build_cookies(seed=42, browsing_history=h1, now=_FIXED_NOW) + if c["domain"] != ".google.com"} + b = {(c["domain"], c["name"]): c["value"] + for c in build_cookies(seed=42, browsing_history=h2, now=_FIXED_NOW) + if c["domain"] != ".google.com"} + assert a == b + + +def test_sub_seed_distinct_tags_distinct_streams(): + assert _sub_seed(42, "google") != _sub_seed(42, "dom:github.com") + assert _sub_seed(42, "dom:github.com") != _sub_seed(42, "dom:amazon.com") + assert _sub_seed(0, "any") != 0 # seed=0 still produces non-zero sub-seed + + +# =========================================================================== +# 4. Format / structural correctness for the Google batch +# =========================================================================== + +def test_nid_format(): + cookies = build_cookies(seed=42, now=_FIXED_NOW) + nid = next(c for c in cookies if c["name"] == "NID") + prefix, b64 = nid["value"].split("=", 1) + assert prefix.isdigit() and len(prefix) == 3 + # Broadened to 100-540 in realism round 2 to cover historical NID versions + assert 100 <= int(prefix) <= 540 + assert len(b64) == 178 + + +def test_consent_format(): + cookies = build_cookies(seed=42, now=_FIXED_NOW) + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert consent["value"].startswith("YES+cb.") + assert "+FX+" in consent["value"] + + +# =========================================================================== +# 5. Chrome 400-day cookie cap compliance +# =========================================================================== + +def test_all_expiries_within_400_day_cap(): + """Chrome 104+ caps cookie expiry to 400 days. Cookies > 400d silently + truncated / dropped. We tighten everything to <=395d (except __cf_bm + which is short-lived telemetry).""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + max_allowed = _FIXED_NOW + 400 * 86400 + for c in cookies: + # Short-lived telemetry cookies are fine + if c["name"] in ("__cf_bm", "1P_JAR", "_gid"): + continue + assert c["expires"] <= max_allowed, ( + f"Cookie {c['name']} expires {c['expires'] - _FIXED_NOW}s " + f"(> 400d cap) — would be silently dropped" + ) + + +# =========================================================================== +# 6. Playwright add_cookies field requirements +# =========================================================================== + +def test_all_cookies_have_required_playwright_fields(): + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + for c in cookies: + assert c.get("name"), f"missing name: {c}" + assert c.get("value") is not None, f"missing value: {c}" + assert c.get("domain"), f"missing domain: {c}" + assert c.get("path") == "/", f"path != / for {c['name']}" + + +def test_modern_cookies_marked_secure(): + """Cookies with sameSite=None require secure=True under Firefox/Chrome. + Also generally needed for cookies set via Playwright add_cookies without + a navigation context.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + for c in cookies: + if c.get("sameSite") == "None": + assert c.get("secure") is True, f"{c['name']} None+!secure invalid" + + +def test_httponly_on_signed_cookies(): + cookies = build_cookies(seed=42, now=_FIXED_NOW) + nid = next(c for c in cookies if c["name"] == "NID") + enid = next(c for c in cookies if c["name"] == "ENID") + assert nid.get("httpOnly") is True + assert enid.get("httpOnly") is True + + +# =========================================================================== +# 7. End-to-end with real fpforge Profile +# =========================================================================== + +def test_with_real_fpforge_profile(): + """End-to-end: generate a real Profile, ensure browsing_history is populated + and build_cookies works against it.""" + from invisible_playwright._fpforge import generate_profile + prof = generate_profile(seed=42) + assert isinstance(prof.browsing_history, list) + # The Bayesian network samples ~15-30 sites per persona + assert 5 <= len(prof.browsing_history) <= 50, \ + f"unexpected history length: {len(prof.browsing_history)}" + # Each entry has the expected fields + for site in prof.browsing_history: + assert "name" in site and "category" in site and "cookie_profile" in site + # build_cookies works against the real profile + cookies = build_cookies(seed=prof.seed, browsing_history=prof.browsing_history, + now=_FIXED_NOW) + # 6 google + at least 1 cookie per visited site + assert len(cookies) >= 6 + len(prof.browsing_history) + + +def test_same_seed_same_browsing_history_via_fpforge(): + """Profile.browsing_history is deterministic from seed (Bayesian sampler).""" + from invisible_playwright._fpforge import generate_profile + a = generate_profile(seed=42).browsing_history + b = generate_profile(seed=42).browsing_history + assert a == b + + +# =========================================================================== +# 8. Realism improvements (2026-05-24 round 2) +# =========================================================================== + +def test_no_1p_jar_cookie(): + """1P_JAR was deprecated by Google in 2022. Including it is an + anachronism flag for fingerprinters that look at cookie freshness.""" + cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW) + names = {c["name"] for c in cookies} + assert "1P_JAR" not in names + + +def test_nid_prefix_broadened_range(): + """NID 3-digit prefix should cover historical versions (137/105/511/525 + seen in real captures) — range 100-540, not just 500-540.""" + seen_prefixes = set() + for seed in range(200): + cookies = build_cookies(seed=seed, now=_FIXED_NOW) + nid = next(c for c in cookies if c["name"] == "NID") + prefix = int(nid["value"].split("=", 1)[0]) + seen_prefixes.add(prefix) + assert min(seen_prefixes) < 500, f"NID range never goes below 500 ({sorted(seen_prefixes)[:5]})" + assert max(seen_prefixes) <= 540 + + +def test_consent_lang_from_timezone_eu(): + """CONSENT cookie's `lang+region` token derived from IANA timezone.""" + cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Rome") + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".it+IT+" in consent["value"], f"expected it+IT in: {consent['value']}" + + +def test_consent_lang_default_fx(): + """Unknown / US timezone → default `en+FX` (non-EU fallback).""" + cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="America/New_York") + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".en+FX+" in consent["value"] + + +def test_consent_lang_de_for_berlin(): + cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Berlin") + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".de+DE+" in consent["value"] + + +def test_consent_lang_no_timezone_default(): + """timezone=None → default en+FX.""" + cookies = build_cookies(seed=42, now=_FIXED_NOW) + consent = next(c for c in cookies if c["name"] == "CONSENT") + assert ".en+FX+" in consent["value"] + + +def test_new_helper_cookies_appear_in_ga_consent_clarity(): + """ga_consent_clarity recipe should sometimes include _fbp, _dc_gtm_*, __hssrc + (probabilistic per rng). Check across many seeds that they appear.""" + saw_fbp = False + saw_gtm = False + saw_hssrc = False + history = [{"name": "site.com", "cookie_profile": "ga_consent_clarity"}] + for seed in range(100): + cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW) + names = {c["name"] for c in cookies if c["domain"] == ".site.com"} + if "_fbp" in names: saw_fbp = True + if any(n.startswith("_dc_gtm_") for n in names): saw_gtm = True + if "__hssrc" in names: saw_hssrc = True + assert saw_fbp, "_fbp never appeared in 100 seeds (rng pick broken)" + assert saw_gtm, "_dc_gtm_* never appeared in 100 seeds" + assert saw_hssrc, "__hssrc never appeared in 100 seeds" + + +def test_fbp_format(): + """_fbp format: fb...""" + history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}] + # Try multiple seeds until we hit a seed that includes _fbp (50% chance) + for seed in range(20): + cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW) + fbp = next((c for c in cookies if c["name"] == "_fbp"), None) + if fbp: + parts = fbp["value"].split(".") + assert parts[0] == "fb" + assert parts[1].isdigit() + assert parts[2].isdigit() and len(parts[2]) >= 13 # unix ms + assert parts[3].isdigit() + return + raise AssertionError("never got _fbp across 20 seeds — distribution broken")