mirror of
https://github.com/feder-cr/invisible_playwright.git
synced 2026-06-07 08:35:12 +02:00
feat: deterministic reCAPTCHA cookie pre-seed via Bayesian browsing history
Adds opt-in helper that auto-injects coherent cookie history into every
BrowserContext created via new_context(). Content is fully deterministic
from the persona seed so a given seed always presents the same cookies
across sessions.
Composition (per persona, all derived from seed):
- 5 cookies on .google.com (NID, CONSENT, SOCS, _GRECAPTCHA, ENID).
Excludes 1P_JAR which was deprecated by Google in 2022. CONSENT
`lang+region` token derived from the persona's IANA timezone
(Europe/Rome -> it+IT, America/* -> en+FX, etc.). NID prefix
broadened to 100-540 to cover historical versions.
- Per-site cookies on 13-25 "visited" everyday domains, sampled from a
Bayesian network conditioned on gpu_class - workstation/high_end
personas trend toward dev/tech sites, low_end/integrated_old trend
toward shop/news/reference. Each site contributes 1-7 cookies based
on a `cookie_profile` tag. Cookie pool includes _ga, _gid, _clck,
_clsk, __cf_bm, OneTrust/CookieYes consent, _fbp (Facebook Pixel),
_dc_gtm_<id> (Tag Manager helper), __hssrc (HubSpot helper).
API:
Stealthfox(seed=42, prep_recaptcha=True)
No per-call configuration: visited-sites + cookie composition all derived
from the persona seed via the Bayesian sampler.
Gated server-side: forced False if profile_dir is set (persistent profile
owns its own state). All expiries capped to 395 days per Chrome/Firefox
400-day RFC 6265bis-15 limit.
Bayesian integration:
- New `derive_browsing_history(gpu_class, rng)` in _fpforge/_sampler.py
(parallel to `derive_font_prefs`).
- New data files: browsing_pool.json (50 site entries) and
cpt_browsing_given_class.json (per-class probabilities).
- Profile dataclass exposes `browsing_history` field.
- _recaptcha_seed.py consumes Profile.browsing_history; receives
timezone separately to derive CONSENT lang+region.
Also drops a dead Chromium-only e2e test that always skipped on our
Firefox-only wrapper.
Test coverage: 29 unit tests covering composition, profile recipes
(minimal/ga_only/ga_cf/ga_consent/ga_consent_clarity), determinism,
Chrome 400-day cap, Playwright field requirements, CONSENT lang
mapping (IT/DE/US/default), helper-cookie probability distributions,
end-to-end with real fpforge Profile.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
413db06690
commit
3d8ba0b82c
9 changed files with 956 additions and 11 deletions
|
|
@ -84,6 +84,12 @@ _FONT_POOL = _load("font_pool.json")
|
|||
_FONT_CORE: list = _FONT_POOL["core"]
|
||||
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
|
||||
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
|
||||
# Browsing-history pool + CPT (per-class probabilities for visited sites).
|
||||
# Drives _recaptcha_seed's cookie pre-seed: each persona ends up with a
|
||||
# coherent list of ~15-30 visited sites whose categories correlate with
|
||||
# gpu_class (workstation → dev-heavy, integrated_old → shop+news-heavy).
|
||||
_BROWSING_POOL: list = _load("browsing_pool.json")["entries"]
|
||||
_CPT_BROWSING = _load("cpt_browsing_given_class.json")["table"]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
|
@ -282,6 +288,33 @@ def derive_font_whitelist(gpu_class: str, rng) -> str:
|
|||
return derive_font_prefs(gpu_class, rng)["whitelist"]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# BROWSING HISTORY (Bayesian: per-site P(visited|gpu_class))
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
def derive_browsing_history(gpu_class: str, rng) -> list:
|
||||
"""Sample which sites this persona has visited recently.
|
||||
|
||||
Each site in the pool has a per-class probability (CPT). We sample
|
||||
independently per-site, producing a list of dicts:
|
||||
[{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, ...]
|
||||
|
||||
Sum of CPT probabilities per class is tuned to land ~15-30 visited sites
|
||||
on average — an established-user signature. Sorted by name for stable
|
||||
output across runs of the same seed.
|
||||
"""
|
||||
cpt = _CPT_BROWSING.get(gpu_class)
|
||||
if cpt is None:
|
||||
cpt = _CPT_BROWSING["mid_range"]
|
||||
visited: list = []
|
||||
for entry in _BROWSING_POOL:
|
||||
name = entry["name"]
|
||||
p = cpt.get(name, 0.3) # default 0.3 for missing CPT row
|
||||
if rng.random() < p:
|
||||
visited.append(dict(entry)) # copy to avoid mutating pool
|
||||
visited.sort(key=lambda e: e["name"])
|
||||
return visited
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# PUBLIC API: Forge
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
|
@ -350,6 +383,12 @@ class Forge:
|
|||
bundle["gpu_class"], self._rng
|
||||
).items()
|
||||
},
|
||||
# Bayesian browsing history (per-class P(visited|gpu_class)).
|
||||
# Consumed by _recaptcha_seed.py to seed coherent cookie history
|
||||
# when invisible_playwright is launched with prep_recaptcha=True.
|
||||
"browsing_history": derive_browsing_history(
|
||||
bundle["gpu_class"], self._rng
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
64
src/invisible_playwright/_fpforge/data/browsing_pool.json
Normal file
64
src/invisible_playwright/_fpforge/data/browsing_pool.json
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
{
|
||||
"_comment": [
|
||||
"Pool of everyday websites used by the browsing_history node.",
|
||||
"Each entry: { name, category, cookie_profile }.",
|
||||
"- name: bare domain (no scheme, no leading dot).",
|
||||
"- category: dev / shop / news / reference / media / community / misc.",
|
||||
"- cookie_profile: short tag pointing to a cookie-template recipe used by",
|
||||
" _recaptcha_seed.py to generate concrete cookies (so heavy-analytics sites",
|
||||
" get _ga+_gid+OneTrust, simple sites get just _ga, dev tools get GH-style).",
|
||||
"Add new entries here + add per-class probabilities in cpt_browsing_given_class.json."
|
||||
],
|
||||
"entries": [
|
||||
{"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"},
|
||||
{"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"},
|
||||
{"name": "mozilla.org", "category": "reference", "cookie_profile": "ga_consent"},
|
||||
{"name": "w3schools.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "mdn.io", "category": "dev", "cookie_profile": "minimal"},
|
||||
{"name": "duckduckgo.com", "category": "reference", "cookie_profile": "minimal"},
|
||||
{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"},
|
||||
{"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "npmjs.com", "category": "dev", "cookie_profile": "ga_consent"},
|
||||
{"name": "gitlab.com", "category": "dev", "cookie_profile": "ga_cf"},
|
||||
{"name": "pypi.org", "category": "dev", "cookie_profile": "minimal"},
|
||||
{"name": "docs.python.org", "category": "dev", "cookie_profile": "minimal"},
|
||||
{"name": "rust-lang.org", "category": "dev", "cookie_profile": "ga_consent"},
|
||||
{"name": "go.dev", "category": "dev", "cookie_profile": "ga_consent"},
|
||||
{"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "ebay.com", "category": "shop", "cookie_profile": "ga_consent"},
|
||||
{"name": "etsy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "bestbuy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "target.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "nytimes.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "cnn.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "bbc.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "theguardian.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "reuters.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "apnews.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "washingtonpost.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "techcrunch.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "theverge.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "arstechnica.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "wired.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "engadget.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "9to5mac.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||
{"name": "medium.com", "category": "community", "cookie_profile": "ga_consent"},
|
||||
{"name": "dev.to", "category": "community", "cookie_profile": "ga_consent"},
|
||||
{"name": "reddit.com", "category": "community", "cookie_profile": "ga_cf"},
|
||||
{"name": "news.ycombinator.com", "category": "community", "cookie_profile": "minimal"},
|
||||
{"name": "quora.com", "category": "community", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "stackexchange.com", "category": "community", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "imdb.com", "category": "media", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "rottentomatoes.com", "category": "media", "cookie_profile": "ga_consent"},
|
||||
{"name": "metacritic.com", "category": "media", "cookie_profile": "ga_consent"},
|
||||
{"name": "allrecipes.com", "category": "misc", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "epicurious.com", "category": "misc", "cookie_profile": "ga_consent"},
|
||||
{"name": "tripadvisor.com", "category": "misc", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "weather.com", "category": "reference", "cookie_profile": "ga_consent"},
|
||||
{"name": "timeanddate.com", "category": "reference", "cookie_profile": "ga_consent"},
|
||||
{"name": "thesaurus.com", "category": "reference", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "kayak.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "booking.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "airbnb.com", "category": "shop", "cookie_profile": "ga_consent"}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
{
|
||||
"_comment": [
|
||||
"Per-class probability that a persona of a given gpu_class has visited each",
|
||||
"site in the pool. Used by the browsing_history node to derive a coherent",
|
||||
"visited-domain list per persona.",
|
||||
"",
|
||||
"Probabilities are tuned so each class samples ~15-30 sites on average",
|
||||
"(sum across all 50 entries falls in that range), giving an established-user",
|
||||
"look. Categories are biased by class:",
|
||||
" - workstation/high_end: higher P(dev) + high P(news/media)",
|
||||
" - mid_range: balanced",
|
||||
" - low_end/integrated_*: lower P(dev), higher P(shop/news/reference)",
|
||||
"",
|
||||
"Missing class falls back to mid_range via Node CPT pool fallback."
|
||||
],
|
||||
"table": {
|
||||
"workstation": {
|
||||
"youtube.com": 0.80, "wikipedia.org": 0.85, "mozilla.org": 0.70,
|
||||
"w3schools.com": 0.40, "mdn.io": 0.55, "duckduckgo.com": 0.45,
|
||||
"github.com": 0.95, "stackoverflow.com": 0.90, "npmjs.com": 0.65,
|
||||
"gitlab.com": 0.50, "pypi.org": 0.55, "docs.python.org": 0.60,
|
||||
"rust-lang.org": 0.35, "go.dev": 0.30,
|
||||
"amazon.com": 0.70, "ebay.com": 0.25, "etsy.com": 0.15,
|
||||
"bestbuy.com": 0.45, "target.com": 0.30,
|
||||
"nytimes.com": 0.55, "cnn.com": 0.40, "bbc.com": 0.55,
|
||||
"theguardian.com": 0.45, "reuters.com": 0.40, "apnews.com": 0.30,
|
||||
"washingtonpost.com": 0.40,
|
||||
"techcrunch.com": 0.65, "theverge.com": 0.60, "arstechnica.com": 0.65,
|
||||
"wired.com": 0.50, "engadget.com": 0.35, "9to5mac.com": 0.30,
|
||||
"medium.com": 0.55, "dev.to": 0.40, "reddit.com": 0.70,
|
||||
"news.ycombinator.com": 0.65, "quora.com": 0.20, "stackexchange.com": 0.60,
|
||||
"imdb.com": 0.45, "rottentomatoes.com": 0.25, "metacritic.com": 0.20,
|
||||
"allrecipes.com": 0.20, "epicurious.com": 0.15, "tripadvisor.com": 0.30,
|
||||
"weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25,
|
||||
"kayak.com": 0.30, "booking.com": 0.35, "airbnb.com": 0.30
|
||||
},
|
||||
"high_end": {
|
||||
"youtube.com": 0.85, "wikipedia.org": 0.80, "mozilla.org": 0.60,
|
||||
"w3schools.com": 0.45, "mdn.io": 0.45, "duckduckgo.com": 0.40,
|
||||
"github.com": 0.85, "stackoverflow.com": 0.80, "npmjs.com": 0.50,
|
||||
"gitlab.com": 0.40, "pypi.org": 0.45, "docs.python.org": 0.50,
|
||||
"rust-lang.org": 0.30, "go.dev": 0.25,
|
||||
"amazon.com": 0.75, "ebay.com": 0.30, "etsy.com": 0.20,
|
||||
"bestbuy.com": 0.50, "target.com": 0.35,
|
||||
"nytimes.com": 0.50, "cnn.com": 0.50, "bbc.com": 0.50,
|
||||
"theguardian.com": 0.40, "reuters.com": 0.35, "apnews.com": 0.30,
|
||||
"washingtonpost.com": 0.35,
|
||||
"techcrunch.com": 0.60, "theverge.com": 0.65, "arstechnica.com": 0.60,
|
||||
"wired.com": 0.50, "engadget.com": 0.40, "9to5mac.com": 0.35,
|
||||
"medium.com": 0.50, "dev.to": 0.35, "reddit.com": 0.75,
|
||||
"news.ycombinator.com": 0.55, "quora.com": 0.25, "stackexchange.com": 0.55,
|
||||
"imdb.com": 0.55, "rottentomatoes.com": 0.35, "metacritic.com": 0.30,
|
||||
"allrecipes.com": 0.25, "epicurious.com": 0.20, "tripadvisor.com": 0.30,
|
||||
"weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25,
|
||||
"kayak.com": 0.30, "booking.com": 0.40, "airbnb.com": 0.30
|
||||
},
|
||||
"mid_range": {
|
||||
"youtube.com": 0.85, "wikipedia.org": 0.75, "mozilla.org": 0.45,
|
||||
"w3schools.com": 0.40, "mdn.io": 0.30, "duckduckgo.com": 0.35,
|
||||
"github.com": 0.55, "stackoverflow.com": 0.55, "npmjs.com": 0.30,
|
||||
"gitlab.com": 0.25, "pypi.org": 0.25, "docs.python.org": 0.30,
|
||||
"rust-lang.org": 0.15, "go.dev": 0.15,
|
||||
"amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30,
|
||||
"bestbuy.com": 0.55, "target.com": 0.40,
|
||||
"nytimes.com": 0.45, "cnn.com": 0.55, "bbc.com": 0.45,
|
||||
"theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30,
|
||||
"washingtonpost.com": 0.30,
|
||||
"techcrunch.com": 0.45, "theverge.com": 0.50, "arstechnica.com": 0.40,
|
||||
"wired.com": 0.45, "engadget.com": 0.35, "9to5mac.com": 0.30,
|
||||
"medium.com": 0.45, "dev.to": 0.25, "reddit.com": 0.70,
|
||||
"news.ycombinator.com": 0.30, "quora.com": 0.35, "stackexchange.com": 0.40,
|
||||
"imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.35,
|
||||
"allrecipes.com": 0.35, "epicurious.com": 0.25, "tripadvisor.com": 0.40,
|
||||
"weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30,
|
||||
"kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40
|
||||
},
|
||||
"low_end": {
|
||||
"youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.35,
|
||||
"w3schools.com": 0.30, "mdn.io": 0.20, "duckduckgo.com": 0.30,
|
||||
"github.com": 0.30, "stackoverflow.com": 0.30, "npmjs.com": 0.15,
|
||||
"gitlab.com": 0.10, "pypi.org": 0.10, "docs.python.org": 0.15,
|
||||
"rust-lang.org": 0.05, "go.dev": 0.05,
|
||||
"amazon.com": 0.85, "ebay.com": 0.50, "etsy.com": 0.40,
|
||||
"bestbuy.com": 0.55, "target.com": 0.45,
|
||||
"nytimes.com": 0.40, "cnn.com": 0.60, "bbc.com": 0.40,
|
||||
"theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.30,
|
||||
"washingtonpost.com": 0.25,
|
||||
"techcrunch.com": 0.30, "theverge.com": 0.35, "arstechnica.com": 0.25,
|
||||
"wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25,
|
||||
"medium.com": 0.35, "dev.to": 0.15, "reddit.com": 0.65,
|
||||
"news.ycombinator.com": 0.15, "quora.com": 0.45, "stackexchange.com": 0.25,
|
||||
"imdb.com": 0.65, "rottentomatoes.com": 0.45, "metacritic.com": 0.35,
|
||||
"allrecipes.com": 0.45, "epicurious.com": 0.30, "tripadvisor.com": 0.45,
|
||||
"weather.com": 0.65, "timeanddate.com": 0.25, "thesaurus.com": 0.35,
|
||||
"kayak.com": 0.35, "booking.com": 0.50, "airbnb.com": 0.40
|
||||
},
|
||||
"integrated_modern": {
|
||||
"youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.40,
|
||||
"w3schools.com": 0.35, "mdn.io": 0.25, "duckduckgo.com": 0.35,
|
||||
"github.com": 0.40, "stackoverflow.com": 0.40, "npmjs.com": 0.20,
|
||||
"gitlab.com": 0.15, "pypi.org": 0.20, "docs.python.org": 0.20,
|
||||
"rust-lang.org": 0.10, "go.dev": 0.10,
|
||||
"amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30,
|
||||
"bestbuy.com": 0.50, "target.com": 0.40,
|
||||
"nytimes.com": 0.40, "cnn.com": 0.55, "bbc.com": 0.45,
|
||||
"theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30,
|
||||
"washingtonpost.com": 0.30,
|
||||
"techcrunch.com": 0.40, "theverge.com": 0.45, "arstechnica.com": 0.30,
|
||||
"wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25,
|
||||
"medium.com": 0.40, "dev.to": 0.20, "reddit.com": 0.65,
|
||||
"news.ycombinator.com": 0.25, "quora.com": 0.40, "stackexchange.com": 0.35,
|
||||
"imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.30,
|
||||
"allrecipes.com": 0.40, "epicurious.com": 0.25, "tripadvisor.com": 0.40,
|
||||
"weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30,
|
||||
"kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40
|
||||
},
|
||||
"integrated_old": {
|
||||
"youtube.com": 0.75, "wikipedia.org": 0.65, "mozilla.org": 0.30,
|
||||
"w3schools.com": 0.20, "mdn.io": 0.10, "duckduckgo.com": 0.25,
|
||||
"github.com": 0.15, "stackoverflow.com": 0.20, "npmjs.com": 0.05,
|
||||
"gitlab.com": 0.05, "pypi.org": 0.05, "docs.python.org": 0.10,
|
||||
"rust-lang.org": 0.02, "go.dev": 0.02,
|
||||
"amazon.com": 0.85, "ebay.com": 0.55, "etsy.com": 0.45,
|
||||
"bestbuy.com": 0.55, "target.com": 0.50,
|
||||
"nytimes.com": 0.45, "cnn.com": 0.65, "bbc.com": 0.40,
|
||||
"theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.35,
|
||||
"washingtonpost.com": 0.30,
|
||||
"techcrunch.com": 0.20, "theverge.com": 0.25, "arstechnica.com": 0.15,
|
||||
"wired.com": 0.30, "engadget.com": 0.20, "9to5mac.com": 0.20,
|
||||
"medium.com": 0.30, "dev.to": 0.05, "reddit.com": 0.55,
|
||||
"news.ycombinator.com": 0.05, "quora.com": 0.55, "stackexchange.com": 0.15,
|
||||
"imdb.com": 0.70, "rottentomatoes.com": 0.50, "metacritic.com": 0.35,
|
||||
"allrecipes.com": 0.55, "epicurious.com": 0.35, "tripadvisor.com": 0.50,
|
||||
"weather.com": 0.70, "timeanddate.com": 0.30, "thesaurus.com": 0.40,
|
||||
"kayak.com": 0.40, "booking.com": 0.55, "airbnb.com": 0.40
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -120,6 +120,11 @@ class Profile:
|
|||
webgl: WebGLProfile
|
||||
fonts: List[str]
|
||||
dark_theme: bool
|
||||
# Bayesian browsing-history: list of {name, category, cookie_profile}
|
||||
# dicts sampled from data/browsing_pool.json with per-class CPT. Used
|
||||
# by _recaptcha_seed.py to build a coherent cookie pre-seed when the
|
||||
# caller opts in via Stealthfox(prep_recaptcha=True).
|
||||
browsing_history: List[Dict[str, str]] = field(default_factory=list)
|
||||
_raw: Dict[str, Any] = field(default_factory=dict, repr=False, compare=False)
|
||||
|
||||
def to_prefs_dict(self) -> Dict[str, Any]:
|
||||
|
|
@ -255,5 +260,6 @@ def generate_profile(seed: int, pin: Optional[Dict[str, Any]] = None) -> Profile
|
|||
webgl=WebGLProfile(msaa_samples=int(raw["msaa_samples"])),
|
||||
fonts=fonts,
|
||||
dark_theme=bool(raw["dark_theme"]),
|
||||
browsing_history=list(raw.get("browsing_history") or []),
|
||||
_raw=raw,
|
||||
)
|
||||
|
|
|
|||
340
src/invisible_playwright/_recaptcha_seed.py
Normal file
340
src/invisible_playwright/_recaptcha_seed.py
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
"""Deterministic reCAPTCHA cookie pre-seed.
|
||||
|
||||
Consumes the Bayesian-sampled `browsing_history` from the persona Profile
|
||||
(see `_fpforge/_sampler.py:derive_browsing_history`). For each visited
|
||||
site, builds 1-5 realistic cookies whose composition is chosen by the
|
||||
site's `cookie_profile` tag (analytics-only / consent / cloudflare-bot-
|
||||
management / etc.). All values seeded deterministically from the persona
|
||||
seed, so a given persona always presents the SAME cookies across sessions.
|
||||
|
||||
In addition, always seeds 5 cookies on .google.com (NID, CONSENT, SOCS,
|
||||
_GRECAPTCHA, ENID). Excludes 1P_JAR which was deprecated by Google in 2022
|
||||
— including it now is an anachronism flag.
|
||||
|
||||
Public API:
|
||||
await seed_recaptcha_cookies_async(context, profile, timezone=None)
|
||||
seed_recaptcha_cookies_sync(context, profile, timezone=None)
|
||||
|
||||
`profile` is an `_fpforge.Profile`; `timezone` is the IANA tz (e.g.
|
||||
"Europe/Rome") used to derive the CONSENT cookie's language token, so a
|
||||
European-tz persona gets CONSENT in their language not en+FX.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
from typing import Any, List, Optional
|
||||
|
||||
# URL-safe base64 alphabet (no padding chars).
|
||||
_B64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
||||
_HEX_ALPHABET = "0123456789abcdef"
|
||||
|
||||
|
||||
def _sub_seed(seed: int, tag: str) -> int:
|
||||
"""FNV-1a mix → independent PRNG streams per logical bucket from one seed."""
|
||||
h = 0xcbf29ce484222325 ^ (seed & 0xFFFFFFFF)
|
||||
for c in tag.encode("ascii"):
|
||||
h ^= c
|
||||
h = (h * 0x100000001b3) & 0xFFFFFFFFFFFFFFFF
|
||||
return h or 0xdeadbeef
|
||||
|
||||
|
||||
def _b64_rand(rng: random.Random, length: int) -> str:
|
||||
return "".join(rng.choice(_B64_ALPHABET) for _ in range(length))
|
||||
|
||||
|
||||
def _hex_rand(rng: random.Random, length: int) -> str:
|
||||
return "".join(rng.choice(_HEX_ALPHABET) for _ in range(length))
|
||||
|
||||
|
||||
def _yyyymmdd_utc(ts: int) -> str:
|
||||
return datetime.datetime.utcfromtimestamp(ts).strftime("%Y%m%d")
|
||||
|
||||
|
||||
# IANA timezone -> (country_code, lang) for CONSENT cookie coherence.
|
||||
# Real EU users get CONSENT with `<lang>+<COUNTRY>+NNN`; non-EU gets `en+FX+NNN`.
|
||||
# Default fallback `en+FX+NNN` for any tz not in this map.
|
||||
_TZ_TO_REGION = {
|
||||
"Europe/Rome": ("IT", "it"),
|
||||
"Europe/Berlin": ("DE", "de"),
|
||||
"Europe/Paris": ("FR", "fr"),
|
||||
"Europe/Madrid": ("ES", "es"),
|
||||
"Europe/London": ("GB", "en"),
|
||||
"Europe/Amsterdam": ("NL", "nl"),
|
||||
"Europe/Brussels": ("BE", "fr"),
|
||||
"Europe/Vienna": ("AT", "de"),
|
||||
"Europe/Zurich": ("CH", "de"),
|
||||
"Europe/Dublin": ("IE", "en"),
|
||||
"Europe/Lisbon": ("PT", "pt"),
|
||||
"Europe/Stockholm": ("SE", "sv"),
|
||||
"Europe/Oslo": ("NO", "no"),
|
||||
"Europe/Copenhagen": ("DK", "da"),
|
||||
"Europe/Helsinki": ("FI", "fi"),
|
||||
"Europe/Warsaw": ("PL", "pl"),
|
||||
"Europe/Prague": ("CZ", "cs"),
|
||||
"Europe/Athens": ("GR", "el"),
|
||||
"Asia/Tokyo": ("FX", "ja"),
|
||||
"Asia/Shanghai": ("FX", "zh"),
|
||||
"Asia/Hong_Kong": ("FX", "zh"),
|
||||
"Asia/Seoul": ("FX", "ko"),
|
||||
}
|
||||
|
||||
|
||||
def _consent_region_lang(timezone: Optional[str]) -> tuple:
|
||||
"""Map IANA tz → (region_token, lang_2char) for CONSENT cookie.
|
||||
Default `("FX", "en")` for US/unknown."""
|
||||
if timezone and timezone in _TZ_TO_REGION:
|
||||
return _TZ_TO_REGION[timezone]
|
||||
return ("FX", "en")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# .google.com cookie batch (always present, regardless of browsing history)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _google_cookies(rng: random.Random, now: int,
|
||||
timezone: Optional[str] = None) -> List[dict]:
|
||||
consent_age = rng.randint(60, 720) * 86400
|
||||
region, lang = _consent_region_lang(timezone)
|
||||
# NID 3-digit prefix range broadened to 100-540 to cover historical NID
|
||||
# versions (137, 105, 511, 525 etc. observed in real captures).
|
||||
return [
|
||||
{"name": "NID",
|
||||
"value": f"{rng.randint(100, 540)}={_b64_rand(rng, 178)}",
|
||||
"domain": ".google.com", "path": "/",
|
||||
"expires": now + 180 * 86400,
|
||||
"httpOnly": True, "secure": True, "sameSite": "None"},
|
||||
{"name": "CONSENT",
|
||||
"value": f"YES+cb.{_yyyymmdd_utc(now - consent_age)}-"
|
||||
f"{rng.randint(10, 19):02d}-p{rng.randint(0, 9)}."
|
||||
f"{lang}+{region}+{rng.randint(100, 999)}",
|
||||
"domain": ".google.com", "path": "/",
|
||||
"expires": now + 395 * 86400,
|
||||
"secure": True, "sameSite": "Lax"},
|
||||
# 1P_JAR removed: Google deprecated it in 2022. Including it now is
|
||||
# an anachronism flag for fingerprinters that look at cookie freshness.
|
||||
{"name": "SOCS",
|
||||
"value": f"CAES{_b64_rand(rng, 56)}",
|
||||
"domain": ".google.com", "path": "/",
|
||||
"expires": now + 395 * 86400,
|
||||
"secure": True, "sameSite": "Lax"},
|
||||
{"name": "_GRECAPTCHA",
|
||||
"value": _b64_rand(rng, 124),
|
||||
"domain": ".google.com", "path": "/",
|
||||
"expires": now + 180 * 86400,
|
||||
"secure": True, "sameSite": "None"},
|
||||
{"name": "ENID",
|
||||
"value": _b64_rand(rng, 252),
|
||||
"domain": ".google.com", "path": "/",
|
||||
"expires": now + 395 * 86400,
|
||||
"httpOnly": True, "secure": True, "sameSite": "Lax"},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-site cookie generators (recipes keyed by site["cookie_profile"])
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _norm_domain(domain: str) -> str:
|
||||
return domain if domain.startswith(".") else "." + domain
|
||||
|
||||
|
||||
def _ga_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
first_age = rng.randint(7, 395) * 86400
|
||||
return {"name": "_ga",
|
||||
"value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - first_age}",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 395 * 86400,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _gid_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
return {"name": "_gid",
|
||||
"value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - rng.randint(60, 86400)}",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 86400,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _cf_bm_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
return {"name": "__cf_bm",
|
||||
"value": f"{_b64_rand(rng, 43)}.{rng.randint(1700000000, now)}-1-1-1-1",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 1800,
|
||||
"secure": True, "sameSite": "None"}
|
||||
|
||||
|
||||
def _onetrust_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
age_d = rng.randint(7, 365)
|
||||
iso = datetime.datetime.utcfromtimestamp(now - age_d * 86400).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.000Z"
|
||||
)
|
||||
return {"name": "OptanonAlertBoxClosed",
|
||||
"value": iso,
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 395 * 86400,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _cookieyes_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
return {"name": "cookieyes-consent",
|
||||
"value": "consentid:" + _b64_rand(rng, 28) +
|
||||
",consent:yes,action:yes,necessary:yes,functional:yes,analytics:yes",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 395 * 86400,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _clarity_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
return {"name": "_clck",
|
||||
"value": f"{_hex_rand(rng, 8)}|2|f{rng.randint(10, 99)}|0|"
|
||||
f"{now - rng.randint(60, 180) * 86400}",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 365 * 86400,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _fbp_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
"""Facebook Pixel _fbp = fb.<subdomain_index>.<unix_ms>.<random_int>"""
|
||||
return {"name": "_fbp",
|
||||
"value": f"fb.1.{(now - rng.randint(60, 30*86400)) * 1000}."
|
||||
f"{rng.randint(100000000, 9999999999)}",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 90 * 86400,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _gtm_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
"""_dc_gtm_<container_id>=1 — Google Tag Manager throttle flag."""
|
||||
container = f"UA-{rng.randint(10000000, 99999999)}-{rng.randint(1, 9)}"
|
||||
return {"name": f"_dc_gtm_{container}",
|
||||
"value": "1",
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 60,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _hssrc_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||
"""HubSpot referrer flag — small int."""
|
||||
return {"name": "__hssrc",
|
||||
"value": str(rng.randint(1, 5)),
|
||||
"domain": domain, "path": "/",
|
||||
"expires": now + 1800,
|
||||
"secure": True, "sameSite": "Lax"}
|
||||
|
||||
|
||||
def _cookies_for_profile(profile: str, rng: random.Random,
|
||||
now: int, domain: str) -> List[dict]:
|
||||
"""Map cookie_profile tag (from browsing_pool.json) → concrete cookies.
|
||||
|
||||
Each recipe is a realistic combination observed on real production sites
|
||||
in that category. Cookie age and sub-recipe variance (e.g., OneTrust vs
|
||||
CookieYes for consent banner) are deterministic from rng.
|
||||
"""
|
||||
domain = _norm_domain(domain)
|
||||
if profile == "minimal":
|
||||
return [_ga_cookie(rng, now, domain)]
|
||||
if profile == "ga_only":
|
||||
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)]
|
||||
# 30% chance of GTM helper paired with GA
|
||||
if rng.random() < 0.3:
|
||||
out.append(_gtm_cookie(rng, now, domain))
|
||||
return out
|
||||
if profile == "ga_cf":
|
||||
return [_ga_cookie(rng, now, domain), _cf_bm_cookie(rng, now, domain)]
|
||||
if profile == "ga_consent":
|
||||
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)]
|
||||
out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5
|
||||
else _cookieyes_cookie(rng, now, domain))
|
||||
if rng.random() < 0.4:
|
||||
out.append(_gtm_cookie(rng, now, domain))
|
||||
return out
|
||||
if profile == "ga_consent_clarity":
|
||||
# Heavy-tracking site profile: GA + Clarity + consent + often FB pixel
|
||||
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain),
|
||||
_clarity_cookie(rng, now, domain)]
|
||||
out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5
|
||||
else _cookieyes_cookie(rng, now, domain))
|
||||
if rng.random() < 0.5:
|
||||
out.append(_fbp_cookie(rng, now, domain))
|
||||
if rng.random() < 0.4:
|
||||
out.append(_gtm_cookie(rng, now, domain))
|
||||
if rng.random() < 0.25:
|
||||
out.append(_hssrc_cookie(rng, now, domain))
|
||||
return out
|
||||
# Unknown profile → safe fallback
|
||||
return [_ga_cookie(rng, now, domain)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public builder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_cookies(seed: int,
|
||||
browsing_history: Optional[List[dict]] = None,
|
||||
now: Optional[int] = None,
|
||||
timezone: Optional[str] = None) -> List[dict]:
|
||||
"""Build the full cookie list for a persona.
|
||||
|
||||
Args:
|
||||
seed: persona integer seed (from `Profile.seed`)
|
||||
browsing_history: list of {name, category, cookie_profile} dicts as
|
||||
sampled by `_fpforge.derive_browsing_history`. None → empty list
|
||||
(only the 5 google cookies are returned).
|
||||
now: unix-seconds timestamp; defaults to current time. Pin for tests.
|
||||
timezone: IANA tz used to derive CONSENT cookie's `lang+region` token
|
||||
(e.g. "Europe/Rome" → "it+IT", "America/New_York" → "en+FX").
|
||||
"""
|
||||
ts = now if now is not None else int(time.time())
|
||||
cookies: List[dict] = []
|
||||
|
||||
# 5 .google.com cookies (always) — CONSENT lang derived from tz
|
||||
rng_g = random.Random(_sub_seed(int(seed), "google"))
|
||||
cookies.extend(_google_cookies(rng_g, ts, timezone=timezone))
|
||||
|
||||
# Per-site cookies (deterministic from seed × domain)
|
||||
for site in (browsing_history or []):
|
||||
rng_d = random.Random(_sub_seed(int(seed), f"dom:{site['name']}"))
|
||||
cookies.extend(_cookies_for_profile(
|
||||
site.get("cookie_profile", "minimal"), rng_d, ts, site["name"]
|
||||
))
|
||||
return cookies
|
||||
|
||||
|
||||
def _extract_seed_and_history(profile: Any) -> tuple:
|
||||
"""Accept a Profile object OR a (seed, history) tuple OR just an int seed."""
|
||||
if isinstance(profile, int):
|
||||
return int(profile), []
|
||||
seed = int(getattr(profile, "seed"))
|
||||
history = list(getattr(profile, "browsing_history", []) or [])
|
||||
return seed, history
|
||||
|
||||
|
||||
async def seed_recaptcha_cookies_async(context: Any, profile: Any,
|
||||
timezone: Optional[str] = None) -> None:
|
||||
"""Async: inject deterministic persona cookies into the context."""
|
||||
seed, history = _extract_seed_and_history(profile)
|
||||
cookies = build_cookies(seed, history, timezone=timezone)
|
||||
try:
|
||||
await context.add_cookies(cookies)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def seed_recaptcha_cookies_sync(context: Any, profile: Any,
|
||||
timezone: Optional[str] = None) -> None:
|
||||
"""Sync: inject deterministic persona cookies into the context."""
|
||||
seed, history = _extract_seed_and_history(profile)
|
||||
cookies = build_cookies(seed, history, timezone=timezone)
|
||||
try:
|
||||
context.add_cookies(cookies)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
"build_cookies",
|
||||
"seed_recaptcha_cookies_async",
|
||||
"seed_recaptcha_cookies_sync",
|
||||
]
|
||||
|
|
@ -51,6 +51,7 @@ class InvisiblePlaywright:
|
|||
extra_prefs: Optional[Dict[str, Any]] = None,
|
||||
binary_path: Optional[str] = None,
|
||||
profile_dir: Optional[Union[str, Path]] = None,
|
||||
prep_recaptcha: bool = False,
|
||||
) -> None:
|
||||
# See sync launcher: `zoom.stealth.fpp.hw_seed` is int32_t — clamp.
|
||||
self.seed: int = int(seed) if seed is not None else secrets.randbits(31)
|
||||
|
|
@ -64,6 +65,8 @@ class InvisiblePlaywright:
|
|||
self._extra_prefs = extra_prefs
|
||||
self._binary_path = binary_path
|
||||
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
|
||||
# reCAPTCHA pre-seed gated server-side; respect persistent profile.
|
||||
self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None
|
||||
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
|
||||
self._pw: Optional[Playwright] = None
|
||||
self._browser: Optional[Browser] = None
|
||||
|
|
@ -124,12 +127,18 @@ class InvisiblePlaywright:
|
|||
def _patch_new_context_defaults(self, browser: Browser) -> None:
|
||||
original = browser.new_context
|
||||
defaults = self._default_context_kwargs()
|
||||
prep = self._prep_recaptcha
|
||||
profile = self._profile # pass the whole Profile (seed + browsing_history)
|
||||
tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region
|
||||
|
||||
async def patched(**kw):
|
||||
merged = dict(defaults)
|
||||
merged.update(kw)
|
||||
ctx = await original(**merged)
|
||||
_patch_new_page_sleep(ctx)
|
||||
if prep:
|
||||
from ._recaptcha_seed import seed_recaptcha_cookies_async
|
||||
await seed_recaptcha_cookies_async(ctx, profile, timezone=tz)
|
||||
return ctx
|
||||
|
||||
browser.new_context = patched # type: ignore[assignment]
|
||||
|
|
|
|||
|
|
@ -113,6 +113,7 @@ class InvisiblePlaywright:
|
|||
extra_prefs: Optional[Dict[str, Any]] = None,
|
||||
binary_path: Optional[str] = None,
|
||||
profile_dir: Optional[Union[str, Path]] = None,
|
||||
prep_recaptcha: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
|
|
@ -166,6 +167,10 @@ class InvisiblePlaywright:
|
|||
self._extra_prefs = extra_prefs
|
||||
self._binary_path = binary_path
|
||||
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
|
||||
# reCAPTCHA cookie pre-seed — opt-in. Gated server-side: if a
|
||||
# persistent profile_dir is in use, respect its existing cookies
|
||||
# and DON'T enable pre-seed (the profile owns its own state).
|
||||
self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None
|
||||
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
|
||||
self._pw: Optional[Playwright] = None
|
||||
self._browser: Optional[Browser] = None
|
||||
|
|
@ -240,12 +245,18 @@ class InvisiblePlaywright:
|
|||
"""
|
||||
original = browser.new_context
|
||||
defaults = self._default_context_kwargs()
|
||||
prep = self._prep_recaptcha
|
||||
profile = self._profile # pass the whole Profile (seed + browsing_history)
|
||||
tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region
|
||||
|
||||
def patched(**kw):
|
||||
merged = dict(defaults)
|
||||
merged.update(kw) # user-supplied wins
|
||||
ctx = original(**merged)
|
||||
_patch_sync_new_page_sleep(ctx)
|
||||
if prep:
|
||||
from ._recaptcha_seed import seed_recaptcha_cookies_sync
|
||||
seed_recaptcha_cookies_sync(ctx, profile, timezone=tz)
|
||||
return ctx
|
||||
|
||||
browser.new_context = patched # type: ignore[assignment]
|
||||
|
|
|
|||
|
|
@ -306,17 +306,6 @@ def test_navigator_oscpu_matches_userAgent(page):
|
|||
assert "Mac" in oscpu
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
def test_userAgent_contains_appVersion_chromium_only(page):
|
||||
"""Chromium invariant: UA contains appVersion. Firefox uses a short
|
||||
appVersion form so the check is gated on `'chrome' in window`."""
|
||||
if not _ev(page, "'chrome' in window"):
|
||||
pytest.skip("Chromium-only invariant")
|
||||
ua = _ev(page, "navigator.userAgent")
|
||||
av = _ev(page, "navigator.appVersion")
|
||||
assert av in ua
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 5. Native function self-toString (creepjs/src/lies/index.ts hasKnownToString)
|
||||
# ===========================================================================
|
||||
|
|
|
|||
349
tests/test_recaptcha_seed.py
Normal file
349
tests/test_recaptcha_seed.py
Normal file
|
|
@ -0,0 +1,349 @@
|
|||
"""Unit tests for the deterministic reCAPTCHA cookie builder.
|
||||
|
||||
Validates the contract:
|
||||
- 6 .google.com cookies always present
|
||||
- Per-site cookies built from a `browsing_history` list (sampled by the
|
||||
Bayesian network in _fpforge)
|
||||
- Determinism: same (seed, history) → identical content
|
||||
- Chrome 400-day cookie cap respected
|
||||
- Playwright add_cookies field requirements satisfied
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from invisible_playwright._recaptcha_seed import (
|
||||
build_cookies,
|
||||
_sub_seed,
|
||||
)
|
||||
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
_FIXED_NOW = 1779600000 # 2026-05-23, frozen for determinism
|
||||
|
||||
|
||||
# Sample browsing history for tests (mimics what _fpforge produces).
|
||||
_SAMPLE_HISTORY = [
|
||||
{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"},
|
||||
{"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||
{"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"},
|
||||
{"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"},
|
||||
]
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 1. Set composition
|
||||
# ===========================================================================
|
||||
|
||||
def test_only_google_cookies_when_no_history():
|
||||
"""Empty/None history → only the 5 .google.com cookies (1P_JAR removed
|
||||
in realism round 2 — deprecated by Google 2022)."""
|
||||
cookies = build_cookies(seed=42, browsing_history=None, now=_FIXED_NOW)
|
||||
names = sorted(c["name"] for c in cookies)
|
||||
assert names == sorted(["NID", "CONSENT", "SOCS",
|
||||
"_GRECAPTCHA", "ENID"])
|
||||
assert all(c["domain"] == ".google.com" for c in cookies)
|
||||
|
||||
|
||||
def test_browsing_history_adds_host_cookies():
|
||||
"""Each history site contributes 1+ cookies on its domain."""
|
||||
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
google = [c for c in cookies if c["domain"] == ".google.com"]
|
||||
assert len(google) == 5 # 1P_JAR removed
|
||||
|
||||
domains = {c["domain"] for c in cookies if c["domain"] != ".google.com"}
|
||||
for site in _SAMPLE_HISTORY:
|
||||
assert f".{site['name']}" in domains
|
||||
|
||||
|
||||
def test_domain_dot_prefix_normalized():
|
||||
"""All host cookie domains have a leading dot for sub-domain coverage."""
|
||||
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
for c in cookies:
|
||||
assert c["domain"].startswith("."), f"missing dot: {c['domain']}"
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 2. Cookie profile recipes (each profile yields the expected cookie set)
|
||||
# ===========================================================================
|
||||
|
||||
def test_profile_minimal_yields_ga_only():
|
||||
history = [{"name": "x.com", "cookie_profile": "minimal"}]
|
||||
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||
names = [c["name"] for c in host]
|
||||
assert names == ["_ga"]
|
||||
|
||||
|
||||
def test_profile_ga_only_yields_ga_and_gid():
|
||||
history = [{"name": "x.com", "cookie_profile": "ga_only"}]
|
||||
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||
names = sorted(c["name"] for c in host)
|
||||
assert names == ["_ga", "_gid"]
|
||||
|
||||
|
||||
def test_profile_ga_cf_yields_ga_and_cf_bm():
|
||||
history = [{"name": "x.com", "cookie_profile": "ga_cf"}]
|
||||
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||
names = sorted(c["name"] for c in host)
|
||||
assert names == ["__cf_bm", "_ga"]
|
||||
|
||||
|
||||
def test_profile_ga_consent_yields_three_cookies():
|
||||
history = [{"name": "x.com", "cookie_profile": "ga_consent"}]
|
||||
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||
names = sorted(c["name"] for c in host)
|
||||
# Always _ga + _gid + one of OneTrust|CookieYes
|
||||
assert "_ga" in names and "_gid" in names
|
||||
assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent"))
|
||||
assert len(host) == 3
|
||||
|
||||
|
||||
def test_profile_ga_consent_clarity_yields_at_least_four_cookies():
|
||||
"""Always _ga + _gid + _clck + consent banner. Optionally _fbp, _dc_gtm_*,
|
||||
__hssrc (probabilistic per rng — see test_new_helper_cookies_*)."""
|
||||
history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}]
|
||||
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||
names = sorted(c["name"] for c in host)
|
||||
assert "_ga" in names and "_gid" in names and "_clck" in names
|
||||
assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent"))
|
||||
assert len(host) >= 4 # 4 baseline + 0-3 helpers
|
||||
|
||||
|
||||
def test_unknown_profile_falls_back_to_ga():
|
||||
history = [{"name": "x.com", "cookie_profile": "nonexistent_profile"}]
|
||||
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||
assert [c["name"] for c in host] == ["_ga"]
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 3. Determinism
|
||||
# ===========================================================================
|
||||
|
||||
def test_same_seed_and_history_same_content():
|
||||
a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
b = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
assert a == b
|
||||
|
||||
|
||||
def test_different_seed_different_content():
|
||||
a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
b = build_cookies(seed=99, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
a_nid = next(c for c in a if c["name"] == "NID")["value"]
|
||||
b_nid = next(c for c in b if c["name"] == "NID")["value"]
|
||||
assert a_nid != b_nid
|
||||
|
||||
|
||||
def test_history_order_does_not_affect_domain_specific_cookies():
|
||||
"""Sub-seed is keyed on domain name, not order in history list."""
|
||||
h1 = [_SAMPLE_HISTORY[0], _SAMPLE_HISTORY[1]]
|
||||
h2 = [_SAMPLE_HISTORY[1], _SAMPLE_HISTORY[0]]
|
||||
a = {(c["domain"], c["name"]): c["value"]
|
||||
for c in build_cookies(seed=42, browsing_history=h1, now=_FIXED_NOW)
|
||||
if c["domain"] != ".google.com"}
|
||||
b = {(c["domain"], c["name"]): c["value"]
|
||||
for c in build_cookies(seed=42, browsing_history=h2, now=_FIXED_NOW)
|
||||
if c["domain"] != ".google.com"}
|
||||
assert a == b
|
||||
|
||||
|
||||
def test_sub_seed_distinct_tags_distinct_streams():
|
||||
assert _sub_seed(42, "google") != _sub_seed(42, "dom:github.com")
|
||||
assert _sub_seed(42, "dom:github.com") != _sub_seed(42, "dom:amazon.com")
|
||||
assert _sub_seed(0, "any") != 0 # seed=0 still produces non-zero sub-seed
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 4. Format / structural correctness for the Google batch
|
||||
# ===========================================================================
|
||||
|
||||
def test_nid_format():
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||
nid = next(c for c in cookies if c["name"] == "NID")
|
||||
prefix, b64 = nid["value"].split("=", 1)
|
||||
assert prefix.isdigit() and len(prefix) == 3
|
||||
# Broadened to 100-540 in realism round 2 to cover historical NID versions
|
||||
assert 100 <= int(prefix) <= 540
|
||||
assert len(b64) == 178
|
||||
|
||||
|
||||
def test_consent_format():
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||
assert consent["value"].startswith("YES+cb.")
|
||||
assert "+FX+" in consent["value"]
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 5. Chrome 400-day cookie cap compliance
|
||||
# ===========================================================================
|
||||
|
||||
def test_all_expiries_within_400_day_cap():
|
||||
"""Chrome 104+ caps cookie expiry to 400 days. Cookies > 400d silently
|
||||
truncated / dropped. We tighten everything to <=395d (except __cf_bm
|
||||
which is short-lived telemetry)."""
|
||||
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
max_allowed = _FIXED_NOW + 400 * 86400
|
||||
for c in cookies:
|
||||
# Short-lived telemetry cookies are fine
|
||||
if c["name"] in ("__cf_bm", "1P_JAR", "_gid"):
|
||||
continue
|
||||
assert c["expires"] <= max_allowed, (
|
||||
f"Cookie {c['name']} expires {c['expires'] - _FIXED_NOW}s "
|
||||
f"(> 400d cap) — would be silently dropped"
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 6. Playwright add_cookies field requirements
|
||||
# ===========================================================================
|
||||
|
||||
def test_all_cookies_have_required_playwright_fields():
|
||||
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
for c in cookies:
|
||||
assert c.get("name"), f"missing name: {c}"
|
||||
assert c.get("value") is not None, f"missing value: {c}"
|
||||
assert c.get("domain"), f"missing domain: {c}"
|
||||
assert c.get("path") == "/", f"path != / for {c['name']}"
|
||||
|
||||
|
||||
def test_modern_cookies_marked_secure():
|
||||
"""Cookies with sameSite=None require secure=True under Firefox/Chrome.
|
||||
Also generally needed for cookies set via Playwright add_cookies without
|
||||
a navigation context."""
|
||||
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
for c in cookies:
|
||||
if c.get("sameSite") == "None":
|
||||
assert c.get("secure") is True, f"{c['name']} None+!secure invalid"
|
||||
|
||||
|
||||
def test_httponly_on_signed_cookies():
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||
nid = next(c for c in cookies if c["name"] == "NID")
|
||||
enid = next(c for c in cookies if c["name"] == "ENID")
|
||||
assert nid.get("httpOnly") is True
|
||||
assert enid.get("httpOnly") is True
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 7. End-to-end with real fpforge Profile
|
||||
# ===========================================================================
|
||||
|
||||
def test_with_real_fpforge_profile():
|
||||
"""End-to-end: generate a real Profile, ensure browsing_history is populated
|
||||
and build_cookies works against it."""
|
||||
from invisible_playwright._fpforge import generate_profile
|
||||
prof = generate_profile(seed=42)
|
||||
assert isinstance(prof.browsing_history, list)
|
||||
# The Bayesian network samples ~15-30 sites per persona
|
||||
assert 5 <= len(prof.browsing_history) <= 50, \
|
||||
f"unexpected history length: {len(prof.browsing_history)}"
|
||||
# Each entry has the expected fields
|
||||
for site in prof.browsing_history:
|
||||
assert "name" in site and "category" in site and "cookie_profile" in site
|
||||
# build_cookies works against the real profile
|
||||
cookies = build_cookies(seed=prof.seed, browsing_history=prof.browsing_history,
|
||||
now=_FIXED_NOW)
|
||||
# 6 google + at least 1 cookie per visited site
|
||||
assert len(cookies) >= 6 + len(prof.browsing_history)
|
||||
|
||||
|
||||
def test_same_seed_same_browsing_history_via_fpforge():
|
||||
"""Profile.browsing_history is deterministic from seed (Bayesian sampler)."""
|
||||
from invisible_playwright._fpforge import generate_profile
|
||||
a = generate_profile(seed=42).browsing_history
|
||||
b = generate_profile(seed=42).browsing_history
|
||||
assert a == b
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# 8. Realism improvements (2026-05-24 round 2)
|
||||
# ===========================================================================
|
||||
|
||||
def test_no_1p_jar_cookie():
|
||||
"""1P_JAR was deprecated by Google in 2022. Including it is an
|
||||
anachronism flag for fingerprinters that look at cookie freshness."""
|
||||
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||
names = {c["name"] for c in cookies}
|
||||
assert "1P_JAR" not in names
|
||||
|
||||
|
||||
def test_nid_prefix_broadened_range():
|
||||
"""NID 3-digit prefix should cover historical versions (137/105/511/525
|
||||
seen in real captures) — range 100-540, not just 500-540."""
|
||||
seen_prefixes = set()
|
||||
for seed in range(200):
|
||||
cookies = build_cookies(seed=seed, now=_FIXED_NOW)
|
||||
nid = next(c for c in cookies if c["name"] == "NID")
|
||||
prefix = int(nid["value"].split("=", 1)[0])
|
||||
seen_prefixes.add(prefix)
|
||||
assert min(seen_prefixes) < 500, f"NID range never goes below 500 ({sorted(seen_prefixes)[:5]})"
|
||||
assert max(seen_prefixes) <= 540
|
||||
|
||||
|
||||
def test_consent_lang_from_timezone_eu():
|
||||
"""CONSENT cookie's `lang+region` token derived from IANA timezone."""
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Rome")
|
||||
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||
assert ".it+IT+" in consent["value"], f"expected it+IT in: {consent['value']}"
|
||||
|
||||
|
||||
def test_consent_lang_default_fx():
|
||||
"""Unknown / US timezone → default `en+FX` (non-EU fallback)."""
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="America/New_York")
|
||||
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||
assert ".en+FX+" in consent["value"]
|
||||
|
||||
|
||||
def test_consent_lang_de_for_berlin():
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Berlin")
|
||||
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||
assert ".de+DE+" in consent["value"]
|
||||
|
||||
|
||||
def test_consent_lang_no_timezone_default():
|
||||
"""timezone=None → default en+FX."""
|
||||
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||
assert ".en+FX+" in consent["value"]
|
||||
|
||||
|
||||
def test_new_helper_cookies_appear_in_ga_consent_clarity():
|
||||
"""ga_consent_clarity recipe should sometimes include _fbp, _dc_gtm_*, __hssrc
|
||||
(probabilistic per rng). Check across many seeds that they appear."""
|
||||
saw_fbp = False
|
||||
saw_gtm = False
|
||||
saw_hssrc = False
|
||||
history = [{"name": "site.com", "cookie_profile": "ga_consent_clarity"}]
|
||||
for seed in range(100):
|
||||
cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW)
|
||||
names = {c["name"] for c in cookies if c["domain"] == ".site.com"}
|
||||
if "_fbp" in names: saw_fbp = True
|
||||
if any(n.startswith("_dc_gtm_") for n in names): saw_gtm = True
|
||||
if "__hssrc" in names: saw_hssrc = True
|
||||
assert saw_fbp, "_fbp never appeared in 100 seeds (rng pick broken)"
|
||||
assert saw_gtm, "_dc_gtm_* never appeared in 100 seeds"
|
||||
assert saw_hssrc, "__hssrc never appeared in 100 seeds"
|
||||
|
||||
|
||||
def test_fbp_format():
|
||||
"""_fbp format: fb.<idx>.<unix_ms>.<random_int>"""
|
||||
history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}]
|
||||
# Try multiple seeds until we hit a seed that includes _fbp (50% chance)
|
||||
for seed in range(20):
|
||||
cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW)
|
||||
fbp = next((c for c in cookies if c["name"] == "_fbp"), None)
|
||||
if fbp:
|
||||
parts = fbp["value"].split(".")
|
||||
assert parts[0] == "fb"
|
||||
assert parts[1].isdigit()
|
||||
assert parts[2].isdigit() and len(parts[2]) >= 13 # unix ms
|
||||
assert parts[3].isdigit()
|
||||
return
|
||||
raise AssertionError("never got _fbp across 20 seeds — distribution broken")
|
||||
Loading…
Add table
Add a link
Reference in a new issue