mirror of
https://github.com/feder-cr/invisible_playwright.git
synced 2026-06-16 09:05:13 +02:00
359 lines
16 KiB
Python
359 lines
16 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""stealth_forge — Bayesian fingerprint generator for Firefox 150 Windows.
|
|||
|
|
|
|||
|
|
Everything the Firefox build exposes to JS (screen, hardwareConcurrency,
|
|||
|
|
WebGL, audio, MSAA, theme, media codecs) is sampled from a Bayesian network
|
|||
|
|
with coherent cross-field dependencies. Identity (userAgent, platform,
|
|||
|
|
oscpu, webdriver=false, maxTouchPoints=0) is locked by the compiled build.
|
|||
|
|
|
|||
|
|
Graph:
|
|||
|
|
|
|||
|
|
gpu (root, 444 real Windows ANGLE renderers)
|
|||
|
|
│
|
|||
|
|
└─> gpu_class (deterministic classifier, 6 classes)
|
|||
|
|
├─> hw_concurrency (CPT per class)
|
|||
|
|
├─> screen (w/h/dpr/av) (CPT per class)
|
|||
|
|
└─> msaa_samples (CPT per class)
|
|||
|
|
|
|||
|
|
audio (root, joint rate+latency+channels — marginal)
|
|||
|
|
dark_theme (marginal)
|
|||
|
|
av1_enabled (marginal)
|
|||
|
|
webm_encoder_enabled (marginal)
|
|||
|
|
|
|||
|
|
font_exclude ← deterministic hash of stealth_seed (seed-derived)
|
|||
|
|
|
|||
|
|
CPTs live in `data/*.json` (easy to tune without code changes).
|
|||
|
|
Sampling is deterministic per stealth_seed via a private random.Random.
|
|||
|
|
"""
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
from typing import Any, Dict
|
|||
|
|
|
|||
|
|
from ._network import Network, Node
|
|||
|
|
|
|||
|
|
_HERE = os.path.dirname(os.path.abspath(__file__))
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _load(filename: str) -> Any:
|
|||
|
|
with open(os.path.join(_HERE, "data", filename), "r", encoding="utf-8") as f:
|
|||
|
|
return json.load(f)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# LOCKED IDENTITY (compiled into our Firefox 150 build — never varies)
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
_LOCKED: Dict[str, Any] = {
|
|||
|
|
"user_agent": (
|
|||
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) "
|
|||
|
|
"Gecko/20100101 Firefox/150.0.1"
|
|||
|
|
),
|
|||
|
|
"platform": "Win32",
|
|||
|
|
"oscpu": "Windows NT 10.0; Win64; x64",
|
|||
|
|
"app_code_name": "Mozilla",
|
|||
|
|
"app_version": "5.0 (Windows)",
|
|||
|
|
"product_sub": "20100101",
|
|||
|
|
"webdriver": False,
|
|||
|
|
"max_touch_points": 0,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# DATA
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
_GPU_POOL = _load("webgl_renderer_pool.json")["entries"]
|
|||
|
|
# hwc/screen/storage now keyed on (gpu_class, intra_tier) for triangulation
|
|||
|
|
_CPT_HWC = _load("cpt_hwc_given_class_tier.json")["table"]
|
|||
|
|
_CPT_SCREEN = _load("cpt_screen_given_class_tier.json")["table"]
|
|||
|
|
_CPT_STORAGE = _load("cpt_storage_given_class_tier.json")["table"]
|
|||
|
|
# Hidden tier variable that makes hwc/screen/storage jointly coherent
|
|||
|
|
_CPT_INTRA_TIER = _load("cpt_intra_tier_given_class.json")["table"]
|
|||
|
|
# MSAA depends on (gpu_class, screen_tier) — 4K gaming → MSAA=0, 1080p+GPU → MSAA=4
|
|||
|
|
_CPT_MSAA = _load("cpt_msaa_given_class_screen.json")["table"]
|
|||
|
|
# Codec unchanged
|
|||
|
|
_CPT_CODEC = _load("cpt_codec_given_class.json")["table"]
|
|||
|
|
# Audio now conditional on gpu_class (workstation → pro audio, old → 44.1kHz onboard)
|
|||
|
|
_CPT_AUDIO = _load("cpt_audio_given_class.json")["table"]
|
|||
|
|
_INDEP = _load("priors_independent.json")
|
|||
|
|
_FONT_POOL = _load("font_pool.json")
|
|||
|
|
# Each entry is a dict {"name": "<lowercase family>", "factor": float}.
|
|||
|
|
# - name: the font family advertised to the page.
|
|||
|
|
# - factor: per-family width scale used by the consumer to make the family
|
|||
|
|
# detectable by width-diff probes.
|
|||
|
|
# Core = always-included; Optional = sampled with P(font | gpu_class).
|
|||
|
|
_FONT_CORE: list = _FONT_POOL["core"]
|
|||
|
|
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
|
|||
|
|
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# GPU CLASSIFIER (deterministic function of gpu → gpu_class)
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
_GPU_CLASSES = (
|
|||
|
|
"integrated_old", "integrated_modern", "low_end",
|
|||
|
|
"mid_range", "high_end", "workstation",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def classify_gpu(gpu_value: Dict[str, str]) -> str:
|
|||
|
|
"""Deterministic: maps (renderer, vendor) dict to one of 6 classes.
|
|||
|
|
|
|||
|
|
See data/cpt_*.json — each CPT table has an entry for every class.
|
|||
|
|
"""
|
|||
|
|
r = gpu_value.get("renderer", "")
|
|||
|
|
|
|||
|
|
if re.search(r"Intel.*HD Graphics (3000|4000|2500)", r):
|
|||
|
|
return "integrated_old"
|
|||
|
|
if re.search(
|
|||
|
|
r"Intel.*(HD Graphics (4[56]|5\d\d|6\d\d)|UHD Graphics|Graphics Family|Iris|Arc)",
|
|||
|
|
r,
|
|||
|
|
):
|
|||
|
|
return "integrated_modern"
|
|||
|
|
if re.search(
|
|||
|
|
r"AMD.*(Radeon(\(TM\))? (Graphics|6\d\dM|7\d\dM|8\d\dM)|Vega [0-9]|"
|
|||
|
|
r"Renoir|Rembrandt|TM Graphics)",
|
|||
|
|
r, re.IGNORECASE,
|
|||
|
|
):
|
|||
|
|
return "integrated_modern"
|
|||
|
|
|
|||
|
|
# NVIDIA: Firefox SanitizeRenderer.cpp collapses every GeForce into one of
|
|||
|
|
# 3 vintage buckets (8800 GTX / GTX 480 / GTX 980). The renderer string
|
|||
|
|
# exposed to JS is therefore vintage; pairing it with modern cores/screen
|
|||
|
|
# creates an internal mismatch that FP Pro's tampering_ml flags. We pick
|
|||
|
|
# `low_end` for all 3 buckets so cores stay 4-12 and screen 1080-1440p,
|
|||
|
|
# consistent with what a real user with each of those (vintage) cards
|
|||
|
|
# would have. Workstation overrides keep their high-tier classification.
|
|||
|
|
if re.search(
|
|||
|
|
r"(GeForce (8\d\d\d?|9\d\d\d?|GTX 980|GTX 480|GT 1030|GT 710|GT 730|"
|
|||
|
|
r"GT 220|GT 240|210|310)|Quadro K\d|Radeon HD [1234]\d\d\d)", r,
|
|||
|
|
):
|
|||
|
|
return "low_end"
|
|||
|
|
|
|||
|
|
# NVIDIA discrete (any other GeForce — should be rare after the pool was
|
|||
|
|
# collapsed to the 3 sanitize buckets, but kept as a safety net).
|
|||
|
|
m = re.search(r"GeForce\s+(?:GTX\s+|RTX\s+)?(\d{3,4})", r)
|
|||
|
|
if m:
|
|||
|
|
if "Quadro" in r or "Workstation" in r:
|
|||
|
|
return "workstation"
|
|||
|
|
# Anything that survives the sanitize collapse stays low_end to avoid
|
|||
|
|
# the modern-cores/vintage-renderer pairing.
|
|||
|
|
return "low_end"
|
|||
|
|
|
|||
|
|
# AMD discrete
|
|||
|
|
m = re.search(r"Radeon[^0-9]*(\d{3,4})", r)
|
|||
|
|
if m:
|
|||
|
|
n = int(m.group(1))
|
|||
|
|
if "FirePro" in r or "Radeon Pro" in r:
|
|||
|
|
return "workstation"
|
|||
|
|
if n >= 5700:
|
|||
|
|
return "high_end"
|
|||
|
|
if 5500 <= n <= 5600 or 580 <= n <= 590:
|
|||
|
|
return "mid_range"
|
|||
|
|
return "low_end"
|
|||
|
|
|
|||
|
|
# Fallback
|
|||
|
|
return "mid_range"
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# NETWORK CONSTRUCTION
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# Build once at import — the network is stateless, only the RNG varies.
|
|||
|
|
|
|||
|
|
def _gpu_marginal():
|
|||
|
|
"""Build marginal distribution over GPU pool (uniform for now)."""
|
|||
|
|
n = len(_GPU_POOL)
|
|||
|
|
p = 1.0 / n
|
|||
|
|
return [{"value": g, "prob": p} for g in _GPU_POOL]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _cpt_from_table(table: Dict[str, Any]) -> Dict[str, list]:
|
|||
|
|
"""CPT for conditional nodes: `{class_name: [{value, prob}, ...]}`."""
|
|||
|
|
return dict(table)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _screen_tier(ctx):
|
|||
|
|
"""Classify screen width into tier for (gpu_class, screen_tier) CPTs."""
|
|||
|
|
s = ctx.get("screen", {}) or {}
|
|||
|
|
w = int(s.get("w", 1920))
|
|||
|
|
h = int(s.get("h", 1080))
|
|||
|
|
# Ultrawide: aspect ratio > 2.1 (e.g. 3440x1440, 5120x1440)
|
|||
|
|
if h > 0 and (w / h) > 2.1:
|
|||
|
|
return "ultrawide"
|
|||
|
|
if w <= 1920:
|
|||
|
|
return "1080p"
|
|||
|
|
if w <= 2560:
|
|||
|
|
return "1440p"
|
|||
|
|
if w <= 3840:
|
|||
|
|
return "2160p"
|
|||
|
|
return "ultrawide"
|
|||
|
|
|
|||
|
|
|
|||
|
|
_NETWORK = Network([
|
|||
|
|
Node("gpu", parents=[], cpt=_gpu_marginal()),
|
|||
|
|
Node("gpu_class", parents=["gpu"], classifier=lambda ctx: classify_gpu(ctx["gpu"])),
|
|||
|
|
# Hidden variable: within a gpu_class, user's OTHER components (RAM, SSD,
|
|||
|
|
# cores, screen) correlate — a 'premium' mid_range user has more cores,
|
|||
|
|
# larger SSD, higher-res screen than a 'budget' mid_range user. Without
|
|||
|
|
# this, hwc/screen/storage would be independent given gpu_class (noisy).
|
|||
|
|
Node("intra_tier", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_INTRA_TIER)),
|
|||
|
|
# hwc/screen/storage now jointly coherent via (gpu_class, intra_tier).
|
|||
|
|
Node("hw_concurrency", parents=["gpu_class", "intra_tier"],
|
|||
|
|
cpt=_cpt_from_table(_CPT_HWC)),
|
|||
|
|
Node("screen", parents=["gpu_class", "intra_tier"],
|
|||
|
|
cpt=_cpt_from_table(_CPT_SCREEN)),
|
|||
|
|
# Derive screen_tier from screen for msaa parent lookup.
|
|||
|
|
Node("screen_tier", parents=["screen"], classifier=_screen_tier),
|
|||
|
|
# MSAA: realistic combo (4K + high_end GPU → MSAA=0 due to perf cost;
|
|||
|
|
# 1080p + high_end → MSAA=4 common; 1080p + integrated → MSAA=0).
|
|||
|
|
Node("msaa_samples", parents=["gpu_class", "screen_tier"],
|
|||
|
|
cpt=_cpt_from_table(_CPT_MSAA)),
|
|||
|
|
# Joint codec distribution (gpu_class only).
|
|||
|
|
Node("codec", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_CODEC)),
|
|||
|
|
# Storage quota: coherent within gpu_class × intra_tier (premium workstation
|
|||
|
|
# user → 2-3TB SSD; budget workstation user → 512GB; budget integrated_old
|
|||
|
|
# → 128GB).
|
|||
|
|
Node("storage_quota_mb", parents=["gpu_class", "intra_tier"],
|
|||
|
|
cpt=_cpt_from_table(_CPT_STORAGE)),
|
|||
|
|
# Audio: pro users (workstation) → 48/96kHz 6-8ch; old onboard → 44.1kHz
|
|||
|
|
# 2ch high latency. Workstation GPU + 44.1kHz mono was previously
|
|||
|
|
# implausible; now blocked by the CPT.
|
|||
|
|
Node("audio", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_AUDIO)),
|
|||
|
|
Node("dark_theme", parents=[], cpt=_INDEP["dark_theme"]["table"]),
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# FONT WHITELIST (Bayesian: core ∪ sampled_optional | gpu_class)
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# Semantic flip: previously exclude-list (block N probed fonts per seed).
|
|||
|
|
# Now whitelist (browser sees ONLY these fonts, everything else hidden).
|
|||
|
|
# Core (~112): always included — fresh Win11 + Office 2021 English.
|
|||
|
|
# Optional (~40): sampled per-session with P(present | gpu_class). Gives
|
|||
|
|
# small realistic variance (~3-8 optional fonts differ per session) while
|
|||
|
|
# keeping the profile strongly centered on 'typical Windows user'.
|
|||
|
|
|
|||
|
|
|
|||
|
|
def derive_font_prefs(gpu_class: str, rng) -> Dict[str, str]:
|
|||
|
|
"""Build COHERENT whitelist + metrics strings for the session.
|
|||
|
|
|
|||
|
|
Sampling:
|
|||
|
|
- Core fonts always included.
|
|||
|
|
- Optional fonts sampled with P(font | gpu_class) from the CPT table.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
{
|
|||
|
|
"whitelist": "arial,calibri,marlett,...",
|
|||
|
|
"metrics": "arial|0.978,calibri|0.934,marlett|0.855,..."
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
The whitelist is the list of font families to advertise. The metrics
|
|||
|
|
string encodes per-family width scale factors that the consumer can
|
|||
|
|
use to make each family detectable by width-diff font probes.
|
|||
|
|
|
|||
|
|
Each entry in font_pool.json carries its own {name, factor} pair so the
|
|||
|
|
two pref strings are GUARANTEED coherent — no chance of a fabricated
|
|||
|
|
font with factor 1.0 (undetectable) or a metrics entry for a font not
|
|||
|
|
in the whitelist (useless).
|
|||
|
|
|
|||
|
|
Markers & add-new-font: simply add an entry to font_pool.json:core (with
|
|||
|
|
a factor at least 4% away from 1.0) — no special-case code needed.
|
|||
|
|
"""
|
|||
|
|
cpt = _CPT_FONTS_OPT.get(gpu_class)
|
|||
|
|
if cpt is None:
|
|||
|
|
cpt = _CPT_FONTS_OPT["integrated_modern"]
|
|||
|
|
included: list = list(_FONT_CORE) # always present
|
|||
|
|
for entry in _FONT_OPTIONAL:
|
|||
|
|
name = entry["name"]
|
|||
|
|
p = cpt.get(name, 0.7) # default 0.7 if CPT has no row for this font
|
|||
|
|
if rng.random() < p:
|
|||
|
|
included.append(entry)
|
|||
|
|
# Deterministic ordering: sort by name
|
|||
|
|
included.sort(key=lambda e: e["name"])
|
|||
|
|
whitelist = ",".join(e["name"] for e in included)
|
|||
|
|
metrics = ",".join(
|
|||
|
|
f'{e["name"]}|{e["factor"]:.3f}' for e in included
|
|||
|
|
)
|
|||
|
|
return {"whitelist": whitelist, "metrics": metrics}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Back-compat shim: legacy callers still import derive_font_whitelist.
|
|||
|
|
def derive_font_whitelist(gpu_class: str, rng) -> str:
|
|||
|
|
return derive_font_prefs(gpu_class, rng)["whitelist"]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
# PUBLIC API: Forge
|
|||
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|||
|
|
import random
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Forge:
|
|||
|
|
"""Fingerprint forge — single seed → coherent bundle."""
|
|||
|
|
|
|||
|
|
def __init__(self, seed: int):
|
|||
|
|
self.seed = int(seed)
|
|||
|
|
self._rng = random.Random(self.seed)
|
|||
|
|
|
|||
|
|
def sample(self) -> Dict[str, Any]:
|
|||
|
|
bundle = _NETWORK.sample(self._rng)
|
|||
|
|
gpu = bundle["gpu"]
|
|||
|
|
screen = bundle["screen"]
|
|||
|
|
audio = bundle["audio"]
|
|||
|
|
codec = bundle["codec"]
|
|||
|
|
return {
|
|||
|
|
# Seed tracking
|
|||
|
|
"stealth_seed": self.seed,
|
|||
|
|
# Locked identity
|
|||
|
|
**_LOCKED,
|
|||
|
|
# GPU (coherent pair from 444 pool)
|
|||
|
|
"webgl_renderer": gpu["renderer"],
|
|||
|
|
"webgl_vendor": gpu["vendor"],
|
|||
|
|
"gpu_class": bundle["gpu_class"],
|
|||
|
|
# Hidden-variable debug metadata (not a Firefox pref, just for
|
|||
|
|
# analysis / test result correlation tracking)
|
|||
|
|
"intra_tier": bundle["intra_tier"],
|
|||
|
|
"screen_tier": bundle["screen_tier"],
|
|||
|
|
# Screen (coherent with GPU class)
|
|||
|
|
"screen_w": int(screen["w"]),
|
|||
|
|
"screen_h": int(screen["h"]),
|
|||
|
|
"screen_avail_w": int(screen.get("aw", screen["w"])),
|
|||
|
|
"screen_avail_h": int(screen.get("ah", screen["h"] - 40)),
|
|||
|
|
"dpr": float(screen["dpr"]),
|
|||
|
|
# Hardware (coherent with GPU class)
|
|||
|
|
"hw_concurrency": int(bundle["hw_concurrency"]),
|
|||
|
|
# WebGL MSAA (coherent with GPU class)
|
|||
|
|
"msaa_samples": int(bundle["msaa_samples"]),
|
|||
|
|
# Audio (independent joint)
|
|||
|
|
"audio_sample_rate": int(audio["rate"]),
|
|||
|
|
"audio_output_latency_ms": int(audio["latency"]),
|
|||
|
|
"audio_max_channel_count": int(audio["channels"]),
|
|||
|
|
# Codec prefs (joint, coherent with GPU class). All 5 are
|
|||
|
|
# JS-visible: av1/webm_encoder via canPlayType/MediaRecorder,
|
|||
|
|
# mediasource_* via MediaSource.isTypeSupported, webspeech_synth
|
|||
|
|
# via 'speechSynthesis' in window (CreepJS voices probe).
|
|||
|
|
"av1_enabled": bool(codec["av1_enabled"]),
|
|||
|
|
"webm_encoder_enabled": bool(codec["webm_encoder_enabled"]),
|
|||
|
|
"mediasource_webm": bool(codec["mediasource_webm"]),
|
|||
|
|
"mediasource_mp4": bool(codec["mediasource_mp4"]),
|
|||
|
|
"webspeech_synth": bool(codec["webspeech_synth"]),
|
|||
|
|
# Storage quota MB (coherent with GPU class — workstation larger SSDs).
|
|||
|
|
"storage_quota_mb": int(bundle["storage_quota_mb"]),
|
|||
|
|
# Independent marginals
|
|||
|
|
"dark_theme": int(bundle["dark_theme"]),
|
|||
|
|
# Bayesian font prefs (coherent pair: whitelist + per-family
|
|||
|
|
# width scale metrics, both sampled from the same font_pool.json
|
|||
|
|
# and conditioned on gpu_class).
|
|||
|
|
**{
|
|||
|
|
f"font_{k}": v
|
|||
|
|
for k, v in derive_font_prefs(
|
|||
|
|
bundle["gpu_class"], self._rng
|
|||
|
|
).items()
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def sample(seed: int) -> Dict[str, Any]:
|
|||
|
|
"""Convenience: `Forge(seed).sample()`."""
|
|||
|
|
return Forge(seed).sample()
|