mirror of
https://github.com/feder-cr/invisible_playwright.git
synced 2026-06-07 08:35:12 +02:00
Stealthfox — a patched Firefox 150.0.1 for browser-fingerprint stealth,
shipped as a Playwright-compatible Python wrapper.
* Sync + async Stealthfox launcher (firefox_user_prefs, virtual desktop
on Windows, SOCKS5 auth via patched nsProtocolProxyService)
* fpforge: Bayesian fingerprint sampler over GPU / audio / fonts /
screen / TCP options / ~400 other navigator fields
* WebRTC stealth: srflx address swap, synthetic srflx fallback,
private-LAN host candidates — no real public IP leak via STUN
* GPU sandbox fix for FF150 alt-desktop regression
* Bezier-curve mouse motion baked into Juggler
Targets Windows x86_64 + Linux x86_64. Binary fetched on first run from
GitHub Release "firefox-1".
358 lines
16 KiB
Python
358 lines
16 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""stealth_forge — Bayesian fingerprint generator for Firefox 150 Windows.
|
||
|
||
Everything the Firefox build exposes to JS (screen, hardwareConcurrency,
|
||
WebGL, audio, MSAA, theme, media codecs) is sampled from a Bayesian network
|
||
with coherent cross-field dependencies. Identity (userAgent, platform,
|
||
oscpu, webdriver=false, maxTouchPoints=0) is locked by the compiled build.
|
||
|
||
Graph:
|
||
|
||
gpu (root, 444 real Windows ANGLE renderers)
|
||
│
|
||
└─> gpu_class (deterministic classifier, 6 classes)
|
||
├─> hw_concurrency (CPT per class)
|
||
├─> screen (w/h/dpr/av) (CPT per class)
|
||
└─> msaa_samples (CPT per class)
|
||
|
||
audio (root, joint rate+latency+channels — marginal)
|
||
dark_theme (marginal)
|
||
av1_enabled (marginal)
|
||
webm_encoder_enabled (marginal)
|
||
|
||
font_exclude ← deterministic hash of stealth_seed (seed-derived)
|
||
|
||
CPTs live in `data/*.json` (easy to tune without code changes).
|
||
Sampling is deterministic per stealth_seed via a private random.Random.
|
||
"""
|
||
import json
|
||
import os
|
||
import re
|
||
from typing import Any, Dict
|
||
|
||
from ._network import Network, Node
|
||
|
||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||
|
||
|
||
def _load(filename: str) -> Any:
|
||
with open(os.path.join(_HERE, "data", filename), "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# LOCKED IDENTITY (compiled into our Firefox 150 build — never varies)
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
_LOCKED: Dict[str, Any] = {
|
||
"user_agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) "
|
||
"Gecko/20100101 Firefox/150.0.1"
|
||
),
|
||
"platform": "Win32",
|
||
"oscpu": "Windows NT 10.0; Win64; x64",
|
||
"app_code_name": "Mozilla",
|
||
"app_version": "5.0 (Windows)",
|
||
"product_sub": "20100101",
|
||
"webdriver": False,
|
||
"max_touch_points": 0,
|
||
}
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# DATA
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
_GPU_POOL = _load("webgl_renderer_pool.json")["entries"]
|
||
# hwc/screen/storage now keyed on (gpu_class, intra_tier) for triangulation
|
||
_CPT_HWC = _load("cpt_hwc_given_class_tier.json")["table"]
|
||
_CPT_SCREEN = _load("cpt_screen_given_class_tier.json")["table"]
|
||
_CPT_STORAGE = _load("cpt_storage_given_class_tier.json")["table"]
|
||
# Hidden tier variable that makes hwc/screen/storage jointly coherent
|
||
_CPT_INTRA_TIER = _load("cpt_intra_tier_given_class.json")["table"]
|
||
# MSAA depends on (gpu_class, screen_tier) — 4K gaming → MSAA=0, 1080p+GPU → MSAA=4
|
||
_CPT_MSAA = _load("cpt_msaa_given_class_screen.json")["table"]
|
||
# Codec unchanged
|
||
_CPT_CODEC = _load("cpt_codec_given_class.json")["table"]
|
||
# Audio now conditional on gpu_class (workstation → pro audio, old → 44.1kHz onboard)
|
||
_CPT_AUDIO = _load("cpt_audio_given_class.json")["table"]
|
||
_INDEP = _load("priors_independent.json")
|
||
_FONT_POOL = _load("font_pool.json")
|
||
# Each entry is a dict {"name": "<lowercase family>", "factor": float}.
|
||
# - name: the font family advertised to the page.
|
||
# - factor: per-family width scale used by the consumer to make the family
|
||
# detectable by width-diff probes.
|
||
# Core = always-included; Optional = sampled with P(font | gpu_class).
|
||
_FONT_CORE: list = _FONT_POOL["core"]
|
||
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
|
||
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# GPU CLASSIFIER (deterministic function of gpu → gpu_class)
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
_GPU_CLASSES = (
|
||
"integrated_old", "integrated_modern", "low_end",
|
||
"mid_range", "high_end", "workstation",
|
||
)
|
||
|
||
|
||
def classify_gpu(gpu_value: Dict[str, str]) -> str:
|
||
"""Deterministic: maps (renderer, vendor) dict to one of 6 classes.
|
||
|
||
See data/cpt_*.json — each CPT table has an entry for every class.
|
||
"""
|
||
r = gpu_value.get("renderer", "")
|
||
|
||
if re.search(r"Intel.*HD Graphics (3000|4000|2500)", r):
|
||
return "integrated_old"
|
||
if re.search(
|
||
r"Intel.*(HD Graphics (4[56]|5\d\d|6\d\d)|UHD Graphics|Graphics Family|Iris|Arc)",
|
||
r,
|
||
):
|
||
return "integrated_modern"
|
||
if re.search(
|
||
r"AMD.*(Radeon(\(TM\))? (Graphics|6\d\dM|7\d\dM|8\d\dM)|Vega [0-9]|"
|
||
r"Renoir|Rembrandt|TM Graphics)",
|
||
r, re.IGNORECASE,
|
||
):
|
||
return "integrated_modern"
|
||
|
||
# NVIDIA: Firefox SanitizeRenderer.cpp collapses every GeForce into one of
|
||
# 3 vintage buckets (8800 GTX / GTX 480 / GTX 980). The renderer string
|
||
# exposed to JS is therefore vintage; pairing it with modern cores/screen
|
||
# creates an internal mismatch that FP Pro's tampering_ml flags. We pick
|
||
# `low_end` for all 3 buckets so cores stay 4-12 and screen 1080-1440p,
|
||
# consistent with what a real user with each of those (vintage) cards
|
||
# would have. Workstation overrides keep their high-tier classification.
|
||
if re.search(
|
||
r"(GeForce (8\d\d\d?|9\d\d\d?|GTX 980|GTX 480|GT 1030|GT 710|GT 730|"
|
||
r"GT 220|GT 240|210|310)|Quadro K\d|Radeon HD [1234]\d\d\d)", r,
|
||
):
|
||
return "low_end"
|
||
|
||
# NVIDIA discrete (any other GeForce — should be rare after the pool was
|
||
# collapsed to the 3 sanitize buckets, but kept as a safety net).
|
||
m = re.search(r"GeForce\s+(?:GTX\s+|RTX\s+)?(\d{3,4})", r)
|
||
if m:
|
||
if "Quadro" in r or "Workstation" in r:
|
||
return "workstation"
|
||
# Anything that survives the sanitize collapse stays low_end to avoid
|
||
# the modern-cores/vintage-renderer pairing.
|
||
return "low_end"
|
||
|
||
# AMD discrete
|
||
m = re.search(r"Radeon[^0-9]*(\d{3,4})", r)
|
||
if m:
|
||
n = int(m.group(1))
|
||
if "FirePro" in r or "Radeon Pro" in r:
|
||
return "workstation"
|
||
if n >= 5700:
|
||
return "high_end"
|
||
if 5500 <= n <= 5600 or 580 <= n <= 590:
|
||
return "mid_range"
|
||
return "low_end"
|
||
|
||
# Fallback
|
||
return "mid_range"
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# NETWORK CONSTRUCTION
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# Build once at import — the network is stateless, only the RNG varies.
|
||
|
||
def _gpu_marginal():
|
||
"""Build marginal distribution over GPU pool (uniform for now)."""
|
||
n = len(_GPU_POOL)
|
||
p = 1.0 / n
|
||
return [{"value": g, "prob": p} for g in _GPU_POOL]
|
||
|
||
|
||
def _cpt_from_table(table: Dict[str, Any]) -> Dict[str, list]:
|
||
"""CPT for conditional nodes: `{class_name: [{value, prob}, ...]}`."""
|
||
return dict(table)
|
||
|
||
|
||
def _screen_tier(ctx):
|
||
"""Classify screen width into tier for (gpu_class, screen_tier) CPTs."""
|
||
s = ctx.get("screen", {}) or {}
|
||
w = int(s.get("w", 1920))
|
||
h = int(s.get("h", 1080))
|
||
# Ultrawide: aspect ratio > 2.1 (e.g. 3440x1440, 5120x1440)
|
||
if h > 0 and (w / h) > 2.1:
|
||
return "ultrawide"
|
||
if w <= 1920:
|
||
return "1080p"
|
||
if w <= 2560:
|
||
return "1440p"
|
||
if w <= 3840:
|
||
return "2160p"
|
||
return "ultrawide"
|
||
|
||
|
||
_NETWORK = Network([
|
||
Node("gpu", parents=[], cpt=_gpu_marginal()),
|
||
Node("gpu_class", parents=["gpu"], classifier=lambda ctx: classify_gpu(ctx["gpu"])),
|
||
# Hidden variable: within a gpu_class, user's OTHER components (RAM, SSD,
|
||
# cores, screen) correlate — a 'premium' mid_range user has more cores,
|
||
# larger SSD, higher-res screen than a 'budget' mid_range user. Without
|
||
# this, hwc/screen/storage would be independent given gpu_class (noisy).
|
||
Node("intra_tier", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_INTRA_TIER)),
|
||
# hwc/screen/storage now jointly coherent via (gpu_class, intra_tier).
|
||
Node("hw_concurrency", parents=["gpu_class", "intra_tier"],
|
||
cpt=_cpt_from_table(_CPT_HWC)),
|
||
Node("screen", parents=["gpu_class", "intra_tier"],
|
||
cpt=_cpt_from_table(_CPT_SCREEN)),
|
||
# Derive screen_tier from screen for msaa parent lookup.
|
||
Node("screen_tier", parents=["screen"], classifier=_screen_tier),
|
||
# MSAA: realistic combo (4K + high_end GPU → MSAA=0 due to perf cost;
|
||
# 1080p + high_end → MSAA=4 common; 1080p + integrated → MSAA=0).
|
||
Node("msaa_samples", parents=["gpu_class", "screen_tier"],
|
||
cpt=_cpt_from_table(_CPT_MSAA)),
|
||
# Joint codec distribution (gpu_class only).
|
||
Node("codec", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_CODEC)),
|
||
# Storage quota: coherent within gpu_class × intra_tier (premium workstation
|
||
# user → 2-3TB SSD; budget workstation user → 512GB; budget integrated_old
|
||
# → 128GB).
|
||
Node("storage_quota_mb", parents=["gpu_class", "intra_tier"],
|
||
cpt=_cpt_from_table(_CPT_STORAGE)),
|
||
# Audio: pro users (workstation) → 48/96kHz 6-8ch; old onboard → 44.1kHz
|
||
# 2ch high latency. Workstation GPU + 44.1kHz mono was previously
|
||
# implausible; now blocked by the CPT.
|
||
Node("audio", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_AUDIO)),
|
||
Node("dark_theme", parents=[], cpt=_INDEP["dark_theme"]["table"]),
|
||
])
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# FONT WHITELIST (Bayesian: core ∪ sampled_optional | gpu_class)
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# Semantic flip: previously exclude-list (block N probed fonts per seed).
|
||
# Now whitelist (browser sees ONLY these fonts, everything else hidden).
|
||
# Core (~112): always included — fresh Win11 + Office 2021 English.
|
||
# Optional (~40): sampled per-session with P(present | gpu_class). Gives
|
||
# small realistic variance (~3-8 optional fonts differ per session) while
|
||
# keeping the profile strongly centered on 'typical Windows user'.
|
||
|
||
|
||
def derive_font_prefs(gpu_class: str, rng) -> Dict[str, str]:
|
||
"""Build COHERENT whitelist + metrics strings for the session.
|
||
|
||
Sampling:
|
||
- Core fonts always included.
|
||
- Optional fonts sampled with P(font | gpu_class) from the CPT table.
|
||
|
||
Returns:
|
||
{
|
||
"whitelist": "arial,calibri,marlett,...",
|
||
"metrics": "arial|0.978,calibri|0.934,marlett|0.855,..."
|
||
}
|
||
|
||
The whitelist is the list of font families to advertise. The metrics
|
||
string encodes per-family width scale factors that the consumer can
|
||
use to make each family detectable by width-diff font probes.
|
||
|
||
Each entry in font_pool.json carries its own {name, factor} pair so the
|
||
two pref strings are GUARANTEED coherent — no chance of a fabricated
|
||
font with factor 1.0 (undetectable) or a metrics entry for a font not
|
||
in the whitelist (useless).
|
||
|
||
Markers & add-new-font: simply add an entry to font_pool.json:core (with
|
||
a factor at least 4% away from 1.0) — no special-case code needed.
|
||
"""
|
||
cpt = _CPT_FONTS_OPT.get(gpu_class)
|
||
if cpt is None:
|
||
cpt = _CPT_FONTS_OPT["integrated_modern"]
|
||
included: list = list(_FONT_CORE) # always present
|
||
for entry in _FONT_OPTIONAL:
|
||
name = entry["name"]
|
||
p = cpt.get(name, 0.7) # default 0.7 if CPT has no row for this font
|
||
if rng.random() < p:
|
||
included.append(entry)
|
||
# Deterministic ordering: sort by name
|
||
included.sort(key=lambda e: e["name"])
|
||
whitelist = ",".join(e["name"] for e in included)
|
||
metrics = ",".join(
|
||
f'{e["name"]}|{e["factor"]:.3f}' for e in included
|
||
)
|
||
return {"whitelist": whitelist, "metrics": metrics}
|
||
|
||
|
||
# Back-compat shim: legacy callers still import derive_font_whitelist.
|
||
def derive_font_whitelist(gpu_class: str, rng) -> str:
|
||
return derive_font_prefs(gpu_class, rng)["whitelist"]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
# PUBLIC API: Forge
|
||
# ═══════════════════════════════════════════════════════════════════════
|
||
import random
|
||
|
||
|
||
class Forge:
|
||
"""Fingerprint forge — single seed → coherent bundle."""
|
||
|
||
def __init__(self, seed: int):
|
||
self.seed = int(seed)
|
||
self._rng = random.Random(self.seed)
|
||
|
||
def sample(self) -> Dict[str, Any]:
|
||
bundle = _NETWORK.sample(self._rng)
|
||
gpu = bundle["gpu"]
|
||
screen = bundle["screen"]
|
||
audio = bundle["audio"]
|
||
codec = bundle["codec"]
|
||
return {
|
||
# Seed tracking
|
||
"stealth_seed": self.seed,
|
||
# Locked identity
|
||
**_LOCKED,
|
||
# GPU (coherent pair from 444 pool)
|
||
"webgl_renderer": gpu["renderer"],
|
||
"webgl_vendor": gpu["vendor"],
|
||
"gpu_class": bundle["gpu_class"],
|
||
# Hidden-variable debug metadata (not a Firefox pref, just for
|
||
# analysis / test result correlation tracking)
|
||
"intra_tier": bundle["intra_tier"],
|
||
"screen_tier": bundle["screen_tier"],
|
||
# Screen (coherent with GPU class)
|
||
"screen_w": int(screen["w"]),
|
||
"screen_h": int(screen["h"]),
|
||
"screen_avail_w": int(screen.get("aw", screen["w"])),
|
||
"screen_avail_h": int(screen.get("ah", screen["h"] - 40)),
|
||
"dpr": float(screen["dpr"]),
|
||
# Hardware (coherent with GPU class)
|
||
"hw_concurrency": int(bundle["hw_concurrency"]),
|
||
# WebGL MSAA (coherent with GPU class)
|
||
"msaa_samples": int(bundle["msaa_samples"]),
|
||
# Audio (independent joint)
|
||
"audio_sample_rate": int(audio["rate"]),
|
||
"audio_output_latency_ms": int(audio["latency"]),
|
||
"audio_max_channel_count": int(audio["channels"]),
|
||
# Codec prefs (joint, coherent with GPU class). All 5 are
|
||
# JS-visible: av1/webm_encoder via canPlayType/MediaRecorder,
|
||
# mediasource_* via MediaSource.isTypeSupported, webspeech_synth
|
||
# via 'speechSynthesis' in window (CreepJS voices probe).
|
||
"av1_enabled": bool(codec["av1_enabled"]),
|
||
"webm_encoder_enabled": bool(codec["webm_encoder_enabled"]),
|
||
"mediasource_webm": bool(codec["mediasource_webm"]),
|
||
"mediasource_mp4": bool(codec["mediasource_mp4"]),
|
||
"webspeech_synth": bool(codec["webspeech_synth"]),
|
||
# Storage quota MB (coherent with GPU class — workstation larger SSDs).
|
||
"storage_quota_mb": int(bundle["storage_quota_mb"]),
|
||
# Independent marginals
|
||
"dark_theme": int(bundle["dark_theme"]),
|
||
# Bayesian font prefs (coherent pair: whitelist + per-family
|
||
# width scale metrics, both sampled from the same font_pool.json
|
||
# and conditioned on gpu_class).
|
||
**{
|
||
f"font_{k}": v
|
||
for k, v in derive_font_prefs(
|
||
bundle["gpu_class"], self._rng
|
||
).items()
|
||
},
|
||
}
|
||
|
||
|
||
def sample(seed: int) -> Dict[str, Any]:
|
||
"""Convenience: `Forge(seed).sample()`."""
|
||
return Forge(seed).sample()
|