invisible_playwright/src/stealthfox/_fpforge/_sampler.py
feder-cr 60e55491ea feat: initial public release
Stealthfox — a patched Firefox 150.0.1 for browser-fingerprint stealth,
shipped as a Playwright-compatible Python wrapper.

  * Sync + async Stealthfox launcher (firefox_user_prefs, virtual desktop
    on Windows, SOCKS5 auth via patched nsProtocolProxyService)
  * fpforge: Bayesian fingerprint sampler over GPU / audio / fonts /
    screen / TCP options / ~400 other navigator fields
  * WebRTC stealth: srflx address swap, synthetic srflx fallback,
    private-LAN host candidates — no real public IP leak via STUN
  * GPU sandbox fix for FF150 alt-desktop regression
  * Bezier-curve mouse motion baked into Juggler

Targets Windows x86_64 + Linux x86_64. Binary fetched on first run from
GitHub Release "firefox-1".
2026-05-12 21:57:18 -07:00

358 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""stealth_forge — Bayesian fingerprint generator for Firefox 150 Windows.
Everything the Firefox build exposes to JS (screen, hardwareConcurrency,
WebGL, audio, MSAA, theme, media codecs) is sampled from a Bayesian network
with coherent cross-field dependencies. Identity (userAgent, platform,
oscpu, webdriver=false, maxTouchPoints=0) is locked by the compiled build.
Graph:
gpu (root, 444 real Windows ANGLE renderers)
└─> gpu_class (deterministic classifier, 6 classes)
├─> hw_concurrency (CPT per class)
├─> screen (w/h/dpr/av) (CPT per class)
└─> msaa_samples (CPT per class)
audio (root, joint rate+latency+channels — marginal)
dark_theme (marginal)
av1_enabled (marginal)
webm_encoder_enabled (marginal)
font_exclude ← deterministic hash of stealth_seed (seed-derived)
CPTs live in `data/*.json` (easy to tune without code changes).
Sampling is deterministic per stealth_seed via a private random.Random.
"""
import json
import os
import re
from typing import Any, Dict
from ._network import Network, Node
_HERE = os.path.dirname(os.path.abspath(__file__))
def _load(filename: str) -> Any:
with open(os.path.join(_HERE, "data", filename), "r", encoding="utf-8") as f:
return json.load(f)
# ═══════════════════════════════════════════════════════════════════════
# LOCKED IDENTITY (compiled into our Firefox 150 build — never varies)
# ═══════════════════════════════════════════════════════════════════════
_LOCKED: Dict[str, Any] = {
"user_agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) "
"Gecko/20100101 Firefox/150.0.1"
),
"platform": "Win32",
"oscpu": "Windows NT 10.0; Win64; x64",
"app_code_name": "Mozilla",
"app_version": "5.0 (Windows)",
"product_sub": "20100101",
"webdriver": False,
"max_touch_points": 0,
}
# ═══════════════════════════════════════════════════════════════════════
# DATA
# ═══════════════════════════════════════════════════════════════════════
_GPU_POOL = _load("webgl_renderer_pool.json")["entries"]
# hwc/screen/storage now keyed on (gpu_class, intra_tier) for triangulation
_CPT_HWC = _load("cpt_hwc_given_class_tier.json")["table"]
_CPT_SCREEN = _load("cpt_screen_given_class_tier.json")["table"]
_CPT_STORAGE = _load("cpt_storage_given_class_tier.json")["table"]
# Hidden tier variable that makes hwc/screen/storage jointly coherent
_CPT_INTRA_TIER = _load("cpt_intra_tier_given_class.json")["table"]
# MSAA depends on (gpu_class, screen_tier) — 4K gaming → MSAA=0, 1080p+GPU → MSAA=4
_CPT_MSAA = _load("cpt_msaa_given_class_screen.json")["table"]
# Codec unchanged
_CPT_CODEC = _load("cpt_codec_given_class.json")["table"]
# Audio now conditional on gpu_class (workstation → pro audio, old → 44.1kHz onboard)
_CPT_AUDIO = _load("cpt_audio_given_class.json")["table"]
_INDEP = _load("priors_independent.json")
_FONT_POOL = _load("font_pool.json")
# Each entry is a dict {"name": "<lowercase family>", "factor": float}.
# - name: the font family advertised to the page.
# - factor: per-family width scale used by the consumer to make the family
# detectable by width-diff probes.
# Core = always-included; Optional = sampled with P(font | gpu_class).
_FONT_CORE: list = _FONT_POOL["core"]
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
# ═══════════════════════════════════════════════════════════════════════
# GPU CLASSIFIER (deterministic function of gpu → gpu_class)
# ═══════════════════════════════════════════════════════════════════════
_GPU_CLASSES = (
"integrated_old", "integrated_modern", "low_end",
"mid_range", "high_end", "workstation",
)
def classify_gpu(gpu_value: Dict[str, str]) -> str:
"""Deterministic: maps (renderer, vendor) dict to one of 6 classes.
See data/cpt_*.json — each CPT table has an entry for every class.
"""
r = gpu_value.get("renderer", "")
if re.search(r"Intel.*HD Graphics (3000|4000|2500)", r):
return "integrated_old"
if re.search(
r"Intel.*(HD Graphics (4[56]|5\d\d|6\d\d)|UHD Graphics|Graphics Family|Iris|Arc)",
r,
):
return "integrated_modern"
if re.search(
r"AMD.*(Radeon(\(TM\))? (Graphics|6\d\dM|7\d\dM|8\d\dM)|Vega [0-9]|"
r"Renoir|Rembrandt|TM Graphics)",
r, re.IGNORECASE,
):
return "integrated_modern"
# NVIDIA: Firefox SanitizeRenderer.cpp collapses every GeForce into one of
# 3 vintage buckets (8800 GTX / GTX 480 / GTX 980). The renderer string
# exposed to JS is therefore vintage; pairing it with modern cores/screen
# creates an internal mismatch that FP Pro's tampering_ml flags. We pick
# `low_end` for all 3 buckets so cores stay 4-12 and screen 1080-1440p,
# consistent with what a real user with each of those (vintage) cards
# would have. Workstation overrides keep their high-tier classification.
if re.search(
r"(GeForce (8\d\d\d?|9\d\d\d?|GTX 980|GTX 480|GT 1030|GT 710|GT 730|"
r"GT 220|GT 240|210|310)|Quadro K\d|Radeon HD [1234]\d\d\d)", r,
):
return "low_end"
# NVIDIA discrete (any other GeForce — should be rare after the pool was
# collapsed to the 3 sanitize buckets, but kept as a safety net).
m = re.search(r"GeForce\s+(?:GTX\s+|RTX\s+)?(\d{3,4})", r)
if m:
if "Quadro" in r or "Workstation" in r:
return "workstation"
# Anything that survives the sanitize collapse stays low_end to avoid
# the modern-cores/vintage-renderer pairing.
return "low_end"
# AMD discrete
m = re.search(r"Radeon[^0-9]*(\d{3,4})", r)
if m:
n = int(m.group(1))
if "FirePro" in r or "Radeon Pro" in r:
return "workstation"
if n >= 5700:
return "high_end"
if 5500 <= n <= 5600 or 580 <= n <= 590:
return "mid_range"
return "low_end"
# Fallback
return "mid_range"
# ═══════════════════════════════════════════════════════════════════════
# NETWORK CONSTRUCTION
# ═══════════════════════════════════════════════════════════════════════
# Build once at import — the network is stateless, only the RNG varies.
def _gpu_marginal():
"""Build marginal distribution over GPU pool (uniform for now)."""
n = len(_GPU_POOL)
p = 1.0 / n
return [{"value": g, "prob": p} for g in _GPU_POOL]
def _cpt_from_table(table: Dict[str, Any]) -> Dict[str, list]:
"""CPT for conditional nodes: `{class_name: [{value, prob}, ...]}`."""
return dict(table)
def _screen_tier(ctx):
"""Classify screen width into tier for (gpu_class, screen_tier) CPTs."""
s = ctx.get("screen", {}) or {}
w = int(s.get("w", 1920))
h = int(s.get("h", 1080))
# Ultrawide: aspect ratio > 2.1 (e.g. 3440x1440, 5120x1440)
if h > 0 and (w / h) > 2.1:
return "ultrawide"
if w <= 1920:
return "1080p"
if w <= 2560:
return "1440p"
if w <= 3840:
return "2160p"
return "ultrawide"
_NETWORK = Network([
Node("gpu", parents=[], cpt=_gpu_marginal()),
Node("gpu_class", parents=["gpu"], classifier=lambda ctx: classify_gpu(ctx["gpu"])),
# Hidden variable: within a gpu_class, user's OTHER components (RAM, SSD,
# cores, screen) correlate — a 'premium' mid_range user has more cores,
# larger SSD, higher-res screen than a 'budget' mid_range user. Without
# this, hwc/screen/storage would be independent given gpu_class (noisy).
Node("intra_tier", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_INTRA_TIER)),
# hwc/screen/storage now jointly coherent via (gpu_class, intra_tier).
Node("hw_concurrency", parents=["gpu_class", "intra_tier"],
cpt=_cpt_from_table(_CPT_HWC)),
Node("screen", parents=["gpu_class", "intra_tier"],
cpt=_cpt_from_table(_CPT_SCREEN)),
# Derive screen_tier from screen for msaa parent lookup.
Node("screen_tier", parents=["screen"], classifier=_screen_tier),
# MSAA: realistic combo (4K + high_end GPU → MSAA=0 due to perf cost;
# 1080p + high_end → MSAA=4 common; 1080p + integrated → MSAA=0).
Node("msaa_samples", parents=["gpu_class", "screen_tier"],
cpt=_cpt_from_table(_CPT_MSAA)),
# Joint codec distribution (gpu_class only).
Node("codec", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_CODEC)),
# Storage quota: coherent within gpu_class × intra_tier (premium workstation
# user → 2-3TB SSD; budget workstation user → 512GB; budget integrated_old
# → 128GB).
Node("storage_quota_mb", parents=["gpu_class", "intra_tier"],
cpt=_cpt_from_table(_CPT_STORAGE)),
# Audio: pro users (workstation) → 48/96kHz 6-8ch; old onboard → 44.1kHz
# 2ch high latency. Workstation GPU + 44.1kHz mono was previously
# implausible; now blocked by the CPT.
Node("audio", parents=["gpu_class"], cpt=_cpt_from_table(_CPT_AUDIO)),
Node("dark_theme", parents=[], cpt=_INDEP["dark_theme"]["table"]),
])
# ═══════════════════════════════════════════════════════════════════════
# FONT WHITELIST (Bayesian: core sampled_optional | gpu_class)
# ═══════════════════════════════════════════════════════════════════════
# Semantic flip: previously exclude-list (block N probed fonts per seed).
# Now whitelist (browser sees ONLY these fonts, everything else hidden).
# Core (~112): always included — fresh Win11 + Office 2021 English.
# Optional (~40): sampled per-session with P(present | gpu_class). Gives
# small realistic variance (~3-8 optional fonts differ per session) while
# keeping the profile strongly centered on 'typical Windows user'.
def derive_font_prefs(gpu_class: str, rng) -> Dict[str, str]:
"""Build COHERENT whitelist + metrics strings for the session.
Sampling:
- Core fonts always included.
- Optional fonts sampled with P(font | gpu_class) from the CPT table.
Returns:
{
"whitelist": "arial,calibri,marlett,...",
"metrics": "arial|0.978,calibri|0.934,marlett|0.855,..."
}
The whitelist is the list of font families to advertise. The metrics
string encodes per-family width scale factors that the consumer can
use to make each family detectable by width-diff font probes.
Each entry in font_pool.json carries its own {name, factor} pair so the
two pref strings are GUARANTEED coherent — no chance of a fabricated
font with factor 1.0 (undetectable) or a metrics entry for a font not
in the whitelist (useless).
Markers & add-new-font: simply add an entry to font_pool.json:core (with
a factor at least 4% away from 1.0) — no special-case code needed.
"""
cpt = _CPT_FONTS_OPT.get(gpu_class)
if cpt is None:
cpt = _CPT_FONTS_OPT["integrated_modern"]
included: list = list(_FONT_CORE) # always present
for entry in _FONT_OPTIONAL:
name = entry["name"]
p = cpt.get(name, 0.7) # default 0.7 if CPT has no row for this font
if rng.random() < p:
included.append(entry)
# Deterministic ordering: sort by name
included.sort(key=lambda e: e["name"])
whitelist = ",".join(e["name"] for e in included)
metrics = ",".join(
f'{e["name"]}|{e["factor"]:.3f}' for e in included
)
return {"whitelist": whitelist, "metrics": metrics}
# Back-compat shim: legacy callers still import derive_font_whitelist.
def derive_font_whitelist(gpu_class: str, rng) -> str:
return derive_font_prefs(gpu_class, rng)["whitelist"]
# ═══════════════════════════════════════════════════════════════════════
# PUBLIC API: Forge
# ═══════════════════════════════════════════════════════════════════════
import random
class Forge:
"""Fingerprint forge — single seed → coherent bundle."""
def __init__(self, seed: int):
self.seed = int(seed)
self._rng = random.Random(self.seed)
def sample(self) -> Dict[str, Any]:
bundle = _NETWORK.sample(self._rng)
gpu = bundle["gpu"]
screen = bundle["screen"]
audio = bundle["audio"]
codec = bundle["codec"]
return {
# Seed tracking
"stealth_seed": self.seed,
# Locked identity
**_LOCKED,
# GPU (coherent pair from 444 pool)
"webgl_renderer": gpu["renderer"],
"webgl_vendor": gpu["vendor"],
"gpu_class": bundle["gpu_class"],
# Hidden-variable debug metadata (not a Firefox pref, just for
# analysis / test result correlation tracking)
"intra_tier": bundle["intra_tier"],
"screen_tier": bundle["screen_tier"],
# Screen (coherent with GPU class)
"screen_w": int(screen["w"]),
"screen_h": int(screen["h"]),
"screen_avail_w": int(screen.get("aw", screen["w"])),
"screen_avail_h": int(screen.get("ah", screen["h"] - 40)),
"dpr": float(screen["dpr"]),
# Hardware (coherent with GPU class)
"hw_concurrency": int(bundle["hw_concurrency"]),
# WebGL MSAA (coherent with GPU class)
"msaa_samples": int(bundle["msaa_samples"]),
# Audio (independent joint)
"audio_sample_rate": int(audio["rate"]),
"audio_output_latency_ms": int(audio["latency"]),
"audio_max_channel_count": int(audio["channels"]),
# Codec prefs (joint, coherent with GPU class). All 5 are
# JS-visible: av1/webm_encoder via canPlayType/MediaRecorder,
# mediasource_* via MediaSource.isTypeSupported, webspeech_synth
# via 'speechSynthesis' in window (CreepJS voices probe).
"av1_enabled": bool(codec["av1_enabled"]),
"webm_encoder_enabled": bool(codec["webm_encoder_enabled"]),
"mediasource_webm": bool(codec["mediasource_webm"]),
"mediasource_mp4": bool(codec["mediasource_mp4"]),
"webspeech_synth": bool(codec["webspeech_synth"]),
# Storage quota MB (coherent with GPU class — workstation larger SSDs).
"storage_quota_mb": int(bundle["storage_quota_mb"]),
# Independent marginals
"dark_theme": int(bundle["dark_theme"]),
# Bayesian font prefs (coherent pair: whitelist + per-family
# width scale metrics, both sampled from the same font_pool.json
# and conditioned on gpu_class).
**{
f"font_{k}": v
for k, v in derive_font_prefs(
bundle["gpu_class"], self._rng
).items()
},
}
def sample(seed: int) -> Dict[str, Any]:
"""Convenience: `Forge(seed).sample()`."""
return Forge(seed).sample()