diff --git a/CHANGELOG.md b/CHANGELOG.md index 90f8cc1..f142d90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ## [Unreleased] ### Added -- `timezone="auto"`: resolve the browser timezone from the proxy egress IP. A session with a proxy and no explicit timezone now defaults to `auto` — a foreign proxy paired with the host TZ is the classic `timezone_mismatch` signal. The egress IP is discovered through the proxy (SOCKS supported) and mapped to its IANA zone with an offline mmdb (`daijro/geoip-all-in-one`, downloaded + cached on first use; `STEALTHFOX_GEOIP_MMDB` points at your own). Precedence: an explicit zone wins; `""`/`"auto"` without a proxy stay on the host TZ; `"host"`/`"local"` force the host TZ even behind a proxy. With a proxy, an unresolvable zone raises rather than silently falling back. +- `timezone="auto"`: the browser timezone is auto-derived from the egress IP. By default (no explicit timezone) it ALWAYS resolves — from the proxy egress when a proxy is set, otherwise from the host's own public IP — so the zone can never disagree with the IP (the classic `timezone_mismatch` signal). An explicit `"Area/City"` is the only way to force a specific zone. On failure: with a proxy the launch raises (no silent host-TZ fallback behind a foreign proxy); without a proxy it falls back to the host TZ so a transient lookup can't break the launch. +- The egress IP is mapped to its IANA zone with an offline mmdb (`daijro/geoip-all-in-one`). It auto-updates against the upstream weekly rebuild: cached locally, re-checked after `GEOIP_REFRESH_DAYS` (7), older copies pruned, and a stale cache is reused when offline. `STEALTHFOX_GEOIP_MMDB` points at your own `.mmdb` to skip the download. - `resolve_session_timezone(timezone, proxy)` and `ensure_geoip_mmdb()` re-exported at the package root (plus `GeoTimezoneError`) so integrations that own their launch can reproduce the resolution. -- `tests/test_geo.py`: 32 unit tests (precedence policy, proxy→requests translation, egress discovery, IP→IANA mapping, fail-early). +- `tests/test_geo.py` (37) + `tests/test_geoip_update.py` (freshness / auto-update / offline fallback) unit tests. ### Changed - New runtime dependencies: `requests[socks]` (SOCKS egress lookup), `maxminddb` (mmdb reader), `tzdata` (IANA database for `zoneinfo`, which Windows lacks). diff --git a/README.md b/README.md index 20507c1..bf5b8e3 100644 --- a/README.md +++ b/README.md @@ -146,27 +146,22 @@ Schemes supported: `socks5`, `socks4`, `http`, `https`. Auth works on all of the The browser timezone follows `timezone=`: ```python -# default: with a proxy, the timezone is auto-derived from the proxy egress IP +# default: timezone is auto-derived from the egress IP (proxy egress if a +# proxy is set, otherwise the host's own public IP) with InvisiblePlaywright(proxy=proxy) as browser: ... -# explicit IANA zone always wins +# explicit IANA zone always wins — the only way to force a specific zone with InvisiblePlaywright(proxy=proxy, timezone="America/New_York") as browser: ... - -# opt out and keep the host timezone even behind a proxy -with InvisiblePlaywright(proxy=proxy, timezone="host") as browser: - ... ``` | `timezone=` | with proxy | without proxy | |---|---|---| -| `""` (default) | auto-derived from egress IP | host timezone | -| `"auto"` | auto-derived from egress IP | host timezone | +| `""` (default) / `"auto"` | auto from proxy egress IP | auto from host public IP | | `"Area/City"` | that zone | that zone | -| `"host"` / `"local"` | host timezone | host timezone | -A proxy in a different country paired with the host timezone is the classic `timezone_mismatch` signal, so a proxy with no explicit timezone now resolves automatically. The egress IP is looked up through the proxy and mapped to its IANA zone with an offline database ([`daijro/geoip-all-in-one`](https://github.com/daijro/geoip-all-in-one)), downloaded and cached on first use. If a proxy is set but the zone can't be resolved, the launch raises rather than silently falling back to the host zone — pass an explicit `timezone=` or `timezone="host"` to override. Point `STEALTHFOX_GEOIP_MMDB` at your own `.mmdb` to skip the download. +The timezone always tracks the actual egress, so it can't disagree with the IP — a proxy in a different country paired with the host timezone is the classic `timezone_mismatch` signal. The egress IP is mapped to its IANA zone with an offline database ([`daijro/geoip-all-in-one`](https://github.com/daijro/geoip-all-in-one)), which auto-updates against its weekly rebuild and is cached locally (point `STEALTHFOX_GEOIP_MMDB` at your own `.mmdb` to skip the download). On failure: with a proxy the launch raises rather than silently using the host zone (pass an explicit `timezone=` to override); without a proxy it falls back to the host timezone so a transient lookup failure can't break the launch. ### Pinning specific fingerprint fields diff --git a/src/invisible_playwright/_geo.py b/src/invisible_playwright/_geo.py index 95ab3c5..02971e1 100644 --- a/src/invisible_playwright/_geo.py +++ b/src/invisible_playwright/_geo.py @@ -1,23 +1,23 @@ -"""Resolve the session timezone from the proxy egress IP (``timezone="auto"``). +"""Resolve the session timezone from the egress IP (``timezone="auto"``). -Approach B: discover the egress IP with one HTTP request routed *through the -configured proxy*, then map IP → IANA timezone with an offline mmdb +Approach B: discover the egress IP with one HTTP request — routed *through the +proxy* when one is set, otherwise a direct request that sees the host's own +public IP — then map IP → IANA timezone with an offline mmdb (``daijro/geoip-all-in-one``, downloaded + cached by ``download.py``). Precedence (see ``resolve_session_timezone``): - "host" / "local" → "" force host TZ (escape hatch) - explicit IANA → unchanged explicit always wins - "" + no proxy → "" host TZ (default, unchanged behaviour) - "" + proxy → egress NEW default: a proxy with no timezone is - exactly the timezone_mismatch trap, so we - auto-resolve it. - "auto" + no proxy → "" nothing to resolve, fall back to host TZ - "auto" + proxy → egress + explicit IANA → unchanged explicit always wins + "" / "auto" → egress ALWAYS resolve. With a proxy, from the proxy + egress IP; without a proxy, from the host's + own public IP. This is the default. -When a proxy IS set we fail loudly rather than silently fall back to the host -TZ — a foreign proxy paired with the host timezone is the precise signal -detectors flag as ``timezone_mismatch``. +On failure: + with a proxy → raise a foreign proxy paired with the host TZ is + the precise ``timezone_mismatch`` signal, so + we fail loudly rather than fall back silently. + without a proxy → "" (host) the host TZ is a safe default, so a transient + lookup failure must not break the launch. """ from __future__ import annotations @@ -79,14 +79,16 @@ def _proxies_for_requests(proxy: Dict[str, str]) -> Dict[str, str]: def discover_egress_ip( - proxy: Dict[str, str], *, timeout: float = 10.0 + proxy: Optional[Dict[str, str]] = None, *, timeout: float = 10.0 ) -> str: - """Return the public IP seen when routing through ``proxy``. + """Return the public egress IP. - Tries each echo endpoint in turn; raises :class:`GeoTimezoneError` if none - return a valid IP (SOCKS support requires ``requests[socks]`` / PySocks). + Routes the request through ``proxy`` when given (SOCKS support requires + ``requests[socks]`` / PySocks); with ``proxy=None`` it makes a direct + request that sees the host's own public IP. Tries each echo endpoint in + turn; raises :class:`GeoTimezoneError` if none return a valid IP. """ - proxies = _proxies_for_requests(proxy) + proxies = _proxies_for_requests(proxy) if proxy else None last_err: Optional[Exception] = None for url in _IP_ECHO_ENDPOINTS: try: @@ -139,22 +141,24 @@ def resolve_session_timezone( ) -> str: """Map the user's ``timezone`` setting to a concrete IANA zone (or ``""``). - See the module docstring for the full precedence table. Raises - :class:`GeoTimezoneError` when a proxy is set but the egress timezone - cannot be resolved (fail-early — never silently use the host TZ behind a - foreign proxy). + See the module docstring for the full precedence table. ``""``/``"auto"`` + ALWAYS resolve from the egress IP (proxy egress if a proxy is set, else the + host's own public IP). On failure: with a proxy we raise + :class:`GeoTimezoneError` (never silently use the host TZ behind a foreign + proxy); without a proxy we fall back to ``""`` (host TZ) so a transient + lookup failure can't break the launch. """ tz = (timezone or "").strip() - if tz.lower() in ("host", "local"): - return "" if tz and tz.lower() != "auto": return tz # explicit IANA wins - if not _proxy_is_set(proxy): - return "" # "" / "auto" without a proxy → host TZ - # proxy set, tz is "" (new default) or "auto" → resolve from egress. - assert proxy is not None + # "" or "auto" → always resolve from the egress IP. from .download import ensure_geoip_mmdb - ip = discover_egress_ip(proxy) - mmdb = ensure_geoip_mmdb() - return ip_to_timezone(ip, mmdb) + proxy_set = _proxy_is_set(proxy) + try: + ip = discover_egress_ip(proxy if proxy_set else None) + return ip_to_timezone(ip, ensure_geoip_mmdb()) + except Exception: + if proxy_set: + raise # fail-early behind a proxy (timezone_mismatch trap) + return "" # no proxy: host TZ is a safe fallback diff --git a/src/invisible_playwright/constants.py b/src/invisible_playwright/constants.py index d2cc16b..187aa2e 100644 --- a/src/invisible_playwright/constants.py +++ b/src/invisible_playwright/constants.py @@ -53,8 +53,10 @@ RELEASE_URL_TEMPLATE = ( # daijro/geoip-all-in-one merges IP2Location LITE + GeoLite2 + DB-IP into a # single mmdb (country ISO + coordinates + IANA timezone via tzfpy), rebuilt # weekly. GPL-3.0, so we DOWNLOAD it at runtime into the user cache (like the -# Firefox binary) rather than bundling it into this MIT package. Pinned to a -# known-good weekly tag; bump to refresh. The `-all` variant covers IPv4+IPv6. +# Firefox binary) rather than bundling it into this MIT package. The `-all` +# variant covers IPv4+IPv6. download.py tracks the LATEST release and refreshes +# weekly; GEOIP_MMDB_VERSION is only the cold-cache fallback when the GitHub +# API is unreachable on a machine that has never downloaded the DB. GEOIP_REPO: str = "daijro/geoip-all-in-one" GEOIP_MMDB_VERSION: str = "2026.06.03" GEOIP_ASSET: str = "geoip-aio-all.mmdb.zip" diff --git a/src/invisible_playwright/download.py b/src/invisible_playwright/download.py index 3dbb8e1..7417e39 100644 --- a/src/invisible_playwright/download.py +++ b/src/invisible_playwright/download.py @@ -5,9 +5,11 @@ import hashlib import os import platform import re +import shutil import sys import tarfile import tempfile +import time import zipfile from pathlib import Path @@ -158,46 +160,133 @@ def ensure_binary(version: str = BINARY_VERSION) -> Path: # ───────────────────────────────────────────────────────────────────────── -# GeoIP mmdb (used by timezone="auto" to map proxy egress IP → IANA zone) +# GeoIP mmdb (timezone="auto" → map egress IP → IANA zone) +# +# daijro/geoip-all-in-one is rebuilt WEEKLY, so we don't pin a tag. We cache +# the latest mmdb and, once it's older than GEOIP_REFRESH_DAYS, re-check the +# latest release and pull a newer build if one exists. Net effect: no download +# (not even an API call) on a launch within the window; auto-refresh after it; +# a stale cache is reused when offline rather than breaking the launch. # ───────────────────────────────────────────────────────────────────────── -def geoip_mmdb_path(version: str = GEOIP_MMDB_VERSION) -> Path: - """Cache location for the extracted geoip mmdb.""" - return cache_root() / "geoip" / version / GEOIP_MMDB_NAME +GEOIP_REFRESH_DAYS = 7 # matches daijro's weekly rebuild cadence -def ensure_geoip_mmdb(version: str = GEOIP_MMDB_VERSION) -> Path: - """Return a path to the geoip mmdb, downloading + caching it if needed. +def _geoip_root() -> Path: + return cache_root() / "geoip" - Set ``STEALTHFOX_GEOIP_MMDB`` to point at a user-supplied mmdb (or a test - fixture) to skip the download entirely. Otherwise the pinned weekly build - of ``daijro/geoip-all-in-one`` is fetched from GitHub Releases (public, no - token) into the user cache and unzipped once. + +def _geoip_check_marker() -> Path: + return _geoip_root() / ".last_check" + + +def _cached_geoip_mmdb() -> Path | None: + """Newest cached mmdb across tag dirs, or None. Tag dirs are date strings + (e.g. ``2026.06.03``) so a lexical sort is chronological.""" + root = _geoip_root() + if not root.exists(): + return None + cands = sorted(root.glob("*/*.mmdb")) + return cands[-1] if cands else None + + +def _geoip_cache_fresh(max_age_days: int) -> bool: + marker = _geoip_check_marker() + if not marker.exists(): + return False + return (time.time() - marker.stat().st_mtime) < max_age_days * 86400 + + +def _touch_geoip_marker() -> None: + m = _geoip_check_marker() + m.parent.mkdir(parents=True, exist_ok=True) + m.touch() + + +def _latest_geoip_tag() -> str: + """Latest ``daijro/geoip-all-in-one`` release tag via the GitHub API.""" + headers = {"Accept": "application/vnd.github+json"} + token = _github_token() + if token: + headers["Authorization"] = f"token {token}" + r = requests.get( + f"https://api.github.com/repos/{GEOIP_REPO}/releases/latest", + headers=headers, timeout=15, + ) + r.raise_for_status() + tag = r.json().get("tag_name") + if not tag: + raise RuntimeError("no tag_name in geoip-all-in-one latest release") + return tag + + +def _download_geoip_tag(tag: str) -> Path: + """Download + extract a specific tag's mmdb if not already cached.""" + dst_dir = _geoip_root() / tag + target = dst_dir / GEOIP_MMDB_NAME + if not target.exists(): + url = GEOIP_RELEASE_URL_TEMPLATE.format(tag=tag, asset=GEOIP_ASSET) + dst_dir.mkdir(parents=True, exist_ok=True) + with tempfile.TemporaryDirectory() as td: + archive = Path(td) / GEOIP_ASSET + _download_file(url, archive) + _extract(archive, dst_dir) + if target.exists(): + return target + # asset name inside the zip may differ from GEOIP_MMDB_NAME + found = sorted(dst_dir.glob("*.mmdb")) + if found: + return found[0] + raise RuntimeError(f"geoip mmdb not found after extraction in {dst_dir}") + + +def _prune_old_geoip_tags(keep: str) -> None: + """Drop every cached tag dir except ``keep`` to bound disk usage.""" + root = _geoip_root() + if not root.exists(): + return + for d in root.iterdir(): + if d.is_dir() and d.name != keep: + shutil.rmtree(d, ignore_errors=True) + + +def geoip_mmdb_path() -> Path | None: + """Path to the currently-cached mmdb (newest tag), or None if none cached.""" + return _cached_geoip_mmdb() + + +def ensure_geoip_mmdb(max_age_days: int = GEOIP_REFRESH_DAYS) -> Path: + """Return a geoip mmdb, kept fresh against daijro's weekly rebuild. + + Resolution order: + 1. ``STEALTHFOX_GEOIP_MMDB`` env → use that file (user-supplied / test). + 2. A cached mmdb younger than ``max_age_days`` → use it (no network). + 3. Else ask GitHub for the latest tag, download it if not already cached, + prune older tags, and reset the freshness timer. + 4. If the API/download is unreachable but a cached mmdb exists → use it + (and reset the timer so we don't hammer the API while offline). + 5. Cold cache + no network → fall back to the pinned ``GEOIP_MMDB_VERSION``; + if that download also fails, raise. """ override = os.environ.get("STEALTHFOX_GEOIP_MMDB") if override: p = Path(override) if not p.exists(): - raise RuntimeError( - f"STEALTHFOX_GEOIP_MMDB points to a missing file: {p}" - ) + raise RuntimeError(f"STEALTHFOX_GEOIP_MMDB points to a missing file: {p}") return p - dst = geoip_mmdb_path(version) - if dst.exists(): - return dst + cached = _cached_geoip_mmdb() + if cached and _geoip_cache_fresh(max_age_days): + return cached - url = GEOIP_RELEASE_URL_TEMPLATE.format(tag=version, asset=GEOIP_ASSET) - dst.parent.mkdir(parents=True, exist_ok=True) - with tempfile.TemporaryDirectory() as td: - archive = Path(td) / GEOIP_ASSET - _download_file(url, archive) - _extract(archive, dst.parent) + try: + tag = _latest_geoip_tag() + except Exception: + if cached: + _touch_geoip_marker() # recheck after the window; don't hammer + return cached + tag = GEOIP_MMDB_VERSION # cold cache + API down → pinned fallback - if dst.exists(): - return dst - # The asset name inside the zip may differ from GEOIP_MMDB_NAME — fall - # back to the first .mmdb the archive produced. - candidates = sorted(dst.parent.glob("*.mmdb")) - if candidates: - return candidates[0] - raise RuntimeError(f"geoip mmdb not found after extraction in {dst.parent}") + mmdb = _download_geoip_tag(tag) + _prune_old_geoip_tags(mmdb.parent.name) + _touch_geoip_marker() + return mmdb diff --git a/src/invisible_playwright/launcher.py b/src/invisible_playwright/launcher.py index 6de4818..15055ee 100644 --- a/src/invisible_playwright/launcher.py +++ b/src/invisible_playwright/launcher.py @@ -137,12 +137,13 @@ class InvisiblePlaywright: locale: BCP-47 tag (e.g. ``"en-US"``). Drives the ``Accept-Language`` header and ``navigator.language``. timezone: IANA zone (e.g. ``"America/New_York"``) — used as-is - when set. ``""`` (default) or ``"auto"`` resolves the zone - from the proxy egress IP when a proxy is set (one lookup - through the proxy + an offline mmdb), otherwise the host TZ. - ``"host"`` / ``"local"`` forces the host TZ even behind a - proxy. With a proxy, an unresolvable zone raises rather than - silently falling back to the host TZ (``timezone_mismatch``). + when set, the only way to force a specific zone. ``""`` + (default) or ``"auto"`` ALWAYS resolves from the egress IP: + through the proxy when one is set, otherwise from the host's + own public IP (one lookup + an offline mmdb). On failure: with + a proxy it raises (a foreign proxy on the host TZ is the + ``timezone_mismatch`` signal); without a proxy it falls back to + the host TZ so a transient lookup failure can't break launch. extra_prefs: Optional dict of Firefox prefs overlayed on top of the generated profile — useful for niche tweaks without monkey-patching the package. diff --git a/tests/test_geo.py b/tests/test_geo.py index 17d448f..39ef5ee 100644 --- a/tests/test_geo.py +++ b/tests/test_geo.py @@ -136,6 +136,20 @@ def test_discover_egress_ip_all_fail_raises(monkeypatch): discover_egress_ip(SOCKS) +@pytest.mark.unit +def test_discover_egress_ip_no_proxy_is_direct(monkeypatch): + # proxy=None → direct request, requests.get must get proxies=None. + seen = {} + + def fake_get(url, **kw): + seen["proxies"] = kw.get("proxies", "MISSING") + return _FakeResp("192.0.2.55") + + monkeypatch.setattr(_geo.requests, "get", fake_get) + assert discover_egress_ip(None) == "192.0.2.55" + assert seen["proxies"] is None + + # ────────────────────────────────────────────────────────────────────── # ip_to_timezone — mocked mmdb reader # ────────────────────────────────────────────────────────────────────── @@ -194,8 +208,9 @@ def stub_egress(monkeypatch): """Make egress resolution deterministic + offline; record if it ran.""" state = {"called": False} - def fake_discover(proxy, **kw): + def fake_discover(proxy=None, **kw): state["called"] = True + state["proxy_arg"] = proxy return "203.0.113.7" monkeypatch.setattr(_geo, "discover_egress_ip", fake_discover) @@ -208,56 +223,66 @@ def stub_egress(monkeypatch): @pytest.mark.unit -@pytest.mark.parametrize("sentinel", ["host", "local", "HOST", "Local"]) -def test_resolve_host_sentinel_forces_host_tz(sentinel, stub_egress): - # Even with a proxy set, "host"/"local" force the host TZ and never resolve. - assert resolve_session_timezone(sentinel, SOCKS) == "" - assert stub_egress["called"] is False - - -@pytest.mark.unit -def test_resolve_explicit_iana_wins_over_proxy(stub_egress): +def test_resolve_explicit_iana_wins(stub_egress): + # An explicit zone wins and never triggers resolution (proxy or not). assert resolve_session_timezone("Asia/Tokyo", SOCKS) == "Asia/Tokyo" - assert stub_egress["called"] is False # no resolution when explicit - - -@pytest.mark.unit -def test_resolve_empty_no_proxy_is_host(stub_egress): - assert resolve_session_timezone("", None) == "" + assert resolve_session_timezone("Asia/Tokyo", None) == "Asia/Tokyo" assert stub_egress["called"] is False @pytest.mark.unit -def test_resolve_auto_no_proxy_is_host(stub_egress): - assert resolve_session_timezone("auto", None) == "" - assert stub_egress["called"] is False - - -@pytest.mark.unit -def test_resolve_empty_with_proxy_defaults_to_auto(stub_egress): - # NEW default: a proxy with no timezone auto-resolves from the egress. +def test_resolve_empty_with_proxy_resolves_from_proxy(stub_egress): assert resolve_session_timezone("", SOCKS) == "America/New_York" assert stub_egress["called"] is True + assert stub_egress["proxy_arg"] == SOCKS # routed through the proxy @pytest.mark.unit -def test_resolve_auto_with_proxy_resolves(stub_egress): +def test_resolve_auto_with_proxy_resolves_from_proxy(stub_egress): assert resolve_session_timezone("auto", HTTP) == "America/New_York" + assert stub_egress["proxy_arg"] == HTTP + + +@pytest.mark.unit +def test_resolve_empty_no_proxy_resolves_from_host(stub_egress): + # auto ALWAYS resolves — without a proxy, from the host's own public IP. + assert resolve_session_timezone("", None) == "America/New_York" assert stub_egress["called"] is True + assert stub_egress["proxy_arg"] is None # direct request, no proxy @pytest.mark.unit -def test_resolve_direct_proxy_treated_as_no_proxy(stub_egress): - assert resolve_session_timezone("auto", {"server": "direct://"}) == "" - assert stub_egress["called"] is False +def test_resolve_auto_no_proxy_resolves_from_host(stub_egress): + assert resolve_session_timezone("auto", None) == "America/New_York" + assert stub_egress["proxy_arg"] is None @pytest.mark.unit -def test_resolve_fail_early_propagates(monkeypatch): - # With a proxy set, a discovery failure must raise — never silent host TZ. - def boom(proxy, **kw): +def test_resolve_direct_proxy_resolves_via_host(stub_egress): + # direct:// counts as "no proxy" → resolve from the host IP, don't skip. + assert resolve_session_timezone("auto", {"server": "direct://"}) == "America/New_York" + assert stub_egress["proxy_arg"] is None + + +@pytest.mark.unit +def test_resolve_no_proxy_failure_falls_back_to_host(monkeypatch): + # Without a proxy, a lookup failure must NOT break the launch → host TZ (""). + def boom(proxy=None, **kw): + raise GeoTimezoneError("offline") + + monkeypatch.setattr(_geo, "discover_egress_ip", boom) + assert resolve_session_timezone("auto", None) == "" + assert resolve_session_timezone("", None) == "" + + +@pytest.mark.unit +def test_resolve_proxy_failure_raises(monkeypatch): + # With a proxy set, a failure must raise — never a silent host-TZ fallback. + def boom(proxy=None, **kw): raise GeoTimezoneError("no egress") monkeypatch.setattr(_geo, "discover_egress_ip", boom) with pytest.raises(GeoTimezoneError): resolve_session_timezone("auto", SOCKS) + with pytest.raises(GeoTimezoneError): + resolve_session_timezone("", SOCKS) diff --git a/tests/test_geoip_update.py b/tests/test_geoip_update.py new file mode 100644 index 0000000..26632b7 --- /dev/null +++ b/tests/test_geoip_update.py @@ -0,0 +1,131 @@ +"""Unit tests for the intelligent geoip mmdb auto-update in `download.py`. + +daijro/geoip-all-in-one rebuilds weekly; `ensure_geoip_mmdb` keeps the cache +fresh without a download (or API call) on every launch. These tests mock the +cache root, the latest-tag API, and the per-tag download so nothing touches the +network. +""" +import os +import time + +import pytest + +import invisible_playwright.download as dl + + +@pytest.fixture +def cache(tmp_path, monkeypatch): + """Point the cache at tmp_path and clear the env override.""" + monkeypatch.setattr(dl, "cache_root", lambda: tmp_path) + monkeypatch.delenv("STEALTHFOX_GEOIP_MMDB", raising=False) + return tmp_path + + +def _make_cached(root, tag, name=dl.GEOIP_MMDB_NAME): + d = root / "geoip" / tag + d.mkdir(parents=True, exist_ok=True) + f = d / name + f.write_bytes(b"FAKE-MMDB") + return f + + +def _set_marker_age(root, days): + m = root / "geoip" / ".last_check" + m.parent.mkdir(parents=True, exist_ok=True) + m.touch() + old = time.time() - days * 86400 + os.utime(m, (old, old)) + + +# ────────────────────────────────────────────────────────────────────── +# env override +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_env_override_returns_file(tmp_path, monkeypatch): + f = tmp_path / "mine.mmdb" + f.write_bytes(b"X") + monkeypatch.setenv("STEALTHFOX_GEOIP_MMDB", str(f)) + assert dl.ensure_geoip_mmdb() == f + + +@pytest.mark.unit +def test_env_override_missing_raises(tmp_path, monkeypatch): + monkeypatch.setenv("STEALTHFOX_GEOIP_MMDB", str(tmp_path / "nope.mmdb")) + with pytest.raises(RuntimeError): + dl.ensure_geoip_mmdb() + + +# ────────────────────────────────────────────────────────────────────── +# freshness window +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_fresh_cache_no_network(cache, monkeypatch): + f = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 0) # just checked + + def boom(): + raise AssertionError("latest-tag API must NOT be called within the window") + + monkeypatch.setattr(dl, "_latest_geoip_tag", boom) + assert dl.ensure_geoip_mmdb(max_age_days=7) == f + + +@pytest.mark.unit +def test_stale_same_tag_no_download(cache, monkeypatch): + f = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 30) # stale → will re-check + monkeypatch.setattr(dl, "_latest_geoip_tag", lambda: "2026.06.03") + # real _download_geoip_tag runs but target exists, so no actual download: + monkeypatch.setattr(dl, "_download_file", lambda *a, **k: (_ for _ in ()).throw( + AssertionError("must not download when tag already cached"))) + assert dl.ensure_geoip_mmdb(max_age_days=7) == f + + +@pytest.mark.unit +def test_stale_new_tag_downloads_and_prunes(cache, monkeypatch): + old = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 30) + monkeypatch.setattr(dl, "_latest_geoip_tag", lambda: "2026.06.10") + + def fake_download(tag): + return _make_cached(cache, tag) # simulate fetch+extract of the new tag + + monkeypatch.setattr(dl, "_download_geoip_tag", fake_download) + got = dl.ensure_geoip_mmdb(max_age_days=7) + assert got.parent.name == "2026.06.10" + assert not old.parent.exists() # old tag pruned + assert got.exists() + + +# ────────────────────────────────────────────────────────────────────── +# offline resilience +# ────────────────────────────────────────────────────────────────────── +@pytest.mark.unit +def test_api_down_with_cache_uses_cache(cache, monkeypatch): + f = _make_cached(cache, "2026.06.03") + _set_marker_age(cache, 30) + + def boom(): + raise OSError("offline") + + monkeypatch.setattr(dl, "_latest_geoip_tag", boom) + assert dl.ensure_geoip_mmdb(max_age_days=7) == f # stale cache reused, no raise + + +@pytest.mark.unit +def test_cold_cache_api_down_falls_back_to_pinned(cache, monkeypatch): + # no cache at all + API unreachable → pinned GEOIP_MMDB_VERSION fallback. + def boom(): + raise OSError("offline") + + monkeypatch.setattr(dl, "_latest_geoip_tag", boom) + captured = {} + + def fake_download(tag): + captured["tag"] = tag + return _make_cached(cache, tag) + + monkeypatch.setattr(dl, "_download_geoip_tag", fake_download) + got = dl.ensure_geoip_mmdb(max_age_days=7) + assert captured["tag"] == dl.GEOIP_MMDB_VERSION + assert got.exists()