SurfSense/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
DESKTOP-RTLN3BA\$punk 3737118050 chore: evals
2026-05-13 14:02:26 -07:00

112 lines
4 KiB
Python

"""Tests for the FRAMES Wikipedia fetcher.
We mock the MW API with respx so tests are network-free. Coverage:
* URL → title parsing (percent-encoded, underscores, redirects)
* Filename safety (slashes, special chars)
* Cache hit short-circuits the API call
* Missing pages return ``None`` (not an exception)
* Successful fetches write ``# Title`` markdown to disk
"""
from __future__ import annotations
from pathlib import Path
import httpx
import pytest
import respx
from surfsense_evals.suites.research.frames.wiki_fetch import (
WIKI_API,
WikiFetcher,
cache_filename_for_title,
title_from_url,
)
class TestTitleFromUrl:
def test_basic(self) -> None:
assert title_from_url("https://en.wikipedia.org/wiki/James_Buchanan") == "James Buchanan"
def test_percent_encoded(self) -> None:
assert (
title_from_url("https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB")
== "Charlotte Brontë"
)
def test_query_string_dropped(self) -> None:
assert title_from_url("https://en.wikipedia.org/wiki/Foo?action=edit") == "Foo"
def test_non_wiki_raises(self) -> None:
with pytest.raises(ValueError):
title_from_url("https://example.com/wiki/Foo")
class TestCacheFilename:
def test_simple(self) -> None:
assert cache_filename_for_title("James Buchanan") == "James_Buchanan.md"
def test_unicode_replaced_with_underscore(self) -> None:
# Brontë's diaeresis is non-ASCII so the regex replaces it with `_`.
# The space → `_` happens after the unicode swap, so the final
# name has exactly one underscore for the diaeresis. Acceptable:
# filenames stay round-trippable as long as the rule is deterministic.
assert cache_filename_for_title("Charlotte Brontë") == "Charlotte_Bront_.md"
def test_slashes_replaced(self) -> None:
# Wikipedia titles can contain ``/`` (e.g. "I/O"), which would
# break the filesystem layout if not sanitised.
assert cache_filename_for_title("I/O") == "I_O.md"
@pytest.mark.asyncio
@respx.mock
async def test_fetch_success_writes_markdown(tmp_path: Path) -> None:
respx.get(WIKI_API).mock(return_value=httpx.Response(
200,
json={"query": {"pages": [{
"pageid": 1,
"title": "James Buchanan",
"extract": "James Buchanan was the 15th president of the United States.",
}]}},
))
fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100) # disable throttle
article = await fetcher.fetch("https://en.wikipedia.org/wiki/James_Buchanan")
assert article is not None
assert article.title == "James Buchanan"
body = article.markdown_path.read_text(encoding="utf-8")
assert body.startswith("# James Buchanan")
assert "15th president" in body
@pytest.mark.asyncio
@respx.mock
async def test_fetch_missing_page_returns_none(tmp_path: Path) -> None:
respx.get(WIKI_API).mock(return_value=httpx.Response(
200,
json={"query": {"pages": [{
"title": "DoesNotExist",
"missing": True,
}]}},
))
fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
article = await fetcher.fetch("https://en.wikipedia.org/wiki/DoesNotExist")
assert article is None
assert not (tmp_path / "DoesNotExist.md").exists()
@pytest.mark.asyncio
@respx.mock
async def test_fetch_cache_hit_skips_api(tmp_path: Path) -> None:
# Pre-populate the cache.
cached = tmp_path / cache_filename_for_title("Cached Page")
cached.write_text("# Cached Page\n\nfrom disk\n", encoding="utf-8")
fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
# No respx mock registered; if the fetcher hits the network, respx
# would error out (it intercepts everything inside the decorator).
article = await fetcher.fetch("https://en.wikipedia.org/wiki/Cached_Page")
assert article is not None
assert article.markdown_path == cached
assert article.markdown_path.read_text(encoding="utf-8").endswith("from disk\n")