mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
112 lines
4 KiB
Python
112 lines
4 KiB
Python
"""Tests for the FRAMES Wikipedia fetcher.
|
|
|
|
We mock the MW API with respx so tests are network-free. Coverage:
|
|
|
|
* URL → title parsing (percent-encoded, underscores, redirects)
|
|
* Filename safety (slashes, special chars)
|
|
* Cache hit short-circuits the API call
|
|
* Missing pages return ``None`` (not an exception)
|
|
* Successful fetches write ``# Title`` markdown to disk
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import pytest
|
|
import respx
|
|
|
|
from surfsense_evals.suites.research.frames.wiki_fetch import (
|
|
WIKI_API,
|
|
WikiFetcher,
|
|
cache_filename_for_title,
|
|
title_from_url,
|
|
)
|
|
|
|
|
|
class TestTitleFromUrl:
|
|
def test_basic(self) -> None:
|
|
assert title_from_url("https://en.wikipedia.org/wiki/James_Buchanan") == "James Buchanan"
|
|
|
|
def test_percent_encoded(self) -> None:
|
|
assert (
|
|
title_from_url("https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB")
|
|
== "Charlotte Brontë"
|
|
)
|
|
|
|
def test_query_string_dropped(self) -> None:
|
|
assert title_from_url("https://en.wikipedia.org/wiki/Foo?action=edit") == "Foo"
|
|
|
|
def test_non_wiki_raises(self) -> None:
|
|
with pytest.raises(ValueError):
|
|
title_from_url("https://example.com/wiki/Foo")
|
|
|
|
|
|
class TestCacheFilename:
|
|
def test_simple(self) -> None:
|
|
assert cache_filename_for_title("James Buchanan") == "James_Buchanan.md"
|
|
|
|
def test_unicode_replaced_with_underscore(self) -> None:
|
|
# Brontë's diaeresis is non-ASCII so the regex replaces it with `_`.
|
|
# The space → `_` happens after the unicode swap, so the final
|
|
# name has exactly one underscore for the diaeresis. Acceptable:
|
|
# filenames stay round-trippable as long as the rule is deterministic.
|
|
assert cache_filename_for_title("Charlotte Brontë") == "Charlotte_Bront_.md"
|
|
|
|
def test_slashes_replaced(self) -> None:
|
|
# Wikipedia titles can contain ``/`` (e.g. "I/O"), which would
|
|
# break the filesystem layout if not sanitised.
|
|
assert cache_filename_for_title("I/O") == "I_O.md"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@respx.mock
|
|
async def test_fetch_success_writes_markdown(tmp_path: Path) -> None:
|
|
respx.get(WIKI_API).mock(return_value=httpx.Response(
|
|
200,
|
|
json={"query": {"pages": [{
|
|
"pageid": 1,
|
|
"title": "James Buchanan",
|
|
"extract": "James Buchanan was the 15th president of the United States.",
|
|
}]}},
|
|
))
|
|
fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100) # disable throttle
|
|
article = await fetcher.fetch("https://en.wikipedia.org/wiki/James_Buchanan")
|
|
assert article is not None
|
|
assert article.title == "James Buchanan"
|
|
body = article.markdown_path.read_text(encoding="utf-8")
|
|
assert body.startswith("# James Buchanan")
|
|
assert "15th president" in body
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@respx.mock
|
|
async def test_fetch_missing_page_returns_none(tmp_path: Path) -> None:
|
|
respx.get(WIKI_API).mock(return_value=httpx.Response(
|
|
200,
|
|
json={"query": {"pages": [{
|
|
"title": "DoesNotExist",
|
|
"missing": True,
|
|
}]}},
|
|
))
|
|
fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
|
|
article = await fetcher.fetch("https://en.wikipedia.org/wiki/DoesNotExist")
|
|
assert article is None
|
|
assert not (tmp_path / "DoesNotExist.md").exists()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@respx.mock
|
|
async def test_fetch_cache_hit_skips_api(tmp_path: Path) -> None:
|
|
# Pre-populate the cache.
|
|
cached = tmp_path / cache_filename_for_title("Cached Page")
|
|
cached.write_text("# Cached Page\n\nfrom disk\n", encoding="utf-8")
|
|
fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
|
|
|
|
# No respx mock registered; if the fetcher hits the network, respx
|
|
# would error out (it intercepts everything inside the decorator).
|
|
article = await fetcher.fetch("https://en.wikipedia.org/wiki/Cached_Page")
|
|
assert article is not None
|
|
assert article.markdown_path == cached
|
|
assert article.markdown_path.read_text(encoding="utf-8").endswith("from disk\n")
|