mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-21 18:55:16 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
149
surfsense_evals/tests/suites/test_crag_html_extract.py
Normal file
149
surfsense_evals/tests/suites/test_crag_html_extract.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
"""Tests for the CRAG HTML extractor.
|
||||
|
||||
We don't network-fetch trafilatura; we just verify the wrapper:
|
||||
|
||||
* Strips obvious boilerplate (nav/footer/scripts) out of the result.
|
||||
* Falls back to the stdlib stripper on degenerate input.
|
||||
* Caps output at the configured ceiling.
|
||||
* Always prepends a metadata header (``# title``) when content is
|
||||
produced.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.suites.research.crag.html_extract import (
|
||||
extract_main_content,
|
||||
)
|
||||
|
||||
|
||||
_RICH_HTML = """\
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Apple Q3 Earnings</title>
|
||||
<script>const a=1;</script>
|
||||
<style>body{font-family:sans;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="/home">Home</a><a href="/about">About</a></nav>
|
||||
<header><h1>Tech News Site</h1><p>Subscribe to our newsletter</p></header>
|
||||
<main>
|
||||
<article>
|
||||
<h1>Apple posts $90B revenue in Q3 2024</h1>
|
||||
<p>Apple Inc. announced its Q3 2024 financial results today, reporting
|
||||
$90 billion in revenue, beating analyst expectations of $87 billion.</p>
|
||||
<p>The company saw growth across iPhone, services, and wearables.
|
||||
CEO Tim Cook attributed the performance to strong demand in emerging
|
||||
markets, particularly India.</p>
|
||||
<h2>Segment breakdown</h2>
|
||||
<ul>
|
||||
<li>iPhone: $45B</li>
|
||||
<li>Services: $24B</li>
|
||||
<li>Mac: $7B</li>
|
||||
</ul>
|
||||
</article>
|
||||
</main>
|
||||
<footer><p>Copyright 2024 Tech News Site. All rights reserved.</p></footer>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
class TestExtractMainContent:
|
||||
def test_extracts_main_article(self) -> None:
|
||||
result = extract_main_content(
|
||||
_RICH_HTML,
|
||||
url="https://example.com/apple",
|
||||
page_name="Apple Q3 Earnings",
|
||||
)
|
||||
assert result.ok
|
||||
assert "Apple" in result.text
|
||||
assert "Q3 2024" in result.text
|
||||
# Header line is prepended.
|
||||
assert result.text.startswith("# Apple Q3 Earnings")
|
||||
assert "Source: https://example.com/apple" in result.text
|
||||
|
||||
def test_strips_boilerplate(self) -> None:
|
||||
result = extract_main_content(
|
||||
_RICH_HTML,
|
||||
url="https://example.com/apple",
|
||||
page_name="Apple Q3 Earnings",
|
||||
)
|
||||
assert result.ok
|
||||
# Boilerplate strings should NOT make it through.
|
||||
assert "Subscribe to our newsletter" not in result.text
|
||||
assert "Copyright 2024 Tech News Site" not in result.text
|
||||
assert "const a=1" not in result.text # script content
|
||||
|
||||
def test_includes_last_modified_when_provided(self) -> None:
|
||||
result = extract_main_content(
|
||||
_RICH_HTML,
|
||||
url="https://example.com/apple",
|
||||
page_name="Apple Q3 Earnings",
|
||||
last_modified="2024-08-01",
|
||||
)
|
||||
assert "Last modified: 2024-08-01" in result.text
|
||||
|
||||
def test_empty_html_returns_empty_result(self) -> None:
|
||||
result = extract_main_content("", url="https://x.test/")
|
||||
assert not result.ok
|
||||
assert result.method == "empty"
|
||||
assert result.n_chars == 0
|
||||
|
||||
def test_whitespace_only_html_is_empty(self) -> None:
|
||||
result = extract_main_content(" \n ", url="https://x.test/")
|
||||
assert not result.ok
|
||||
|
||||
def test_garbage_html_falls_back(self) -> None:
|
||||
# Trafilatura should reject this, fallback strip should still yield text.
|
||||
result = extract_main_content(
|
||||
"<<weird>>not a tag>>>The brown fox<<jumped<<",
|
||||
url="https://x.test/garbage",
|
||||
page_name="Garbage",
|
||||
)
|
||||
# Either trafilatura recovers something or fallback_strip does.
|
||||
if result.ok:
|
||||
assert "brown fox" in result.text or "jumped" in result.text
|
||||
|
||||
|
||||
class TestFallbackStripper:
|
||||
def test_extract_when_no_clear_main(self) -> None:
|
||||
html = """
|
||||
<html><body>
|
||||
<p>This is content one.</p>
|
||||
<p>This is content two.</p>
|
||||
</body></html>
|
||||
"""
|
||||
result = extract_main_content(
|
||||
html, url="https://x.test/", page_name="Title",
|
||||
)
|
||||
assert result.ok
|
||||
assert "content one" in result.text
|
||||
assert "content two" in result.text
|
||||
|
||||
def test_html_entities_decoded(self) -> None:
|
||||
html = """<html><body>
|
||||
<article>
|
||||
<p>Tom & Jerry — classic cartoon © 1940.</p>
|
||||
<p>It's a story about a cat <Tom> and a mouse <Jerry>.</p>
|
||||
</article>
|
||||
</body></html>"""
|
||||
result = extract_main_content(html, url="https://x.test/")
|
||||
assert result.ok
|
||||
# & should be decoded
|
||||
assert "&" not in result.text
|
||||
assert "Tom" in result.text and "Jerry" in result.text
|
||||
|
||||
|
||||
class TestOutputCapping:
|
||||
def test_long_output_is_truncated(self) -> None:
|
||||
# Generate enough content to exceed 200k cap.
|
||||
body = "<p>" + ("hello world " * 50_000) + "</p>"
|
||||
html = f"<html><body><article>{body}</article></body></html>"
|
||||
result = extract_main_content(html, url="https://x.test/", page_name="long")
|
||||
assert result.ok
|
||||
# The body text itself + the metadata header. Truncation marker
|
||||
# appears either at the body limit or before EOF.
|
||||
if "[...truncated...]" in result.text:
|
||||
# The truncation kicked in.
|
||||
assert len(result.text) <= 250_000 # header + 200k cap + slack
|
||||
Loading…
Add table
Add a link
Reference in a new issue