"""Tests for the CRAG HTML extractor. We don't network-fetch trafilatura; we just verify the wrapper: * Strips obvious boilerplate (nav/footer/scripts) out of the result. * Falls back to the stdlib stripper on degenerate input. * Caps output at the configured ceiling. * Always prepends a metadata header (``# title``) when content is produced. """ from __future__ import annotations import pytest from surfsense_evals.suites.research.crag.html_extract import ( extract_main_content, ) _RICH_HTML = """\ Apple Q3 Earnings

Tech News Site

Subscribe to our newsletter

Apple posts $90B revenue in Q3 2024

Apple Inc. announced its Q3 2024 financial results today, reporting $90 billion in revenue, beating analyst expectations of $87 billion.

The company saw growth across iPhone, services, and wearables. CEO Tim Cook attributed the performance to strong demand in emerging markets, particularly India.

Segment breakdown

iPhone: $45B
Services: $24B
Mac: $7B

""" class TestExtractMainContent: def test_extracts_main_article(self) -> None: result = extract_main_content( _RICH_HTML, url="https://example.com/apple", page_name="Apple Q3 Earnings", ) assert result.ok assert "Apple" in result.text assert "Q3 2024" in result.text # Header line is prepended. assert result.text.startswith("# Apple Q3 Earnings") assert "Source: https://example.com/apple" in result.text def test_strips_boilerplate(self) -> None: result = extract_main_content( _RICH_HTML, url="https://example.com/apple", page_name="Apple Q3 Earnings", ) assert result.ok # Boilerplate strings should NOT make it through. assert "Subscribe to our newsletter" not in result.text assert "Copyright 2024 Tech News Site" not in result.text assert "const a=1" not in result.text # script content def test_includes_last_modified_when_provided(self) -> None: result = extract_main_content( _RICH_HTML, url="https://example.com/apple", page_name="Apple Q3 Earnings", last_modified="2024-08-01", ) assert "Last modified: 2024-08-01" in result.text def test_empty_html_returns_empty_result(self) -> None: result = extract_main_content("", url="https://x.test/") assert not result.ok assert result.method == "empty" assert result.n_chars == 0 def test_whitespace_only_html_is_empty(self) -> None: result = extract_main_content(" \n ", url="https://x.test/") assert not result.ok def test_garbage_html_falls_back(self) -> None: # Trafilatura should reject this, fallback strip should still yield text. result = extract_main_content( "<>not a tag>>>The brown fox< None: html = """

This is content one.

This is content two.

""" result = extract_main_content( html, url="https://x.test/", page_name="Title", ) assert result.ok assert "content one" in result.text assert "content two" in result.text def test_html_entities_decoded(self) -> None: html = """

It's a story about a cat <Tom> and a mouse <Jerry>.

""" result = extract_main_content(html, url="https://x.test/") assert result.ok # & should be decoded assert "&" not in result.text assert "Tom" in result.text and "Jerry" in result.text class TestOutputCapping: def test_long_output_is_truncated(self) -> None: # Generate enough content to exceed 200k cap. body = "

" + ("hello world " * 50_000) + "

" html = f"

{body}

" result = extract_main_content(html, url="https://x.test/", page_name="long") assert result.ok # The body text itself + the metadata header. Truncation marker # appears either at the body limit or before EOF. if "[...truncated...]" in result.text: # The truncation kicked in. assert len(result.text) <= 250_000 # header + 200k cap + slack