mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-08 22:25:12 +02:00
chore: bump to 0.3.9, fix formatting from #14
Version bump for layout table, stack overflow, and noise filter fixes contributed by @devnen. Also fixes cargo fmt issues that caused CI lint failure on the merge commit. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
87ecf4241f
commit
3cf9dbaf2a
7 changed files with 137 additions and 39 deletions
|
|
@ -1492,7 +1492,8 @@ mod form_integration_tests {
|
|||
#[test]
|
||||
fn aspnet_form_content_extraction() {
|
||||
let content = "x".repeat(600); // Ensure >500 chars
|
||||
let html = format!(r#"<html><body>
|
||||
let html = format!(
|
||||
r#"<html><body>
|
||||
<form method="post" action="./page.aspx" id="form1">
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a></div>
|
||||
|
|
@ -1503,12 +1504,19 @@ mod form_integration_tests {
|
|||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</body></html>"#);
|
||||
</body></html>"#
|
||||
);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("Section"), "h2 missing from markdown");
|
||||
assert!(result.markdown.contains("Question"), "h3 missing from markdown");
|
||||
assert!(
|
||||
result.markdown.contains("Section"),
|
||||
"h2 missing from markdown"
|
||||
);
|
||||
assert!(
|
||||
result.markdown.contains("Question"),
|
||||
"h3 missing from markdown"
|
||||
);
|
||||
}
|
||||
|
||||
/// Simulate unclosed header div absorbing the content div.
|
||||
|
|
@ -1520,7 +1528,8 @@ mod form_integration_tests {
|
|||
// The header div is intentionally NOT closed — the HTML parser makes
|
||||
// div.content a child of div.header. The safety valve (>5000 chars)
|
||||
// should prevent div.header from being treated as noise.
|
||||
let html = format!(r#"<html><body>
|
||||
let html = format!(
|
||||
r#"<html><body>
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a>
|
||||
<div class="content">
|
||||
|
|
@ -1529,11 +1538,18 @@ mod form_integration_tests {
|
|||
<p>{faq}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>"#);
|
||||
</body></html>"#
|
||||
);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
|
||||
assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
|
||||
assert!(
|
||||
result.markdown.contains("FAQ Section"),
|
||||
"h2 missing: header swallowed content"
|
||||
);
|
||||
assert!(
|
||||
result.markdown.contains("First question"),
|
||||
"h3 missing: header swallowed content"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue