mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-14 23:25:12 +02:00
CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
124 lines
4 KiB
Rust
124 lines
4 KiB
Rust
//! Post-processing for LLM responses.
|
|
//! Strips chain-of-thought reasoning tags that models like qwen3 emit.
|
|
//! Applied to every provider response so callers never see internal reasoning.
|
|
|
|
/// Strip `<think>...</think>` blocks from LLM responses.
|
|
/// Models like qwen3 wrap internal chain-of-thought reasoning in these tags.
|
|
/// Handles multiline content, multiple blocks, and partial/malformed tags.
|
|
pub fn strip_thinking_tags(text: &str) -> String {
|
|
let mut result = String::with_capacity(text.len());
|
|
let mut remaining = text;
|
|
|
|
while let Some(start) = remaining.find("<think>") {
|
|
// Keep everything before the opening tag
|
|
result.push_str(&remaining[..start]);
|
|
|
|
// Find the matching closing tag
|
|
let after_open = &remaining[start + 7..]; // len("<think>") == 7
|
|
if let Some(end) = after_open.find("</think>") {
|
|
remaining = &after_open[end + 8..]; // len("</think>") == 8
|
|
} else {
|
|
// Unclosed <think> — discard everything after it (the model is still "thinking")
|
|
remaining = "";
|
|
}
|
|
}
|
|
|
|
result.push_str(remaining);
|
|
|
|
// Clean up: leftover </think> or /think fragments from partial responses
|
|
let result = result.replace("</think>", "");
|
|
let result = result.replace("/think", "");
|
|
|
|
// Collapse leading whitespace left behind after stripping
|
|
let trimmed = result.trim();
|
|
if trimmed.is_empty() {
|
|
String::new()
|
|
} else {
|
|
trimmed.to_string()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn strips_simple_thinking_block() {
|
|
let input = "<think>reasoning here</think>actual response";
|
|
assert_eq!(strip_thinking_tags(input), "actual response");
|
|
}
|
|
|
|
#[test]
|
|
fn strips_multiline_thinking() {
|
|
let input = "<think>\nlong\nthinking\nprocess\n</think>\nclean output";
|
|
assert_eq!(strip_thinking_tags(input), "clean output");
|
|
}
|
|
|
|
#[test]
|
|
fn passthrough_no_tags() {
|
|
let input = "no thinking tags here";
|
|
assert_eq!(strip_thinking_tags(input), "no thinking tags here");
|
|
}
|
|
|
|
#[test]
|
|
fn strips_partial_think_at_end() {
|
|
let input = "some text /think";
|
|
assert_eq!(strip_thinking_tags(input), "some text");
|
|
}
|
|
|
|
#[test]
|
|
fn strips_orphan_closing_tag() {
|
|
let input = "some text</think> more text";
|
|
assert_eq!(strip_thinking_tags(input), "some text more text");
|
|
}
|
|
|
|
#[test]
|
|
fn strips_multiple_thinking_blocks() {
|
|
let input = "<think>first</think>hello <think>second</think>world";
|
|
assert_eq!(strip_thinking_tags(input), "hello world");
|
|
}
|
|
|
|
#[test]
|
|
fn handles_unclosed_think_tag() {
|
|
// Model started thinking and never closed — discard everything after <think>
|
|
let input = "good content<think>still reasoning...";
|
|
assert_eq!(strip_thinking_tags(input), "good content");
|
|
}
|
|
|
|
#[test]
|
|
fn handles_empty_thinking_block() {
|
|
let input = "<think></think>content";
|
|
assert_eq!(strip_thinking_tags(input), "content");
|
|
}
|
|
|
|
#[test]
|
|
fn handles_only_thinking() {
|
|
let input = "<think>just thinking, no output</think>";
|
|
assert_eq!(strip_thinking_tags(input), "");
|
|
}
|
|
|
|
#[test]
|
|
fn preserves_json_content() {
|
|
let input = "<think>let me analyze...</think>{\"key\": \"value\", \"count\": 42}";
|
|
assert_eq!(
|
|
strip_thinking_tags(input),
|
|
"{\"key\": \"value\", \"count\": 42}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn real_world_extract_leak() {
|
|
// Actual bug: qwen3 leaked "/think" into JSON values
|
|
let input = "<think>analyzing the page</think>{\"learn_more\": \"Learn more\"}";
|
|
assert_eq!(
|
|
strip_thinking_tags(input),
|
|
"{\"learn_more\": \"Learn more\"}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn thinking_with_newlines_before_json() {
|
|
let input = "<think>\nstep 1\nstep 2\n</think>\n\n{\"result\": true}";
|
|
assert_eq!(strip_thinking_tags(input), "{\"result\": true}");
|
|
}
|
|
}
|