webclaw/crates/webclaw-llm/src/clean.rs
Valerio c99ec684fa Initial release: webclaw v0.1.0 — web content extraction for LLMs
CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
2026-03-23 18:31:11 +01:00

124 lines
4 KiB
Rust

//! Post-processing for LLM responses.
//! Strips chain-of-thought reasoning tags that models like qwen3 emit.
//! Applied to every provider response so callers never see internal reasoning.
/// Strip `<think>...</think>` blocks from LLM responses.
/// Models like qwen3 wrap internal chain-of-thought reasoning in these tags.
/// Handles multiline content, multiple blocks, and partial/malformed tags.
pub fn strip_thinking_tags(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut remaining = text;
while let Some(start) = remaining.find("<think>") {
// Keep everything before the opening tag
result.push_str(&remaining[..start]);
// Find the matching closing tag
let after_open = &remaining[start + 7..]; // len("<think>") == 7
if let Some(end) = after_open.find("</think>") {
remaining = &after_open[end + 8..]; // len("</think>") == 8
} else {
// Unclosed <think> — discard everything after it (the model is still "thinking")
remaining = "";
}
}
result.push_str(remaining);
// Clean up: leftover </think> or /think fragments from partial responses
let result = result.replace("</think>", "");
let result = result.replace("/think", "");
// Collapse leading whitespace left behind after stripping
let trimmed = result.trim();
if trimmed.is_empty() {
String::new()
} else {
trimmed.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strips_simple_thinking_block() {
let input = "<think>reasoning here</think>actual response";
assert_eq!(strip_thinking_tags(input), "actual response");
}
#[test]
fn strips_multiline_thinking() {
let input = "<think>\nlong\nthinking\nprocess\n</think>\nclean output";
assert_eq!(strip_thinking_tags(input), "clean output");
}
#[test]
fn passthrough_no_tags() {
let input = "no thinking tags here";
assert_eq!(strip_thinking_tags(input), "no thinking tags here");
}
#[test]
fn strips_partial_think_at_end() {
let input = "some text /think";
assert_eq!(strip_thinking_tags(input), "some text");
}
#[test]
fn strips_orphan_closing_tag() {
let input = "some text</think> more text";
assert_eq!(strip_thinking_tags(input), "some text more text");
}
#[test]
fn strips_multiple_thinking_blocks() {
let input = "<think>first</think>hello <think>second</think>world";
assert_eq!(strip_thinking_tags(input), "hello world");
}
#[test]
fn handles_unclosed_think_tag() {
// Model started thinking and never closed — discard everything after <think>
let input = "good content<think>still reasoning...";
assert_eq!(strip_thinking_tags(input), "good content");
}
#[test]
fn handles_empty_thinking_block() {
let input = "<think></think>content";
assert_eq!(strip_thinking_tags(input), "content");
}
#[test]
fn handles_only_thinking() {
let input = "<think>just thinking, no output</think>";
assert_eq!(strip_thinking_tags(input), "");
}
#[test]
fn preserves_json_content() {
let input = "<think>let me analyze...</think>{\"key\": \"value\", \"count\": 42}";
assert_eq!(
strip_thinking_tags(input),
"{\"key\": \"value\", \"count\": 42}"
);
}
#[test]
fn real_world_extract_leak() {
// Actual bug: qwen3 leaked "/think" into JSON values
let input = "<think>analyzing the page</think>{\"learn_more\": \"Learn more\"}";
assert_eq!(
strip_thinking_tags(input),
"{\"learn_more\": \"Learn more\"}"
);
}
#[test]
fn thinking_with_newlines_before_json() {
let input = "<think>\nstep 1\nstep 2\n</think>\n\n{\"result\": true}";
assert_eq!(strip_thinking_tags(input), "{\"result\": true}");
}
}