webclaw/crates/webclaw-llm/src/extract.rs

/// Schema-based and prompt-based LLM extraction.
/// Both functions build a system prompt, send content to the LLM, and parse JSON back.
use crate::clean::strip_thinking_tags;
use crate::error::LlmError;
use crate::provider::{CompletionRequest, LlmProvider, Message};

/// Extract structured JSON from content using a JSON schema.
/// The schema tells the LLM exactly what fields to extract and their types.
pub async fn extract_json(
    content: &str,
    schema: &serde_json::Value,
    provider: &dyn LlmProvider,
    model: Option<&str>,
) -> Result<serde_json::Value, LlmError> {
    let system = format!(
        "You are a JSON extraction engine. Extract data from the content according to this schema.\n\
         Return ONLY valid JSON matching the schema. No explanations, no markdown, no commentary.\n\n\
         Schema:\n```json\n{}\n```",
        serde_json::to_string_pretty(schema).unwrap_or_else(|_| schema.to_string())
    );

    let request = CompletionRequest {
        model: model.unwrap_or_default().to_string(),
        messages: vec![
            Message {
                role: "system".into(),
                content: system,
            },
            Message {
                role: "user".into(),
                content: content.to_string(),
            },
        ],
        temperature: Some(0.0),
        max_tokens: None,
        json_mode: true,
    };

    let response = provider.complete(&request).await?;
    parse_json_response(&response)
}

/// Extract information using a natural language prompt.
/// More flexible than schema extraction — the user describes what they want.
pub async fn extract_with_prompt(
    content: &str,
    prompt: &str,
    provider: &dyn LlmProvider,
    model: Option<&str>,
) -> Result<serde_json::Value, LlmError> {
    let system = format!(
        "You are a JSON extraction engine. Extract information from the content based on these instructions.\n\
         Return ONLY valid JSON. No explanations, no markdown, no commentary.\n\n\
         Instructions: {prompt}"
    );

    let request = CompletionRequest {
        model: model.unwrap_or_default().to_string(),
        messages: vec![
            Message {
                role: "system".into(),
                content: system,
            },
            Message {
                role: "user".into(),
                content: content.to_string(),
            },
        ],
        temperature: Some(0.0),
        max_tokens: None,
        json_mode: true,
    };

    let response = provider.complete(&request).await?;
    parse_json_response(&response)
}

/// Parse an LLM response string as JSON. Handles common edge cases:
/// - Thinking tags (`<think>...</think>`)
/// - Markdown code fences (```json ... ```)
/// - Leading/trailing whitespace
fn parse_json_response(response: &str) -> Result<serde_json::Value, LlmError> {
    // Strip thinking tags before any JSON parsing — providers already do this,
    // but defense in depth for any caller that bypasses the provider layer
    let cleaned = strip_thinking_tags(response);
    let trimmed = cleaned.trim();

    // Strip markdown code fences if present
    let json_str = if trimmed.starts_with("```") {
        let without_opener = trimmed
            .strip_prefix("```json")
            .or_else(|| trimmed.strip_prefix("```"))
            .unwrap_or(trimmed);
        without_opener
            .strip_suffix("```")
            .unwrap_or(without_opener)
            .trim()
    } else {
        trimmed
    };

    serde_json::from_str(json_str)
        .map_err(|e| LlmError::InvalidJson(format!("{e} — raw response: {response}")))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::testing::mock::MockProvider;

    #[test]
    fn parse_clean_json() {
        let result = parse_json_response(r#"{"name": "Rust", "version": 2024}"#).unwrap();
        assert_eq!(result["name"], "Rust");
        assert_eq!(result["version"], 2024);
    }

    #[test]
    fn parse_json_with_code_fence() {
        let response = "```json\n{\"key\": \"value\"}\n```";
        let result = parse_json_response(response).unwrap();
        assert_eq!(result["key"], "value");
    }

    #[test]
    fn parse_json_with_whitespace() {
        let response = "  \n  {\"ok\": true}  \n  ";
        let result = parse_json_response(response).unwrap();
        assert_eq!(result["ok"], true);
    }

    #[test]
    fn parse_invalid_json() {
        let result = parse_json_response("not json at all");
        assert!(matches!(result, Err(LlmError::InvalidJson(_))));
    }

    #[test]
    fn parse_json_with_thinking_tags() {
        let response = "<think>analyzing the content</think>{\"title\": \"Hello\"}";
        let result = parse_json_response(response).unwrap();
        assert_eq!(result["title"], "Hello");
    }

    #[test]
    fn parse_json_with_thinking_and_code_fence() {
        let response = "<think>let me think</think>\n```json\n{\"key\": \"value\"}\n```";
        let result = parse_json_response(response).unwrap();
        assert_eq!(result["key"], "value");
    }

    #[tokio::test]
    async fn extract_json_uses_schema_in_prompt() {
        let mock = MockProvider::ok(r#"{"title": "Test Article", "author": "Jane"}"#);

        let schema = serde_json::json!({
            "type": "object",
            "properties": {
                "title": { "type": "string" },
                "author": { "type": "string" }
            }
        });

        let result = extract_json("Some article content by Jane", &schema, &mock, None)
            .await
            .unwrap();

        assert_eq!(result["title"], "Test Article");
        assert_eq!(result["author"], "Jane");
    }

    #[tokio::test]
    async fn extract_with_prompt_returns_json() {
        let mock = MockProvider::ok(r#"{"emails": ["test@example.com"]}"#);

        let result = extract_with_prompt(
            "Contact us at test@example.com",
            "Find all email addresses",
            &mock,
            None,
        )
        .await
        .unwrap();

        assert_eq!(result["emails"][0], "test@example.com");
    }
}
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`/// Schema-based and prompt-based LLM extraction.`
			`/// Both functions build a system prompt, send content to the LLM, and parse JSON back.`
			`use crate::clean::strip_thinking_tags;`
			`use crate::error::LlmError;`
			`use crate::provider::{CompletionRequest, LlmProvider, Message};`

			`/// Extract structured JSON from content using a JSON schema.`
			`/// The schema tells the LLM exactly what fields to extract and their types.`
			`pub async fn extract_json(`
			`content: &str,`
			`schema: &serde_json::Value,`
			`provider: &dyn LlmProvider,`
			`model: Option<&str>,`
			`) -> Result<serde_json::Value, LlmError> {`
			`let system = format!(`
			`"You are a JSON extraction engine. Extract data from the content according to this schema.\n\`
			`Return ONLY valid JSON matching the schema. No explanations, no markdown, no commentary.\n\n\`
			Schema:\n```json\n{}\n```",
			`serde_json::to_string_pretty(schema).unwrap_or_else(\|_\| schema.to_string())`
			`);`

			`let request = CompletionRequest {`
			`model: model.unwrap_or_default().to_string(),`
			`messages: vec![`
			`Message {`
			`role: "system".into(),`
			`content: system,`
			`},`
			`Message {`
			`role: "user".into(),`
			`content: content.to_string(),`
			`},`
			`],`
			`temperature: Some(0.0),`
			`max_tokens: None,`
			`json_mode: true,`
			`};`

			`let response = provider.complete(&request).await?;`
			`parse_json_response(&response)`
			`}`

			`/// Extract information using a natural language prompt.`
			`/// More flexible than schema extraction — the user describes what they want.`
			`pub async fn extract_with_prompt(`
			`content: &str,`
			`prompt: &str,`
			`provider: &dyn LlmProvider,`
			`model: Option<&str>,`
			`) -> Result<serde_json::Value, LlmError> {`
			`let system = format!(`
			`"You are a JSON extraction engine. Extract information from the content based on these instructions.\n\`
			`Return ONLY valid JSON. No explanations, no markdown, no commentary.\n\n\`
			`Instructions: {prompt}"`
			`);`

			`let request = CompletionRequest {`
			`model: model.unwrap_or_default().to_string(),`
			`messages: vec![`
			`Message {`
			`role: "system".into(),`
			`content: system,`
			`},`
			`Message {`
			`role: "user".into(),`
			`content: content.to_string(),`
			`},`
			`],`
			`temperature: Some(0.0),`
			`max_tokens: None,`
			`json_mode: true,`
			`};`

			`let response = provider.complete(&request).await?;`
			`parse_json_response(&response)`
			`}`

			`/// Parse an LLM response string as JSON. Handles common edge cases:`
			/// - Thinking tags (`<think>...</think>`)
			/// - Markdown code fences (```json ... ```)
			`/// - Leading/trailing whitespace`
			`fn parse_json_response(response: &str) -> Result<serde_json::Value, LlmError> {`
			`// Strip thinking tags before any JSON parsing — providers already do this,`
			`// but defense in depth for any caller that bypasses the provider layer`
			`let cleaned = strip_thinking_tags(response);`
			`let trimmed = cleaned.trim();`

			`// Strip markdown code fences if present`
			let json_str = if trimmed.starts_with("```") {
			`let without_opener = trimmed`
			.strip_prefix("```json")
			.or_else(\|\| trimmed.strip_prefix("```"))
			`.unwrap_or(trimmed);`
			`without_opener`
			.strip_suffix("```")
			`.unwrap_or(without_opener)`
			`.trim()`
			`} else {`
			`trimmed`
			`};`

			`serde_json::from_str(json_str)`
			`.map_err(\|e\| LlmError::InvalidJson(format!("{e} — raw response: {response}")))`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`use crate::testing::mock::MockProvider;`

			`#[test]`
			`fn parse_clean_json() {`
			`let result = parse_json_response(r#"{"name": "Rust", "version": 2024}"#).unwrap();`
			`assert_eq!(result["name"], "Rust");`
			`assert_eq!(result["version"], 2024);`
			`}`

			`#[test]`
			`fn parse_json_with_code_fence() {`
			let response = "```json\n{\"key\": \"value\"}\n```";
			`let result = parse_json_response(response).unwrap();`
			`assert_eq!(result["key"], "value");`
			`}`

			`#[test]`
			`fn parse_json_with_whitespace() {`
			`let response = " \n {\"ok\": true} \n ";`
			`let result = parse_json_response(response).unwrap();`
			`assert_eq!(result["ok"], true);`
			`}`

			`#[test]`
			`fn parse_invalid_json() {`
			`let result = parse_json_response("not json at all");`
			`assert!(matches!(result, Err(LlmError::InvalidJson(_))));`
			`}`

			`#[test]`
			`fn parse_json_with_thinking_tags() {`
			`let response = "<think>analyzing the content</think>{\"title\": \"Hello\"}";`
			`let result = parse_json_response(response).unwrap();`
			`assert_eq!(result["title"], "Hello");`
			`}`

			`#[test]`
			`fn parse_json_with_thinking_and_code_fence() {`
			let response = "<think>let me think</think>\n```json\n{\"key\": \"value\"}\n```";
			`let result = parse_json_response(response).unwrap();`
			`assert_eq!(result["key"], "value");`
			`}`

			`#[tokio::test]`
			`async fn extract_json_uses_schema_in_prompt() {`
			`let mock = MockProvider::ok(r#"{"title": "Test Article", "author": "Jane"}"#);`

			`let schema = serde_json::json!({`
			`"type": "object",`
			`"properties": {`
			`"title": { "type": "string" },`
			`"author": { "type": "string" }`
			`}`
			`});`

			`let result = extract_json("Some article content by Jane", &schema, &mock, None)`
			`.await`
			`.unwrap();`

			`assert_eq!(result["title"], "Test Article");`
			`assert_eq!(result["author"], "Jane");`
			`}`

			`#[tokio::test]`
			`async fn extract_with_prompt_returns_json() {`
			`let mock = MockProvider::ok(r#"{"emails": ["test@example.com"]}"#);`

			`let result = extract_with_prompt(`
			`"Contact us at test@example.com",`
			`"Find all email addresses",`
			`&mock,`
			`None,`
			`)`
			`.await`
			`.unwrap();`

			`assert_eq!(result["emails"][0], "test@example.com");`
			`}`
			`}`