webclaw/crates/webclaw-core/src/llm/links.rs

/// Link extraction, deduplication, noise filtering, and label formatting
/// for the LLM output's deduplicated links section.
use std::collections::HashSet;

use once_cell::sync::Lazy;
use regex::Regex;

// ---------------------------------------------------------------------------
// Link extraction
// ---------------------------------------------------------------------------

/// Matches `[text](url)`. Images are already stripped, so no `!` prefix concern.
static LINK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap());

/// Extract all links from markdown, replacing inline `[text](url)` with just `text`.
/// Returns the cleaned text and a deduplicated list of (label, href) pairs.
pub(crate) fn extract_and_strip_links(input: &str) -> (String, Vec<(String, String)>) {
    let mut links: Vec<(String, String)> = Vec::new();
    let mut seen_hrefs: HashSet<String> = HashSet::new();

    let replaced = LINK_RE.replace_all(input, |caps: &regex::Captures| {
        let text = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
        let href = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();

        let skip = href.starts_with('#')
            || href.starts_with("javascript:")
            || href.is_empty()
            || is_noise_link(&text, &href);

        if !skip && !text.is_empty() && seen_hrefs.insert(href.clone()) {
            links.push((text.clone(), href));
        }

        text
    });

    (replaced.into_owned(), links)
}

/// Links that are noise for LLM consumption: internal actions, timestamps,
/// user profiles, generic short text.
fn is_noise_link(text: &str, href: &str) -> bool {
    let t = text.to_lowercase();

    // Generic action links
    if matches!(
        t.as_str(),
        "hide"
            | "flag"
            | "reply"
            | "favorite"
            | "unflag"
            | "vouch"
            | "next"
            | "prev"
            | "previous"
            | "more"
    ) {
        return true;
    }

    // Timestamp text ("1 hour ago", "5 minutes ago", "yesterday")
    if t.ends_with(" ago") || t == "yesterday" || t == "just now" {
        return true;
    }

    // Single-char text that's not meaningful (but keep letters -- "X", "Go", etc.)
    if text.len() == 1 && !text.chars().next().unwrap_or(' ').is_alphanumeric() {
        return true;
    }

    // Bare integer labels are usually comment counts, vote counts, or page
    // numbers. The label alone carries no useful link context for an LLM.
    if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
        return true;
    }

    // In-page comment/discussion fragments that survived the bare-fragment
    // check because the href is a full URL with a comment fragment.
    if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
        return true;
    }

    // Internal user profile / action URLs (HN-style)
    if href.contains("/user?id=")
        || href.contains("/hide?id=")
        || href.contains("/from?site=")
        || href.contains("/flag?id=")
    {
        return true;
    }

    false
}

// ---------------------------------------------------------------------------
// Link label cleaning
// ---------------------------------------------------------------------------

static MD_MARKERS_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());

static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)",
    )
    .unwrap()
});

/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
pub(crate) fn clean_link_label(raw: &str) -> String {
    // Strip markdown markers
    let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
    // Strip a11y link chrome ("opens new tab", etc.)
    let label = A11Y_LABEL_RE.replace_all(&label, "").to_string();
    let label = label.split_whitespace().collect::<Vec<_>>().join(" ");

    // Dedup repeated phrases in label
    let label = dedup_label_phrase(&label);

    // Truncate to ~80 chars (UTF-8 safe)
    if label.len() > 80 {
        // Find last whitespace boundary at or before 80 bytes
        let mut end = None;
        for (i, _) in label.char_indices() {
            if i > 80 {
                break;
            }
            if i > 0 && label.as_bytes()[i - 1].is_ascii_whitespace() {
                end = Some(i);
            }
        }
        let end = end.unwrap_or_else(|| {
            // No whitespace found -- find char boundary near 80
            label
                .char_indices()
                .map(|(i, _)| i)
                .find(|&i| i >= 80)
                .unwrap_or(label.len())
        });
        format!("{}...", label[..end].trim_end())
    } else {
        label
    }
}

/// If a label contains the same phrase twice (e.g., "X Y Z X Y Z"), return just one copy.
fn dedup_label_phrase(label: &str) -> String {
    let len = label.len();
    if len < 8 {
        return label.to_string();
    }
    // Try split at each whitespace boundary
    for (i, _) in label.match_indices(' ') {
        let left = label[..i].trim();
        let right = label[i + 1..].trim();
        if left.len() >= 4 && left.eq_ignore_ascii_case(right) {
            return left.to_string();
        }
    }
    label.to_string()
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn link_label_truncated() {
        let long = "The quick brown fox jumps over the lazy dog and then runs across the field to find more interesting things to do on a sunny afternoon";
        let result = clean_link_label(long);
        assert!(result.len() <= 84, "got len {}: {result}", result.len());
        assert!(result.ends_with("..."), "got: {result}");
    }

    #[test]
    fn link_label_markdown_stripped() {
        assert_eq!(clean_link_label("## Hello **world**"), "Hello world");
    }

    #[test]
    fn link_label_duplicate_deduped() {
        assert_eq!(
            clean_link_label("Express Delivery Express Delivery"),
            "Express Delivery"
        );
    }

    #[test]
    fn link_label_short_unchanged() {
        assert_eq!(clean_link_label("Click here"), "Click here");
    }

    #[test]
    fn noise_link_detected() {
        assert!(is_noise_link("hide", "https://example.com"));
        assert!(is_noise_link("5 minutes ago", "https://example.com"));
        assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
        assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
    }

    #[test]
    fn link_label_preserves_external_link_prose() {
        assert_eq!(
            clean_link_label("Research found an external link between incidents"),
            "Research found an external link between incidents"
        );
    }

    #[test]
    fn link_label_strips_terminal_external_link_chrome() {
        assert_eq!(
            clean_link_label("Reuters story external link"),
            "Reuters story"
        );
    }
}