use trafilatura to extrack page content from the chromium result

This commit is contained in:
CREDO23 2025-12-19 08:36:25 +02:00
parent c6cc7c2a6a
commit 64cd65bc1f
4 changed files with 202 additions and 8 deletions

View file

@ -5,13 +5,17 @@ A module for crawling web pages and extracting content using Firecrawl or AsyncC
Provides a unified interface for web scraping.
"""
import logging
from typing import Any
import trafilatura
import validators
from fake_useragent import UserAgent
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader
logger = logging.getLogger(__name__)
class WebCrawlerConnector:
"""Class for crawling web pages and extracting content."""
@ -122,7 +126,8 @@ class WebCrawlerConnector:
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
"""
Crawl URL using AsyncChromiumLoader with realistic User-Agent.
Crawl URL using AsyncChromiumLoader with Trafilatura for content extraction.
Falls back to raw HTML if Trafilatura extraction fails.
Args:
url: URL to crawl
@ -147,17 +152,77 @@ class WebCrawlerConnector:
raise ValueError(f"Failed to load content from {url}")
doc = documents[0]
raw_html = doc.page_content
# Extract basic metadata from the document
metadata = doc.metadata if doc.metadata else {}
base_metadata = doc.metadata if doc.metadata else {}
# Try to extract main content using Trafilatura
extracted_content = None
trafilatura_metadata = None
try:
logger.info(
f"Attempting to extract main content from {url} using Trafilatura"
)
# Extract main content as markdown
extracted_content = trafilatura.extract(
raw_html,
output_format="markdown", # Get clean markdown
include_comments=False, # Exclude comments
include_tables=True, # Keep tables
include_images=True, # Keep image references
include_links=True, # Keep links
)
# Extract metadata using Trafilatura
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
if extracted_content and len(extracted_content.strip()) > 0:
logger.info(
f"Successfully extracted main content from {url} using Trafilatura "
f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)"
)
else:
logger.warning(
f"Trafilatura extraction returned empty content for {url}, "
"falling back to raw HTML"
)
extracted_content = None
except Exception as e:
logger.warning(
f"Trafilatura extraction failed for {url}: {e}. "
"Falling back to raw HTML"
)
extracted_content = None
# Build metadata, preferring Trafilatura metadata when available
metadata = {
"source": url,
"title": (
trafilatura_metadata.title
if trafilatura_metadata and trafilatura_metadata.title
else base_metadata.get("title", url)
),
}
# Add additional metadata from Trafilatura if available
if trafilatura_metadata:
if trafilatura_metadata.description:
metadata["description"] = trafilatura_metadata.description
if trafilatura_metadata.author:
metadata["author"] = trafilatura_metadata.author
if trafilatura_metadata.date:
metadata["date"] = trafilatura_metadata.date
# Add any remaining base metadata
metadata.update(base_metadata)
return {
"content": doc.page_content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
**metadata,
},
"content": extracted_content if extracted_content else raw_html,
"metadata": metadata,
"crawler_type": "chromium",
}

View file

@ -197,6 +197,10 @@ async def index_crawled_urls(
structured_document_for_hash, search_space_id
)
logger.info(
f"structured_document_for_hash {structured_document_for_hash} ========="
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash