mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
use trafilatura to extrack page content from the chromium result
This commit is contained in:
parent
c6cc7c2a6a
commit
64cd65bc1f
4 changed files with 202 additions and 8 deletions
|
|
@ -5,13 +5,17 @@ A module for crawling web pages and extracting content using Firecrawl or AsyncC
|
|||
Provides a unified interface for web scraping.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import trafilatura
|
||||
import validators
|
||||
from fake_useragent import UserAgent
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebCrawlerConnector:
|
||||
"""Class for crawling web pages and extracting content."""
|
||||
|
|
@ -122,7 +126,8 @@ class WebCrawlerConnector:
|
|||
|
||||
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
|
||||
"""
|
||||
Crawl URL using AsyncChromiumLoader with realistic User-Agent.
|
||||
Crawl URL using AsyncChromiumLoader with Trafilatura for content extraction.
|
||||
Falls back to raw HTML if Trafilatura extraction fails.
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
|
|
@ -147,17 +152,77 @@ class WebCrawlerConnector:
|
|||
raise ValueError(f"Failed to load content from {url}")
|
||||
|
||||
doc = documents[0]
|
||||
raw_html = doc.page_content
|
||||
|
||||
# Extract basic metadata from the document
|
||||
metadata = doc.metadata if doc.metadata else {}
|
||||
base_metadata = doc.metadata if doc.metadata else {}
|
||||
|
||||
# Try to extract main content using Trafilatura
|
||||
extracted_content = None
|
||||
trafilatura_metadata = None
|
||||
|
||||
try:
|
||||
logger.info(
|
||||
f"Attempting to extract main content from {url} using Trafilatura"
|
||||
)
|
||||
|
||||
# Extract main content as markdown
|
||||
extracted_content = trafilatura.extract(
|
||||
raw_html,
|
||||
output_format="markdown", # Get clean markdown
|
||||
include_comments=False, # Exclude comments
|
||||
include_tables=True, # Keep tables
|
||||
include_images=True, # Keep image references
|
||||
include_links=True, # Keep links
|
||||
)
|
||||
|
||||
# Extract metadata using Trafilatura
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
if extracted_content and len(extracted_content.strip()) > 0:
|
||||
logger.info(
|
||||
f"Successfully extracted main content from {url} using Trafilatura "
|
||||
f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Trafilatura extraction returned empty content for {url}, "
|
||||
"falling back to raw HTML"
|
||||
)
|
||||
extracted_content = None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Trafilatura extraction failed for {url}: {e}. "
|
||||
"Falling back to raw HTML"
|
||||
)
|
||||
extracted_content = None
|
||||
|
||||
# Build metadata, preferring Trafilatura metadata when available
|
||||
metadata = {
|
||||
"source": url,
|
||||
"title": (
|
||||
trafilatura_metadata.title
|
||||
if trafilatura_metadata and trafilatura_metadata.title
|
||||
else base_metadata.get("title", url)
|
||||
),
|
||||
}
|
||||
|
||||
# Add additional metadata from Trafilatura if available
|
||||
if trafilatura_metadata:
|
||||
if trafilatura_metadata.description:
|
||||
metadata["description"] = trafilatura_metadata.description
|
||||
if trafilatura_metadata.author:
|
||||
metadata["author"] = trafilatura_metadata.author
|
||||
if trafilatura_metadata.date:
|
||||
metadata["date"] = trafilatura_metadata.date
|
||||
|
||||
# Add any remaining base metadata
|
||||
metadata.update(base_metadata)
|
||||
|
||||
return {
|
||||
"content": doc.page_content,
|
||||
"metadata": {
|
||||
"source": url,
|
||||
"title": metadata.get("title", url),
|
||||
**metadata,
|
||||
},
|
||||
"content": extracted_content if extracted_content else raw_html,
|
||||
"metadata": metadata,
|
||||
"crawler_type": "chromium",
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -197,6 +197,10 @@ async def index_crawled_urls(
|
|||
structured_document_for_hash, search_space_id
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"structured_document_for_hash {structured_document_for_hash} ========="
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue