mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-31 19:45:15 +02:00
use trafilatura to extrack page content from the chromium result
This commit is contained in:
parent
c6cc7c2a6a
commit
64cd65bc1f
4 changed files with 202 additions and 8 deletions
|
|
@ -197,6 +197,10 @@ async def index_crawled_urls(
|
|||
structured_document_for_hash, search_space_id
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"structured_document_for_hash {structured_document_for_hash} ========="
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue