mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-02 04:12:47 +02:00
Webcrawler connector draft
This commit is contained in:
parent
419f94e8ee
commit
896e410e2a
26 changed files with 1225 additions and 9 deletions
|
|
@ -49,6 +49,7 @@ from app.tasks.connector_indexers import (
|
|||
index_luma_events,
|
||||
index_notion_pages,
|
||||
index_slack_messages,
|
||||
index_webcrawler_urls,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
|
|
@ -1523,3 +1524,63 @@ async def run_elasticsearch_indexing(
|
|||
f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# Add new helper functions for webcrawler indexing
|
||||
async def run_webcrawler_indexing_with_new_session(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Create a new session and run the Webcrawler indexing task.
|
||||
This prevents session leaks by creating a dedicated session for the background task.
|
||||
"""
|
||||
async with async_session_maker() as session:
|
||||
await run_webcrawler_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
||||
|
||||
async def run_webcrawler_indexing(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Background task to run Webcrawler indexing.
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the webcrawler connector
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
start_date: Start date for indexing
|
||||
end_date: End date for indexing
|
||||
"""
|
||||
try:
|
||||
documents_processed, error_or_warning = await index_webcrawler_urls(
|
||||
session=session,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
update_last_indexed=False, # Don't update timestamp in the indexing function
|
||||
)
|
||||
|
||||
# Only update last_indexed_at if indexing was successful (either new docs or updated docs)
|
||||
if documents_processed > 0:
|
||||
await update_connector_last_indexed(session, connector_id)
|
||||
logger.info(
|
||||
f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background Webcrawler indexing task: {e!s}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue