mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
Renaming resources
This commit is contained in:
parent
6d19e0fad8
commit
121e2f0c0e
24 changed files with 117 additions and 273 deletions
|
|
@ -44,9 +44,9 @@ def upgrade() -> None:
|
|||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'documenttype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
|
||||
WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL'
|
||||
) THEN
|
||||
ALTER TYPE documenttype ADD VALUE 'WEBCRAWLER_CONNECTOR';
|
||||
ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
|
@ -671,7 +671,7 @@ async def fetch_relevant_documents(
|
|||
(
|
||||
source_object,
|
||||
crawled_urls_chunks,
|
||||
) = await connector_service.search_crawled_urls(
|
||||
) = await connector_service.search_webcrawler(
|
||||
user_query=reformulated_query,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
|
|||
- TAVILY_API: "Tavily search API results" (personalized search results)
|
||||
- LINKUP_API: "Linkup search API results" (personalized search results)
|
||||
- LUMA_CONNECTOR: "Luma events"
|
||||
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense web crawler" (personally selected websites)
|
||||
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
|
||||
</knowledge_sources>
|
||||
|
||||
<instructions>
|
||||
|
|
|
|||
|
|
@ -208,9 +208,6 @@ class Config:
|
|||
# LlamaCloud API Key
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
|
||||
# Firecrawl API Key
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
|
||||
|
||||
# Litellm TTS Configuration
|
||||
TTS_SERVICE = os.getenv("TTS_SERVICE")
|
||||
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ from app.tasks.connector_indexers import (
|
|||
index_luma_events,
|
||||
index_notion_pages,
|
||||
index_slack_messages,
|
||||
index_webcrawler_urls,
|
||||
index_crawled_urls,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
|
|
@ -691,12 +691,12 @@ async def index_connector_content(
|
|||
response_message = "Elasticsearch indexing started in the background."
|
||||
|
||||
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
|
||||
from app.tasks.celery_tasks.connector_tasks import index_webcrawler_urls_task
|
||||
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
|
||||
|
||||
logger.info(
|
||||
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
|
||||
)
|
||||
index_webcrawler_urls_task.delay(
|
||||
index_crawled_urls_task.delay(
|
||||
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
|
||||
)
|
||||
response_message = "Web page indexing started in the background."
|
||||
|
|
@ -1537,8 +1537,8 @@ async def run_elasticsearch_indexing(
|
|||
exc_info=True,
|
||||
)
|
||||
|
||||
# Add new helper functions for webcrawler indexing
|
||||
async def run_webcrawler_indexing_with_new_session(
|
||||
# Add new helper functions for crawled web page indexing
|
||||
async def run_web_page_indexing_with_new_session(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
|
|
@ -1546,16 +1546,16 @@ async def run_webcrawler_indexing_with_new_session(
|
|||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Create a new session and run the Webcrawler indexing task.
|
||||
Create a new session and run the Web page indexing task.
|
||||
This prevents session leaks by creating a dedicated session for the background task.
|
||||
"""
|
||||
async with async_session_maker() as session:
|
||||
await run_webcrawler_indexing(
|
||||
await run_web_page_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
||||
|
||||
async def run_webcrawler_indexing(
|
||||
async def run_web_page_indexing(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
|
|
@ -1564,7 +1564,7 @@ async def run_webcrawler_indexing(
|
|||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Background task to run Webcrawler indexing.
|
||||
Background task to run Web page indexing.
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the webcrawler connector
|
||||
|
|
@ -1574,7 +1574,7 @@ async def run_webcrawler_indexing(
|
|||
end_date: End date for indexing
|
||||
"""
|
||||
try:
|
||||
documents_processed, error_or_warning = await index_webcrawler_urls(
|
||||
documents_processed, error_or_warning = await index_crawled_urls(
|
||||
session=session,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
|
|
@ -1588,11 +1588,11 @@ async def run_webcrawler_indexing(
|
|||
if documents_processed > 0:
|
||||
await update_connector_last_indexed(session, connector_id)
|
||||
logger.info(
|
||||
f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
|
||||
f"Web page indexing completed successfully: {documents_processed} documents processed"
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
|
||||
f"Web page indexing failed or no documents processed: {error_or_warning}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background Webcrawler indexing task: {e!s}")
|
||||
logger.error(f"Error in background Web page indexing task: {e!s}")
|
||||
|
|
@ -2573,4 +2573,4 @@ class ConnectorService:
|
|||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, elasticsearch_chunks
|
||||
return result_object, elasticsearch_chunks
|
||||
|
|
@ -602,8 +602,8 @@ async def _index_elasticsearch_documents(
|
|||
)
|
||||
|
||||
|
||||
@celery_app.task(name="index_webcrawler_urls", bind=True)
|
||||
def index_webcrawler_urls_task(
|
||||
@celery_app.task(name="index_crawled_urls", bind=True)
|
||||
def index_crawled_urls_task(
|
||||
self,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
|
|
@ -611,7 +611,7 @@ def index_webcrawler_urls_task(
|
|||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""Celery task to index Webcrawler Urls."""
|
||||
"""Celery task to index Web page Urls."""
|
||||
import asyncio
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
|
|
@ -619,7 +619,7 @@ def index_webcrawler_urls_task(
|
|||
|
||||
try:
|
||||
loop.run_until_complete(
|
||||
_index_webcrawler_urls(
|
||||
_index_crawled_urls(
|
||||
connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
)
|
||||
|
|
@ -627,19 +627,19 @@ def index_webcrawler_urls_task(
|
|||
loop.close()
|
||||
|
||||
|
||||
async def _index_webcrawler_urls(
|
||||
async def _index_crawled_urls(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""Index Webcrawler Urls with new session."""
|
||||
"""Index Web page Urls with new session."""
|
||||
from app.routes.search_source_connectors_routes import (
|
||||
run_webcrawler_indexing,
|
||||
run_web_page_indexing,
|
||||
)
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
await run_webcrawler_indexing(
|
||||
await run_web_page_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ async def _check_and_trigger_schedules():
|
|||
index_luma_events_task,
|
||||
index_notion_pages_task,
|
||||
index_slack_messages_task,
|
||||
index_webcrawler_urls_task
|
||||
index_crawled_urls_task
|
||||
)
|
||||
|
||||
# Map connector types to their tasks
|
||||
|
|
@ -95,7 +95,7 @@ async def _check_and_trigger_schedules():
|
|||
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
|
||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
||||
}
|
||||
|
||||
# Trigger indexing for each due connector
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ from .luma_indexer import index_luma_events
|
|||
# Documentation and knowledge management
|
||||
from .notion_indexer import index_notion_pages
|
||||
from .slack_indexer import index_slack_messages
|
||||
from .webcrawler_indexer import index_webcrawler_urls
|
||||
from .webcrawler_indexer import index_crawled_urls
|
||||
|
||||
__all__ = [ # noqa: RUF022
|
||||
"index_airtable_records",
|
||||
|
|
@ -60,7 +60,7 @@ __all__ = [ # noqa: RUF022
|
|||
"index_linear_issues",
|
||||
# Documentation and knowledge management
|
||||
"index_notion_pages",
|
||||
"index_webcrawler_urls",
|
||||
"index_crawled_urls",
|
||||
# Communication platforms
|
||||
"index_slack_messages",
|
||||
"index_google_gmail_messages",
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ from .base import (
|
|||
)
|
||||
|
||||
|
||||
async def index_webcrawler_urls(
|
||||
async def index_crawled_urls(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
|
|
@ -37,7 +37,7 @@ async def index_webcrawler_urls(
|
|||
update_last_indexed: bool = True,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index webcrawler URLs.
|
||||
Index web page URLs.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
|
|
@ -55,9 +55,9 @@ async def index_webcrawler_urls(
|
|||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="webcrawler_url_indexing",
|
||||
task_name="crawled_url_indexing",
|
||||
source="connector_indexing_task",
|
||||
message=f"Starting webcrawler URL indexing for connector {connector_id}",
|
||||
message=f"Starting web page URL indexing for connector {connector_id}",
|
||||
metadata={
|
||||
"connector_id": connector_id,
|
||||
"user_id": str(user_id),
|
||||
|
|
@ -104,7 +104,7 @@ async def index_webcrawler_urls(
|
|||
urls = []
|
||||
|
||||
logger.info(
|
||||
f"Starting webcrawler indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
)
|
||||
|
||||
# Initialize webcrawler client
|
||||
|
|
@ -367,7 +367,7 @@ async def index_webcrawler_urls(
|
|||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed webcrawler indexing for connector {connector_id}",
|
||||
f"Successfully completed crawled web page indexing for connector {connector_id}",
|
||||
{
|
||||
"urls_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
|
|
@ -378,7 +378,7 @@ async def index_webcrawler_urls(
|
|||
)
|
||||
|
||||
logger.info(
|
||||
f"Webcrawler indexing completed: {documents_indexed} new, "
|
||||
f"Web page indexing completed: {documents_indexed} new, "
|
||||
f"{documents_updated} updated, {documents_skipped} skipped, "
|
||||
f"{len(failed_urls)} failed"
|
||||
)
|
||||
|
|
@ -388,7 +388,7 @@ async def index_webcrawler_urls(
|
|||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Database error during webcrawler indexing for connector {connector_id}",
|
||||
f"Database error during web page indexing for connector {connector_id}",
|
||||
str(db_error),
|
||||
{"error_type": "SQLAlchemyError"},
|
||||
)
|
||||
|
|
@ -398,12 +398,12 @@ async def index_webcrawler_urls(
|
|||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to index webcrawler URLs for connector {connector_id}",
|
||||
f"Failed to index web page URLs for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
logger.error(f"Failed to index webcrawler URLs: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index webcrawler URLs: {e!s}"
|
||||
logger.error(f"Failed to index web page URLs: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index web page URLs: {e!s}"
|
||||
|
||||
|
||||
async def get_crawled_url_documents(
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ CONNECTOR_TASK_MAP = {
|
|||
SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
|
||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_webcrawler_urls",
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -80,7 +80,7 @@ def create_periodic_schedule(
|
|||
index_luma_events_task,
|
||||
index_notion_pages_task,
|
||||
index_slack_messages_task,
|
||||
index_webcrawler_urls_task,
|
||||
index_crawled_urls_task,
|
||||
)
|
||||
|
||||
# Map connector type to task
|
||||
|
|
@ -98,7 +98,7 @@ def create_periodic_schedule(
|
|||
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
|
||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
||||
}
|
||||
|
||||
# Trigger the first run immediately
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue