mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-27 19:25:15 +02:00
chore(lint): ruff checks
This commit is contained in:
parent
34fbee0c28
commit
8f30cfd69a
8 changed files with 44 additions and 26 deletions
|
|
@ -8,9 +8,10 @@ Create Date: 2025-11-13 23:20:12.912741
|
|||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
from sqlalchemy import text
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "36"
|
||||
down_revision: str | None = "35"
|
||||
|
|
@ -49,7 +50,9 @@ def upgrade() -> None:
|
|||
]
|
||||
|
||||
for constraint_name in constraints_to_drop:
|
||||
if constraint_exists(connection, "user_search_space_preferences", constraint_name):
|
||||
if constraint_exists(
|
||||
connection, "user_search_space_preferences", constraint_name
|
||||
):
|
||||
op.drop_constraint(
|
||||
constraint_name,
|
||||
"user_search_space_preferences",
|
||||
|
|
@ -67,13 +70,18 @@ def downgrade() -> None:
|
|||
|
||||
# Re-add the foreign key constraints if they don't exist
|
||||
constraints_to_create = [
|
||||
("user_search_space_preferences_long_context_llm_id_fkey", "long_context_llm_id"),
|
||||
(
|
||||
"user_search_space_preferences_long_context_llm_id_fkey",
|
||||
"long_context_llm_id",
|
||||
),
|
||||
("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"),
|
||||
("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"),
|
||||
]
|
||||
|
||||
for constraint_name, column_name in constraints_to_create:
|
||||
if not constraint_exists(connection, "user_search_space_preferences", constraint_name):
|
||||
if not constraint_exists(
|
||||
connection, "user_search_space_preferences", constraint_name
|
||||
):
|
||||
op.create_foreign_key(
|
||||
constraint_name,
|
||||
"user_search_space_preferences",
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ class WebCrawlerConnector:
|
|||
|
||||
# Extract content based on format
|
||||
content = scrape_result.markdown or scrape_result.html or ""
|
||||
|
||||
|
||||
# Extract metadata
|
||||
metadata = scrape_result.metadata if scrape_result.metadata else {}
|
||||
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ from app.tasks.connector_indexers import (
|
|||
index_airtable_records,
|
||||
index_clickup_tasks,
|
||||
index_confluence_pages,
|
||||
index_crawled_urls,
|
||||
index_discord_messages,
|
||||
index_elasticsearch_documents,
|
||||
index_github_repos,
|
||||
|
|
@ -49,7 +50,6 @@ from app.tasks.connector_indexers import (
|
|||
index_luma_events,
|
||||
index_notion_pages,
|
||||
index_slack_messages,
|
||||
index_crawled_urls,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
|
|
@ -1537,6 +1537,7 @@ async def run_elasticsearch_indexing(
|
|||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
# Add new helper functions for crawled web page indexing
|
||||
async def run_web_page_indexing_with_new_session(
|
||||
connector_id: int,
|
||||
|
|
@ -1595,4 +1596,4 @@ async def run_web_page_indexing(
|
|||
f"Web page indexing failed or no documents processed: {error_or_warning}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background Web page indexing task: {e!s}")
|
||||
logger.error(f"Error in background Web page indexing task: {e!s}")
|
||||
|
|
|
|||
|
|
@ -118,7 +118,9 @@ class ConnectorService:
|
|||
|
||||
# Extract webcrawler-specific metadata
|
||||
url = metadata.get("source", metadata.get("url", ""))
|
||||
title = document.get("title", metadata.get("title", "Untitled Document"))
|
||||
title = document.get(
|
||||
"title", metadata.get("title", "Untitled Document")
|
||||
)
|
||||
description = metadata.get("description", "")
|
||||
language = metadata.get("language", "")
|
||||
last_crawled_at = metadata.get("last_crawled_at", "")
|
||||
|
|
@ -2573,4 +2575,4 @@ class ConnectorService:
|
|||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, elasticsearch_chunks
|
||||
return result_object, elasticsearch_chunks
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
|
|||
index_airtable_records_task,
|
||||
index_clickup_tasks_task,
|
||||
index_confluence_pages_task,
|
||||
index_crawled_urls_task,
|
||||
index_discord_messages_task,
|
||||
index_elasticsearch_documents_task,
|
||||
index_github_repos_task,
|
||||
|
|
@ -77,7 +78,6 @@ async def _check_and_trigger_schedules():
|
|||
index_luma_events_task,
|
||||
index_notion_pages_task,
|
||||
index_slack_messages_task,
|
||||
index_crawled_urls_task
|
||||
)
|
||||
|
||||
# Map connector types to their tasks
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ async def index_crawled_urls(
|
|||
|
||||
# Get the Firecrawl API key from the connector config (optional)
|
||||
api_key = connector.config.get("FIRECRAWL_API_KEY")
|
||||
|
||||
|
||||
# Get URLs from connector config
|
||||
initial_urls = connector.config.get("INITIAL_URLS", "")
|
||||
if isinstance(initial_urls, str):
|
||||
|
|
@ -177,7 +177,9 @@ async def index_crawled_urls(
|
|||
continue
|
||||
|
||||
# Format content as structured document
|
||||
structured_document = crawler.format_to_structured_document(crawl_result)
|
||||
structured_document = crawler.format_to_structured_document(
|
||||
crawl_result
|
||||
)
|
||||
|
||||
# Generate unique identifier hash for this URL
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
|
|
@ -185,7 +187,9 @@ async def index_crawled_urls(
|
|||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(structured_document, search_space_id)
|
||||
content_hash = generate_content_hash(
|
||||
structured_document, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
|
|
@ -205,7 +209,9 @@ async def index_crawled_urls(
|
|||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(f"Content changed for URL {url}. Updating document.")
|
||||
logger.info(
|
||||
f"Content changed for URL {url}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
|
|
@ -236,7 +242,7 @@ async def index_crawled_urls(
|
|||
if language:
|
||||
summary_content += f"Language: {language}\n"
|
||||
summary_content += f"Crawler: {crawler_type}\n\n"
|
||||
|
||||
|
||||
# Add content preview
|
||||
content_preview = content[:1000]
|
||||
if len(content) > 1000:
|
||||
|
|
@ -298,7 +304,7 @@ async def index_crawled_urls(
|
|||
if language:
|
||||
summary_content += f"Language: {language}\n"
|
||||
summary_content += f"Crawler: {crawler_type}\n\n"
|
||||
|
||||
|
||||
# Add content preview
|
||||
content_preview = content[:1000]
|
||||
if len(content) > 1000:
|
||||
|
|
@ -347,7 +353,7 @@ async def index_crawled_urls(
|
|||
continue
|
||||
|
||||
total_processed = documents_indexed + documents_updated
|
||||
|
||||
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
|
|
@ -360,10 +366,14 @@ async def index_crawled_urls(
|
|||
# Build result message
|
||||
result_message = None
|
||||
if failed_urls:
|
||||
failed_summary = "; ".join([f"{url}: {error}" for url, error in failed_urls[:5]])
|
||||
failed_summary = "; ".join(
|
||||
[f"{url}: {error}" for url, error in failed_urls[:5]]
|
||||
)
|
||||
if len(failed_urls) > 5:
|
||||
failed_summary += f" (and {len(failed_urls) - 5} more)"
|
||||
result_message = f"Completed with {len(failed_urls)} failures: {failed_summary}"
|
||||
result_message = (
|
||||
f"Completed with {len(failed_urls)} failures: {failed_summary}"
|
||||
)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -436,4 +446,4 @@ async def get_crawled_url_documents(
|
|||
|
||||
result = await session.execute(query)
|
||||
documents = result.scalars().all()
|
||||
return list(documents)
|
||||
return list(documents)
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ def create_periodic_schedule(
|
|||
index_airtable_records_task,
|
||||
index_clickup_tasks_task,
|
||||
index_confluence_pages_task,
|
||||
index_crawled_urls_task,
|
||||
index_discord_messages_task,
|
||||
index_elasticsearch_documents_task,
|
||||
index_github_repos_task,
|
||||
|
|
@ -80,7 +81,6 @@ def create_periodic_schedule(
|
|||
index_luma_events_task,
|
||||
index_notion_pages_task,
|
||||
index_slack_messages_task,
|
||||
index_crawled_urls_task,
|
||||
)
|
||||
|
||||
# Map connector type to task
|
||||
|
|
|
|||
|
|
@ -468,7 +468,7 @@ def validate_connector_config(
|
|||
value = config.get(key)
|
||||
if not isinstance(value, list) or not value:
|
||||
raise ValueError(f"{field_name} must be a non-empty list of strings")
|
||||
|
||||
|
||||
def validate_firecrawl_api_key_format() -> None:
|
||||
"""Validate Firecrawl API key format if provided."""
|
||||
api_key = config.get("FIRECRAWL_API_KEY", "")
|
||||
|
|
@ -477,16 +477,13 @@ def validate_connector_config(
|
|||
"Firecrawl API key should start with 'fc-'. Please verify your API key."
|
||||
)
|
||||
|
||||
|
||||
def validate_initial_urls() -> None:
|
||||
initial_urls = config.get("INITIAL_URLS", "")
|
||||
if initial_urls and initial_urls.strip():
|
||||
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
|
||||
for url in urls:
|
||||
if not validators.url(url):
|
||||
raise ValueError(
|
||||
f"Invalid URL format in INITIAL_URLS: {url}"
|
||||
)
|
||||
raise ValueError(f"Invalid URL format in INITIAL_URLS: {url}")
|
||||
|
||||
# Lookup table for connector validation rules
|
||||
connector_rules = {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue