chore(lint): ruff checks

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-11-26 13:22:31 -08:00
parent 34fbee0c28
commit 8f30cfd69a
8 changed files with 44 additions and 26 deletions

View file

@ -8,9 +8,10 @@ Create Date: 2025-11-13 23:20:12.912741
from collections.abc import Sequence
from alembic import op
from sqlalchemy import text
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "36"
down_revision: str | None = "35"
@ -49,7 +50,9 @@ def upgrade() -> None:
]
for constraint_name in constraints_to_drop:
if constraint_exists(connection, "user_search_space_preferences", constraint_name):
if constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.drop_constraint(
constraint_name,
"user_search_space_preferences",
@ -67,13 +70,18 @@ def downgrade() -> None:
# Re-add the foreign key constraints if they don't exist
constraints_to_create = [
("user_search_space_preferences_long_context_llm_id_fkey", "long_context_llm_id"),
(
"user_search_space_preferences_long_context_llm_id_fkey",
"long_context_llm_id",
),
("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"),
("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"),
]
for constraint_name, column_name in constraints_to_create:
if not constraint_exists(connection, "user_search_space_preferences", constraint_name):
if not constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.create_foreign_key(
constraint_name,
"user_search_space_preferences",

View file

@ -105,7 +105,7 @@ class WebCrawlerConnector:
# Extract content based on format
content = scrape_result.markdown or scrape_result.html or ""
# Extract metadata
metadata = scrape_result.metadata if scrape_result.metadata else {}

View file

@ -39,6 +39,7 @@ from app.tasks.connector_indexers import (
index_airtable_records,
index_clickup_tasks,
index_confluence_pages,
index_crawled_urls,
index_discord_messages,
index_elasticsearch_documents,
index_github_repos,
@ -49,7 +50,6 @@ from app.tasks.connector_indexers import (
index_luma_events,
index_notion_pages,
index_slack_messages,
index_crawled_urls,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
@ -1537,6 +1537,7 @@ async def run_elasticsearch_indexing(
exc_info=True,
)
# Add new helper functions for crawled web page indexing
async def run_web_page_indexing_with_new_session(
connector_id: int,
@ -1595,4 +1596,4 @@ async def run_web_page_indexing(
f"Web page indexing failed or no documents processed: {error_or_warning}"
)
except Exception as e:
logger.error(f"Error in background Web page indexing task: {e!s}")
logger.error(f"Error in background Web page indexing task: {e!s}")

View file

@ -118,7 +118,9 @@ class ConnectorService:
# Extract webcrawler-specific metadata
url = metadata.get("source", metadata.get("url", ""))
title = document.get("title", metadata.get("title", "Untitled Document"))
title = document.get(
"title", metadata.get("title", "Untitled Document")
)
description = metadata.get("description", "")
language = metadata.get("language", "")
last_crawled_at = metadata.get("last_crawled_at", "")
@ -2573,4 +2575,4 @@ class ConnectorService:
"sources": sources_list,
}
return result_object, elasticsearch_chunks
return result_object, elasticsearch_chunks

View file

@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
index_airtable_records_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
index_elasticsearch_documents_task,
index_github_repos_task,
@ -77,7 +78,6 @@ async def _check_and_trigger_schedules():
index_luma_events_task,
index_notion_pages_task,
index_slack_messages_task,
index_crawled_urls_task
)
# Map connector types to their tasks

View file

@ -93,7 +93,7 @@ async def index_crawled_urls(
# Get the Firecrawl API key from the connector config (optional)
api_key = connector.config.get("FIRECRAWL_API_KEY")
# Get URLs from connector config
initial_urls = connector.config.get("INITIAL_URLS", "")
if isinstance(initial_urls, str):
@ -177,7 +177,9 @@ async def index_crawled_urls(
continue
# Format content as structured document
structured_document = crawler.format_to_structured_document(crawl_result)
structured_document = crawler.format_to_structured_document(
crawl_result
)
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
@ -185,7 +187,9 @@ async def index_crawled_urls(
)
# Generate content hash
content_hash = generate_content_hash(structured_document, search_space_id)
content_hash = generate_content_hash(
structured_document, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
@ -205,7 +209,9 @@ async def index_crawled_urls(
continue
else:
# Content has changed - update the existing document
logger.info(f"Content changed for URL {url}. Updating document.")
logger.info(
f"Content changed for URL {url}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
@ -236,7 +242,7 @@ async def index_crawled_urls(
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
@ -298,7 +304,7 @@ async def index_crawled_urls(
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
@ -347,7 +353,7 @@ async def index_crawled_urls(
continue
total_processed = documents_indexed + documents_updated
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
@ -360,10 +366,14 @@ async def index_crawled_urls(
# Build result message
result_message = None
if failed_urls:
failed_summary = "; ".join([f"{url}: {error}" for url, error in failed_urls[:5]])
failed_summary = "; ".join(
[f"{url}: {error}" for url, error in failed_urls[:5]]
)
if len(failed_urls) > 5:
failed_summary += f" (and {len(failed_urls) - 5} more)"
result_message = f"Completed with {len(failed_urls)} failures: {failed_summary}"
result_message = (
f"Completed with {len(failed_urls)} failures: {failed_summary}"
)
await task_logger.log_task_success(
log_entry,
@ -436,4 +446,4 @@ async def get_crawled_url_documents(
result = await session.execute(query)
documents = result.scalars().all()
return list(documents)
return list(documents)

View file

@ -70,6 +70,7 @@ def create_periodic_schedule(
index_airtable_records_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
index_elasticsearch_documents_task,
index_github_repos_task,
@ -80,7 +81,6 @@ def create_periodic_schedule(
index_luma_events_task,
index_notion_pages_task,
index_slack_messages_task,
index_crawled_urls_task,
)
# Map connector type to task

View file

@ -468,7 +468,7 @@ def validate_connector_config(
value = config.get(key)
if not isinstance(value, list) or not value:
raise ValueError(f"{field_name} must be a non-empty list of strings")
def validate_firecrawl_api_key_format() -> None:
"""Validate Firecrawl API key format if provided."""
api_key = config.get("FIRECRAWL_API_KEY", "")
@ -477,16 +477,13 @@ def validate_connector_config(
"Firecrawl API key should start with 'fc-'. Please verify your API key."
)
def validate_initial_urls() -> None:
initial_urls = config.get("INITIAL_URLS", "")
if initial_urls and initial_urls.strip():
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
for url in urls:
if not validators.url(url):
raise ValueError(
f"Invalid URL format in INITIAL_URLS: {url}"
)
raise ValueError(f"Invalid URL format in INITIAL_URLS: {url}")
# Lookup table for connector validation rules
connector_rules = {