chore(lint): ruff checks

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-11-26 13:22:31 -08:00
parent 34fbee0c28
commit 8f30cfd69a
8 changed files with 44 additions and 26 deletions

View file

@ -8,9 +8,10 @@ Create Date: 2025-11-13 23:20:12.912741
from collections.abc import Sequence from collections.abc import Sequence
from alembic import op
from sqlalchemy import text from sqlalchemy import text
from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.
revision: str = "36" revision: str = "36"
down_revision: str | None = "35" down_revision: str | None = "35"
@ -49,7 +50,9 @@ def upgrade() -> None:
] ]
for constraint_name in constraints_to_drop: for constraint_name in constraints_to_drop:
if constraint_exists(connection, "user_search_space_preferences", constraint_name): if constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.drop_constraint( op.drop_constraint(
constraint_name, constraint_name,
"user_search_space_preferences", "user_search_space_preferences",
@ -67,13 +70,18 @@ def downgrade() -> None:
# Re-add the foreign key constraints if they don't exist # Re-add the foreign key constraints if they don't exist
constraints_to_create = [ constraints_to_create = [
("user_search_space_preferences_long_context_llm_id_fkey", "long_context_llm_id"), (
"user_search_space_preferences_long_context_llm_id_fkey",
"long_context_llm_id",
),
("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"), ("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"),
("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"), ("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"),
] ]
for constraint_name, column_name in constraints_to_create: for constraint_name, column_name in constraints_to_create:
if not constraint_exists(connection, "user_search_space_preferences", constraint_name): if not constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.create_foreign_key( op.create_foreign_key(
constraint_name, constraint_name,
"user_search_space_preferences", "user_search_space_preferences",

View file

@ -105,7 +105,7 @@ class WebCrawlerConnector:
# Extract content based on format # Extract content based on format
content = scrape_result.markdown or scrape_result.html or "" content = scrape_result.markdown or scrape_result.html or ""
# Extract metadata # Extract metadata
metadata = scrape_result.metadata if scrape_result.metadata else {} metadata = scrape_result.metadata if scrape_result.metadata else {}

View file

@ -39,6 +39,7 @@ from app.tasks.connector_indexers import (
index_airtable_records, index_airtable_records,
index_clickup_tasks, index_clickup_tasks,
index_confluence_pages, index_confluence_pages,
index_crawled_urls,
index_discord_messages, index_discord_messages,
index_elasticsearch_documents, index_elasticsearch_documents,
index_github_repos, index_github_repos,
@ -49,7 +50,6 @@ from app.tasks.connector_indexers import (
index_luma_events, index_luma_events,
index_notion_pages, index_notion_pages,
index_slack_messages, index_slack_messages,
index_crawled_urls,
) )
from app.users import current_active_user from app.users import current_active_user
from app.utils.check_ownership import check_ownership from app.utils.check_ownership import check_ownership
@ -1537,6 +1537,7 @@ async def run_elasticsearch_indexing(
exc_info=True, exc_info=True,
) )
# Add new helper functions for crawled web page indexing # Add new helper functions for crawled web page indexing
async def run_web_page_indexing_with_new_session( async def run_web_page_indexing_with_new_session(
connector_id: int, connector_id: int,
@ -1595,4 +1596,4 @@ async def run_web_page_indexing(
f"Web page indexing failed or no documents processed: {error_or_warning}" f"Web page indexing failed or no documents processed: {error_or_warning}"
) )
except Exception as e: except Exception as e:
logger.error(f"Error in background Web page indexing task: {e!s}") logger.error(f"Error in background Web page indexing task: {e!s}")

View file

@ -118,7 +118,9 @@ class ConnectorService:
# Extract webcrawler-specific metadata # Extract webcrawler-specific metadata
url = metadata.get("source", metadata.get("url", "")) url = metadata.get("source", metadata.get("url", ""))
title = document.get("title", metadata.get("title", "Untitled Document")) title = document.get(
"title", metadata.get("title", "Untitled Document")
)
description = metadata.get("description", "") description = metadata.get("description", "")
language = metadata.get("language", "") language = metadata.get("language", "")
last_crawled_at = metadata.get("last_crawled_at", "") last_crawled_at = metadata.get("last_crawled_at", "")
@ -2573,4 +2575,4 @@ class ConnectorService:
"sources": sources_list, "sources": sources_list,
} }
return result_object, elasticsearch_chunks return result_object, elasticsearch_chunks

View file

@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
index_airtable_records_task, index_airtable_records_task,
index_clickup_tasks_task, index_clickup_tasks_task,
index_confluence_pages_task, index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task, index_discord_messages_task,
index_elasticsearch_documents_task, index_elasticsearch_documents_task,
index_github_repos_task, index_github_repos_task,
@ -77,7 +78,6 @@ async def _check_and_trigger_schedules():
index_luma_events_task, index_luma_events_task,
index_notion_pages_task, index_notion_pages_task,
index_slack_messages_task, index_slack_messages_task,
index_crawled_urls_task
) )
# Map connector types to their tasks # Map connector types to their tasks

View file

@ -93,7 +93,7 @@ async def index_crawled_urls(
# Get the Firecrawl API key from the connector config (optional) # Get the Firecrawl API key from the connector config (optional)
api_key = connector.config.get("FIRECRAWL_API_KEY") api_key = connector.config.get("FIRECRAWL_API_KEY")
# Get URLs from connector config # Get URLs from connector config
initial_urls = connector.config.get("INITIAL_URLS", "") initial_urls = connector.config.get("INITIAL_URLS", "")
if isinstance(initial_urls, str): if isinstance(initial_urls, str):
@ -177,7 +177,9 @@ async def index_crawled_urls(
continue continue
# Format content as structured document # Format content as structured document
structured_document = crawler.format_to_structured_document(crawl_result) structured_document = crawler.format_to_structured_document(
crawl_result
)
# Generate unique identifier hash for this URL # Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash( unique_identifier_hash = generate_unique_identifier_hash(
@ -185,7 +187,9 @@ async def index_crawled_urls(
) )
# Generate content hash # Generate content hash
content_hash = generate_content_hash(structured_document, search_space_id) content_hash = generate_content_hash(
structured_document, search_space_id
)
# Check if document with this unique identifier already exists # Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier( existing_document = await check_document_by_unique_identifier(
@ -205,7 +209,9 @@ async def index_crawled_urls(
continue continue
else: else:
# Content has changed - update the existing document # Content has changed - update the existing document
logger.info(f"Content changed for URL {url}. Updating document.") logger.info(
f"Content changed for URL {url}. Updating document."
)
# Generate summary with metadata # Generate summary with metadata
user_llm = await get_user_long_context_llm( user_llm = await get_user_long_context_llm(
@ -236,7 +242,7 @@ async def index_crawled_urls(
if language: if language:
summary_content += f"Language: {language}\n" summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n" summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview # Add content preview
content_preview = content[:1000] content_preview = content[:1000]
if len(content) > 1000: if len(content) > 1000:
@ -298,7 +304,7 @@ async def index_crawled_urls(
if language: if language:
summary_content += f"Language: {language}\n" summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n" summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview # Add content preview
content_preview = content[:1000] content_preview = content[:1000]
if len(content) > 1000: if len(content) > 1000:
@ -347,7 +353,7 @@ async def index_crawled_urls(
continue continue
total_processed = documents_indexed + documents_updated total_processed = documents_indexed + documents_updated
if total_processed > 0: if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed) await update_connector_last_indexed(session, connector, update_last_indexed)
@ -360,10 +366,14 @@ async def index_crawled_urls(
# Build result message # Build result message
result_message = None result_message = None
if failed_urls: if failed_urls:
failed_summary = "; ".join([f"{url}: {error}" for url, error in failed_urls[:5]]) failed_summary = "; ".join(
[f"{url}: {error}" for url, error in failed_urls[:5]]
)
if len(failed_urls) > 5: if len(failed_urls) > 5:
failed_summary += f" (and {len(failed_urls) - 5} more)" failed_summary += f" (and {len(failed_urls) - 5} more)"
result_message = f"Completed with {len(failed_urls)} failures: {failed_summary}" result_message = (
f"Completed with {len(failed_urls)} failures: {failed_summary}"
)
await task_logger.log_task_success( await task_logger.log_task_success(
log_entry, log_entry,
@ -436,4 +446,4 @@ async def get_crawled_url_documents(
result = await session.execute(query) result = await session.execute(query)
documents = result.scalars().all() documents = result.scalars().all()
return list(documents) return list(documents)

View file

@ -70,6 +70,7 @@ def create_periodic_schedule(
index_airtable_records_task, index_airtable_records_task,
index_clickup_tasks_task, index_clickup_tasks_task,
index_confluence_pages_task, index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task, index_discord_messages_task,
index_elasticsearch_documents_task, index_elasticsearch_documents_task,
index_github_repos_task, index_github_repos_task,
@ -80,7 +81,6 @@ def create_periodic_schedule(
index_luma_events_task, index_luma_events_task,
index_notion_pages_task, index_notion_pages_task,
index_slack_messages_task, index_slack_messages_task,
index_crawled_urls_task,
) )
# Map connector type to task # Map connector type to task

View file

@ -468,7 +468,7 @@ def validate_connector_config(
value = config.get(key) value = config.get(key)
if not isinstance(value, list) or not value: if not isinstance(value, list) or not value:
raise ValueError(f"{field_name} must be a non-empty list of strings") raise ValueError(f"{field_name} must be a non-empty list of strings")
def validate_firecrawl_api_key_format() -> None: def validate_firecrawl_api_key_format() -> None:
"""Validate Firecrawl API key format if provided.""" """Validate Firecrawl API key format if provided."""
api_key = config.get("FIRECRAWL_API_KEY", "") api_key = config.get("FIRECRAWL_API_KEY", "")
@ -477,16 +477,13 @@ def validate_connector_config(
"Firecrawl API key should start with 'fc-'. Please verify your API key." "Firecrawl API key should start with 'fc-'. Please verify your API key."
) )
def validate_initial_urls() -> None: def validate_initial_urls() -> None:
initial_urls = config.get("INITIAL_URLS", "") initial_urls = config.get("INITIAL_URLS", "")
if initial_urls and initial_urls.strip(): if initial_urls and initial_urls.strip():
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()] urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
for url in urls: for url in urls:
if not validators.url(url): if not validators.url(url):
raise ValueError( raise ValueError(f"Invalid URL format in INITIAL_URLS: {url}")
f"Invalid URL format in INITIAL_URLS: {url}"
)
# Lookup table for connector validation rules # Lookup table for connector validation rules
connector_rules = { connector_rules = {