diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index c44391528..693d62466 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -152,7 +152,6 @@ celery_app.conf.update( "index_elasticsearch_documents": {"queue": CONNECTORS_QUEUE}, "index_crawled_urls": {"queue": CONNECTORS_QUEUE}, "index_bookstack_pages": {"queue": CONNECTORS_QUEUE}, - "index_obsidian_vault": {"queue": CONNECTORS_QUEUE}, "index_composio_connector": {"queue": CONNECTORS_QUEUE}, # Everything else (document processing, podcasts, reindexing, # schedule checker, cleanup) stays on the default fast queue. diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index b87ce28c9..6622dde61 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -1157,25 +1157,6 @@ async def index_connector_content( ) response_message = "Web page indexing started in the background." - elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR: - from app.config import config as app_config - from app.tasks.celery_tasks.connector_tasks import index_obsidian_vault_task - - # Obsidian connector only available in self-hosted mode - if not app_config.is_self_hosted(): - raise HTTPException( - status_code=400, - detail="Obsidian connector is only available in self-hosted mode", - ) - - logger.info( - f"Triggering Obsidian vault indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" - ) - index_obsidian_vault_task.delay( - connector_id, search_space_id, str(user.id), indexing_from, indexing_to - ) - response_message = "Obsidian vault indexing started in the background." - elif ( connector.connector_type == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR @@ -3048,59 +3029,6 @@ async def run_bookstack_indexing( ) -# Add new helper functions for Obsidian indexing -async def run_obsidian_indexing_with_new_session( - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str, - end_date: str, -): - """Wrapper to run Obsidian indexing with its own database session.""" - logger.info( - f"Background task started: Indexing Obsidian connector {connector_id} into space {search_space_id} from {start_date} to {end_date}" - ) - async with async_session_maker() as session: - await run_obsidian_indexing( - session, connector_id, search_space_id, user_id, start_date, end_date - ) - logger.info(f"Background task finished: Indexing Obsidian connector {connector_id}") - - -async def run_obsidian_indexing( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str, - end_date: str, -): - """ - Background task to run Obsidian vault indexing. - - Args: - session: Database session - connector_id: ID of the Obsidian connector - search_space_id: ID of the search space - user_id: ID of the user - start_date: Start date for indexing - end_date: End date for indexing - """ - from app.tasks.connector_indexers import index_obsidian_vault - - await _run_indexing_with_notifications( - session=session, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - indexing_function=index_obsidian_vault, - update_timestamp_func=_update_connector_timestamp_by_id, - supports_heartbeat_callback=True, - ) - - async def run_composio_indexing_with_new_session( connector_id: int, search_space_id: int, diff --git a/surfsense_backend/app/schemas/obsidian_auth_credentials.py b/surfsense_backend/app/schemas/obsidian_auth_credentials.py deleted file mode 100644 index ab178eac8..000000000 --- a/surfsense_backend/app/schemas/obsidian_auth_credentials.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Obsidian Connector Credentials Schema. - -Obsidian is a local-first note-taking app that stores notes as markdown files. -This connector supports indexing from local file system (self-hosted only). -""" - -from pydantic import BaseModel, field_validator - - -class ObsidianAuthCredentialsBase(BaseModel): - """ - Credentials/configuration for the Obsidian connector. - - Since Obsidian vaults are local directories, this schema primarily - holds the vault path and configuration options rather than API tokens. - """ - - vault_path: str - vault_name: str | None = None - exclude_folders: list[str] | None = None - include_attachments: bool = False - - @field_validator("vault_path") - @classmethod - def validate_vault_path(cls, v: str) -> str: - """Ensure vault path is provided and stripped of whitespace.""" - if not v or not v.strip(): - raise ValueError("Vault path is required") - return v.strip() - - @field_validator("exclude_folders", mode="before") - @classmethod - def parse_exclude_folders(cls, v): - """Parse exclude_folders from string if needed.""" - if v is None: - return [".trash", ".obsidian", "templates"] - if isinstance(v, str): - return [f.strip() for f in v.split(",") if f.strip()] - return v - - def to_dict(self) -> dict: - """Convert credentials to dictionary for storage.""" - return { - "vault_path": self.vault_path, - "vault_name": self.vault_name, - "exclude_folders": self.exclude_folders, - "include_attachments": self.include_attachments, - } - - @classmethod - def from_dict(cls, data: dict) -> "ObsidianAuthCredentialsBase": - """Create credentials from dictionary.""" - return cls( - vault_path=data.get("vault_path", ""), - vault_name=data.get("vault_name"), - exclude_folders=data.get("exclude_folders"), - include_attachments=data.get("include_attachments", False), - ) diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 57475c9fd..9477fa279 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -883,49 +883,6 @@ async def _index_bookstack_pages( ) -@celery_app.task(name="index_obsidian_vault", bind=True) -def index_obsidian_vault_task( - self, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str, - end_date: str, -): - """Celery task to index Obsidian vault notes.""" - import asyncio - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - loop.run_until_complete( - _index_obsidian_vault( - connector_id, search_space_id, user_id, start_date, end_date - ) - ) - finally: - loop.close() - - -async def _index_obsidian_vault( - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str, - end_date: str, -): - """Index Obsidian vault with new session.""" - from app.routes.search_source_connectors_routes import ( - run_obsidian_indexing, - ) - - async with get_celery_session_maker()() as session: - await run_obsidian_indexing( - session, connector_id, search_space_id, user_id, start_date, end_date - ) - - @celery_app.task(name="index_composio_connector", bind=True) def index_composio_connector_task( self, diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py index 1b032d54a..be99e1a2d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/__init__.py +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -46,7 +46,6 @@ from .linear_indexer import index_linear_issues # Documentation and knowledge management from .luma_indexer import index_luma_events from .notion_indexer import index_notion_pages -from .obsidian_indexer import index_obsidian_vault from .slack_indexer import index_slack_messages from .webcrawler_indexer import index_crawled_urls @@ -69,7 +68,6 @@ __all__ = [ # noqa: RUF022 "index_linear_issues", # Documentation and knowledge management "index_notion_pages", - "index_obsidian_vault", "index_crawled_urls", # Communication platforms "index_slack_messages", diff --git a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py deleted file mode 100644 index 5356ecfb7..000000000 --- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py +++ /dev/null @@ -1,676 +0,0 @@ -""" -Obsidian connector indexer. - -Indexes markdown notes from a local Obsidian vault. -This connector is only available in self-hosted mode. - -Implements 2-phase document status updates for real-time UI feedback: -- Phase 1: Create all documents with 'pending' status (visible in UI immediately) -- Phase 2: Process each document: pending → processing → ready/failed -""" - -import os -import re -import time -from collections.abc import Awaitable, Callable -from datetime import UTC, datetime -from pathlib import Path - -import yaml -from sqlalchemy.exc import SQLAlchemyError -from sqlalchemy.ext.asyncio import AsyncSession - -from app.config import config -from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType -from app.services.llm_service import get_user_long_context_llm -from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) - -from .base import ( - build_document_metadata_string, - check_document_by_unique_identifier, - check_duplicate_document_by_hash, - get_connector_by_id, - get_current_timestamp, - logger, - safe_set_chunks, - update_connector_last_indexed, -) - -# Type hint for heartbeat callback -HeartbeatCallbackType = Callable[[int], Awaitable[None]] - -# Heartbeat interval in seconds -HEARTBEAT_INTERVAL_SECONDS = 30 - - -def parse_frontmatter(content: str) -> tuple[dict | None, str]: - """ - Parse YAML frontmatter from markdown content. - - Args: - content: The full markdown content - - Returns: - Tuple of (frontmatter dict or None, content without frontmatter) - """ - if not content.startswith("---"): - return None, content - - # Find the closing --- - end_match = re.search(r"\n---\n", content[3:]) - if not end_match: - return None, content - - frontmatter_str = content[3 : end_match.start() + 3] - remaining_content = content[end_match.end() + 3 :] - - try: - frontmatter = yaml.safe_load(frontmatter_str) - return frontmatter, remaining_content.strip() - except yaml.YAMLError: - return None, content - - -def extract_wiki_links(content: str) -> list[str]: - """ - Extract [[wiki-style links]] from content. - - Args: - content: Markdown content - - Returns: - List of linked note names - """ - # Match [[link]] or [[link|alias]] - pattern = r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]" - matches = re.findall(pattern, content) - return list(set(matches)) - - -def extract_tags(content: str) -> list[str]: - """ - Extract #tags from content (both inline and frontmatter). - - Args: - content: Markdown content - - Returns: - List of tags (without # prefix) - """ - # Match #tag but not ## headers - pattern = r"(? list[dict]: - """ - Scan an Obsidian vault for markdown files. - - Args: - vault_path: Path to the Obsidian vault - exclude_folders: List of folder names to exclude - - Returns: - List of file info dicts with path, name, modified time - """ - if exclude_folders is None: - exclude_folders = [".trash", ".obsidian", "templates"] - - vault = Path(vault_path) - if not vault.exists(): - raise ValueError(f"Vault path does not exist: {vault_path}") - - files = [] - for md_file in vault.rglob("*.md"): - # Check if file is in an excluded folder - relative_path = md_file.relative_to(vault) - parts = relative_path.parts - - if any(excluded in parts for excluded in exclude_folders): - continue - - try: - stat = md_file.stat() - files.append( - { - "path": str(md_file), - "relative_path": str(relative_path), - "name": md_file.stem, - "modified_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC), - "created_at": datetime.fromtimestamp(stat.st_ctime, tz=UTC), - "size": stat.st_size, - } - ) - except OSError as e: - logger.warning(f"Could not stat file {md_file}: {e}") - - return files - - -async def index_obsidian_vault( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, - on_heartbeat_callback: HeartbeatCallbackType | None = None, -) -> tuple[int, str | None]: - """ - Index notes from a local Obsidian vault. - - This indexer is only available in self-hosted mode as it requires - direct file system access to the user's Obsidian vault. - - Args: - session: Database session - connector_id: ID of the Obsidian connector - search_space_id: ID of the search space to store documents in - user_id: ID of the user - start_date: Start date for filtering (YYYY-MM-DD format) - optional - end_date: End date for filtering (YYYY-MM-DD format) - optional - update_last_indexed: Whether to update the last_indexed_at timestamp - on_heartbeat_callback: Optional callback to update notification during long-running indexing. - - Returns: - Tuple containing (number of documents indexed, error message or None) - """ - task_logger = TaskLoggingService(session, search_space_id) - - # Check if self-hosted mode - if not config.is_self_hosted(): - return 0, "Obsidian connector is only available in self-hosted mode" - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="obsidian_vault_indexing", - source="connector_indexing_task", - message=f"Starting Obsidian vault indexing for connector {connector_id}", - metadata={ - "connector_id": connector_id, - "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, - }, - ) - - try: - # Get the connector - await task_logger.log_task_progress( - log_entry, - f"Retrieving Obsidian connector {connector_id} from database", - {"stage": "connector_retrieval"}, - ) - - connector = await get_connector_by_id( - session, connector_id, SearchSourceConnectorType.OBSIDIAN_CONNECTOR - ) - - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector with ID {connector_id} not found or is not an Obsidian connector", - "Connector not found", - {"error_type": "ConnectorNotFound"}, - ) - return ( - 0, - f"Connector with ID {connector_id} not found or is not an Obsidian connector", - ) - - # Get vault path from connector config - vault_path = connector.config.get("vault_path") - if not vault_path: - await task_logger.log_task_failure( - log_entry, - "Vault path not configured for this connector", - "Missing vault path", - {"error_type": "MissingVaultPath"}, - ) - return 0, "Vault path not configured for this connector" - - # Validate vault path exists - if not os.path.exists(vault_path): - await task_logger.log_task_failure( - log_entry, - f"Vault path does not exist: {vault_path}", - "Vault path not found", - {"error_type": "VaultNotFound", "vault_path": vault_path}, - ) - return 0, f"Vault path does not exist: {vault_path}" - - # Get configuration options - exclude_folders = connector.config.get( - "exclude_folders", [".trash", ".obsidian", "templates"] - ) - vault_name = connector.config.get("vault_name") or os.path.basename(vault_path) - - await task_logger.log_task_progress( - log_entry, - f"Scanning Obsidian vault: {vault_name}", - {"stage": "vault_scan", "vault_path": vault_path}, - ) - - # Scan vault for markdown files - try: - files = scan_vault(vault_path, exclude_folders) - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to scan vault: {e}", - "Vault scan error", - {"error_type": "VaultScanError"}, - ) - return 0, f"Failed to scan vault: {e}" - - logger.info(f"Found {len(files)} markdown files in vault") - - await task_logger.log_task_progress( - log_entry, - f"Found {len(files)} markdown files to process", - {"stage": "files_discovered", "file_count": len(files)}, - ) - - # Filter by date if provided (handle "undefined" string from frontend) - # Also handle inverted dates (start > end) by skipping filtering - start_dt = None - end_dt = None - - if start_date and start_date != "undefined": - start_dt = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=UTC) - - if end_date and end_date != "undefined": - # Make end_date inclusive (end of day) - end_dt = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=UTC) - end_dt = end_dt.replace(hour=23, minute=59, second=59) - - # Only apply date filtering if dates are valid and in correct order - if start_dt and end_dt and start_dt > end_dt: - logger.warning( - f"start_date ({start_date}) is after end_date ({end_date}), skipping date filter" - ) - else: - if start_dt: - files = [f for f in files if f["modified_at"] >= start_dt] - logger.info( - f"After start_date filter ({start_date}): {len(files)} files" - ) - if end_dt: - files = [f for f in files if f["modified_at"] <= end_dt] - logger.info(f"After end_date filter ({end_date}): {len(files)} files") - - logger.info(f"Processing {len(files)} files after date filtering") - - indexed_count = 0 - skipped_count = 0 - failed_count = 0 - duplicate_content_count = 0 - - # Heartbeat tracking - update notification periodically to prevent appearing stuck - last_heartbeat_time = time.time() - - # ======================================================================= - # PHASE 1: Analyze all files, create pending documents - # This makes ALL documents visible in the UI immediately with pending status - # ======================================================================= - files_to_process = [] # List of dicts with document and file data - new_documents_created = False - - for file_info in files: - try: - file_path = file_info["path"] - relative_path = file_info["relative_path"] - - # Read file content - try: - with open(file_path, encoding="utf-8") as f: - content = f.read() - except UnicodeDecodeError: - logger.warning(f"Could not decode file {file_path}, skipping") - skipped_count += 1 - continue - - if not content.strip(): - logger.debug(f"Empty file {file_path}, skipping") - skipped_count += 1 - continue - - # Parse frontmatter and extract metadata - frontmatter, body_content = parse_frontmatter(content) - wiki_links = extract_wiki_links(content) - tags = extract_tags(content) - - # Get title from frontmatter or filename - title = file_info["name"] - if frontmatter: - title = frontmatter.get("title", title) - # Also extract tags from frontmatter - fm_tags = frontmatter.get("tags", []) - if isinstance(fm_tags, list): - tags = list({*tags, *fm_tags}) - elif isinstance(fm_tags, str): - tags = list({*tags, fm_tags}) - - # Generate unique identifier using vault name and relative path - unique_identifier = f"{vault_name}:{relative_path}" - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.OBSIDIAN_CONNECTOR, - unique_identifier, - search_space_id, - ) - - # Generate content hash - content_hash = generate_content_hash(content, search_space_id) - - # Check for existing document - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state( - existing_document.status, DocumentStatus.READY - ): - existing_document.status = DocumentStatus.ready() - logger.debug(f"Note {title} unchanged, skipping") - skipped_count += 1 - continue - - # Queue existing document for update (will be set to processing in Phase 2) - files_to_process.append( - { - "document": existing_document, - "is_new": False, - "file_info": file_info, - "content": content, - "body_content": body_content, - "frontmatter": frontmatter, - "wiki_links": wiki_links, - "tags": tags, - "title": title, - "relative_path": relative_path, - "content_hash": content_hash, - "unique_identifier_hash": unique_identifier_hash, - } - ) - continue - - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"Obsidian note {title} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) - duplicate_content_count += 1 - skipped_count += 1 - continue - - # Create new document with PENDING status (visible in UI immediately) - document = Document( - search_space_id=search_space_id, - title=title, - document_type=DocumentType.OBSIDIAN_CONNECTOR, - document_metadata={ - "vault_name": vault_name, - "file_path": relative_path, - "connector_id": connector_id, - }, - content="Pending...", # Placeholder until processed - content_hash=unique_identifier_hash, # Temporary unique value - updated when ready - unique_identifier_hash=unique_identifier_hash, - embedding=None, - chunks=[], # Empty at creation - safe for async - status=DocumentStatus.pending(), # Pending until processing starts - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - session.add(document) - new_documents_created = True - - files_to_process.append( - { - "document": document, - "is_new": True, - "file_info": file_info, - "content": content, - "body_content": body_content, - "frontmatter": frontmatter, - "wiki_links": wiki_links, - "tags": tags, - "title": title, - "relative_path": relative_path, - "content_hash": content_hash, - "unique_identifier_hash": unique_identifier_hash, - } - ) - - except Exception as e: - logger.exception( - f"Error in Phase 1 for file {file_info.get('path', 'unknown')}: {e}" - ) - failed_count += 1 - continue - - # Commit all pending documents - they all appear in UI now - if new_documents_created: - logger.info( - f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" - ) - await session.commit() - - # ======================================================================= - # PHASE 2: Process each document one by one - # Each document transitions: pending → processing → ready/failed - # ======================================================================= - logger.info(f"Phase 2: Processing {len(files_to_process)} documents") - - # Get LLM for summarization - long_context_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - for item in files_to_process: - # Send heartbeat periodically - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(indexed_count) - last_heartbeat_time = current_time - - document = item["document"] - try: - # Set to PROCESSING and commit - shows "processing" in UI for THIS document only - document.status = DocumentStatus.processing() - await session.commit() - - # Extract data from item - title = item["title"] - relative_path = item["relative_path"] - content = item["content"] - body_content = item["body_content"] - frontmatter = item["frontmatter"] - wiki_links = item["wiki_links"] - tags = item["tags"] - content_hash = item["content_hash"] - file_info = item["file_info"] - - # Build metadata - document_metadata = { - "vault_name": vault_name, - "file_path": relative_path, - "tags": tags, - "outgoing_links": wiki_links, - "frontmatter": frontmatter, - "modified_at": file_info["modified_at"].isoformat(), - "created_at": file_info["created_at"].isoformat(), - "word_count": len(body_content.split()), - } - - # Build document content with metadata - metadata_sections = [ - ( - "METADATA", - [ - f"Title: {title}", - f"Vault: {vault_name}", - f"Path: {relative_path}", - f"Tags: {', '.join(tags) if tags else 'None'}", - f"Links to: {', '.join(wiki_links) if wiki_links else 'None'}", - ], - ), - ("CONTENT", [body_content]), - ] - document_string = build_document_metadata_string(metadata_sections) - - # Generate summary - summary_content = "" - if long_context_llm and connector.enable_summary: - summary_content, _ = await generate_document_summary( - document_string, - long_context_llm, - document_metadata, - ) - - # Generate embedding - embedding = embed_text(document_string) - - # Add URL and summary to metadata - document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}" - document_metadata["summary"] = summary_content - document_metadata["connector_id"] = connector_id - - # Create chunks - chunks = await create_document_chunks(document_string) - - # Update document to READY with actual content - document.title = title - document.content = document_string - document.content_hash = content_hash - document.embedding = embedding - document.document_metadata = document_metadata - await safe_set_chunks(session, document, chunks) - document.updated_at = get_current_timestamp() - document.status = DocumentStatus.ready() - - indexed_count += 1 - - # Batch commit every 10 documents (for ready status updates) - if indexed_count % 10 == 0: - logger.info( - f"Committing batch: {indexed_count} Obsidian notes processed so far" - ) - await session.commit() - - except Exception as e: - logger.exception( - f"Error processing file {item.get('file_info', {}).get('path', 'unknown')}: {e}" - ) - # Mark document as failed with reason (visible in UI) - try: - document.status = DocumentStatus.failed(str(e)) - document.updated_at = get_current_timestamp() - except Exception as status_error: - logger.error( - f"Failed to update document status to failed: {status_error}" - ) - failed_count += 1 - continue - - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs - await update_connector_last_indexed(session, connector, update_last_indexed) - - # Final commit for any remaining documents not yet committed in batches - logger.info(f"Final commit: Total {indexed_count} Obsidian notes processed") - try: - await session.commit() - logger.info( - "Successfully committed all Obsidian document changes to database" - ) - except Exception as e: - # Handle any remaining integrity errors gracefully (race conditions, etc.) - if ( - "duplicate key value violates unique constraint" in str(e).lower() - or "uniqueviolationerror" in str(e).lower() - ): - logger.warning( - f"Duplicate content_hash detected during final commit. " - f"This may occur if the same note was indexed by multiple connectors. " - f"Rolling back and continuing. Error: {e!s}" - ) - await session.rollback() - # Don't fail the entire task - some documents may have been successfully indexed - else: - raise - - # Build warning message if there were issues - warning_parts = [] - if duplicate_content_count > 0: - warning_parts.append(f"{duplicate_content_count} duplicate") - if failed_count > 0: - warning_parts.append(f"{failed_count} failed") - warning_message = ", ".join(warning_parts) if warning_parts else None - - total_processed = indexed_count - - await task_logger.log_task_success( - log_entry, - f"Successfully completed Obsidian vault indexing for connector {connector_id}", - { - "notes_processed": total_processed, - "documents_indexed": indexed_count, - "documents_skipped": skipped_count, - "documents_failed": failed_count, - "duplicate_content_count": duplicate_content_count, - }, - ) - - logger.info( - f"Obsidian vault indexing completed: {indexed_count} ready, " - f"{skipped_count} skipped, {failed_count} failed " - f"({duplicate_content_count} duplicate content)" - ) - return total_processed, warning_message - - except SQLAlchemyError as e: - logger.exception(f"Database error during Obsidian indexing: {e}") - await session.rollback() - await task_logger.log_task_failure( - log_entry, - f"Database error during Obsidian indexing: {e}", - "Database error", - {"error_type": "SQLAlchemyError"}, - ) - return 0, f"Database error: {e}" - - except Exception as e: - logger.exception(f"Error during Obsidian indexing: {e}") - await task_logger.log_task_failure( - log_entry, - f"Error during Obsidian indexing: {e}", - "Unexpected error", - {"error_type": type(e).__name__}, - ) - return 0, str(e) diff --git a/surfsense_backend/app/utils/periodic_scheduler.py b/surfsense_backend/app/utils/periodic_scheduler.py index 9ea45df63..aa8c07ce4 100644 --- a/surfsense_backend/app/utils/periodic_scheduler.py +++ b/surfsense_backend/app/utils/periodic_scheduler.py @@ -34,7 +34,6 @@ CONNECTOR_TASK_MAP = { SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents", SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls", SearchSourceConnectorType.BOOKSTACK_CONNECTOR: "index_bookstack_pages", - SearchSourceConnectorType.OBSIDIAN_CONNECTOR: "index_obsidian_vault", } @@ -100,7 +99,6 @@ def create_periodic_schedule( index_linear_issues_task, index_luma_events_task, index_notion_pages_task, - index_obsidian_vault_task, index_slack_messages_task, ) @@ -121,7 +119,6 @@ def create_periodic_schedule( SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, SearchSourceConnectorType.BOOKSTACK_CONNECTOR: index_bookstack_pages_task, - SearchSourceConnectorType.OBSIDIAN_CONNECTOR: index_obsidian_vault_task, } # Trigger the first run immediately diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx index 6c81353ee..cfe6f0574 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/obsidian-config.tsx @@ -1,15 +1,11 @@ "use client"; -import { AlertTriangle, Download, Info } from "lucide-react"; +import { Info } from "lucide-react"; import { type FC, useEffect, useMemo, useState } from "react"; import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; -import { Button } from "@/components/ui/button"; import { connectorsApiService, type ObsidianStats } from "@/lib/apis/connectors-api.service"; import type { ConnectorConfigProps } from "../index"; -const PLUGIN_RELEASES_URL = - "https://github.com/MODSetter/SurfSense/releases?q=obsidian&expanded=true"; - function formatTimestamp(value: unknown): string { if (typeof value !== "string" || !value) return "—"; const d = new Date(value); @@ -26,78 +22,17 @@ function formatTimestamp(value: unknown): string { * web UI doesn't expose a Name input or a Save button for Obsidian (the * latter is suppressed in `connector-edit-view.tsx`). * - * Renders one of three modes depending on the connector's `config`: - * - * 1. **Plugin connector** (`config.source === "plugin"`) — read-only stats - * panel showing what the plugin most recently reported. - * 2. **Legacy server-path connector** (`config.legacy === true`, set by the - * Phase 3 alembic) — migration banner, an "Install Plugin" CTA, and a - * short "how to migrate" checklist that ends with the user pressing the - * standard Disconnect button (which deletes this connector along with - * every document it previously indexed). - * 3. **Unknown** — fallback for rows that escaped the alembic; suggests a - * clean re-install. + * Renders plugin stats when connector metadata comes from the plugin. + * If metadata is missing or malformed, we show a recovery hint. */ export const ObsidianConfig: FC = ({ connector }) => { const config = (connector.config ?? {}) as Record; - const isLegacy = config.legacy === true; const isPlugin = config.source === "plugin"; - if (isLegacy) return ; if (isPlugin) return ; return ; }; -const LegacyBanner: FC = () => { - return ( -
- - - - Sync stopped — install the plugin to migrate - - - This Obsidian connector used the legacy server-path scanner, which has been removed. The - notes already indexed remain searchable, but they no longer reflect changes made in your - vault. - - - - - - - -
-

How to migrate

-
    -
  1. Install the SurfSense Obsidian plugin using the button above.
  2. -
  3. - In Obsidian, open Settings → SurfSense, sign in, pick a search space, and wait for the - first sync to finish. -
  4. -
  5. - Confirm the new "Obsidian — <vault>" connector shows your notes, then return here - and use the Disconnect button below to remove this legacy connector. -
  6. -
-

- Heads up: Disconnect also deletes every document this connector previously indexed. Make - sure the plugin has finished its first sync before you disconnect, otherwise your Obsidian - notes will disappear from search until the plugin re-indexes them. -

-
-
- ); -}; - const PluginStats: FC<{ config: Record }> = ({ config }) => { const vaultId = typeof config.vault_id === "string" ? config.vault_id : null; const [stats, setStats] = useState(null); @@ -179,8 +114,8 @@ const UnknownConnectorState: FC = () => ( Unrecognized config - This connector has neither plugin metadata nor a legacy marker. It may predate the migration — - you can safely delete it and re-install the SurfSense Obsidian plugin to resume syncing. + This connector is missing plugin metadata. Delete it, then reconnect your vault from the + SurfSense Obsidian plugin so sync can resume. ); diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx index 8a0ef5ae1..e58542923 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx @@ -111,7 +111,9 @@ export const ConnectorConnectView: FC = ({ : getConnectorTypeDisplay(connectorType)}

- Enter your connection details + {connectorType === "OBSIDIAN_CONNECTOR" + ? "Follow the plugin setup steps below" + : "Enter your connection details"}

diff --git a/surfsense_web/content/docs/connectors/obsidian.mdx b/surfsense_web/content/docs/connectors/obsidian.mdx index c8475c97f..c4d50cf34 100644 --- a/surfsense_web/content/docs/connectors/obsidian.mdx +++ b/surfsense_web/content/docs/connectors/obsidian.mdx @@ -1,143 +1,60 @@ --- title: Obsidian -description: Connect your Obsidian vault to SurfSense +description: Sync your Obsidian vault with the SurfSense plugin --- -# Obsidian Integration Setup Guide +# Obsidian Plugin Setup Guide -This guide walks you through connecting your Obsidian vault to SurfSense for note search and AI-powered insights. - - - This connector requires direct file system access and only works with self-hosted SurfSense installations. - +SurfSense integrates with Obsidian through the SurfSense Obsidian plugin. +The old server-side vault path scanner is no longer supported. ## How it works -The Obsidian connector scans your local Obsidian vault directory and indexes all Markdown files. It preserves your note structure and extracts metadata from YAML frontmatter. +The plugin runs inside your Obsidian app and pushes note updates to SurfSense over HTTPS. +This works for cloud and self-hosted deployments, including desktop and mobile clients. -- For follow-up indexing runs, the connector uses content hashing to skip unchanged files for faster sync. -- Indexing should be configured to run periodically, so updates should appear in your search results within minutes. - ---- - -## What Gets Indexed +## What gets indexed | Content Type | Description | |--------------|-------------| -| Markdown Files | All `.md` files in your vault | -| Frontmatter | YAML metadata (title, tags, aliases, dates) | -| Wiki Links | Links between notes (`[[note]]`) | -| Inline Tags | Tags throughout your notes (`#tag`) | -| Note Content | Full content with intelligent chunking | +| Markdown files | Note content (`.md`) | +| Frontmatter | YAML metadata like title, tags, aliases, dates | +| Wiki links | Linked notes (`[[note]]`) | +| Tags | Inline and frontmatter tags | +| Vault metadata | Vault and path metadata used for deep links and sync state | - - Binary files and attachments are not indexed by default. Enable "Include Attachments" to index embedded files. - +## Quick start ---- - -## Quick Start (Local Installation) - -1. Navigate to **Connectors** → **Add Connector** → **Obsidian** -2. Enter your vault path: `/Users/yourname/Documents/MyVault` -3. Enter a vault name (e.g., `Personal Notes`) -4. Click **Connect Obsidian** +1. Open **Connectors** in SurfSense and choose **Obsidian**. +2. Click **Open plugin releases** and install the latest SurfSense Obsidian plugin. +3. In Obsidian, open **Settings → SurfSense**. +4. Paste your SurfSense API token from the connector setup panel. +5. Paste your SurfSense backend URL in the plugin's **Server URL** setting. +6. Choose the Search Space in the plugin, then run the first sync. +7. Confirm the connector appears as **Obsidian — ** in SurfSense. - Find your vault path: In Obsidian, right-click any note → "Reveal in Finder" (macOS) or "Show in Explorer" (Windows). + You do not create or configure a vault path in the web UI. The connector row is created automatically when the plugin calls `/api/v1/obsidian/connect`. - -Enable periodic sync to automatically re-index notes when content changes. Available frequencies: Every 5 minutes, 15 minutes, hourly, every 6 hours, daily, or weekly. - +## Self-hosted notes ---- - -## Docker Setup - -For Docker deployments, you need to mount your Obsidian vault as a volume. - -### Step 1: Update docker-compose.yml - -Add your vault as a volume mount to the SurfSense backend service: - -```yaml -services: - surfsense: - # ... other config - volumes: - - /path/to/your/obsidian/vault:/app/obsidian_vaults/my-vault:ro -``` - - - The `:ro` flag mounts the vault as read-only, which is recommended for security. - - -### Step 2: Configure the Connector - -Use the **container path** (not your local path) when setting up the connector: - -| Your Local Path | Container Path (use this) | -|-----------------|---------------------------| -| `/Users/john/Documents/MyVault` | `/app/obsidian_vaults/my-vault` | -| `C:\Users\john\Documents\MyVault` | `/app/obsidian_vaults/my-vault` | - -### Example: Multiple Vaults - -```yaml -volumes: - - /Users/john/Documents/PersonalNotes:/app/obsidian_vaults/personal:ro - - /Users/john/Documents/WorkNotes:/app/obsidian_vaults/work:ro -``` - -Then create separate connectors for each vault using `/app/obsidian_vaults/personal` and `/app/obsidian_vaults/work`. - ---- - -## Connector Configuration - -| Field | Description | Required | -|-------|-------------|----------| -| **Connector Name** | A friendly name to identify this connector | Yes | -| **Vault Path** | Absolute path to your vault (container path for Docker) | Yes | -| **Vault Name** | Display name for your vault in search results | Yes | -| **Exclude Folders** | Comma-separated folder names to skip | No | -| **Include Attachments** | Index embedded files (images, PDFs) | No | - ---- - -## Recommended Exclusions - -Common folders to exclude from indexing: - -| Folder | Reason | -|--------|--------| -| `.obsidian` | Obsidian config files (always exclude) | -| `.trash` | Obsidian's trash folder | -| `templates` | Template files you don't want searchable | -| `daily-notes` | If you want to exclude daily notes | -| `attachments` | If not using "Include Attachments" | - -Default exclusions: `.obsidian,.trash` - ---- +- Use your public or LAN backend URL that your Obsidian device can reach. +- No Docker bind mount for the vault is required. +- If your instance is behind TLS, ensure the URL/certificate is valid for the device running Obsidian. ## Troubleshooting -**Vault not found / Permission denied** -- Verify the path exists and is accessible -- For Docker: ensure the volume is mounted correctly in `docker-compose.yml` -- Check file permissions: SurfSense needs read access to the vault directory +**Plugin connects but no files appear** +- Verify the plugin is pointed to the correct Search Space. +- Trigger a manual sync from the plugin settings. +- Confirm your API token is valid and not expired. -**No notes indexed** -- Ensure your vault contains `.md` files -- Check that notes aren't in excluded folders -- Verify the path points to the vault root (contains `.obsidian` folder) +**Unauthorized / 401 errors** +- Regenerate and paste a fresh API token from SurfSense. +- Ensure the token belongs to the same account and workspace you are syncing into. -**Changes not appearing** -- Wait for the next sync cycle, or manually trigger re-indexing -- For Docker: restart the container if you modified volume mounts - -**Docker: "path not found" error** -- Use the container path (`/app/obsidian_vaults/...`), not your local path -- Verify the volume mount in `docker-compose.yml` matches +**Cannot reach server URL** +- Check that the backend URL is reachable from the Obsidian device. +- For self-hosted setups, verify firewall and reverse proxy rules. +- Avoid using localhost unless SurfSense and Obsidian run on the same machine.