From 6b1b8d0f2e6683d350e4f54aacb2e9473e5d0791 Mon Sep 17 00:00:00 2001
From: Differ <1467673018@qq.com>
Date: Thu, 4 Dec 2025 14:08:44 +0800
Subject: [PATCH] feat: add BookStack connector for wiki documentation indexing
---
README.md | 3 +-
.../app/connectors/bookstack_connector.py | 343 ++++++++++++++
surfsense_backend/app/db.py | 2 +
.../routes/search_source_connectors_routes.py | 83 ++++
.../app/services/connector_service.py | 95 ++++
.../app/tasks/celery_tasks/connector_tasks.py | 43 ++
.../app/tasks/connector_indexers/__init__.py | 3 +
.../connector_indexers/bookstack_indexer.py | 434 ++++++++++++++++++
.../app/utils/periodic_scheduler.py | 3 +
.../add/bookstack-connector/page.tsx | 305 ++++++++++++
.../components/dashboard-breadcrumb.tsx | 1 +
.../components/editConnector/types.ts | 3 +
.../components/sources/connector-data.tsx | 7 +
surfsense_web/contracts/enums/connector.ts | 1 +
.../contracts/enums/connectorIcons.tsx | 3 +
.../hooks/use-connector-edit-page.ts | 32 ++
surfsense_web/messages/en.json | 1 +
surfsense_web/messages/zh.json | 1 +
18 files changed, 1362 insertions(+), 1 deletion(-)
create mode 100644 surfsense_backend/app/connectors/bookstack_connector.py
create mode 100644 surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
create mode 100644 surfsense_web/app/dashboard/[search_space_id]/connectors/add/bookstack-connector/page.tsx
diff --git a/README.md b/README.md
index c0725a44f..cef71a411 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
# SurfSense
-While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Slack, Linear, Jira, ClickUp, Confluence, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Elasticsearch and more to come.
+While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Slack, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Elasticsearch and more to come.
@@ -72,6 +72,7 @@ Open source and easy to deploy locally.
- Jira
- ClickUp
- Confluence
+- BookStack
- Notion
- Gmail
- Youtube Videos
diff --git a/surfsense_backend/app/connectors/bookstack_connector.py b/surfsense_backend/app/connectors/bookstack_connector.py
new file mode 100644
index 000000000..547dffa53
--- /dev/null
+++ b/surfsense_backend/app/connectors/bookstack_connector.py
@@ -0,0 +1,343 @@
+"""
+BookStack Connector Module
+
+A module for retrieving data from BookStack wiki systems.
+Allows fetching pages, books, and chapters from BookStack instances.
+
+BookStack API Documentation: https://demo.bookstackapp.com/api/docs
+"""
+
+import logging
+import time
+from typing import Any
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class BookStackConnector:
+ """Class for retrieving data from BookStack."""
+
+ # Rate limiting: 180 requests per minute = 0.33 seconds per request
+ # Using 0.35 seconds to be safe
+ REQUEST_INTERVAL = 0.35
+
+ def __init__(
+ self,
+ base_url: str | None = None,
+ token_id: str | None = None,
+ token_secret: str | None = None,
+ ):
+ """
+ Initialize the BookStackConnector class.
+
+ Args:
+ base_url: BookStack instance base URL (e.g., 'https://docs.example.com')
+ token_id: BookStack API Token ID
+ token_secret: BookStack API Token Secret
+ """
+ self.base_url = base_url.rstrip("/") if base_url else None
+ self.token_id = token_id
+ self.token_secret = token_secret
+ self._last_request_time = 0.0
+
+ def set_credentials(
+ self, base_url: str, token_id: str, token_secret: str
+ ) -> None:
+ """
+ Set the BookStack credentials.
+
+ Args:
+ base_url: BookStack instance base URL
+ token_id: BookStack API Token ID
+ token_secret: BookStack API Token Secret
+ """
+ self.base_url = base_url.rstrip("/")
+ self.token_id = token_id
+ self.token_secret = token_secret
+
+ def get_headers(self) -> dict[str, str]:
+ """
+ Get headers for BookStack API requests using Token Authentication.
+
+ Returns:
+ Dictionary of headers
+
+ Raises:
+ ValueError: If token_id, token_secret, or base_url have not been set
+ """
+ if not all([self.base_url, self.token_id, self.token_secret]):
+ raise ValueError(
+ "BookStack credentials not initialized. Call set_credentials() first."
+ )
+
+ return {
+ "Authorization": f"Token {self.token_id}:{self.token_secret}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ }
+
+ def _rate_limit(self) -> None:
+ """Apply rate limiting between API requests."""
+ current_time = time.time()
+ elapsed = current_time - self._last_request_time
+ if elapsed < self.REQUEST_INTERVAL:
+ time.sleep(self.REQUEST_INTERVAL - elapsed)
+ self._last_request_time = time.time()
+
+ def make_api_request(
+ self,
+ endpoint: str,
+ params: dict[str, Any] | None = None,
+ raw_response: bool = False,
+ ) -> dict[str, Any] | str:
+ """
+ Make a request to the BookStack API.
+
+ Args:
+ endpoint: API endpoint (without base URL, e.g., 'pages' or 'pages/1')
+ params: Query parameters for the request (optional)
+ raw_response: If True, return raw text response instead of JSON
+
+ Returns:
+ Response data from the API (dict for JSON, str for raw)
+
+ Raises:
+ ValueError: If credentials have not been set
+ Exception: If the API request fails
+ """
+ if not all([self.base_url, self.token_id, self.token_secret]):
+ raise ValueError(
+ "BookStack credentials not initialized. Call set_credentials() first."
+ )
+
+ # Apply rate limiting
+ self._rate_limit()
+
+ url = f"{self.base_url}/api/{endpoint}"
+ headers = self.get_headers()
+
+ try:
+ response = requests.get(url, headers=headers, params=params, timeout=30)
+ response.raise_for_status()
+
+ if raw_response:
+ return response.text
+ return response.json()
+
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 429:
+ logger.warning("Rate limit exceeded, waiting 60 seconds...")
+ time.sleep(60)
+ return self.make_api_request(endpoint, params, raw_response)
+ raise Exception(f"BookStack API request failed: {e!s}") from e
+ except requests.exceptions.RequestException as e:
+ raise Exception(f"BookStack API request failed: {e!s}") from e
+
+ def get_all_pages(self, count: int = 500) -> list[dict[str, Any]]:
+ """
+ Fetch all pages from BookStack with pagination.
+
+ Args:
+ count: Number of records per request (max 500)
+
+ Returns:
+ List of page objects
+
+ Raises:
+ ValueError: If credentials have not been set
+ Exception: If the API request fails
+ """
+ all_pages = []
+ offset = 0
+
+ while True:
+ params = {
+ "count": min(count, 500),
+ "offset": offset,
+ }
+
+ result = self.make_api_request("pages", params)
+
+ if not isinstance(result, dict) or "data" not in result:
+ raise Exception("Invalid response from BookStack API")
+
+ pages = result["data"]
+ all_pages.extend(pages)
+
+ logger.info(f"Fetched {len(pages)} pages (offset: {offset})")
+
+ # Check if we've fetched all pages
+ total = result.get("total", 0)
+ if offset + len(pages) >= total:
+ break
+
+ offset += len(pages)
+
+ logger.info(f"Total pages fetched: {len(all_pages)}")
+ return all_pages
+
+ def get_page_detail(self, page_id: int) -> dict[str, Any]:
+ """
+ Get detailed information for a single page.
+
+ The response includes 'html' (rendered) and optionally 'markdown' content.
+
+ Args:
+ page_id: The ID of the page
+
+ Returns:
+ Page detail object with content
+
+ Raises:
+ ValueError: If credentials have not been set
+ Exception: If the API request fails
+ """
+ result = self.make_api_request(f"pages/{page_id}")
+
+ if not isinstance(result, dict):
+ raise Exception(f"Invalid response for page {page_id}")
+
+ return result
+
+ def export_page_markdown(self, page_id: int) -> str:
+ """
+ Export a page as Markdown content.
+
+ Args:
+ page_id: The ID of the page
+
+ Returns:
+ Markdown content as string
+
+ Raises:
+ ValueError: If credentials have not been set
+ Exception: If the API request fails
+ """
+ result = self.make_api_request(
+ f"pages/{page_id}/export/markdown", raw_response=True
+ )
+ return result if isinstance(result, str) else ""
+
+ def get_book_detail(self, book_id: int) -> dict[str, Any]:
+ """
+ Get detailed information for a single book.
+
+ The response includes a 'content' property with the book's structure.
+
+ Args:
+ book_id: The ID of the book
+
+ Returns:
+ Book detail object
+
+ Raises:
+ ValueError: If credentials have not been set
+ Exception: If the API request fails
+ """
+ result = self.make_api_request(f"books/{book_id}")
+
+ if not isinstance(result, dict):
+ raise Exception(f"Invalid response for book {book_id}")
+
+ return result
+
+ def get_pages_by_date_range(
+ self,
+ start_date: str,
+ end_date: str,
+ count: int = 500,
+ ) -> tuple[list[dict[str, Any]], str | None]:
+ """
+ Fetch pages updated within a specific date range.
+
+ Uses the filter[updated_at:gt] parameter for incremental indexing.
+
+ Args:
+ start_date: Start date in YYYY-MM-DD format
+ end_date: End date in YYYY-MM-DD format (currently unused, for future use)
+ count: Number of records per request (max 500)
+
+ Returns:
+ Tuple of (list of page objects, error message or None)
+
+ Raises:
+ ValueError: If credentials have not been set
+ """
+ all_pages = []
+ offset = 0
+
+ try:
+ while True:
+ params = {
+ "count": min(count, 500),
+ "offset": offset,
+ "filter[updated_at:gt]": start_date,
+ "sort": "-updated_at", # Most recently updated first
+ }
+
+ result = self.make_api_request("pages", params)
+
+ if not isinstance(result, dict) or "data" not in result:
+ return [], "Invalid response from BookStack API"
+
+ pages = result["data"]
+ all_pages.extend(pages)
+
+ logger.info(
+ f"Fetched {len(pages)} pages updated after {start_date} (offset: {offset})"
+ )
+
+ # Check if we've fetched all pages
+ total = result.get("total", 0)
+ if offset + len(pages) >= total:
+ break
+
+ offset += len(pages)
+
+ if not all_pages:
+ return [], f"No pages found updated after {start_date}"
+
+ logger.info(
+ f"Total pages fetched for date range {start_date} to {end_date}: {len(all_pages)}"
+ )
+ return all_pages, None
+
+ except Exception as e:
+ logger.error(f"Error fetching pages by date range: {e!s}", exc_info=True)
+ return [], str(e)
+
+ def get_page_with_content(
+ self, page_id: int, use_markdown: bool = True
+ ) -> tuple[dict[str, Any], str]:
+ """
+ Get page details along with its full content.
+
+ Args:
+ page_id: The ID of the page
+ use_markdown: If True, export as Markdown; otherwise use HTML
+
+ Returns:
+ Tuple of (page detail dict, content string)
+
+ Raises:
+ ValueError: If credentials have not been set
+ Exception: If the API request fails
+ """
+ # Get page details first
+ page_detail = self.get_page_detail(page_id)
+
+ # Get content
+ if use_markdown:
+ try:
+ content = self.export_page_markdown(page_id)
+ except Exception as e:
+ logger.warning(
+ f"Failed to export markdown for page {page_id}, falling back to HTML: {e}"
+ )
+ content = page_detail.get("html", "")
+ else:
+ content = page_detail.get("html", "")
+
+ return page_detail, content
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 06abb7a39..d9275f9f6 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -52,6 +52,7 @@ class DocumentType(str, Enum):
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
+ BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR"
class SearchSourceConnectorType(str, Enum):
@@ -74,6 +75,7 @@ class SearchSourceConnectorType(str, Enum):
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"
+ BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR"
class ChatType(str, Enum):
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index bf397a352..971d95cba 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -597,6 +597,19 @@ async def index_connector_content(
)
response_message = "Confluence indexing started in the background."
+ elif connector.connector_type == SearchSourceConnectorType.BOOKSTACK_CONNECTOR:
+ from app.tasks.celery_tasks.connector_tasks import (
+ index_bookstack_pages_task,
+ )
+
+ logger.info(
+ f"Triggering BookStack indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
+ )
+ index_bookstack_pages_task.delay(
+ connector_id, search_space_id, str(user.id), indexing_from, indexing_to
+ )
+ response_message = "BookStack indexing started in the background."
+
elif connector.connector_type == SearchSourceConnectorType.CLICKUP_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import index_clickup_tasks_task
@@ -1597,3 +1610,73 @@ async def run_web_page_indexing(
)
except Exception as e:
logger.error(f"Error in background Web page indexing task: {e!s}")
+
+
+# Add new helper functions for BookStack indexing
+async def run_bookstack_indexing_with_new_session(
+ connector_id: int,
+ search_space_id: int,
+ user_id: str,
+ start_date: str,
+ end_date: str,
+):
+ """Wrapper to run BookStack indexing with its own database session."""
+ logger.info(
+ f"Background task started: Indexing BookStack connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
+ )
+ async with async_session_maker() as session:
+ await run_bookstack_indexing(
+ session, connector_id, search_space_id, user_id, start_date, end_date
+ )
+ logger.info(
+ f"Background task finished: Indexing BookStack connector {connector_id}"
+ )
+
+
+async def run_bookstack_indexing(
+ session: AsyncSession,
+ connector_id: int,
+ search_space_id: int,
+ user_id: str,
+ start_date: str,
+ end_date: str,
+):
+ """
+ Background task to run BookStack indexing.
+
+ Args:
+ session: Database session
+ connector_id: ID of the BookStack connector
+ search_space_id: ID of the search space
+ user_id: ID of the user
+ start_date: Start date for indexing
+ end_date: End date for indexing
+ """
+ from app.tasks.connector_indexers import index_bookstack_pages
+
+ try:
+ indexed_count, error_message = await index_bookstack_pages(
+ session,
+ connector_id,
+ search_space_id,
+ user_id,
+ start_date,
+ end_date,
+ update_last_indexed=False,
+ )
+ if error_message:
+ logger.error(
+ f"BookStack indexing failed for connector {connector_id}: {error_message}"
+ )
+ else:
+ logger.info(
+ f"BookStack indexing successful for connector {connector_id}. Indexed {indexed_count} documents."
+ )
+ # Update the last indexed timestamp only on success
+ await update_connector_last_indexed(session, connector_id)
+ await session.commit() # Commit timestamp update
+ except Exception as e:
+ logger.error(
+ f"Critical error in run_bookstack_indexing for connector {connector_id}: {e}",
+ exc_info=True,
+ )
diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py
index 3445d69f7..21afa2c8f 100644
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@@ -2576,3 +2576,98 @@ class ConnectorService:
}
return result_object, elasticsearch_chunks
+
+ async def search_bookstack(
+ self,
+ user_query: str,
+ user_id: str,
+ search_space_id: int,
+ top_k: int = 20,
+ search_mode: SearchMode = SearchMode.CHUNKS,
+ ) -> tuple:
+ """
+ Search for BookStack pages and return both the source information and langchain documents
+
+ Args:
+ user_query: The user's query
+ user_id: The user's ID
+ search_space_id: The search space ID to search in
+ top_k: Maximum number of results to return
+ search_mode: Search mode (CHUNKS or DOCUMENTS)
+
+ Returns:
+ tuple: (sources_info, langchain_documents)
+ """
+ if search_mode == SearchMode.CHUNKS:
+ bookstack_chunks = await self.chunk_retriever.hybrid_search(
+ query_text=user_query,
+ top_k=top_k,
+ user_id=user_id,
+ search_space_id=search_space_id,
+ document_type="BOOKSTACK_CONNECTOR",
+ )
+ elif search_mode == SearchMode.DOCUMENTS:
+ bookstack_chunks = await self.document_retriever.hybrid_search(
+ query_text=user_query,
+ top_k=top_k,
+ user_id=user_id,
+ search_space_id=search_space_id,
+ document_type="BOOKSTACK_CONNECTOR",
+ )
+ # Transform document retriever results to match expected format
+ bookstack_chunks = self._transform_document_results(bookstack_chunks)
+
+ # Early return if no results
+ if not bookstack_chunks:
+ return {
+ "id": 50,
+ "name": "BookStack",
+ "type": "BOOKSTACK_CONNECTOR",
+ "sources": [],
+ }, []
+
+ # Process each chunk and create sources directly without deduplication
+ sources_list = []
+ async with self.counter_lock:
+ for _i, chunk in enumerate(bookstack_chunks):
+ # Extract document metadata
+ document = chunk.get("document", {})
+ metadata = document.get("metadata", {})
+
+ # Extract BookStack-specific metadata
+ page_name = metadata.get("page_name", "Untitled Page")
+ page_slug = metadata.get("page_slug", "")
+ book_slug = metadata.get("book_slug", "")
+ base_url = metadata.get("base_url", "")
+ page_url = metadata.get("page_url", "")
+
+ # Create a more descriptive title for BookStack pages
+ title = f"BookStack: {page_name}"
+
+ # Create description from content
+ description = chunk.get("content", "")
+
+ # Build URL to the BookStack page
+ url = page_url
+ if not url and base_url and book_slug and page_slug:
+ url = f"{base_url}/books/{book_slug}/page/{page_slug}"
+
+ source = {
+ "id": chunk.get("chunk_id", self.source_id_counter),
+ "title": title,
+ "description": description,
+ "url": url,
+ }
+
+ self.source_id_counter += 1
+ sources_list.append(source)
+
+ # Create result object
+ result_object = {
+ "id": 50, # Assign a unique ID for the BookStack connector
+ "name": "BookStack",
+ "type": "BOOKSTACK_CONNECTOR",
+ "sources": sources_list,
+ }
+
+ return result_object, bookstack_chunks
diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
index b735741fe..6cd557dc4 100644
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@@ -643,3 +643,46 @@ async def _index_crawled_urls(
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
+
+
+@celery_app.task(name="index_bookstack_pages", bind=True)
+def index_bookstack_pages_task(
+ self,
+ connector_id: int,
+ search_space_id: int,
+ user_id: str,
+ start_date: str,
+ end_date: str,
+):
+ """Celery task to index BookStack pages."""
+ import asyncio
+
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+
+ try:
+ loop.run_until_complete(
+ _index_bookstack_pages(
+ connector_id, search_space_id, user_id, start_date, end_date
+ )
+ )
+ finally:
+ loop.close()
+
+
+async def _index_bookstack_pages(
+ connector_id: int,
+ search_space_id: int,
+ user_id: str,
+ start_date: str,
+ end_date: str,
+):
+ """Index BookStack pages with new session."""
+ from app.routes.search_source_connectors_routes import (
+ run_bookstack_indexing,
+ )
+
+ async with get_celery_session_maker()() as session:
+ await run_bookstack_indexing(
+ session, connector_id, search_space_id, user_id, start_date, end_date
+ )
diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py
index f62739679..dcfca33c3 100644
--- a/surfsense_backend/app/tasks/connector_indexers/__init__.py
+++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py
@@ -12,6 +12,7 @@ Available indexers:
- Linear: Index issues from Linear workspaces
- Jira: Index issues from Jira projects
- Confluence: Index pages from Confluence spaces
+- BookStack: Index pages from BookStack wiki instances
- Discord: Index messages from Discord servers
- ClickUp: Index tasks from ClickUp workspaces
- Google Gmail: Index messages from Google Gmail
@@ -24,6 +25,7 @@ Available indexers:
# Communication platforms
# Calendar and scheduling
from .airtable_indexer import index_airtable_records
+from .bookstack_indexer import index_bookstack_pages
from .clickup_indexer import index_clickup_tasks
from .confluence_indexer import index_confluence_pages
from .discord_indexer import index_discord_messages
@@ -46,6 +48,7 @@ from .webcrawler_indexer import index_crawled_urls
__all__ = [ # noqa: RUF022
"index_airtable_records",
+ "index_bookstack_pages",
"index_clickup_tasks",
"index_confluence_pages",
"index_discord_messages",
diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
new file mode 100644
index 000000000..f3994a45d
--- /dev/null
+++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@@ -0,0 +1,434 @@
+"""
+BookStack connector indexer.
+"""
+
+from datetime import datetime
+
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+from app.connectors.bookstack_connector import BookStackConnector
+from app.db import Document, DocumentType, SearchSourceConnectorType
+from app.services.llm_service import get_user_long_context_llm
+from app.services.task_logging_service import TaskLoggingService
+from app.utils.document_converters import (
+ create_document_chunks,
+ generate_content_hash,
+ generate_document_summary,
+ generate_unique_identifier_hash,
+)
+
+from .base import (
+ calculate_date_range,
+ check_document_by_unique_identifier,
+ get_connector_by_id,
+ logger,
+ update_connector_last_indexed,
+)
+
+
+async def index_bookstack_pages(
+ session: AsyncSession,
+ connector_id: int,
+ search_space_id: int,
+ user_id: str,
+ start_date: str | None = None,
+ end_date: str | None = None,
+ update_last_indexed: bool = True,
+) -> tuple[int, str | None]:
+ """
+ Index BookStack pages.
+
+ Args:
+ session: Database session
+ connector_id: ID of the BookStack connector
+ search_space_id: ID of the search space to store documents in
+ user_id: User ID
+ start_date: Start date for indexing (YYYY-MM-DD format)
+ end_date: End date for indexing (YYYY-MM-DD format)
+ update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+
+ Returns:
+ Tuple containing (number of documents indexed, error message or None)
+ """
+ task_logger = TaskLoggingService(session, search_space_id)
+
+ # Log task start
+ log_entry = await task_logger.log_task_start(
+ task_name="bookstack_pages_indexing",
+ source="connector_indexing_task",
+ message=f"Starting BookStack pages indexing for connector {connector_id}",
+ metadata={
+ "connector_id": connector_id,
+ "user_id": str(user_id),
+ "start_date": start_date,
+ "end_date": end_date,
+ },
+ )
+
+ try:
+ # Get the connector from the database
+ connector = await get_connector_by_id(
+ session, connector_id, SearchSourceConnectorType.BOOKSTACK_CONNECTOR
+ )
+
+ if not connector:
+ await task_logger.log_task_failure(
+ log_entry,
+ f"Connector with ID {connector_id} not found",
+ "Connector not found",
+ {"error_type": "ConnectorNotFound"},
+ )
+ return 0, f"Connector with ID {connector_id} not found"
+
+ # Get the BookStack credentials from the connector config
+ bookstack_base_url = connector.config.get("BOOKSTACK_BASE_URL")
+ bookstack_token_id = connector.config.get("BOOKSTACK_TOKEN_ID")
+ bookstack_token_secret = connector.config.get("BOOKSTACK_TOKEN_SECRET")
+
+ if not bookstack_base_url or not bookstack_token_id or not bookstack_token_secret:
+ await task_logger.log_task_failure(
+ log_entry,
+ f"BookStack credentials not found in connector config for connector {connector_id}",
+ "Missing BookStack credentials",
+ {"error_type": "MissingCredentials"},
+ )
+ return 0, "BookStack credentials not found in connector config"
+
+ # Initialize BookStack client
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Initializing BookStack client for connector {connector_id}",
+ {"stage": "client_initialization"},
+ )
+
+ bookstack_client = BookStackConnector(
+ base_url=bookstack_base_url,
+ token_id=bookstack_token_id,
+ token_secret=bookstack_token_secret,
+ )
+
+ # Calculate date range
+ start_date_str, end_date_str = calculate_date_range(
+ connector, start_date, end_date, default_days_back=365
+ )
+
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Fetching BookStack pages from {start_date_str} to {end_date_str}",
+ {
+ "stage": "fetching_pages",
+ "start_date": start_date_str,
+ "end_date": end_date_str,
+ },
+ )
+
+ # Get pages within date range
+ try:
+ pages, error = bookstack_client.get_pages_by_date_range(
+ start_date=start_date_str, end_date=end_date_str
+ )
+
+ if error:
+ logger.error(f"Failed to get BookStack pages: {error}")
+
+ # Don't treat "No pages found" as an error that should stop indexing
+ if "No pages found" in error:
+ logger.info(
+ "No pages found is not a critical error, continuing with update"
+ )
+ if update_last_indexed:
+ await update_connector_last_indexed(
+ session, connector, update_last_indexed
+ )
+ await session.commit()
+ logger.info(
+ f"Updated last_indexed_at to {connector.last_indexed_at} despite no pages found"
+ )
+
+ await task_logger.log_task_success(
+ log_entry,
+ f"No BookStack pages found in date range {start_date_str} to {end_date_str}",
+ {"pages_found": 0},
+ )
+ return 0, None
+ else:
+ await task_logger.log_task_failure(
+ log_entry,
+ f"Failed to get BookStack pages: {error}",
+ "API Error",
+ {"error_type": "APIError"},
+ )
+ return 0, f"Failed to get BookStack pages: {error}"
+
+ logger.info(f"Retrieved {len(pages)} pages from BookStack API")
+
+ except Exception as e:
+ logger.error(f"Error fetching BookStack pages: {e!s}", exc_info=True)
+ return 0, f"Error fetching BookStack pages: {e!s}"
+
+ # Process and index each page
+ documents_indexed = 0
+ skipped_pages = []
+ documents_skipped = 0
+
+ for page in pages:
+ try:
+ page_id = page.get("id")
+ page_name = page.get("name", "")
+ page_slug = page.get("slug", "")
+ book_id = page.get("book_id")
+ book_slug = page.get("book_slug", "")
+ chapter_id = page.get("chapter_id")
+
+ if not page_id or not page_name:
+ logger.warning(
+ f"Skipping page with missing ID or name: {page_id or 'Unknown'}"
+ )
+ skipped_pages.append(f"{page_name or 'Unknown'} (missing data)")
+ documents_skipped += 1
+ continue
+
+ # Fetch full page content (Markdown preferred)
+ try:
+ page_detail, page_content = bookstack_client.get_page_with_content(
+ page_id, use_markdown=True
+ )
+ except Exception as e:
+ logger.warning(
+ f"Failed to fetch content for page {page_name}: {e}"
+ )
+ skipped_pages.append(f"{page_name} (content fetch error)")
+ documents_skipped += 1
+ continue
+
+ # Build full content with title
+ full_content = f"# {page_name}\n\n{page_content}"
+
+ if not full_content.strip():
+ logger.warning(f"Skipping page with no content: {page_name}")
+ skipped_pages.append(f"{page_name} (no content)")
+ documents_skipped += 1
+ continue
+
+ # Generate unique identifier hash for this BookStack page
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.BOOKSTACK_CONNECTOR, page_id, search_space_id
+ )
+
+ # Generate content hash
+ content_hash = generate_content_hash(full_content, search_space_id)
+
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ # Build page URL
+ page_url = f"{bookstack_base_url}/books/{book_slug}/page/{page_slug}"
+
+ # Build document metadata
+ doc_metadata = {
+ "page_id": page_id,
+ "page_name": page_name,
+ "page_slug": page_slug,
+ "book_id": book_id,
+ "book_slug": book_slug,
+ "chapter_id": chapter_id,
+ "base_url": bookstack_base_url,
+ "page_url": page_url,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for BookStack page {page_name} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for BookStack page {page_name}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ summary_metadata = {
+ "page_name": page_name,
+ "page_id": page_id,
+ "book_id": book_id,
+ "document_type": "BookStack Page",
+ "connector_type": "BookStack",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ full_content, user_llm, summary_metadata
+ )
+ else:
+ summary_content = f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
+ if page_content:
+ content_preview = page_content[:1000]
+ if len(page_content) > 1000:
+ content_preview += "..."
+ summary_content += (
+ f"Content Preview: {content_preview}\n\n"
+ )
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(full_content)
+
+ # Update existing document
+ existing_document.title = f"BookStack - {page_name}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = doc_metadata
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated BookStack page {page_name}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ summary_metadata = {
+ "page_name": page_name,
+ "page_id": page_id,
+ "book_id": book_id,
+ "document_type": "BookStack Page",
+ "connector_type": "BookStack",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ full_content, user_llm, summary_metadata
+ )
+ else:
+ # Fallback to simple summary if no LLM configured
+ summary_content = (
+ f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
+ )
+ if page_content:
+ # Take first 1000 characters of content for summary
+ content_preview = page_content[:1000]
+ if len(page_content) > 1000:
+ content_preview += "..."
+ summary_content += f"Content Preview: {content_preview}\n\n"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks - using the full page content
+ chunks = await create_document_chunks(full_content)
+
+ # Create and store new document
+ logger.info(f"Creating new document for page {page_name}")
+ document = Document(
+ search_space_id=search_space_id,
+ title=f"BookStack - {page_name}",
+ document_type=DocumentType.BOOKSTACK_CONNECTOR,
+ document_metadata=doc_metadata,
+ content=summary_content,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ embedding=summary_embedding,
+ chunks=chunks,
+ )
+
+ session.add(document)
+ documents_indexed += 1
+ logger.info(f"Successfully indexed new page {page_name}")
+
+ # Batch commit every 10 documents
+ if documents_indexed % 10 == 0:
+ logger.info(
+ f"Committing batch: {documents_indexed} BookStack pages processed so far"
+ )
+ await session.commit()
+
+ except Exception as e:
+ logger.error(
+ f"Error processing page {page.get('name', 'Unknown')}: {e!s}",
+ exc_info=True,
+ )
+ skipped_pages.append(
+ f"{page.get('name', 'Unknown')} (processing error)"
+ )
+ documents_skipped += 1
+ continue # Skip this page and continue with others
+
+ # Update the last_indexed_at timestamp for the connector only if requested
+ total_processed = documents_indexed
+ if update_last_indexed:
+ await update_connector_last_indexed(session, connector, update_last_indexed)
+
+ # Final commit for any remaining documents not yet committed in batches
+ logger.info(
+ f"Final commit: Total {documents_indexed} BookStack pages processed"
+ )
+ await session.commit()
+ logger.info(
+ "Successfully committed all BookStack document changes to database"
+ )
+
+ # Log success
+ await task_logger.log_task_success(
+ log_entry,
+ f"Successfully completed BookStack indexing for connector {connector_id}",
+ {
+ "pages_processed": total_processed,
+ "documents_indexed": documents_indexed,
+ "documents_skipped": documents_skipped,
+ "skipped_pages_count": len(skipped_pages),
+ },
+ )
+
+ logger.info(
+ f"BookStack indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
+ )
+ return (
+ total_processed,
+ None,
+ ) # Return None as the error message to indicate success
+
+ except SQLAlchemyError as db_error:
+ await session.rollback()
+ await task_logger.log_task_failure(
+ log_entry,
+ f"Database error during BookStack indexing for connector {connector_id}",
+ str(db_error),
+ {"error_type": "SQLAlchemyError"},
+ )
+ logger.error(f"Database error: {db_error!s}", exc_info=True)
+ return 0, f"Database error: {db_error!s}"
+ except Exception as e:
+ await session.rollback()
+ await task_logger.log_task_failure(
+ log_entry,
+ f"Failed to index BookStack pages for connector {connector_id}",
+ str(e),
+ {"error_type": type(e).__name__},
+ )
+ logger.error(f"Failed to index BookStack pages: {e!s}", exc_info=True)
+ return 0, f"Failed to index BookStack pages: {e!s}"
diff --git a/surfsense_backend/app/utils/periodic_scheduler.py b/surfsense_backend/app/utils/periodic_scheduler.py
index e33661d65..c95f407a4 100644
--- a/surfsense_backend/app/utils/periodic_scheduler.py
+++ b/surfsense_backend/app/utils/periodic_scheduler.py
@@ -32,6 +32,7 @@ CONNECTOR_TASK_MAP = {
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
+ SearchSourceConnectorType.BOOKSTACK_CONNECTOR: "index_bookstack_pages",
}
@@ -68,6 +69,7 @@ def create_periodic_schedule(
# Import all indexing tasks
from app.tasks.celery_tasks.connector_tasks import (
index_airtable_records_task,
+ index_bookstack_pages_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
@@ -99,6 +101,7 @@ def create_periodic_schedule(
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
+ SearchSourceConnectorType.BOOKSTACK_CONNECTOR: index_bookstack_pages_task,
}
# Trigger the first run immediately
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/bookstack-connector/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/bookstack-connector/page.tsx
new file mode 100644
index 000000000..56e1c28b5
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/bookstack-connector/page.tsx
@@ -0,0 +1,305 @@
+"use client";
+
+import { zodResolver } from "@hookform/resolvers/zod";
+import { ArrowLeft, Check, Info, Loader2 } from "lucide-react";
+import { motion } from "motion/react";
+import { useParams, useRouter } from "next/navigation";
+import { useState } from "react";
+import { useForm } from "react-hook-form";
+import { toast } from "sonner";
+import * as z from "zod";
+import { Alert, AlertDescription } from "@/components/ui/alert";
+import { Button } from "@/components/ui/button";
+import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
+import {
+ Form,
+ FormControl,
+ FormDescription,
+ FormField,
+ FormItem,
+ FormLabel,
+ FormMessage,
+} from "@/components/ui/form";
+import { Input } from "@/components/ui/input";
+import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
+import { EnumConnectorName } from "@/contracts/enums/connector";
+import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
+import { useSearchSourceConnectors } from "@/hooks/use-search-source-connectors";
+
+// Define the form schema with Zod
+const bookstackConnectorFormSchema = z.object({
+ name: z.string().min(3, {
+ message: "Connector name must be at least 3 characters.",
+ }),
+ base_url: z
+ .string()
+ .url({
+ message: "Please enter a valid BookStack URL (e.g., https://docs.example.com)",
+ }),
+ token_id: z.string().min(10, {
+ message: "BookStack Token ID is required.",
+ }),
+ token_secret: z.string().min(10, {
+ message: "BookStack Token Secret is required.",
+ }),
+});
+
+// Define the type for the form values
+type BookStackConnectorFormValues = z.infer;
+
+export default function BookStackConnectorPage() {
+ const router = useRouter();
+ const params = useParams();
+ const searchSpaceId = params.search_space_id as string;
+ const [isSubmitting, setIsSubmitting] = useState(false);
+ const { createConnector } = useSearchSourceConnectors();
+
+ // Initialize the form
+ const form = useForm({
+ resolver: zodResolver(bookstackConnectorFormSchema),
+ defaultValues: {
+ name: "BookStack Connector",
+ base_url: "",
+ token_id: "",
+ token_secret: "",
+ },
+ });
+
+ // Handle form submission
+ const onSubmit = async (values: BookStackConnectorFormValues) => {
+ setIsSubmitting(true);
+ try {
+ await createConnector(
+ {
+ name: values.name,
+ connector_type: EnumConnectorName.BOOKSTACK_CONNECTOR,
+ config: {
+ BOOKSTACK_BASE_URL: values.base_url,
+ BOOKSTACK_TOKEN_ID: values.token_id,
+ BOOKSTACK_TOKEN_SECRET: values.token_secret,
+ },
+ is_indexable: true,
+ last_indexed_at: null,
+ periodic_indexing_enabled: false,
+ indexing_frequency_minutes: null,
+ next_scheduled_at: null,
+ },
+ parseInt(searchSpaceId)
+ );
+
+ toast.success("BookStack connector created successfully!");
+
+ // Navigate back to connectors page
+ router.push(`/dashboard/${searchSpaceId}/connectors`);
+ } catch (error) {
+ console.error("Error creating connector:", error);
+ toast.error(error instanceof Error ? error.message : "Failed to create connector");
+ } finally {
+ setIsSubmitting(false);
+ }
+ };
+
+ return (
+
+ Connect your BookStack instance to search wiki pages.
+
+
+
+
+
+
+
+
+ Connect
+ Documentation
+
+
+
+
+
+ Connect to BookStack
+
+ Connect your BookStack instance to index pages from your wiki.
+
+
+
+
+
+
+ You'll need to create an API token from your BookStack instance.
+ Go to Edit Profile → API Tokens → Create Token
+
+
+
+
+
+
+
+
+
+
+
+
+ BookStack Integration Guide
+
+ Learn how to set up and use the BookStack connector.
+
+
+
+
+
What gets indexed?
+
+
All pages from your BookStack instance
+
Page content in Markdown format
+
Page titles and metadata
+
Book and chapter hierarchy information
+
+
+
+
+
Setup Instructions
+
+
Log in to your BookStack instance
+
Click on your profile icon → Edit Profile
+
Navigate to the "API Tokens" tab
+
Click "Create Token" and give it a name
+
Copy both the Token ID and Token Secret
+
Paste them in the form above
+
+
+
+
+
Permissions Required
+
+
Your user account must have "Access System API" permission
+
Read access to books and pages you want to index
+
The connector will only index content your account can view
+
+
+
+
+
+
+ BookStack API has a rate limit of 180 requests per minute. The connector
+ automatically handles rate limiting to ensure reliable indexing.
+
+
+
+
+
+
+
+