feat: add BookStack connector for wiki documentation indexing

This commit is contained in:
Differ 2025-12-04 14:08:44 +08:00
parent e0725741c9
commit 6b1b8d0f2e
18 changed files with 1362 additions and 1 deletions

View file

@ -15,7 +15,7 @@
</div>
# SurfSense
While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Slack, Linear, Jira, ClickUp, Confluence, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Elasticsearch and more to come.
While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Slack, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Elasticsearch and more to come.
<div align="center">
<a href="https://trendshift.io/repositories/13606" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13606" alt="MODSetter%2FSurfSense | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@ -72,6 +72,7 @@ Open source and easy to deploy locally.
- Jira
- ClickUp
- Confluence
- BookStack
- Notion
- Gmail
- Youtube Videos

View file

@ -0,0 +1,343 @@
"""
BookStack Connector Module
A module for retrieving data from BookStack wiki systems.
Allows fetching pages, books, and chapters from BookStack instances.
BookStack API Documentation: https://demo.bookstackapp.com/api/docs
"""
import logging
import time
from typing import Any
import requests
logger = logging.getLogger(__name__)
class BookStackConnector:
"""Class for retrieving data from BookStack."""
# Rate limiting: 180 requests per minute = 0.33 seconds per request
# Using 0.35 seconds to be safe
REQUEST_INTERVAL = 0.35
def __init__(
self,
base_url: str | None = None,
token_id: str | None = None,
token_secret: str | None = None,
):
"""
Initialize the BookStackConnector class.
Args:
base_url: BookStack instance base URL (e.g., 'https://docs.example.com')
token_id: BookStack API Token ID
token_secret: BookStack API Token Secret
"""
self.base_url = base_url.rstrip("/") if base_url else None
self.token_id = token_id
self.token_secret = token_secret
self._last_request_time = 0.0
def set_credentials(
self, base_url: str, token_id: str, token_secret: str
) -> None:
"""
Set the BookStack credentials.
Args:
base_url: BookStack instance base URL
token_id: BookStack API Token ID
token_secret: BookStack API Token Secret
"""
self.base_url = base_url.rstrip("/")
self.token_id = token_id
self.token_secret = token_secret
def get_headers(self) -> dict[str, str]:
"""
Get headers for BookStack API requests using Token Authentication.
Returns:
Dictionary of headers
Raises:
ValueError: If token_id, token_secret, or base_url have not been set
"""
if not all([self.base_url, self.token_id, self.token_secret]):
raise ValueError(
"BookStack credentials not initialized. Call set_credentials() first."
)
return {
"Authorization": f"Token {self.token_id}:{self.token_secret}",
"Content-Type": "application/json",
"Accept": "application/json",
}
def _rate_limit(self) -> None:
"""Apply rate limiting between API requests."""
current_time = time.time()
elapsed = current_time - self._last_request_time
if elapsed < self.REQUEST_INTERVAL:
time.sleep(self.REQUEST_INTERVAL - elapsed)
self._last_request_time = time.time()
def make_api_request(
self,
endpoint: str,
params: dict[str, Any] | None = None,
raw_response: bool = False,
) -> dict[str, Any] | str:
"""
Make a request to the BookStack API.
Args:
endpoint: API endpoint (without base URL, e.g., 'pages' or 'pages/1')
params: Query parameters for the request (optional)
raw_response: If True, return raw text response instead of JSON
Returns:
Response data from the API (dict for JSON, str for raw)
Raises:
ValueError: If credentials have not been set
Exception: If the API request fails
"""
if not all([self.base_url, self.token_id, self.token_secret]):
raise ValueError(
"BookStack credentials not initialized. Call set_credentials() first."
)
# Apply rate limiting
self._rate_limit()
url = f"{self.base_url}/api/{endpoint}"
headers = self.get_headers()
try:
response = requests.get(url, headers=headers, params=params, timeout=30)
response.raise_for_status()
if raw_response:
return response.text
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
logger.warning("Rate limit exceeded, waiting 60 seconds...")
time.sleep(60)
return self.make_api_request(endpoint, params, raw_response)
raise Exception(f"BookStack API request failed: {e!s}") from e
except requests.exceptions.RequestException as e:
raise Exception(f"BookStack API request failed: {e!s}") from e
def get_all_pages(self, count: int = 500) -> list[dict[str, Any]]:
"""
Fetch all pages from BookStack with pagination.
Args:
count: Number of records per request (max 500)
Returns:
List of page objects
Raises:
ValueError: If credentials have not been set
Exception: If the API request fails
"""
all_pages = []
offset = 0
while True:
params = {
"count": min(count, 500),
"offset": offset,
}
result = self.make_api_request("pages", params)
if not isinstance(result, dict) or "data" not in result:
raise Exception("Invalid response from BookStack API")
pages = result["data"]
all_pages.extend(pages)
logger.info(f"Fetched {len(pages)} pages (offset: {offset})")
# Check if we've fetched all pages
total = result.get("total", 0)
if offset + len(pages) >= total:
break
offset += len(pages)
logger.info(f"Total pages fetched: {len(all_pages)}")
return all_pages
def get_page_detail(self, page_id: int) -> dict[str, Any]:
"""
Get detailed information for a single page.
The response includes 'html' (rendered) and optionally 'markdown' content.
Args:
page_id: The ID of the page
Returns:
Page detail object with content
Raises:
ValueError: If credentials have not been set
Exception: If the API request fails
"""
result = self.make_api_request(f"pages/{page_id}")
if not isinstance(result, dict):
raise Exception(f"Invalid response for page {page_id}")
return result
def export_page_markdown(self, page_id: int) -> str:
"""
Export a page as Markdown content.
Args:
page_id: The ID of the page
Returns:
Markdown content as string
Raises:
ValueError: If credentials have not been set
Exception: If the API request fails
"""
result = self.make_api_request(
f"pages/{page_id}/export/markdown", raw_response=True
)
return result if isinstance(result, str) else ""
def get_book_detail(self, book_id: int) -> dict[str, Any]:
"""
Get detailed information for a single book.
The response includes a 'content' property with the book's structure.
Args:
book_id: The ID of the book
Returns:
Book detail object
Raises:
ValueError: If credentials have not been set
Exception: If the API request fails
"""
result = self.make_api_request(f"books/{book_id}")
if not isinstance(result, dict):
raise Exception(f"Invalid response for book {book_id}")
return result
def get_pages_by_date_range(
self,
start_date: str,
end_date: str,
count: int = 500,
) -> tuple[list[dict[str, Any]], str | None]:
"""
Fetch pages updated within a specific date range.
Uses the filter[updated_at:gt] parameter for incremental indexing.
Args:
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format (currently unused, for future use)
count: Number of records per request (max 500)
Returns:
Tuple of (list of page objects, error message or None)
Raises:
ValueError: If credentials have not been set
"""
all_pages = []
offset = 0
try:
while True:
params = {
"count": min(count, 500),
"offset": offset,
"filter[updated_at:gt]": start_date,
"sort": "-updated_at", # Most recently updated first
}
result = self.make_api_request("pages", params)
if not isinstance(result, dict) or "data" not in result:
return [], "Invalid response from BookStack API"
pages = result["data"]
all_pages.extend(pages)
logger.info(
f"Fetched {len(pages)} pages updated after {start_date} (offset: {offset})"
)
# Check if we've fetched all pages
total = result.get("total", 0)
if offset + len(pages) >= total:
break
offset += len(pages)
if not all_pages:
return [], f"No pages found updated after {start_date}"
logger.info(
f"Total pages fetched for date range {start_date} to {end_date}: {len(all_pages)}"
)
return all_pages, None
except Exception as e:
logger.error(f"Error fetching pages by date range: {e!s}", exc_info=True)
return [], str(e)
def get_page_with_content(
self, page_id: int, use_markdown: bool = True
) -> tuple[dict[str, Any], str]:
"""
Get page details along with its full content.
Args:
page_id: The ID of the page
use_markdown: If True, export as Markdown; otherwise use HTML
Returns:
Tuple of (page detail dict, content string)
Raises:
ValueError: If credentials have not been set
Exception: If the API request fails
"""
# Get page details first
page_detail = self.get_page_detail(page_id)
# Get content
if use_markdown:
try:
content = self.export_page_markdown(page_id)
except Exception as e:
logger.warning(
f"Failed to export markdown for page {page_id}, falling back to HTML: {e}"
)
content = page_detail.get("html", "")
else:
content = page_detail.get("html", "")
return page_detail, content

View file

@ -52,6 +52,7 @@ class DocumentType(str, Enum):
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR"
class SearchSourceConnectorType(str, Enum):
@ -74,6 +75,7 @@ class SearchSourceConnectorType(str, Enum):
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"
BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR"
class ChatType(str, Enum):

View file

@ -597,6 +597,19 @@ async def index_connector_content(
)
response_message = "Confluence indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.BOOKSTACK_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import (
index_bookstack_pages_task,
)
logger.info(
f"Triggering BookStack indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
index_bookstack_pages_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
)
response_message = "BookStack indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.CLICKUP_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import index_clickup_tasks_task
@ -1597,3 +1610,73 @@ async def run_web_page_indexing(
)
except Exception as e:
logger.error(f"Error in background Web page indexing task: {e!s}")
# Add new helper functions for BookStack indexing
async def run_bookstack_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Wrapper to run BookStack indexing with its own database session."""
logger.info(
f"Background task started: Indexing BookStack connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
)
async with async_session_maker() as session:
await run_bookstack_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
logger.info(
f"Background task finished: Indexing BookStack connector {connector_id}"
)
async def run_bookstack_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run BookStack indexing.
Args:
session: Database session
connector_id: ID of the BookStack connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_bookstack_pages
try:
indexed_count, error_message = await index_bookstack_pages(
session,
connector_id,
search_space_id,
user_id,
start_date,
end_date,
update_last_indexed=False,
)
if error_message:
logger.error(
f"BookStack indexing failed for connector {connector_id}: {error_message}"
)
else:
logger.info(
f"BookStack indexing successful for connector {connector_id}. Indexed {indexed_count} documents."
)
# Update the last indexed timestamp only on success
await update_connector_last_indexed(session, connector_id)
await session.commit() # Commit timestamp update
except Exception as e:
logger.error(
f"Critical error in run_bookstack_indexing for connector {connector_id}: {e}",
exc_info=True,
)

View file

@ -2576,3 +2576,98 @@ class ConnectorService:
}
return result_object, elasticsearch_chunks
async def search_bookstack(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
"""
Search for BookStack pages and return both the source information and langchain documents
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
Returns:
tuple: (sources_info, langchain_documents)
"""
if search_mode == SearchMode.CHUNKS:
bookstack_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="BOOKSTACK_CONNECTOR",
)
elif search_mode == SearchMode.DOCUMENTS:
bookstack_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="BOOKSTACK_CONNECTOR",
)
# Transform document retriever results to match expected format
bookstack_chunks = self._transform_document_results(bookstack_chunks)
# Early return if no results
if not bookstack_chunks:
return {
"id": 50,
"name": "BookStack",
"type": "BOOKSTACK_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication
sources_list = []
async with self.counter_lock:
for _i, chunk in enumerate(bookstack_chunks):
# Extract document metadata
document = chunk.get("document", {})
metadata = document.get("metadata", {})
# Extract BookStack-specific metadata
page_name = metadata.get("page_name", "Untitled Page")
page_slug = metadata.get("page_slug", "")
book_slug = metadata.get("book_slug", "")
base_url = metadata.get("base_url", "")
page_url = metadata.get("page_url", "")
# Create a more descriptive title for BookStack pages
title = f"BookStack: {page_name}"
# Create description from content
description = chunk.get("content", "")
# Build URL to the BookStack page
url = page_url
if not url and base_url and book_slug and page_slug:
url = f"{base_url}/books/{book_slug}/page/{page_slug}"
source = {
"id": chunk.get("chunk_id", self.source_id_counter),
"title": title,
"description": description,
"url": url,
}
self.source_id_counter += 1
sources_list.append(source)
# Create result object
result_object = {
"id": 50, # Assign a unique ID for the BookStack connector
"name": "BookStack",
"type": "BOOKSTACK_CONNECTOR",
"sources": sources_list,
}
return result_object, bookstack_chunks

View file

@ -643,3 +643,46 @@ async def _index_crawled_urls(
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_bookstack_pages", bind=True)
def index_bookstack_pages_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index BookStack pages."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_bookstack_pages(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_bookstack_pages(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index BookStack pages with new session."""
from app.routes.search_source_connectors_routes import (
run_bookstack_indexing,
)
async with get_celery_session_maker()() as session:
await run_bookstack_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -12,6 +12,7 @@ Available indexers:
- Linear: Index issues from Linear workspaces
- Jira: Index issues from Jira projects
- Confluence: Index pages from Confluence spaces
- BookStack: Index pages from BookStack wiki instances
- Discord: Index messages from Discord servers
- ClickUp: Index tasks from ClickUp workspaces
- Google Gmail: Index messages from Google Gmail
@ -24,6 +25,7 @@ Available indexers:
# Communication platforms
# Calendar and scheduling
from .airtable_indexer import index_airtable_records
from .bookstack_indexer import index_bookstack_pages
from .clickup_indexer import index_clickup_tasks
from .confluence_indexer import index_confluence_pages
from .discord_indexer import index_discord_messages
@ -46,6 +48,7 @@ from .webcrawler_indexer import index_crawled_urls
__all__ = [ # noqa: RUF022
"index_airtable_records",
"index_bookstack_pages",
"index_clickup_tasks",
"index_confluence_pages",
"index_discord_messages",

View file

@ -0,0 +1,434 @@
"""
BookStack connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.bookstack_connector import BookStackConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_bookstack_pages(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index BookStack pages.
Args:
session: Database session
connector_id: ID of the BookStack connector
search_space_id: ID of the search space to store documents in
user_id: User ID
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="bookstack_pages_indexing",
source="connector_indexing_task",
message=f"Starting BookStack pages indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.BOOKSTACK_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return 0, f"Connector with ID {connector_id} not found"
# Get the BookStack credentials from the connector config
bookstack_base_url = connector.config.get("BOOKSTACK_BASE_URL")
bookstack_token_id = connector.config.get("BOOKSTACK_TOKEN_ID")
bookstack_token_secret = connector.config.get("BOOKSTACK_TOKEN_SECRET")
if not bookstack_base_url or not bookstack_token_id or not bookstack_token_secret:
await task_logger.log_task_failure(
log_entry,
f"BookStack credentials not found in connector config for connector {connector_id}",
"Missing BookStack credentials",
{"error_type": "MissingCredentials"},
)
return 0, "BookStack credentials not found in connector config"
# Initialize BookStack client
await task_logger.log_task_progress(
log_entry,
f"Initializing BookStack client for connector {connector_id}",
{"stage": "client_initialization"},
)
bookstack_client = BookStackConnector(
base_url=bookstack_base_url,
token_id=bookstack_token_id,
token_secret=bookstack_token_secret,
)
# Calculate date range
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
await task_logger.log_task_progress(
log_entry,
f"Fetching BookStack pages from {start_date_str} to {end_date_str}",
{
"stage": "fetching_pages",
"start_date": start_date_str,
"end_date": end_date_str,
},
)
# Get pages within date range
try:
pages, error = bookstack_client.get_pages_by_date_range(
start_date=start_date_str, end_date=end_date_str
)
if error:
logger.error(f"Failed to get BookStack pages: {error}")
# Don't treat "No pages found" as an error that should stop indexing
if "No pages found" in error:
logger.info(
"No pages found is not a critical error, continuing with update"
)
if update_last_indexed:
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no pages found"
)
await task_logger.log_task_success(
log_entry,
f"No BookStack pages found in date range {start_date_str} to {end_date_str}",
{"pages_found": 0},
)
return 0, None
else:
await task_logger.log_task_failure(
log_entry,
f"Failed to get BookStack pages: {error}",
"API Error",
{"error_type": "APIError"},
)
return 0, f"Failed to get BookStack pages: {error}"
logger.info(f"Retrieved {len(pages)} pages from BookStack API")
except Exception as e:
logger.error(f"Error fetching BookStack pages: {e!s}", exc_info=True)
return 0, f"Error fetching BookStack pages: {e!s}"
# Process and index each page
documents_indexed = 0
skipped_pages = []
documents_skipped = 0
for page in pages:
try:
page_id = page.get("id")
page_name = page.get("name", "")
page_slug = page.get("slug", "")
book_id = page.get("book_id")
book_slug = page.get("book_slug", "")
chapter_id = page.get("chapter_id")
if not page_id or not page_name:
logger.warning(
f"Skipping page with missing ID or name: {page_id or 'Unknown'}"
)
skipped_pages.append(f"{page_name or 'Unknown'} (missing data)")
documents_skipped += 1
continue
# Fetch full page content (Markdown preferred)
try:
page_detail, page_content = bookstack_client.get_page_with_content(
page_id, use_markdown=True
)
except Exception as e:
logger.warning(
f"Failed to fetch content for page {page_name}: {e}"
)
skipped_pages.append(f"{page_name} (content fetch error)")
documents_skipped += 1
continue
# Build full content with title
full_content = f"# {page_name}\n\n{page_content}"
if not full_content.strip():
logger.warning(f"Skipping page with no content: {page_name}")
skipped_pages.append(f"{page_name} (no content)")
documents_skipped += 1
continue
# Generate unique identifier hash for this BookStack page
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.BOOKSTACK_CONNECTOR, page_id, search_space_id
)
# Generate content hash
content_hash = generate_content_hash(full_content, search_space_id)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Build page URL
page_url = f"{bookstack_base_url}/books/{book_slug}/page/{page_slug}"
# Build document metadata
doc_metadata = {
"page_id": page_id,
"page_name": page_name,
"page_slug": page_slug,
"book_id": book_id,
"book_slug": book_slug,
"chapter_id": chapter_id,
"base_url": bookstack_base_url,
"page_url": page_url,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for BookStack page {page_name} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for BookStack page {page_name}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
summary_metadata = {
"page_name": page_name,
"page_id": page_id,
"book_id": book_id,
"document_type": "BookStack Page",
"connector_type": "BookStack",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, summary_metadata
)
else:
summary_content = f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
if page_content:
content_preview = page_content[:1000]
if len(page_content) > 1000:
content_preview += "..."
summary_content += (
f"Content Preview: {content_preview}\n\n"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(full_content)
# Update existing document
existing_document.title = f"BookStack - {page_name}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = doc_metadata
existing_document.chunks = chunks
documents_indexed += 1
logger.info(
f"Successfully updated BookStack page {page_name}"
)
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
summary_metadata = {
"page_name": page_name,
"page_id": page_id,
"book_id": book_id,
"document_type": "BookStack Page",
"connector_type": "BookStack",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, summary_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = (
f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
)
if page_content:
# Take first 1000 characters of content for summary
content_preview = page_content[:1000]
if len(page_content) > 1000:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full page content
chunks = await create_document_chunks(full_content)
# Create and store new document
logger.info(f"Creating new document for page {page_name}")
document = Document(
search_space_id=search_space_id,
title=f"BookStack - {page_name}",
document_type=DocumentType.BOOKSTACK_CONNECTOR,
document_metadata=doc_metadata,
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new page {page_name}")
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} BookStack pages processed so far"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing page {page.get('name', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_pages.append(
f"{page.get('name', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this page and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_indexed} BookStack pages processed"
)
await session.commit()
logger.info(
"Successfully committed all BookStack document changes to database"
)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed BookStack indexing for connector {connector_id}",
{
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_pages_count": len(skipped_pages),
},
)
logger.info(
f"BookStack indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during BookStack indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index BookStack pages for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index BookStack pages: {e!s}", exc_info=True)
return 0, f"Failed to index BookStack pages: {e!s}"

View file

@ -32,6 +32,7 @@ CONNECTOR_TASK_MAP = {
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
SearchSourceConnectorType.BOOKSTACK_CONNECTOR: "index_bookstack_pages",
}
@ -68,6 +69,7 @@ def create_periodic_schedule(
# Import all indexing tasks
from app.tasks.celery_tasks.connector_tasks import (
index_airtable_records_task,
index_bookstack_pages_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
@ -99,6 +101,7 @@ def create_periodic_schedule(
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
SearchSourceConnectorType.BOOKSTACK_CONNECTOR: index_bookstack_pages_task,
}
# Trigger the first run immediately

View file

@ -0,0 +1,305 @@
"use client";
import { zodResolver } from "@hookform/resolvers/zod";
import { ArrowLeft, Check, Info, Loader2 } from "lucide-react";
import { motion } from "motion/react";
import { useParams, useRouter } from "next/navigation";
import { useState } from "react";
import { useForm } from "react-hook-form";
import { toast } from "sonner";
import * as z from "zod";
import { Alert, AlertDescription } from "@/components/ui/alert";
import { Button } from "@/components/ui/button";
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Input } from "@/components/ui/input";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
import { EnumConnectorName } from "@/contracts/enums/connector";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import { useSearchSourceConnectors } from "@/hooks/use-search-source-connectors";
// Define the form schema with Zod
const bookstackConnectorFormSchema = z.object({
name: z.string().min(3, {
message: "Connector name must be at least 3 characters.",
}),
base_url: z
.string()
.url({
message: "Please enter a valid BookStack URL (e.g., https://docs.example.com)",
}),
token_id: z.string().min(10, {
message: "BookStack Token ID is required.",
}),
token_secret: z.string().min(10, {
message: "BookStack Token Secret is required.",
}),
});
// Define the type for the form values
type BookStackConnectorFormValues = z.infer<typeof bookstackConnectorFormSchema>;
export default function BookStackConnectorPage() {
const router = useRouter();
const params = useParams();
const searchSpaceId = params.search_space_id as string;
const [isSubmitting, setIsSubmitting] = useState(false);
const { createConnector } = useSearchSourceConnectors();
// Initialize the form
const form = useForm<BookStackConnectorFormValues>({
resolver: zodResolver(bookstackConnectorFormSchema),
defaultValues: {
name: "BookStack Connector",
base_url: "",
token_id: "",
token_secret: "",
},
});
// Handle form submission
const onSubmit = async (values: BookStackConnectorFormValues) => {
setIsSubmitting(true);
try {
await createConnector(
{
name: values.name,
connector_type: EnumConnectorName.BOOKSTACK_CONNECTOR,
config: {
BOOKSTACK_BASE_URL: values.base_url,
BOOKSTACK_TOKEN_ID: values.token_id,
BOOKSTACK_TOKEN_SECRET: values.token_secret,
},
is_indexable: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,
next_scheduled_at: null,
},
parseInt(searchSpaceId)
);
toast.success("BookStack connector created successfully!");
// Navigate back to connectors page
router.push(`/dashboard/${searchSpaceId}/connectors`);
} catch (error) {
console.error("Error creating connector:", error);
toast.error(error instanceof Error ? error.message : "Failed to create connector");
} finally {
setIsSubmitting(false);
}
};
return (
<div className="container mx-auto py-8 max-w-3xl">
<Button
variant="ghost"
className="mb-6"
onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}
>
<ArrowLeft className="mr-2 h-4 w-4" />
Back to Connectors
</Button>
{/* Header */}
<div className="mb-8">
<div className="flex items-center gap-4">
<div className="flex h-12 w-12 items-center justify-center rounded-lg">
{getConnectorIcon(EnumConnectorName.BOOKSTACK_CONNECTOR, "h-6 w-6")}
</div>
<div>
<h1 className="text-3xl font-bold tracking-tight">Connect BookStack</h1>
<p className="text-muted-foreground">
Connect your BookStack instance to search wiki pages.
</p>
</div>
</div>
</div>
<motion.div
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.5 }}
>
<Tabs defaultValue="connect" className="w-full">
<TabsList className="grid w-full grid-cols-2 mb-6">
<TabsTrigger value="connect">Connect</TabsTrigger>
<TabsTrigger value="documentation">Documentation</TabsTrigger>
</TabsList>
<TabsContent value="connect">
<Card>
<CardHeader>
<CardTitle>Connect to BookStack</CardTitle>
<CardDescription>
Connect your BookStack instance to index pages from your wiki.
</CardDescription>
</CardHeader>
<CardContent className="space-y-6">
<Alert>
<Info className="h-4 w-4" />
<AlertDescription>
You'll need to create an API token from your BookStack instance.
Go to <strong>Edit Profile API Tokens Create Token</strong>
</AlertDescription>
</Alert>
<Form {...form}>
<form onSubmit={form.handleSubmit(onSubmit)} className="space-y-6">
<FormField
control={form.control}
name="name"
render={({ field }) => (
<FormItem>
<FormLabel>Connector Name</FormLabel>
<FormControl>
<Input placeholder="My BookStack Connector" {...field} />
</FormControl>
<FormDescription>
A friendly name to identify this connector.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="base_url"
render={({ field }) => (
<FormItem>
<FormLabel>BookStack Instance URL</FormLabel>
<FormControl>
<Input placeholder="https://docs.example.com" {...field} />
</FormControl>
<FormDescription>
Your BookStack instance URL (e.g., https://wiki.yourcompany.com)
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="token_id"
render={({ field }) => (
<FormItem>
<FormLabel>Token ID</FormLabel>
<FormControl>
<Input placeholder="Your BookStack Token ID" {...field} />
</FormControl>
<FormDescription>
The Token ID from your BookStack API token.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="token_secret"
render={({ field }) => (
<FormItem>
<FormLabel>Token Secret</FormLabel>
<FormControl>
<Input
type="password"
placeholder="Your BookStack Token Secret"
{...field}
/>
</FormControl>
<FormDescription>
Your Token Secret will be encrypted and stored securely.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<div className="flex justify-end">
<Button type="submit" disabled={isSubmitting} className="w-full sm:w-auto">
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
Connecting...
</>
) : (
<>
<Check className="mr-2 h-4 w-4" />
Connect BookStack
</>
)}
</Button>
</div>
</form>
</Form>
</CardContent>
</Card>
</TabsContent>
<TabsContent value="documentation">
<Card>
<CardHeader>
<CardTitle>BookStack Integration Guide</CardTitle>
<CardDescription>
Learn how to set up and use the BookStack connector.
</CardDescription>
</CardHeader>
<CardContent className="space-y-6">
<div>
<h3 className="text-lg font-semibold mb-3">What gets indexed?</h3>
<ul className="list-disc list-inside space-y-2 text-sm text-muted-foreground">
<li>All pages from your BookStack instance</li>
<li>Page content in Markdown format</li>
<li>Page titles and metadata</li>
<li>Book and chapter hierarchy information</li>
</ul>
</div>
<div>
<h3 className="text-lg font-semibold mb-3">Setup Instructions</h3>
<ol className="list-decimal list-inside space-y-2 text-sm text-muted-foreground">
<li>Log in to your BookStack instance</li>
<li>Click on your profile icon Edit Profile</li>
<li>Navigate to the "API Tokens" tab</li>
<li>Click "Create Token" and give it a name</li>
<li>Copy both the Token ID and Token Secret</li>
<li>Paste them in the form above</li>
</ol>
</div>
<div>
<h3 className="text-lg font-semibold mb-3">Permissions Required</h3>
<ul className="list-disc list-inside space-y-2 text-sm text-muted-foreground">
<li>Your user account must have "Access System API" permission</li>
<li>Read access to books and pages you want to index</li>
<li>The connector will only index content your account can view</li>
</ul>
</div>
<Alert>
<Info className="h-4 w-4" />
<AlertDescription>
BookStack API has a rate limit of 180 requests per minute. The connector
automatically handles rate limiting to ensure reliable indexing.
</AlertDescription>
</Alert>
</CardContent>
</Card>
</TabsContent>
</Tabs>
</motion.div>
</div>
);
}

View file

@ -128,6 +128,7 @@ export function DashboardBreadcrumb() {
"github-connector": "GitHub",
"jira-connector": "Jira",
"confluence-connector": "Confluence",
"bookstack-connector": "BookStack",
"discord-connector": "Discord",
"linear-connector": "Linear",
"clickup-connector": "ClickUp",

View file

@ -43,6 +43,9 @@ export const editConnectorSchema = z.object({
CONFLUENCE_BASE_URL: z.string().optional(),
CONFLUENCE_EMAIL: z.string().optional(),
CONFLUENCE_API_TOKEN: z.string().optional(),
BOOKSTACK_BASE_URL: z.string().optional(),
BOOKSTACK_TOKEN_ID: z.string().optional(),
BOOKSTACK_TOKEN_SECRET: z.string().optional(),
JIRA_BASE_URL: z.string().optional(),
JIRA_EMAIL: z.string().optional(),
JIRA_API_TOKEN: z.string().optional(),

View file

@ -123,6 +123,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "bookstack-connector",
title: "BookStack",
description: "bookstack_desc",
icon: getConnectorIcon(EnumConnectorName.BOOKSTACK_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{

View file

@ -11,6 +11,7 @@ export enum EnumConnectorName {
JIRA_CONNECTOR = "JIRA_CONNECTOR",
DISCORD_CONNECTOR = "DISCORD_CONNECTOR",
CONFLUENCE_CONNECTOR = "CONFLUENCE_CONNECTOR",
BOOKSTACK_CONNECTOR = "BOOKSTACK_CONNECTOR",
CLICKUP_CONNECTOR = "CLICKUP_CONNECTOR",
GOOGLE_CALENDAR_CONNECTOR = "GOOGLE_CALENDAR_CONNECTOR",
GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR",

View file

@ -1,5 +1,6 @@
import {
IconBook,
IconBooks,
IconBrandDiscord,
IconBrandElastic,
IconBrandGithub,
@ -53,6 +54,8 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
return <IconTable {...iconProps} />;
case EnumConnectorName.CONFLUENCE_CONNECTOR:
return <IconBook {...iconProps} />;
case EnumConnectorName.BOOKSTACK_CONNECTOR:
return <IconBooks {...iconProps} />;
case EnumConnectorName.CLICKUP_CONNECTOR:
return <IconChecklist {...iconProps} />;
case EnumConnectorName.LUMA_CONNECTOR:

View file

@ -92,6 +92,9 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
CONFLUENCE_BASE_URL: "",
CONFLUENCE_EMAIL: "",
CONFLUENCE_API_TOKEN: "",
BOOKSTACK_BASE_URL: "",
BOOKSTACK_TOKEN_ID: "",
BOOKSTACK_TOKEN_SECRET: "",
JIRA_BASE_URL: "",
JIRA_EMAIL: "",
JIRA_API_TOKEN: "",
@ -139,6 +142,9 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
CONFLUENCE_BASE_URL: config.CONFLUENCE_BASE_URL || "",
CONFLUENCE_EMAIL: config.CONFLUENCE_EMAIL || "",
CONFLUENCE_API_TOKEN: config.CONFLUENCE_API_TOKEN || "",
BOOKSTACK_BASE_URL: config.BOOKSTACK_BASE_URL || "",
BOOKSTACK_TOKEN_ID: config.BOOKSTACK_TOKEN_ID || "",
BOOKSTACK_TOKEN_SECRET: config.BOOKSTACK_TOKEN_SECRET || "",
JIRA_BASE_URL: config.JIRA_BASE_URL || "",
JIRA_EMAIL: config.JIRA_EMAIL || "",
JIRA_API_TOKEN: config.JIRA_API_TOKEN || "",
@ -435,6 +441,28 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
};
}
break;
case "BOOKSTACK_CONNECTOR":
if (
formData.BOOKSTACK_BASE_URL !== originalConfig.BOOKSTACK_BASE_URL ||
formData.BOOKSTACK_TOKEN_ID !== originalConfig.BOOKSTACK_TOKEN_ID ||
formData.BOOKSTACK_TOKEN_SECRET !== originalConfig.BOOKSTACK_TOKEN_SECRET
) {
if (
!formData.BOOKSTACK_BASE_URL ||
!formData.BOOKSTACK_TOKEN_ID ||
!formData.BOOKSTACK_TOKEN_SECRET
) {
toast.error("All BookStack fields are required.");
setIsSaving(false);
return;
}
newConfig = {
BOOKSTACK_BASE_URL: formData.BOOKSTACK_BASE_URL,
BOOKSTACK_TOKEN_ID: formData.BOOKSTACK_TOKEN_ID,
BOOKSTACK_TOKEN_SECRET: formData.BOOKSTACK_TOKEN_SECRET,
};
}
break;
case "JIRA_CONNECTOR":
if (
formData.JIRA_BASE_URL !== originalConfig.JIRA_BASE_URL ||
@ -584,6 +612,10 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
editForm.setValue("CONFLUENCE_BASE_URL", newlySavedConfig.CONFLUENCE_BASE_URL || "");
editForm.setValue("CONFLUENCE_EMAIL", newlySavedConfig.CONFLUENCE_EMAIL || "");
editForm.setValue("CONFLUENCE_API_TOKEN", newlySavedConfig.CONFLUENCE_API_TOKEN || "");
} else if (connector.connector_type === "BOOKSTACK_CONNECTOR") {
editForm.setValue("BOOKSTACK_BASE_URL", newlySavedConfig.BOOKSTACK_BASE_URL || "");
editForm.setValue("BOOKSTACK_TOKEN_ID", newlySavedConfig.BOOKSTACK_TOKEN_ID || "");
editForm.setValue("BOOKSTACK_TOKEN_SECRET", newlySavedConfig.BOOKSTACK_TOKEN_SECRET || "");
} else if (connector.connector_type === "JIRA_CONNECTOR") {
editForm.setValue("JIRA_BASE_URL", newlySavedConfig.JIRA_BASE_URL || "");
editForm.setValue("JIRA_EMAIL", newlySavedConfig.JIRA_EMAIL || "");

View file

@ -330,6 +330,7 @@
"notion_desc": "Connect to your Notion workspace to access pages and databases.",
"github_desc": "Connect a GitHub PAT to index code and docs from accessible repositories.",
"confluence_desc": "Connect to Confluence to search pages, comments and documentation.",
"bookstack_desc": "Connect to BookStack to search wiki pages and documentation.",
"airtable_desc": "Connect to Airtable to search records, tables and database content.",
"luma_desc": "Connect to Luma to search events, meetups and gatherings.",
"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",

View file

@ -330,6 +330,7 @@
"notion_desc": "连接到您的 Notion 工作区以访问页面和数据库。",
"github_desc": "连接 GitHub PAT 以索引可访问存储库的代码和文档。",
"confluence_desc": "连接到 Confluence 以搜索页面、评论和文档。",
"bookstack_desc": "连接到 BookStack 以搜索 Wiki 页面和文档。",
"airtable_desc": "连接到 Airtable 以搜索记录、表格和数据库内容。",
"luma_desc": "连接到 Luma 以搜索活动、聚会和集会。",
"calendar_desc": "连接到 Google 日历以搜索活动、会议和日程。",