From f200502ffc4ebcf03f9ffddc92ef44a8a8a45d49 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Tue, 20 Jan 2026 03:11:49 -0800 Subject: [PATCH 1/7] refactor: bulk updating Admin to Editor roles - Consolidated the migration process for search space memberships and invites from Admin to Editor roles using bulk SQL updates. - Removed the Admin role in bulk for system roles. - Updated permissions for Editor and Viewer roles across all search spaces in a more efficient manner. --- .../versions/72_simplify_rbac_roles.py | 89 +++++++------------ 1 file changed, 34 insertions(+), 55 deletions(-) diff --git a/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py b/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py index e7d5ff019..2a3b81990 100644 --- a/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py +++ b/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py @@ -67,63 +67,42 @@ NEW_VIEWER_PERMISSIONS = [ def upgrade(): connection = op.get_bind() - # Step 1: For each search space, get the Editor role ID and Admin role ID - search_spaces = connection.execute( - sa.text("SELECT id FROM searchspaces") - ).fetchall() + # Step 1: Move all memberships from Admin roles to corresponding Editor roles (BULK) + # Uses a subquery to match Admin->Editor within the same search space + connection.execute( + sa.text(""" + UPDATE search_space_memberships m + SET role_id = e.id + FROM search_space_roles a + JOIN search_space_roles e ON a.search_space_id = e.search_space_id + WHERE m.role_id = a.id + AND a.name = 'Admin' + AND e.name = 'Editor' + """) + ) - for (ss_id,) in search_spaces: - # Get Admin and Editor role IDs for this search space - admin_role = connection.execute( - sa.text(""" - SELECT id FROM search_space_roles - WHERE search_space_id = :ss_id AND name = 'Admin' - """), - {"ss_id": ss_id}, - ).fetchone() + # Step 2: Move all invites from Admin roles to corresponding Editor roles (BULK) + connection.execute( + sa.text(""" + UPDATE search_space_invites i + SET role_id = e.id + FROM search_space_roles a + JOIN search_space_roles e ON a.search_space_id = e.search_space_id + WHERE i.role_id = a.id + AND a.name = 'Admin' + AND e.name = 'Editor' + """) + ) - editor_role = connection.execute( - sa.text(""" - SELECT id FROM search_space_roles - WHERE search_space_id = :ss_id AND name = 'Editor' - """), - {"ss_id": ss_id}, - ).fetchone() + # Step 3: Delete all Admin roles (BULK) + connection.execute( + sa.text(""" + DELETE FROM search_space_roles + WHERE name = 'Admin' AND is_system_role = TRUE + """) + ) - if admin_role and editor_role: - admin_role_id = admin_role[0] - editor_role_id = editor_role[0] - - # Step 2: Move all memberships from Admin to Editor - connection.execute( - sa.text(""" - UPDATE search_space_memberships - SET role_id = :editor_role_id - WHERE role_id = :admin_role_id - """), - {"editor_role_id": editor_role_id, "admin_role_id": admin_role_id}, - ) - - # Step 3: Move all invites from Admin to Editor - connection.execute( - sa.text(""" - UPDATE search_space_invites - SET role_id = :editor_role_id - WHERE role_id = :admin_role_id - """), - {"editor_role_id": editor_role_id, "admin_role_id": admin_role_id}, - ) - - # Step 4: Delete the Admin role - connection.execute( - sa.text(""" - DELETE FROM search_space_roles - WHERE id = :admin_role_id - """), - {"admin_role_id": admin_role_id}, - ) - - # Step 5: Update Editor permissions for all search spaces + # Step 4: Update Editor permissions for all search spaces (BULK) editor_perms_literal = ( "ARRAY[" + ",".join(f"'{p}'" for p in NEW_EDITOR_PERMISSIONS) + "]::TEXT[]" ) @@ -136,7 +115,7 @@ def upgrade(): """) ) - # Step 6: Update Viewer permissions for all search spaces + # Step 5: Update Viewer permissions for all search spaces (BULK) viewer_perms_literal = ( "ARRAY[" + ",".join(f"'{p}'" for p in NEW_VIEWER_PERMISSIONS) + "]::TEXT[]" ) From 49b8a46d1045bd71f89aa9b519134a5d21f04832 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Tue, 20 Jan 2026 21:52:32 +0530 Subject: [PATCH 2/7] feat: Integrate gitingest for GitHub repository ingestion - Added gitingest as a dependency to streamline the ingestion of GitHub repositories. - Refactored GitHubConnector to utilize gitingest for efficient repository digest generation, reducing API calls. - Updated GitHub indexer to process entire repository digests, enhancing performance and simplifying the indexing process. - Modified GitHub connect form to indicate that the Personal Access Token is optional for public repositories. --- .../app/connectors/github_connector.py | 507 +++++++++--------- .../connector_indexers/github_indexer.py | 491 ++++++++--------- surfsense_backend/pyproject.toml | 1 + surfsense_backend/uv.lock | 30 ++ .../components/github-connect-form.tsx | 51 +- .../new-chat/source-detail-panel.tsx | 4 +- 6 files changed, 545 insertions(+), 539 deletions(-) diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index 647856c6f..90fd93fb9 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -1,296 +1,295 @@ -import base64 -import logging -from typing import Any +""" +GitHub connector using gitingest for efficient repository digestion. -from github3 import exceptions as github_exceptions, login as github_login -from github3.exceptions import ForbiddenError, NotFoundError -from github3.repos.contents import Contents +This connector replaces the previous file-by-file approach with a single +digest generation per repository, dramatically reducing LLM API calls. +""" + +import logging +from dataclasses import dataclass + +from gitingest import ingest_async logger = logging.getLogger(__name__) -# List of common code file extensions to target -CODE_EXTENSIONS = { - ".py", - ".js", - ".jsx", - ".ts", - ".tsx", - ".java", - ".c", - ".cpp", - ".h", - ".hpp", - ".cs", - ".go", - ".rb", - ".php", - ".swift", - ".kt", - ".scala", - ".rs", - ".m", - ".sh", - ".bash", - ".ps1", - ".lua", - ".pl", - ".pm", - ".r", - ".dart", - ".sql", -} +# Maximum file size in bytes (5MB) +MAX_FILE_SIZE = 5 * 1024 * 1024 -# List of common documentation/text file extensions -DOC_EXTENSIONS = { - ".md", - ".txt", - ".rst", - ".adoc", - ".html", - ".htm", - ".xml", - ".json", - ".yaml", - ".yml", - ".toml", -} +# Default patterns to exclude (recommended approach for comprehensive analysis) +# Using only exclude_patterns ensures we don't miss any relevant file types +DEFAULT_EXCLUDE_PATTERNS = [ + # Dependencies + "node_modules/*", + "vendor/*", + "bower_components/*", + ".pnpm/*", + # Build artifacts / Caches + "build/*", + "dist/*", + "target/*", + "out/*", + "__pycache__/*", + "*.pyc", + ".cache/*", + ".next/*", + ".nuxt/*", + # Virtual environments + "venv/*", + ".venv/*", + "env/*", + ".env/*", + # IDE/Editor config + ".vscode/*", + ".idea/*", + ".project", + ".settings/*", + "*.swp", + "*.swo", + # Version control + ".git/*", + ".svn/*", + ".hg/*", + # Temporary / Logs + "tmp/*", + "temp/*", + "logs/*", + "*.log", + # Lock files (usually not needed for understanding code) + "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", + "uv.lock", + "Gemfile.lock", + "poetry.lock", + "Cargo.lock", + "composer.lock", + # Binary/media files + "*.png", + "*.jpg", + "*.jpeg", + "*.gif", + "*.ico", + "*.svg", + "*.webp", + "*.bmp", + "*.tiff", + "*.woff", + "*.woff2", + "*.ttf", + "*.eot", + "*.otf", + "*.mp3", + "*.mp4", + "*.wav", + "*.ogg", + "*.webm", + "*.avi", + "*.mov", + "*.pdf", + "*.doc", + "*.docx", + "*.xls", + "*.xlsx", + "*.ppt", + "*.pptx", + "*.zip", + "*.tar", + "*.tar.gz", + "*.tgz", + "*.rar", + "*.7z", + "*.exe", + "*.dll", + "*.so", + "*.dylib", + "*.bin", + "*.obj", + "*.o", + "*.a", + "*.lib", + # Minified files + "*.min.js", + "*.min.css", + # Source maps + "*.map", + # Database files + "*.db", + "*.sqlite", + "*.sqlite3", + # Coverage reports + "coverage/*", + ".coverage", + "htmlcov/*", + ".nyc_output/*", + # Test snapshots (can be large) + "__snapshots__/*", +] -# Maximum file size in bytes (e.g., 1MB) -MAX_FILE_SIZE = 1 * 1024 * 1024 + +@dataclass +class RepositoryDigest: + """Represents a digested repository from gitingest.""" + + repo_full_name: str + summary: str + tree: str + content: str + branch: str | None = None + + @property + def full_digest(self) -> str: + """Returns the complete digest with tree and content.""" + return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}" + + @property + def estimated_tokens(self) -> int: + """Rough estimate of tokens (1 token ≈ 4 characters).""" + return len(self.full_digest) // 4 class GitHubConnector: - """Connector for interacting with the GitHub API.""" + """ + Connector for ingesting GitHub repositories using gitingest. - # Directories to skip during file traversal - SKIPPED_DIRS = { - # Version control - ".git", - # Dependencies - "node_modules", - "vendor", - # Build artifacts / Caches - "build", - "dist", - "target", - "__pycache__", - # Virtual environments - "venv", - ".venv", - "env", - # IDE/Editor config - ".vscode", - ".idea", - ".project", - ".settings", - # Temporary / Logs - "tmp", - "logs", - # Add other project-specific irrelevant directories if needed - } + This connector efficiently processes entire repositories into a single + digest, reducing the number of API calls and LLM invocations compared + to file-by-file processing. + """ - def __init__(self, token: str): + def __init__(self, token: str | None = None): """ Initializes the GitHub connector. Args: - token: GitHub Personal Access Token (PAT). + token: Optional GitHub Personal Access Token (PAT). + Only required for private repositories. + Public repositories can be ingested without a token. """ - if not token: - raise ValueError("GitHub token cannot be empty.") - try: - self.gh = github_login(token=token) - # Try a simple authenticated call to check token validity - self.gh.me() - logger.info("Successfully authenticated with GitHub API.") - except (github_exceptions.AuthenticationFailed, ForbiddenError) as e: - logger.error(f"GitHub authentication failed: {e}") - raise ValueError("Invalid GitHub token or insufficient permissions.") from e - except Exception as e: - logger.error(f"Failed to initialize GitHub client: {e}") - raise e + self.token = token if token and token.strip() else None + if self.token: + logger.info("GitHub connector initialized with authentication token.") + else: + logger.info("GitHub connector initialized without token (public repos only).") - def get_user_repositories(self) -> list[dict[str, Any]]: - """Fetches repositories accessible by the authenticated user.""" - repos_data = [] - try: - # type='owner' fetches repos owned by the user - # type='member' fetches repos the user is a collaborator on (including orgs) - # type='all' fetches both - for repo in self.gh.repositories(type="all", sort="updated"): - repos_data.append( - { - "id": repo.id, - "name": repo.name, - "full_name": repo.full_name, - "private": repo.private, - "url": repo.html_url, - "description": repo.description or "", - "last_updated": repo.updated_at if repo.updated_at else None, - } - ) - logger.info(f"Fetched {len(repos_data)} repositories.") - return repos_data - except Exception as e: - logger.error(f"Failed to fetch GitHub repositories: {e}") - return [] # Return empty list on error - - def get_repository_files( - self, repo_full_name: str, path: str = "" - ) -> list[dict[str, Any]]: + async def ingest_repository( + self, + repo_full_name: str, + branch: str | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, + max_file_size: int = MAX_FILE_SIZE, + ) -> RepositoryDigest | None: """ - Recursively fetches details of relevant files (code, docs) within a repository path. + Ingest an entire repository and return a digest. Args: repo_full_name: The full name of the repository (e.g., 'owner/repo'). - path: The starting path within the repository (default is root). + branch: Optional specific branch or tag to ingest. + include_patterns: Optional list of glob patterns for files to include. + If None, includes all files (recommended). + exclude_patterns: Optional list of glob patterns for files to exclude. + If None, uses DEFAULT_EXCLUDE_PATTERNS. + max_file_size: Maximum file size in bytes to include (default 5MB). Returns: - A list of dictionaries, each containing file details (path, sha, url, size). - Returns an empty list if the repository or path is not found or on error. + RepositoryDigest containing the summary, tree structure, and content, + or None if ingestion fails. """ - files_list = [] + repo_url = f"https://github.com/{repo_full_name}" + + # Use only exclude_patterns by default (recommended for comprehensive analysis) + # This ensures we don't miss any relevant file types + exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS + + logger.info(f"Starting gitingest for repository: {repo_full_name}") + try: - owner, repo_name = repo_full_name.split("/") - repo = self.gh.repository(owner, repo_name) - if not repo: - logger.warning(f"Repository '{repo_full_name}' not found.") - return [] - contents = repo.directory_contents( - directory_path=path - ) # Use directory_contents for clarity + # Build kwargs dynamically + ingest_kwargs = { + "max_file_size": max_file_size, + "exclude_patterns": exclude_pats, + "include_gitignored": False, + "include_submodules": False, + } - # contents returns a list of tuples (name, content_obj) - for _item_name, content_item in contents: - if not isinstance(content_item, Contents): - continue + # Only add token if provided (required only for private repos) + if self.token: + ingest_kwargs["token"] = self.token - if content_item.type == "dir": - # Check if the directory name is in the skipped list - if content_item.name in self.SKIPPED_DIRS: - logger.debug(f"Skipping directory: {content_item.path}") - continue # Skip recursion for this directory + # Only add branch if specified + if branch: + ingest_kwargs["branch"] = branch - # Recursively fetch contents of subdirectory - files_list.extend( - self.get_repository_files( - repo_full_name, path=content_item.path - ) - ) - elif content_item.type == "file": - # Check if the file extension is relevant and size is within limits - file_extension = ( - "." + content_item.name.split(".")[-1].lower() - if "." in content_item.name - else "" - ) - is_code = file_extension in CODE_EXTENSIONS - is_doc = file_extension in DOC_EXTENSIONS + # Only add include_patterns if explicitly provided + if include_patterns is not None: + ingest_kwargs["include_patterns"] = include_patterns - if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE: - files_list.append( - { - "path": content_item.path, - "sha": content_item.sha, - "url": content_item.html_url, - "size": content_item.size, - "type": "code" if is_code else "doc", - } - ) - elif content_item.size > MAX_FILE_SIZE: - logger.debug( - f"Skipping large file: {content_item.path} ({content_item.size} bytes)" - ) - else: - logger.debug( - f"Skipping irrelevant file type: {content_item.path}" - ) + summary, tree, content = await ingest_async(repo_url, **ingest_kwargs) - except (NotFoundError, ForbiddenError) as e: - logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}") - except Exception as e: - logger.error( - f"Failed to get files for {repo_full_name} at path '{path}': {e}" + if not content or not content.strip(): + logger.warning( + f"No content retrieved from repository: {repo_full_name}" + ) + return None + + digest = RepositoryDigest( + repo_full_name=repo_full_name, + summary=summary, + tree=tree, + content=content, + branch=branch, ) - # Return what we have collected so far in case of partial failure - return files_list + logger.info( + f"Successfully ingested {repo_full_name}: " + f"~{digest.estimated_tokens} estimated tokens" + ) + return digest - def get_file_content(self, repo_full_name: str, file_path: str) -> str | None: + except Exception as e: + logger.error(f"Failed to ingest repository {repo_full_name}: {e}") + return None + + async def ingest_repositories( + self, + repo_full_names: list[str], + branch: str | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, + max_file_size: int = MAX_FILE_SIZE, + ) -> list[RepositoryDigest]: """ - Fetches the decoded content of a specific file. + Ingest multiple repositories and return their digests. Args: - repo_full_name: The full name of the repository (e.g., 'owner/repo'). - file_path: The path to the file within the repository. + repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']). + branch: Optional specific branch or tag to ingest (applied to all repos). + include_patterns: Optional list of glob patterns for files to include. + exclude_patterns: Optional list of glob patterns for files to exclude. + max_file_size: Maximum file size in bytes to include. Returns: - The decoded file content as a string, or None if fetching fails or file is too large. + List of RepositoryDigest objects for successfully ingested repositories. """ - try: - owner, repo_name = repo_full_name.split("/") - repo = self.gh.repository(owner, repo_name) - if not repo: - logger.warning( - f"Repository '{repo_full_name}' not found when fetching file '{file_path}'." - ) - return None + digests = [] - content_item = repo.file_contents( - path=file_path - ) # Use file_contents for clarity + for repo_full_name in repo_full_names: + if not repo_full_name or not isinstance(repo_full_name, str): + logger.warning(f"Skipping invalid repository entry: {repo_full_name}") + continue - if ( - not content_item - or not isinstance(content_item, Contents) - or content_item.type != "file" - ): - logger.warning( - f"File '{file_path}' not found or is not a file in '{repo_full_name}'." - ) - return None - - if content_item.size > MAX_FILE_SIZE: - logger.warning( - f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch." - ) - return None - - # Content is base64 encoded - if content_item.content: - try: - decoded_content = base64.b64decode(content_item.content).decode( - "utf-8" - ) - return decoded_content - except UnicodeDecodeError: - logger.warning( - f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'." - ) - try: - # Try a fallback encoding - decoded_content = base64.b64decode(content_item.content).decode( - "latin-1" - ) - return decoded_content - except Exception as decode_err: - logger.error( - f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}" - ) - return None # Give up if fallback fails - else: - logger.warning( - f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty." - ) - return "" # Return empty string for empty files - - except (NotFoundError, ForbiddenError) as e: - logger.warning( - f"Cannot access file '{file_path}' in '{repo_full_name}': {e}" + digest = await self.ingest_repository( + repo_full_name=repo_full_name, + branch=branch, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + max_file_size=max_file_size, ) - return None - except Exception as e: - logger.error( - f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}" - ) - return None + + if digest: + digests.append(digest) + + logger.info( + f"Ingested {len(digests)} out of {len(repo_full_names)} repositories." + ) + return digests diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index e1844a503..f1ccabdef 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -1,5 +1,8 @@ """ -GitHub connector indexer. +GitHub connector indexer using gitingest. + +This indexer processes entire repository digests in one pass, dramatically +reducing LLM API calls compared to the previous file-by-file approach. """ from datetime import UTC, datetime @@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config -from app.connectors.github_connector import GitHubConnector +from app.connectors.github_connector import GitHubConnector, RepositoryDigest from app.db import Document, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService @@ -26,43 +29,55 @@ from .base import ( logger, ) +# Maximum tokens for a single digest before splitting +# Most LLMs can handle 128k+ tokens now, but we'll be conservative +MAX_DIGEST_CHARS = 500_000 # ~125k tokens + async def index_github_repos( session: AsyncSession, connector_id: int, search_space_id: int, user_id: str, - start_date: str | None = None, - end_date: str | None = None, + start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots + end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots update_last_indexed: bool = True, ) -> tuple[int, str | None]: """ - Index code and documentation files from accessible GitHub repositories. + Index GitHub repositories using gitingest for efficient processing. + + This function ingests entire repositories as digests, generates a single + summary per repository, and chunks the content for vector storage. + + Note: The start_date and end_date parameters are accepted for API compatibility + but are IGNORED. GitHub repositories are indexed as complete snapshots since + gitingest captures the current state of the entire codebase. Args: session: Database session connector_id: ID of the GitHub connector search_space_id: ID of the search space to store documents in user_id: ID of the user - start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates - end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates + start_date: Ignored - kept for API compatibility + end_date: Ignored - kept for API compatibility update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) Returns: Tuple containing (number of documents indexed, error message or None) """ + # Note: start_date and end_date are intentionally unused + _ = start_date, end_date task_logger = TaskLoggingService(session, search_space_id) # Log task start log_entry = await task_logger.log_task_start( task_name="github_repos_indexing", source="connector_indexing_task", - message=f"Starting GitHub repositories indexing for connector {connector_id}", + message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)", metadata={ "connector_id": connector_id, "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, + "method": "gitingest", }, ) @@ -93,19 +108,11 @@ async def index_github_repos( f"Connector with ID {connector_id} not found or is not a GitHub connector", ) - # 2. Get the GitHub PAT and selected repositories from the connector config - github_pat = connector.config.get("GITHUB_PAT") + # 2. Get the GitHub PAT (optional) and selected repositories from the connector config + # PAT is only required for private repositories - public repos work without it + github_pat = connector.config.get("GITHUB_PAT") # Can be None or empty repo_full_names_to_index = connector.config.get("repo_full_names") - if not github_pat: - await task_logger.log_task_failure( - log_entry, - f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", - "Missing GitHub PAT", - {"error_type": "MissingToken"}, - ) - return 0, "GitHub Personal Access Token (PAT) not found in connector config" - if not repo_full_names_to_index or not isinstance( repo_full_names_to_index, list ): @@ -117,10 +124,16 @@ async def index_github_repos( ) return 0, "'repo_full_names' not found or is not a list in connector config" - # 3. Initialize GitHub connector client + # Log whether we're using authentication + if github_pat: + logger.info("Using GitHub PAT for authentication (private repos supported)") + else: + logger.info("No GitHub PAT provided - only public repositories can be indexed") + + # 3. Initialize GitHub connector with gitingest backend await task_logger.log_task_progress( log_entry, - f"Initializing GitHub client for connector {connector_id}", + f"Initializing gitingest-based GitHub client for connector {connector_id}", { "stage": "client_initialization", "repo_count": len(repo_full_names_to_index), @@ -138,258 +151,52 @@ async def index_github_repos( ) return 0, f"Failed to initialize GitHub client: {e!s}" - # 4. Validate selected repositories + # 4. Process each repository with gitingest await task_logger.log_task_progress( log_entry, - f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", + f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories", { "stage": "repo_processing", "repo_count": len(repo_full_names_to_index), - "start_date": start_date, - "end_date": end_date, }, ) logger.info( - f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." + f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories." ) - if start_date and end_date: - logger.info( - f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" - ) - # 6. Iterate through selected repositories and index files for repo_full_name in repo_full_names_to_index: if not repo_full_name or not isinstance(repo_full_name, str): logger.warning(f"Skipping invalid repository entry: {repo_full_name}") continue - logger.info(f"Processing repository: {repo_full_name}") + logger.info(f"Ingesting repository: {repo_full_name}") + try: - files_to_index = github_client.get_repository_files(repo_full_name) - if not files_to_index: - logger.info( - f"No indexable files found in repository: {repo_full_name}" + # Ingest the entire repository + digest = await github_client.ingest_repository(repo_full_name) + + if not digest: + logger.warning( + f"No digest returned for repository: {repo_full_name}" ) + errors.append(f"No digest for {repo_full_name}") continue - logger.info( - f"Found {len(files_to_index)} files to process in {repo_full_name}" + # Process the digest and create documents + docs_created = await _process_repository_digest( + session=session, + digest=digest, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, ) - for file_info in files_to_index: - file_path = file_info.get("path") - file_url = file_info.get("url") - file_sha = file_info.get("sha") - file_type = file_info.get("type") # 'code' or 'doc' - full_path_key = f"{repo_full_name}/{file_path}" - - if not file_path or not file_url or not file_sha: - logger.warning( - f"Skipping file with missing info in {repo_full_name}: {file_info}" - ) - continue - - # Get file content - file_content = github_client.get_file_content( - repo_full_name, file_path - ) - - if file_content is None: - logger.warning( - f"Could not retrieve content for {full_path_key}. Skipping." - ) - continue # Skip if content fetch failed - - # Generate unique identifier hash for this GitHub file - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_content, search_space_id) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info( - f"Document for GitHub file {full_path_key} unchanged. Skipping." - ) - continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for GitHub file {full_path_key}. Updating document." - ) - - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if user_llm: - file_extension = ( - file_path.split(".")[-1] - if "." in file_path - else None - ) - document_metadata = { - "file_path": full_path_key, - "repository": repo_full_name, - "file_type": file_extension or "unknown", - "document_type": "GitHub Repository File", - "connector_type": "GitHub", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - file_content, user_llm, document_metadata - ) - else: - summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." - summary_embedding = ( - config.embedding_model_instance.embed( - summary_content - ) - ) - - # Chunk the content - try: - if hasattr(config, "code_chunker_instance"): - chunks_data = [ - await create_document_chunks(file_content) - ][0] - else: - chunks_data = await create_document_chunks( - file_content - ) - except Exception as chunk_err: - logger.error( - f"Failed to chunk file {full_path_key}: {chunk_err}" - ) - continue - - # Update existing document - existing_document.title = f"GitHub - {full_path_key}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_path": file_path, - "file_sha": file_sha, - "file_url": file_url, - "repository": repo_full_name, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - existing_document.chunks = chunks_data - existing_document.updated_at = get_current_timestamp() - - logger.info( - f"Successfully updated GitHub file {full_path_key}" - ) - continue - - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if user_llm: - # Extract file extension from file path - file_extension = ( - file_path.split(".")[-1] if "." in file_path else None - ) - document_metadata = { - "file_path": full_path_key, - "repository": repo_full_name, - "file_type": file_extension or "unknown", - "document_type": "GitHub Repository File", - "connector_type": "GitHub", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - file_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = ( - f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Chunk the content - try: - chunks_data = [await create_document_chunks(file_content)][0] - - # Use code chunker if available, otherwise regular chunker - if hasattr(config, "code_chunker_instance"): - chunks_data = [ - { - "content": chunk.text, - "embedding": config.embedding_model_instance.embed( - chunk.text - ), - } - for chunk in config.code_chunker_instance.chunk( - file_content - ) - ] - else: - chunks_data = await create_document_chunks(file_content) - - except Exception as chunk_err: - logger.error( - f"Failed to chunk file {full_path_key}: {chunk_err}" - ) - errors.append( - f"Chunking failed for {full_path_key}: {chunk_err}" - ) - continue # Skip this file if chunking fails - - doc_metadata = { - "repository_full_name": repo_full_name, - "file_path": file_path, - "full_path": full_path_key, # For easier lookup - "url": file_url, - "sha": file_sha, - "type": file_type, - "indexed_at": datetime.now(UTC).isoformat(), - } - - # Create new document - logger.info(f"Creating new document for file: {full_path_key}") - document = Document( - title=f"GitHub - {file_path}", - document_type=DocumentType.GITHUB_CONNECTOR, - document_metadata=doc_metadata, - content=summary_content, # Store summary - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - search_space_id=search_space_id, - chunks=chunks_data, # Associate chunks directly - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_processed += 1 - - # Batch commit every 10 documents - if documents_processed % 10 == 0: - logger.info( - f"Committing batch: {documents_processed} GitHub files processed so far" - ) - await session.commit() + documents_processed += docs_created + logger.info( + f"Created {docs_created} documents from repository: {repo_full_name}" + ) except Exception as repo_err: logger.error( @@ -397,11 +204,11 @@ async def index_github_repos( ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") - # Final commit for any remaining documents not yet committed in batches - logger.info(f"Final commit: Total {documents_processed} GitHub files processed") + # Final commit await session.commit() logger.info( - f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." + f"Finished GitHub indexing for connector {connector_id}. " + f"Created {documents_processed} documents." ) # Log success @@ -412,6 +219,7 @@ async def index_github_repos( "documents_processed": documents_processed, "errors_count": len(errors), "repo_count": len(repo_full_names_to_index), + "method": "gitingest", }, ) @@ -428,6 +236,7 @@ async def index_github_repos( ) errors.append(f"Database error: {db_err}") return documents_processed, "; ".join(errors) if errors else str(db_err) + except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -445,3 +254,173 @@ async def index_github_repos( error_message = "; ".join(errors) if errors else None return documents_processed, error_message + + +async def _process_repository_digest( + session: AsyncSession, + digest: RepositoryDigest, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, +) -> int: + """ + Process a repository digest and create documents. + + For each repository, we create: + 1. One main document with the repository summary + 2. Chunks from the full digest content for granular search + + Args: + session: Database session + digest: The repository digest from gitingest + search_space_id: ID of the search space + user_id: ID of the user + task_logger: Task logging service + log_entry: Current log entry + + Returns: + Number of documents created + """ + repo_full_name = digest.repo_full_name + documents_created = 0 + + # Generate unique identifier based on repo name and content hash + # This allows updates when repo content changes + full_content = digest.full_digest + content_hash = generate_content_hash(full_content, search_space_id) + + # Use repo name as the unique identifier (one document per repo) + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id + ) + + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + logger.info( + f"Repository {repo_full_name} unchanged. Skipping." + ) + return 0 + else: + logger.info( + f"Content changed for repository {repo_full_name}. Updating document." + ) + # Delete existing document to replace with new one + await session.delete(existing_document) + await session.flush() + + # Generate summary using LLM (ONE call per repository!) + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + document_metadata = { + "repository": repo_full_name, + "document_type": "GitHub Repository", + "connector_type": "GitHub", + "ingestion_method": "gitingest", + "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree, + "estimated_tokens": digest.estimated_tokens, + } + + if user_llm: + # Prepare content for summarization + # Include tree structure and truncated content if too large + summary_content = digest.full_digest + if len(summary_content) > MAX_DIGEST_CHARS: + # Truncate but keep the tree and beginning of content + summary_content = ( + f"# Repository: {repo_full_name}\n\n" + f"## File Structure\n\n{digest.tree}\n\n" + f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." + ) + + summary_text, summary_embedding = await generate_document_summary( + summary_content, user_llm, document_metadata + ) + else: + # Fallback to simple summary if no LLM configured + summary_text = ( + f"# GitHub Repository: {repo_full_name}\n\n" + f"## Summary\n{digest.summary}\n\n" + f"## File Structure\n{digest.tree[:3000]}" + ) + summary_embedding = config.embedding_model_instance.embed(summary_text) + + # Chunk the full digest content for granular search + try: + # Use the content (not the summary) for chunking + # This preserves file-level granularity in search + chunks_data = await create_document_chunks(digest.content) + except Exception as chunk_err: + logger.error( + f"Failed to chunk repository {repo_full_name}: {chunk_err}" + ) + # Fall back to a simpler chunking approach + chunks_data = await _simple_chunk_content(digest.content) + + # Create the document + doc_metadata = { + "repository_full_name": repo_full_name, + "url": f"https://github.com/{repo_full_name}", + "branch": digest.branch, + "ingestion_method": "gitingest", + "file_tree": digest.tree, + "gitingest_summary": digest.summary, + "estimated_tokens": digest.estimated_tokens, + "indexed_at": datetime.now(UTC).isoformat(), + } + + document = Document( + title=f"GitHub Repository: {repo_full_name}", + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata=doc_metadata, + content=summary_text, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + search_space_id=search_space_id, + chunks=chunks_data, + updated_at=get_current_timestamp(), + ) + + session.add(document) + documents_created += 1 + + logger.info( + f"Created document for repository {repo_full_name} " + f"with {len(chunks_data)} chunks" + ) + + return documents_created + + +async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: + """ + Simple fallback chunking when the regular chunker fails. + + Args: + content: The content to chunk + chunk_size: Size of each chunk in characters + + Returns: + List of chunk dictionaries with content and embedding + """ + from app.db import Chunk + + chunks = [] + for i in range(0, len(content), chunk_size): + chunk_text = content[i : i + chunk_size] + if chunk_text.strip(): + chunks.append( + Chunk( + content=chunk_text, + embedding=config.embedding_model_instance.embed(chunk_text), + ) + ) + + return chunks diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 83a00b4e4..6197dbce7 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "mcp>=1.25.0", "starlette>=0.40.0,<0.51.0", "sse-starlette>=3.1.1,<3.1.2", + "gitingest>=0.3.1", ] [dependency-groups] diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index ef01847f8..44daab0d6 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -1945,6 +1945,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" }, ] +[[package]] +name = "gitingest" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "httpx" }, + { name = "loguru" }, + { name = "pathspec" }, + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "starlette" }, + { name = "tiktoken" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" }, +] + [[package]] name = "google-api-core" version = "2.25.1" @@ -4460,6 +4479,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" }, ] +[[package]] +name = "pathspec" +version = "1.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" }, +] + [[package]] name = "pdf2image" version = "1.17.0" @@ -6484,6 +6512,7 @@ dependencies = [ { name = "firecrawl-py" }, { name = "flower" }, { name = "github3-py" }, + { name = "gitingest" }, { name = "google-api-python-client" }, { name = "google-auth-oauthlib" }, { name = "kokoro" }, @@ -6549,6 +6578,7 @@ requires-dist = [ { name = "firecrawl-py", specifier = ">=4.9.0" }, { name = "flower", specifier = ">=2.0.1" }, { name = "github3-py", specifier = "==4.0.1" }, + { name = "gitingest", specifier = ">=0.3.1" }, { name = "google-api-python-client", specifier = ">=2.156.0" }, { name = "google-auth-oauthlib", specifier = ">=1.2.1" }, { name = "kokoro", specifier = ">=0.9.4" }, diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx index b2b371ed8..6ed36e180 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx @@ -34,7 +34,6 @@ import { } from "@/components/ui/select"; import { Switch } from "@/components/ui/switch"; import { EnumConnectorName } from "@/contracts/enums/connector"; -import { DateRangeSelector } from "../../components/date-range-selector"; import { getConnectorBenefits } from "../connector-benefits"; import type { ConnectFormProps } from "../index"; @@ -44,12 +43,13 @@ const githubConnectorFormSchema = z.object({ }), github_pat: z .string() - .min(20, { - message: "GitHub Personal Access Token seems too short.", - }) - .refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), { - message: "GitHub PAT should start with 'ghp_' or 'github_pat_'", - }), + .optional() + .refine( + (pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"), + { + message: "GitHub PAT should start with 'ghp_' or 'github_pat_'", + } + ), repo_full_names: z.string().min(1, { message: "At least one repository is required.", }), @@ -59,8 +59,6 @@ type GithubConnectorFormValues = z.infer; export const GithubConnectForm: FC = ({ onSubmit, isSubmitting }) => { const isSubmittingRef = useRef(false); - const [startDate, setStartDate] = useState(undefined); - const [endDate, setEndDate] = useState(undefined); const [periodicEnabled, setPeriodicEnabled] = useState(false); const [frequencyMinutes, setFrequencyMinutes] = useState("1440"); const form = useForm({ @@ -94,7 +92,7 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting name: values.name, connector_type: EnumConnectorName.GITHUB_CONNECTOR, config: { - GITHUB_PAT: values.github_pat, + GITHUB_PAT: values.github_pat || null, // Optional - only for private repos repo_full_names: repoList, }, is_indexable: true, @@ -102,8 +100,9 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting periodic_indexing_enabled: periodicEnabled, indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null, next_scheduled_at: null, - startDate, - endDate, + // GitHub indexes full repo snapshots - no date range needed + startDate: undefined, + endDate: undefined, periodicEnabled, frequencyMinutes, }); @@ -117,10 +116,10 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
- Personal Access Token Required + Personal Access Token (Optional) - You'll need a GitHub Personal Access Token to use this connector. You can create one - from{" "} + A GitHub PAT is only required for private repositories. Public repos work without a + token. Create one from{" "} = ({ onSubmit, isSubmitting className="font-medium underline underline-offset-4" > GitHub Settings - + {" "} + if needed.
@@ -167,7 +167,10 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting name="github_pat" render={({ field }) => ( - GitHub Personal Access Token + + GitHub Personal Access Token{" "} + (optional) + = ({ onSubmit, isSubmitting /> - Your GitHub PAT will be encrypted and stored securely. It typically starts with - "ghp_" or "github_pat_". + Only required for private repositories. Leave empty if indexing public repos + only. @@ -225,15 +228,9 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting {/* Indexing Configuration */}
-

Indexing Configuration

+

Sync Configuration

- {/* Date Range Selector */} - + {/* Note: No date range for GitHub - it indexes full repo snapshots */} {/* Periodic Sync Config */}
diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index df2809fdb..bf5bd4087 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -490,8 +490,8 @@ export function SourceDetailPanel({ > {idx + 1} {isCited && ( - - + + )} From 35888144ebba4111e2db43382db99790bcd8adba Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Tue, 20 Jan 2026 23:24:33 +0530 Subject: [PATCH 3/7] refactor: Update GitHub connector to use gitingest CLI - Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery. - Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution. - Adjusted GitHub indexer to call the new synchronous ingestion method. - Clarified documentation regarding the optional nature of the Personal Access Token for public repositories. --- .../app/connectors/github_connector.py | 325 +++++++----------- .../connector_indexers/github_indexer.py | 9 +- surfsense_backend/app/utils/validators.py | 5 +- .../config/connector-status-config.json | 5 - .../components/github-connect-form.tsx | 98 +++--- .../components/github-config.tsx | 25 +- .../views/connector-edit-view.tsx | 5 +- .../views/indexing-configuration-view.tsx | 5 +- 8 files changed, 221 insertions(+), 256 deletions(-) diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index 90fd93fb9..6f04ccdba 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -1,130 +1,21 @@ """ -GitHub connector using gitingest for efficient repository digestion. +GitHub connector using gitingest CLI for efficient repository digestion. -This connector replaces the previous file-by-file approach with a single -digest generation per repository, dramatically reducing LLM API calls. +This connector uses subprocess to call gitingest CLI, completely isolating +it from any Python event loop/async complexity that can cause hangs in Celery. """ import logging +import os +import subprocess +import tempfile from dataclasses import dataclass -from gitingest import ingest_async - logger = logging.getLogger(__name__) # Maximum file size in bytes (5MB) MAX_FILE_SIZE = 5 * 1024 * 1024 -# Default patterns to exclude (recommended approach for comprehensive analysis) -# Using only exclude_patterns ensures we don't miss any relevant file types -DEFAULT_EXCLUDE_PATTERNS = [ - # Dependencies - "node_modules/*", - "vendor/*", - "bower_components/*", - ".pnpm/*", - # Build artifacts / Caches - "build/*", - "dist/*", - "target/*", - "out/*", - "__pycache__/*", - "*.pyc", - ".cache/*", - ".next/*", - ".nuxt/*", - # Virtual environments - "venv/*", - ".venv/*", - "env/*", - ".env/*", - # IDE/Editor config - ".vscode/*", - ".idea/*", - ".project", - ".settings/*", - "*.swp", - "*.swo", - # Version control - ".git/*", - ".svn/*", - ".hg/*", - # Temporary / Logs - "tmp/*", - "temp/*", - "logs/*", - "*.log", - # Lock files (usually not needed for understanding code) - "package-lock.json", - "pnpm-lock.yaml", - "yarn.lock", - "uv.lock", - "Gemfile.lock", - "poetry.lock", - "Cargo.lock", - "composer.lock", - # Binary/media files - "*.png", - "*.jpg", - "*.jpeg", - "*.gif", - "*.ico", - "*.svg", - "*.webp", - "*.bmp", - "*.tiff", - "*.woff", - "*.woff2", - "*.ttf", - "*.eot", - "*.otf", - "*.mp3", - "*.mp4", - "*.wav", - "*.ogg", - "*.webm", - "*.avi", - "*.mov", - "*.pdf", - "*.doc", - "*.docx", - "*.xls", - "*.xlsx", - "*.ppt", - "*.pptx", - "*.zip", - "*.tar", - "*.tar.gz", - "*.tgz", - "*.rar", - "*.7z", - "*.exe", - "*.dll", - "*.so", - "*.dylib", - "*.bin", - "*.obj", - "*.o", - "*.a", - "*.lib", - # Minified files - "*.min.js", - "*.min.css", - # Source maps - "*.map", - # Database files - "*.db", - "*.sqlite", - "*.sqlite3", - # Coverage reports - "coverage/*", - ".coverage", - "htmlcov/*", - ".nyc_output/*", - # Test snapshots (can be large) - "__snapshots__/*", -] - @dataclass class RepositoryDigest: @@ -149,21 +40,19 @@ class RepositoryDigest: class GitHubConnector: """ - Connector for ingesting GitHub repositories using gitingest. + Connector for ingesting GitHub repositories using gitingest CLI. - This connector efficiently processes entire repositories into a single - digest, reducing the number of API calls and LLM invocations compared - to file-by-file processing. + Uses subprocess to run gitingest, which avoids all async/event loop + issues that can occur when mixing gitingest with Celery workers. """ def __init__(self, token: str | None = None): """ - Initializes the GitHub connector. + Initialize the GitHub connector. Args: token: Optional GitHub Personal Access Token (PAT). Only required for private repositories. - Public repositories can be ingested without a token. """ self.token = token if token and token.strip() else None if self.token: @@ -171,72 +60,104 @@ class GitHubConnector: else: logger.info("GitHub connector initialized without token (public repos only).") - async def ingest_repository( + def ingest_repository( self, repo_full_name: str, branch: str | None = None, - include_patterns: list[str] | None = None, - exclude_patterns: list[str] | None = None, max_file_size: int = MAX_FILE_SIZE, ) -> RepositoryDigest | None: """ - Ingest an entire repository and return a digest. + Ingest a repository using gitingest CLI via subprocess. + + This approach completely isolates gitingest from Python's event loop, + avoiding any async/Celery conflicts. Args: repo_full_name: The full name of the repository (e.g., 'owner/repo'). branch: Optional specific branch or tag to ingest. - include_patterns: Optional list of glob patterns for files to include. - If None, includes all files (recommended). - exclude_patterns: Optional list of glob patterns for files to exclude. - If None, uses DEFAULT_EXCLUDE_PATTERNS. - max_file_size: Maximum file size in bytes to include (default 5MB). + max_file_size: Maximum file size in bytes to include. Returns: - RepositoryDigest containing the summary, tree structure, and content, - or None if ingestion fails. + RepositoryDigest or None if ingestion fails. """ repo_url = f"https://github.com/{repo_full_name}" - # Use only exclude_patterns by default (recommended for comprehensive analysis) - # This ensures we don't miss any relevant file types - exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS - - logger.info(f"Starting gitingest for repository: {repo_full_name}") + logger.info(f"Starting gitingest CLI for repository: {repo_full_name}") try: - # Build kwargs dynamically - ingest_kwargs = { - "max_file_size": max_file_size, - "exclude_patterns": exclude_pats, - "include_gitignored": False, - "include_submodules": False, - } + # Create a temporary file for output + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", delete=False + ) as tmp_file: + output_path = tmp_file.name - # Only add token if provided (required only for private repos) - if self.token: - ingest_kwargs["token"] = self.token + # Build the gitingest CLI command + cmd = [ + "gitingest", + repo_url, + "--output", output_path, + "--max-size", str(max_file_size), + # Common exclude patterns + "-e", "node_modules/*", + "-e", "vendor/*", + "-e", ".git/*", + "-e", "__pycache__/*", + "-e", "dist/*", + "-e", "build/*", + "-e", "*.lock", + "-e", "package-lock.json", + ] - # Only add branch if specified + # Add branch if specified if branch: - ingest_kwargs["branch"] = branch + cmd.extend(["--branch", branch]) - # Only add include_patterns if explicitly provided - if include_patterns is not None: - ingest_kwargs["include_patterns"] = include_patterns + # Set up environment with token if provided + env = os.environ.copy() + if self.token: + env["GITHUB_TOKEN"] = self.token - summary, tree, content = await ingest_async(repo_url, **ingest_kwargs) + logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...") - if not content or not content.strip(): - logger.warning( - f"No content retrieved from repository: {repo_full_name}" - ) + # Run gitingest as subprocess with timeout + result = subprocess.run( + cmd, + env=env, + capture_output=True, + text=True, + timeout=900, # 5 minute timeout + ) + + if result.returncode != 0: + logger.error(f"gitingest failed: {result.stderr}") + # Clean up temp file + if os.path.exists(output_path): + os.unlink(output_path) return None + # Read the output file + if not os.path.exists(output_path): + logger.error("gitingest did not create output file") + return None + + with open(output_path, encoding="utf-8") as f: + full_content = f.read() + + # Clean up temp file + os.unlink(output_path) + + if not full_content or not full_content.strip(): + logger.warning(f"No content retrieved from repository: {repo_full_name}") + return None + + # Parse the gitingest output + # The output format is: summary + tree + content + # We'll extract what we can digest = RepositoryDigest( repo_full_name=repo_full_name, - summary=summary, - tree=tree, - content=content, + summary=f"Repository: {repo_full_name}", + tree="", # gitingest CLI combines everything into one file + content=full_content, branch=branch, ) @@ -246,50 +167,70 @@ class GitHubConnector: ) return digest + except subprocess.TimeoutExpired: + logger.error(f"gitingest timed out for repository: {repo_full_name}") + return None + except FileNotFoundError: + logger.error( + "gitingest CLI not found. Falling back to Python library." + ) + # Fall back to Python library + return self._ingest_with_python_library(repo_full_name, branch, max_file_size) except Exception as e: logger.error(f"Failed to ingest repository {repo_full_name}: {e}") return None - async def ingest_repositories( + def _ingest_with_python_library( self, - repo_full_names: list[str], + repo_full_name: str, branch: str | None = None, - include_patterns: list[str] | None = None, - exclude_patterns: list[str] | None = None, max_file_size: int = MAX_FILE_SIZE, - ) -> list[RepositoryDigest]: + ) -> RepositoryDigest | None: """ - Ingest multiple repositories and return their digests. - - Args: - repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']). - branch: Optional specific branch or tag to ingest (applied to all repos). - include_patterns: Optional list of glob patterns for files to include. - exclude_patterns: Optional list of glob patterns for files to exclude. - max_file_size: Maximum file size in bytes to include. - - Returns: - List of RepositoryDigest objects for successfully ingested repositories. + Fallback: Ingest using the Python library directly. """ - digests = [] + from gitingest import ingest - for repo_full_name in repo_full_names: - if not repo_full_name or not isinstance(repo_full_name, str): - logger.warning(f"Skipping invalid repository entry: {repo_full_name}") - continue + repo_url = f"https://github.com/{repo_full_name}" - digest = await self.ingest_repository( + logger.info(f"Using Python gitingest library for: {repo_full_name}") + + try: + kwargs = { + "max_file_size": max_file_size, + "exclude_patterns": [ + "node_modules/*", + "vendor/*", + ".git/*", + "__pycache__/*", + "dist/*", + "build/*", + "*.lock", + "package-lock.json", + ], + "include_gitignored": False, + "include_submodules": False, + } + + if self.token: + kwargs["token"] = self.token + if branch: + kwargs["branch"] = branch + + summary, tree, content = ingest(repo_url, **kwargs) + + if not content or not content.strip(): + logger.warning(f"No content from {repo_full_name}") + return None + + return RepositoryDigest( repo_full_name=repo_full_name, + summary=summary, + tree=tree, + content=content, branch=branch, - include_patterns=include_patterns, - exclude_patterns=exclude_patterns, - max_file_size=max_file_size, ) - if digest: - digests.append(digest) - - logger.info( - f"Ingested {len(digests)} out of {len(repo_full_names)} repositories." - ) - return digests + except Exception as e: + logger.error(f"Python library failed for {repo_full_name}: {e}") + return None diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index f1ccabdef..f16ee0156 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -173,8 +173,13 @@ async def index_github_repos( logger.info(f"Ingesting repository: {repo_full_name}") try: - # Ingest the entire repository - digest = await github_client.ingest_repository(repo_full_name) + # Run gitingest via subprocess (isolated from event loop) + # Using to_thread to not block the async database operations + import asyncio + + digest = await asyncio.to_thread( + github_client.ingest_repository, repo_full_name + ) if not digest: logger.warning( diff --git a/surfsense_backend/app/utils/validators.py b/surfsense_backend/app/utils/validators.py index 54e681518..6a87679ec 100644 --- a/surfsense_backend/app/utils/validators.py +++ b/surfsense_backend/app/utils/validators.py @@ -530,7 +530,10 @@ def validate_connector_config( # "validators": {}, # }, "GITHUB_CONNECTOR": { - "required": ["GITHUB_PAT", "repo_full_names"], + # GITHUB_PAT is optional - only required for private repositories + # Public repositories can be indexed without authentication + "required": ["repo_full_names"], + "optional": ["GITHUB_PAT"], # Optional - only needed for private repos "validators": { "repo_full_names": lambda: validate_list_field( "repo_full_names", "repo_full_names" diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json index 6ed792b8e..b729c3f8b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json +++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json @@ -24,11 +24,6 @@ "enabled": true, "status": "warning", "statusMessage": "Some requests may be blocked if not using Firecrawl." - }, - "GITHUB_CONNECTOR": { - "enabled": false, - "status": "maintenance", - "statusMessage": "Rework in progress." } }, "globalSettings": { diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx index 6ed36e180..72d5811d3 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx @@ -96,6 +96,7 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting repo_full_names: repoList, }, is_indexable: true, + is_active: true, last_indexed_at: null, periodic_indexing_enabled: periodicEnabled, indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null, @@ -119,16 +120,16 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting Personal Access Token (Optional) A GitHub PAT is only required for private repositories. Public repos work without a - token. Create one from{" "} + token. {" "} - GitHub Settings + Get your token {" "} - if needed. + .
@@ -324,20 +325,21 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting

How it works

- The GitHub connector uses a Personal Access Token (PAT) to authenticate with the - GitHub API. You provide a comma-separated list of repository full names (e.g., - "owner/repo1, owner/repo2") that you want to index. The connector indexes relevant - files (code, markdown, text) from the selected repositories. + The GitHub connector ingests entire repositories in one pass using gitingest, + making it highly efficient. Provide a comma-separated list of repository full + names (e.g., "owner/repo1, owner/repo2") to index.

  • - The connector indexes files based on common code and documentation extensions. + Public repos: No authentication required.
  • -
  • Large files (over 1MB) are skipped during indexing.
  • -
  • Only specified repositories are indexed.
  • - Indexing runs periodically (check connector settings for frequency) to keep - content up-to-date. + Private repos: Requires a GitHub Personal Access Token (PAT). +
  • +
  • Indexes code, documentation, and configuration files.
  • +
  • Large files (over 5MB) and binary files are automatically skipped.
  • +
  • + Periodic sync detects changes and only re-indexes when content has changed.
@@ -348,19 +350,23 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting - Personal Access Token Required + Personal Access Token (Optional) - You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch - repositories. The PAT will be stored securely to enable indexing. + A GitHub PAT is only needed for private repositories. Public + repos can be indexed without authentication. If you need to access private + repos, create a PAT with the 'repo' scope.

- Step 1: Generate GitHub PAT + For Private Repositories Only: Generate GitHub PAT

+

+ Skip this step if you're only indexing public repositories. +

  1. Go to your GitHub{" "} @@ -375,46 +381,36 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
  2. Click on Personal access tokens, then choose{" "} - Tokens (classic) or Fine-grained tokens{" "} - (recommended if available). + Tokens (classic) or Fine-grained tokens.
  3. - Click Generate new token (and choose the appropriate type). + Click Generate new token.
  4. Give your token a descriptive name (e.g., "SurfSense Connector").
  5. -
  6. Set an expiration date for the token (recommended for security).
  7. - Under Select scopes (for classic tokens) or{" "} - Repository access (for fine-grained), grant the necessary - permissions. At minimum, the `repo` scope (or equivalent - read access to repositories for fine-grained tokens) is required to read - repository content. + Grant the `repo` scope (for classic tokens) or read access + to the specific repositories you want to index (for fine-grained tokens).
  8. - Click Generate token. -
  9. -
  10. - Important: Copy your new PAT immediately. You won't be able - to see it again after leaving the page. + Click Generate token and copy it immediately.

- Step 2: Specify repositories + Specify Repositories

Enter a comma-separated list of repository full names in the format - "owner/repo1, owner/repo2". The connector will index files from only the - specified repositories. + "owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".

- Repository Access + Public vs Private - Make sure your PAT has access to all repositories you want to index. Private - repositories require appropriate permissions. + Public repositories work without a PAT. For private repositories, ensure + your PAT has access to the repos you want to index.
@@ -424,36 +420,38 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
-

Indexing

+

Quick Start

  1. - Navigate to the Connector Dashboard and select the GitHub{" "} - Connector. + Enter the Repository Names you want to index (e.g., + "facebook/react, vercel/next.js").
  2. - Enter your GitHub Personal Access Token in the form field. + (Optional) Add a GitHub PAT if indexing private repositories.
  3. - Enter a comma-separated list of Repository Names (e.g., - "owner/repo1, owner/repo2"). + Click Connect GitHub to start indexing.
  4. - Click Connect to establish the connection. + Enable Periodic Sync to automatically detect and index + changes.
  5. -
  6. Once connected, your GitHub repositories will be indexed automatically.
What Gets Indexed -

The GitHub connector indexes the following data:

+

The GitHub connector indexes:

    -
  • Code files from selected repositories
  • -
  • README files and Markdown documentation
  • -
  • Common text-based file formats
  • -
  • Repository metadata and structure
  • +
  • All code files (Python, JavaScript, TypeScript, etc.)
  • +
  • Documentation (README, Markdown, text files)
  • +
  • Configuration files (JSON, YAML, TOML, etc.)
  • +
  • Repository structure and file tree
+

+ Binary files, images, and build artifacts are automatically excluded. +

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx index 07c7bdfbc..d5169b49d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx @@ -1,8 +1,9 @@ "use client"; -import { KeyRound } from "lucide-react"; +import { Info, KeyRound } from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; @@ -79,6 +80,26 @@ export const GithubConfig: FC = ({ return (
+ + +
+ Personal Access Token (Optional) + + A GitHub PAT is only required for private repositories. Public repos work without a + token. Create one from{" "} + + GitHub Settings + {" "} + if needed. + +
+
+ {/* Connector Name */}
@@ -105,7 +126,7 @@ export const GithubConfig: FC = ({
= ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */} + {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && - connector.connector_type !== "WEBCRAWLER_CONNECTOR" && ( + connector.connector_type !== "WEBCRAWLER_CONNECTOR" && + connector.connector_type !== "GITHUB_CONNECTOR" && ( = ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector?.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */} + {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && - config.connectorType !== "WEBCRAWLER_CONNECTOR" && ( + config.connectorType !== "WEBCRAWLER_CONNECTOR" && + config.connectorType !== "GITHUB_CONNECTOR" && ( Date: Wed, 21 Jan 2026 00:19:17 +0530 Subject: [PATCH 4/7] fix: Clean up GitHub connector UI and documentation - Removed unnecessary period from the GitHub connect form alert description. - Moved helper functions for string and array conversion outside the component to avoid useEffect dependency issues. - Updated the GitHub connector documentation to provide detailed indexing information and troubleshooting tips for users. --- .../components/github-connect-form.tsx | 1 - .../components/github-config.tsx | 67 +++++++--------- .../content/docs/connectors/github.mdx | 79 ++++++++++++++++++- 3 files changed, 107 insertions(+), 40 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx index 72d5811d3..4fb9e93bf 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx @@ -129,7 +129,6 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting > Get your token {" "} - .
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx index d5169b49d..2c28758b8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx @@ -1,9 +1,8 @@ "use client"; -import { Info, KeyRound } from "lucide-react"; +import { KeyRound } from "lucide-react"; import type { FC } from "react"; -import { useEffect, useState } from "react"; -import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; +import { useEffect, useRef, useState } from "react"; import { Badge } from "@/components/ui/badge"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; @@ -13,25 +12,29 @@ export interface GithubConfigProps extends ConnectorConfigProps { onNameChange?: (name: string) => void; } +// Helper functions moved outside component to avoid useEffect dependency issues +const stringToArray = (arr: string[] | string | undefined): string[] => { + if (Array.isArray(arr)) return arr; + if (typeof arr === "string") { + return arr + .split(",") + .map((item) => item.trim()) + .filter((item) => item.length > 0); + } + return []; +}; + +const arrayToString = (arr: string[]): string => { + return arr.join(", "); +}; + export const GithubConfig: FC = ({ connector, onConfigChange, onNameChange, }) => { - const stringToArray = (arr: string[] | string | undefined): string[] => { - if (Array.isArray(arr)) return arr; - if (typeof arr === "string") { - return arr - .split(",") - .map((item) => item.trim()) - .filter((item) => item.length > 0); - } - return []; - }; - - const arrayToString = (arr: string[]): string => { - return arr.join(", "); - }; + // Track internal changes to prevent useEffect from overwriting user input + const isInternalChange = useRef(false); const [githubPat, setGithubPat] = useState( (connector.config?.GITHUB_PAT as string) || "" @@ -41,8 +44,13 @@ export const GithubConfig: FC = ({ ); const [name, setName] = useState(connector.name || ""); - // Update values when connector changes + // Update values when connector changes externally (not from our own input) useEffect(() => { + // Skip if this is our own internal change + if (isInternalChange.current) { + isInternalChange.current = false; + return; + } const pat = (connector.config?.GITHUB_PAT as string) || ""; const repos = arrayToString(stringToArray(connector.config?.repo_full_names)); setGithubPat(pat); @@ -51,6 +59,7 @@ export const GithubConfig: FC = ({ }, [connector.config, connector.name]); const handleGithubPatChange = (value: string) => { + isInternalChange.current = true; setGithubPat(value); if (onConfigChange) { onConfigChange({ @@ -61,6 +70,7 @@ export const GithubConfig: FC = ({ }; const handleRepoFullNamesChange = (value: string) => { + isInternalChange.current = true; setRepoFullNames(value); const repoList = stringToArray(value); if (onConfigChange) { @@ -72,6 +82,7 @@ export const GithubConfig: FC = ({ }; const handleNameChange = (value: string) => { + isInternalChange.current = true; setName(value); if (onNameChange) { onNameChange(value); @@ -80,26 +91,6 @@ export const GithubConfig: FC = ({ return (
- - -
- Personal Access Token (Optional) - - A GitHub PAT is only required for private repositories. Public repos work without a - token. Create one from{" "} - - GitHub Settings - {" "} - if needed. - -
-
- {/* Connector Name */}
diff --git a/surfsense_web/content/docs/connectors/github.mdx b/surfsense_web/content/docs/connectors/github.mdx index bb2faca81..6a4574ec4 100644 --- a/surfsense_web/content/docs/connectors/github.mdx +++ b/surfsense_web/content/docs/connectors/github.mdx @@ -3,4 +3,81 @@ title: GitHub description: Connect your GitHub repositories to SurfSense --- -# Documentation in progress \ No newline at end of file +# GitHub Connector + +Connect your GitHub repositories to SurfSense for code search and AI-powered insights. The connector uses [gitingest](https://gitingest.com) to efficiently index entire codebases. + +## What Gets Indexed + +| Content Type | Examples | +|--------------|----------| +| Code Files | Python, JavaScript, TypeScript, Go, Rust, Java, etc. | +| Documentation | README files, Markdown documents, text files | +| Configuration | JSON, YAML, TOML, .env examples, Dockerfiles | + +> ⚠️ Binary files and files larger than 5MB are automatically excluded. + +--- + +## Quick Start (Public Repos) + +1. Navigate to **Connectors** → **Add Connector** → **GitHub** +2. Enter repository names: `owner/repo` (e.g., `facebook/react, vercel/next.js`) +3. Click **Connect GitHub** + +No authentication required for public repositories. + +--- + +## Private Repositories + +For private repos, you need a GitHub Personal Access Token (PAT). + +### Generate a PAT + +1. Go to [GitHub's token creation page](https://github.com/settings/tokens/new?description=surfsense&scopes=repo) (pre-filled with `repo` scope) +2. Set an expiration +3. Click **Generate token** and copy it + +> ⚠️ The token starts with `ghp_`. Store it securely. + +--- + +## Connector Configuration + +| Field | Description | Required | +|-------|-------------|----------| +| **Connector Name** | A friendly name to identify this connector | Yes | +| **GitHub Personal Access Token** | Your PAT (only for private repos) | No | +| **Repository Names** | Comma-separated list: `owner/repo1, owner/repo2` | Yes | + +--- + +## Periodic Sync + +Enable periodic sync to automatically re-index repositories when content changes: + +| Frequency | Use Case | +|-----------|----------| +| Every 5 minutes | Active development | +| Every 15 minutes | Frequent commits | +| Every hour | Regular workflow | +| Every 6 hours | Less active repos | +| Daily | Reference repositories | +| Weekly | Stable codebases | + +--- + +## Troubleshooting + +**Repository not found** +- Verify format is `owner/repo` +- For private repos, ensure PAT has access + +**Authentication failed** +- Check PAT is valid and not expired +- Token should start with `ghp_` or `github_pat_` + +**Rate limit exceeded** +- Use a PAT for higher limits (5,000/hour vs 60 unauthenticated) +- Reduce sync frequency From 8bd1ba025161d8d6b97a43e87dd92fbdbc8b595d Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:21:21 +0530 Subject: [PATCH 5/7] refactor: Simplify GitHub connect form by removing unused components and documentation sections --- .../components/github-connect-form.tsx | 157 ------------------ 1 file changed, 157 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx index 4fb9e93bf..833acf594 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx @@ -8,9 +8,6 @@ import { useForm } from "react-hook-form"; import * as z from "zod"; import { Accordion, - AccordionContent, - AccordionItem, - AccordionTrigger, } from "@/components/ui/accordion"; import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; @@ -34,7 +31,6 @@ import { } from "@/components/ui/select"; import { Switch } from "@/components/ui/switch"; import { EnumConnectorName } from "@/contracts/enums/connector"; -import { getConnectorBenefits } from "../connector-benefits"; import type { ConnectFormProps } from "../index"; const githubConnectorFormSchema = z.object({ @@ -298,165 +294,12 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
- {/* What you get section */} - {getConnectorBenefits(EnumConnectorName.GITHUB_CONNECTOR) && ( -
-

What you get with GitHub integration:

-
    - {getConnectorBenefits(EnumConnectorName.GITHUB_CONNECTOR)?.map((benefit) => ( -
  • {benefit}
  • - ))} -
-
- )} - {/* Documentation Section */} - - - Documentation - - -
-

How it works

-

- The GitHub connector ingests entire repositories in one pass using gitingest, - making it highly efficient. Provide a comma-separated list of repository full - names (e.g., "owner/repo1, owner/repo2") to index. -

-
    -
  • - Public repos: No authentication required. -
  • -
  • - Private repos: Requires a GitHub Personal Access Token (PAT). -
  • -
  • Indexes code, documentation, and configuration files.
  • -
  • Large files (over 5MB) and binary files are automatically skipped.
  • -
  • - Periodic sync detects changes and only re-indexes when content has changed. -
  • -
-
- -
-
-

Authorization

- - - - Personal Access Token (Optional) - - - A GitHub PAT is only needed for private repositories. Public - repos can be indexed without authentication. If you need to access private - repos, create a PAT with the 'repo' scope. - - - -
-
-

- For Private Repositories Only: Generate GitHub PAT -

-

- Skip this step if you're only indexing public repositories. -

-
    -
  1. - Go to your GitHub{" "} - - Developer settings - -
  2. -
  3. - Click on Personal access tokens, then choose{" "} - Tokens (classic) or Fine-grained tokens. -
  4. -
  5. - Click Generate new token. -
  6. -
  7. Give your token a descriptive name (e.g., "SurfSense Connector").
  8. -
  9. - Grant the `repo` scope (for classic tokens) or read access - to the specific repositories you want to index (for fine-grained tokens). -
  10. -
  11. - Click Generate token and copy it immediately. -
  12. -
-
- -
-

- Specify Repositories -

-

- Enter a comma-separated list of repository full names in the format - "owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js". -

- - - Public vs Private - - Public repositories work without a PAT. For private repositories, ensure - your PAT has access to the repos you want to index. - - -
-
-
-
- -
-
-

Quick Start

-
    -
  1. - Enter the Repository Names you want to index (e.g., - "facebook/react, vercel/next.js"). -
  2. -
  3. - (Optional) Add a GitHub PAT if indexing private repositories. -
  4. -
  5. - Click Connect GitHub to start indexing. -
  6. -
  7. - Enable Periodic Sync to automatically detect and index - changes. -
  8. -
- - - - What Gets Indexed - -

The GitHub connector indexes:

-
    -
  • All code files (Python, JavaScript, TypeScript, etc.)
  • -
  • Documentation (README, Markdown, text files)
  • -
  • Configuration files (JSON, YAML, TOML, etc.)
  • -
  • Repository structure and file tree
  • -
-

- Binary files, images, and build artifacts are automatically excluded. -

-
-
-
-
-
-
); From 5a95a6b543c75035181fd5b89e71f6fb5f605178 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 21 Jan 2026 01:21:25 +0530 Subject: [PATCH 6/7] feat: Add documentation link to GitHub connect form - Replaced the removed Accordion component with a direct link to the GitHub connector documentation. - Enhanced user experience by providing easy access to relevant documentation. --- .../components/github-connect-form.tsx | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx index 833acf594..f83ae0788 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx @@ -1,14 +1,12 @@ "use client"; import { zodResolver } from "@hookform/resolvers/zod"; -import { Info } from "lucide-react"; +import { ExternalLink, Info } from "lucide-react"; +import Link from "next/link"; import type { FC } from "react"; import { useRef, useState } from "react"; import { useForm } from "react-hook-form"; import * as z from "zod"; -import { - Accordion, -} from "@/components/ui/accordion"; import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { @@ -294,13 +292,18 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
- {/* Documentation Section */} - - + {/* Documentation Link */} +
+ + View GitHub Connector Documentation + + +
); }; From d35d89f3a934d326ccf247219881e24c9ba4600f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 21 Jan 2026 03:28:34 +0530 Subject: [PATCH 7/7] chore: ran linting --- .../app/dashboard/[search_space_id]/team/page.tsx | 10 ++++------ .../components/github-connect-form.tsx | 14 ++++++-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx index f00982555..6701342de 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx @@ -778,8 +778,7 @@ function RolesTab({ role.name === "Owner" && "text-amber-600", role.name === "Editor" && "text-blue-600", role.name === "Viewer" && "text-gray-600", - !["Owner", "Editor", "Viewer"].includes(role.name) && - "text-primary" + !["Owner", "Editor", "Viewer"].includes(role.name) && "text-primary" )} />
@@ -1488,7 +1487,8 @@ function CreateRoleDialog({

- Use presets to quickly apply Editor (create/read/update) or Viewer (read-only) permissions + Use presets to quickly apply Editor (create/read/update) or Viewer (read-only) + permissions

@@ -1500,9 +1500,7 @@ function CreateRoleDialog({ return (
-