From f200502ffc4ebcf03f9ffddc92ef44a8a8a45d49 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Tue, 20 Jan 2026 03:11:49 -0800
Subject: [PATCH 1/7] refactor: bulk updating Admin to Editor roles

- Consolidated the migration process for search space memberships and invites from Admin to Editor roles using bulk SQL updates.
- Removed the Admin role in bulk for system roles.
- Updated permissions for Editor and Viewer roles across all search spaces in a more efficient manner.
---
 .../versions/72_simplify_rbac_roles.py        | 89 +++++++------------
 1 file changed, 34 insertions(+), 55 deletions(-)

diff --git a/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py b/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py
index e7d5ff019..2a3b81990 100644
--- a/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py
+++ b/surfsense_backend/alembic/versions/72_simplify_rbac_roles.py
@@ -67,63 +67,42 @@ NEW_VIEWER_PERMISSIONS = [
 def upgrade():
     connection = op.get_bind()
 
-    # Step 1: For each search space, get the Editor role ID and Admin role ID
-    search_spaces = connection.execute(
-        sa.text("SELECT id FROM searchspaces")
-    ).fetchall()
+    # Step 1: Move all memberships from Admin roles to corresponding Editor roles (BULK)
+    # Uses a subquery to match Admin->Editor within the same search space
+    connection.execute(
+        sa.text("""
+            UPDATE search_space_memberships m
+            SET role_id = e.id
+            FROM search_space_roles a
+            JOIN search_space_roles e ON a.search_space_id = e.search_space_id
+            WHERE m.role_id = a.id
+            AND a.name = 'Admin'
+            AND e.name = 'Editor'
+        """)
+    )
 
-    for (ss_id,) in search_spaces:
-        # Get Admin and Editor role IDs for this search space
-        admin_role = connection.execute(
-            sa.text("""
-                SELECT id FROM search_space_roles 
-                WHERE search_space_id = :ss_id AND name = 'Admin'
-            """),
-            {"ss_id": ss_id},
-        ).fetchone()
+    # Step 2: Move all invites from Admin roles to corresponding Editor roles (BULK)
+    connection.execute(
+        sa.text("""
+            UPDATE search_space_invites i
+            SET role_id = e.id
+            FROM search_space_roles a
+            JOIN search_space_roles e ON a.search_space_id = e.search_space_id
+            WHERE i.role_id = a.id
+            AND a.name = 'Admin'
+            AND e.name = 'Editor'
+        """)
+    )
 
-        editor_role = connection.execute(
-            sa.text("""
-                SELECT id FROM search_space_roles 
-                WHERE search_space_id = :ss_id AND name = 'Editor'
-            """),
-            {"ss_id": ss_id},
-        ).fetchone()
+    # Step 3: Delete all Admin roles (BULK)
+    connection.execute(
+        sa.text("""
+            DELETE FROM search_space_roles 
+            WHERE name = 'Admin' AND is_system_role = TRUE
+        """)
+    )
 
-        if admin_role and editor_role:
-            admin_role_id = admin_role[0]
-            editor_role_id = editor_role[0]
-
-            # Step 2: Move all memberships from Admin to Editor
-            connection.execute(
-                sa.text("""
-                    UPDATE search_space_memberships 
-                    SET role_id = :editor_role_id 
-                    WHERE role_id = :admin_role_id
-                """),
-                {"editor_role_id": editor_role_id, "admin_role_id": admin_role_id},
-            )
-
-            # Step 3: Move all invites from Admin to Editor
-            connection.execute(
-                sa.text("""
-                    UPDATE search_space_invites 
-                    SET role_id = :editor_role_id 
-                    WHERE role_id = :admin_role_id
-                """),
-                {"editor_role_id": editor_role_id, "admin_role_id": admin_role_id},
-            )
-
-            # Step 4: Delete the Admin role
-            connection.execute(
-                sa.text("""
-                    DELETE FROM search_space_roles 
-                    WHERE id = :admin_role_id
-                """),
-                {"admin_role_id": admin_role_id},
-            )
-
-    # Step 5: Update Editor permissions for all search spaces
+    # Step 4: Update Editor permissions for all search spaces (BULK)
     editor_perms_literal = (
         "ARRAY[" + ",".join(f"'{p}'" for p in NEW_EDITOR_PERMISSIONS) + "]::TEXT[]"
     )
@@ -136,7 +115,7 @@ def upgrade():
         """)
     )
 
-    # Step 6: Update Viewer permissions for all search spaces
+    # Step 5: Update Viewer permissions for all search spaces (BULK)
     viewer_perms_literal = (
         "ARRAY[" + ",".join(f"'{p}'" for p in NEW_VIEWER_PERMISSIONS) + "]::TEXT[]"
     )

From 49b8a46d1045bd71f89aa9b519134a5d21f04832 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 20 Jan 2026 21:52:32 +0530
Subject: [PATCH 2/7] feat: Integrate gitingest for GitHub repository ingestion

- Added gitingest as a dependency to streamline the ingestion of GitHub repositories.
- Refactored GitHubConnector to utilize gitingest for efficient repository digest generation, reducing API calls.
- Updated GitHub indexer to process entire repository digests, enhancing performance and simplifying the indexing process.
- Modified GitHub connect form to indicate that the Personal Access Token is optional for public repositories.
---
 .../app/connectors/github_connector.py        | 507 +++++++++---------
 .../connector_indexers/github_indexer.py      | 491 ++++++++---------
 surfsense_backend/pyproject.toml              |   1 +
 surfsense_backend/uv.lock                     |  30 ++
 .../components/github-connect-form.tsx        |  51 +-
 .../new-chat/source-detail-panel.tsx          |   4 +-
 6 files changed, 545 insertions(+), 539 deletions(-)

diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py
index 647856c6f..90fd93fb9 100644
--- a/surfsense_backend/app/connectors/github_connector.py
+++ b/surfsense_backend/app/connectors/github_connector.py
@@ -1,296 +1,295 @@
-import base64
-import logging
-from typing import Any
+"""
+GitHub connector using gitingest for efficient repository digestion.
 
-from github3 import exceptions as github_exceptions, login as github_login
-from github3.exceptions import ForbiddenError, NotFoundError
-from github3.repos.contents import Contents
+This connector replaces the previous file-by-file approach with a single
+digest generation per repository, dramatically reducing LLM API calls.
+"""
+
+import logging
+from dataclasses import dataclass
+
+from gitingest import ingest_async
 
 logger = logging.getLogger(__name__)
 
-# List of common code file extensions to target
-CODE_EXTENSIONS = {
-    ".py",
-    ".js",
-    ".jsx",
-    ".ts",
-    ".tsx",
-    ".java",
-    ".c",
-    ".cpp",
-    ".h",
-    ".hpp",
-    ".cs",
-    ".go",
-    ".rb",
-    ".php",
-    ".swift",
-    ".kt",
-    ".scala",
-    ".rs",
-    ".m",
-    ".sh",
-    ".bash",
-    ".ps1",
-    ".lua",
-    ".pl",
-    ".pm",
-    ".r",
-    ".dart",
-    ".sql",
-}
+# Maximum file size in bytes (5MB)
+MAX_FILE_SIZE = 5 * 1024 * 1024
 
-# List of common documentation/text file extensions
-DOC_EXTENSIONS = {
-    ".md",
-    ".txt",
-    ".rst",
-    ".adoc",
-    ".html",
-    ".htm",
-    ".xml",
-    ".json",
-    ".yaml",
-    ".yml",
-    ".toml",
-}
+# Default patterns to exclude (recommended approach for comprehensive analysis)
+# Using only exclude_patterns ensures we don't miss any relevant file types
+DEFAULT_EXCLUDE_PATTERNS = [
+    # Dependencies
+    "node_modules/*",
+    "vendor/*",
+    "bower_components/*",
+    ".pnpm/*",
+    # Build artifacts / Caches
+    "build/*",
+    "dist/*",
+    "target/*",
+    "out/*",
+    "__pycache__/*",
+    "*.pyc",
+    ".cache/*",
+    ".next/*",
+    ".nuxt/*",
+    # Virtual environments
+    "venv/*",
+    ".venv/*",
+    "env/*",
+    ".env/*",
+    # IDE/Editor config
+    ".vscode/*",
+    ".idea/*",
+    ".project",
+    ".settings/*",
+    "*.swp",
+    "*.swo",
+    # Version control
+    ".git/*",
+    ".svn/*",
+    ".hg/*",
+    # Temporary / Logs
+    "tmp/*",
+    "temp/*",
+    "logs/*",
+    "*.log",
+    # Lock files (usually not needed for understanding code)
+    "package-lock.json",
+    "pnpm-lock.yaml",
+    "yarn.lock",
+    "uv.lock",
+    "Gemfile.lock",
+    "poetry.lock",
+    "Cargo.lock",
+    "composer.lock",
+    # Binary/media files
+    "*.png",
+    "*.jpg",
+    "*.jpeg",
+    "*.gif",
+    "*.ico",
+    "*.svg",
+    "*.webp",
+    "*.bmp",
+    "*.tiff",
+    "*.woff",
+    "*.woff2",
+    "*.ttf",
+    "*.eot",
+    "*.otf",
+    "*.mp3",
+    "*.mp4",
+    "*.wav",
+    "*.ogg",
+    "*.webm",
+    "*.avi",
+    "*.mov",
+    "*.pdf",
+    "*.doc",
+    "*.docx",
+    "*.xls",
+    "*.xlsx",
+    "*.ppt",
+    "*.pptx",
+    "*.zip",
+    "*.tar",
+    "*.tar.gz",
+    "*.tgz",
+    "*.rar",
+    "*.7z",
+    "*.exe",
+    "*.dll",
+    "*.so",
+    "*.dylib",
+    "*.bin",
+    "*.obj",
+    "*.o",
+    "*.a",
+    "*.lib",
+    # Minified files
+    "*.min.js",
+    "*.min.css",
+    # Source maps
+    "*.map",
+    # Database files
+    "*.db",
+    "*.sqlite",
+    "*.sqlite3",
+    # Coverage reports
+    "coverage/*",
+    ".coverage",
+    "htmlcov/*",
+    ".nyc_output/*",
+    # Test snapshots (can be large)
+    "__snapshots__/*",
+]
 
-# Maximum file size in bytes (e.g., 1MB)
-MAX_FILE_SIZE = 1 * 1024 * 1024
+
+@dataclass
+class RepositoryDigest:
+    """Represents a digested repository from gitingest."""
+
+    repo_full_name: str
+    summary: str
+    tree: str
+    content: str
+    branch: str | None = None
+
+    @property
+    def full_digest(self) -> str:
+        """Returns the complete digest with tree and content."""
+        return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
+
+    @property
+    def estimated_tokens(self) -> int:
+        """Rough estimate of tokens (1 token ≈ 4 characters)."""
+        return len(self.full_digest) // 4
 
 
 class GitHubConnector:
-    """Connector for interacting with the GitHub API."""
+    """
+    Connector for ingesting GitHub repositories using gitingest.
 
-    # Directories to skip during file traversal
-    SKIPPED_DIRS = {
-        # Version control
-        ".git",
-        # Dependencies
-        "node_modules",
-        "vendor",
-        # Build artifacts / Caches
-        "build",
-        "dist",
-        "target",
-        "__pycache__",
-        # Virtual environments
-        "venv",
-        ".venv",
-        "env",
-        # IDE/Editor config
-        ".vscode",
-        ".idea",
-        ".project",
-        ".settings",
-        # Temporary / Logs
-        "tmp",
-        "logs",
-        # Add other project-specific irrelevant directories if needed
-    }
+    This connector efficiently processes entire repositories into a single
+    digest, reducing the number of API calls and LLM invocations compared
+    to file-by-file processing.
+    """
 
-    def __init__(self, token: str):
+    def __init__(self, token: str | None = None):
         """
         Initializes the GitHub connector.
 
         Args:
-            token: GitHub Personal Access Token (PAT).
+            token: Optional GitHub Personal Access Token (PAT).
+                   Only required for private repositories.
+                   Public repositories can be ingested without a token.
         """
-        if not token:
-            raise ValueError("GitHub token cannot be empty.")
-        try:
-            self.gh = github_login(token=token)
-            # Try a simple authenticated call to check token validity
-            self.gh.me()
-            logger.info("Successfully authenticated with GitHub API.")
-        except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
-            logger.error(f"GitHub authentication failed: {e}")
-            raise ValueError("Invalid GitHub token or insufficient permissions.") from e
-        except Exception as e:
-            logger.error(f"Failed to initialize GitHub client: {e}")
-            raise e
+        self.token = token if token and token.strip() else None
+        if self.token:
+            logger.info("GitHub connector initialized with authentication token.")
+        else:
+            logger.info("GitHub connector initialized without token (public repos only).")
 
-    def get_user_repositories(self) -> list[dict[str, Any]]:
-        """Fetches repositories accessible by the authenticated user."""
-        repos_data = []
-        try:
-            # type='owner' fetches repos owned by the user
-            # type='member' fetches repos the user is a collaborator on (including orgs)
-            # type='all' fetches both
-            for repo in self.gh.repositories(type="all", sort="updated"):
-                repos_data.append(
-                    {
-                        "id": repo.id,
-                        "name": repo.name,
-                        "full_name": repo.full_name,
-                        "private": repo.private,
-                        "url": repo.html_url,
-                        "description": repo.description or "",
-                        "last_updated": repo.updated_at if repo.updated_at else None,
-                    }
-                )
-            logger.info(f"Fetched {len(repos_data)} repositories.")
-            return repos_data
-        except Exception as e:
-            logger.error(f"Failed to fetch GitHub repositories: {e}")
-            return []  # Return empty list on error
-
-    def get_repository_files(
-        self, repo_full_name: str, path: str = ""
-    ) -> list[dict[str, Any]]:
+    async def ingest_repository(
+        self,
+        repo_full_name: str,
+        branch: str | None = None,
+        include_patterns: list[str] | None = None,
+        exclude_patterns: list[str] | None = None,
+        max_file_size: int = MAX_FILE_SIZE,
+    ) -> RepositoryDigest | None:
         """
-        Recursively fetches details of relevant files (code, docs) within a repository path.
+        Ingest an entire repository and return a digest.
 
         Args:
             repo_full_name: The full name of the repository (e.g., 'owner/repo').
-            path: The starting path within the repository (default is root).
+            branch: Optional specific branch or tag to ingest.
+            include_patterns: Optional list of glob patterns for files to include.
+                             If None, includes all files (recommended).
+            exclude_patterns: Optional list of glob patterns for files to exclude.
+                             If None, uses DEFAULT_EXCLUDE_PATTERNS.
+            max_file_size: Maximum file size in bytes to include (default 5MB).
 
         Returns:
-            A list of dictionaries, each containing file details (path, sha, url, size).
-            Returns an empty list if the repository or path is not found or on error.
+            RepositoryDigest containing the summary, tree structure, and content,
+            or None if ingestion fails.
         """
-        files_list = []
+        repo_url = f"https://github.com/{repo_full_name}"
+
+        # Use only exclude_patterns by default (recommended for comprehensive analysis)
+        # This ensures we don't miss any relevant file types
+        exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
+
+        logger.info(f"Starting gitingest for repository: {repo_full_name}")
+
         try:
-            owner, repo_name = repo_full_name.split("/")
-            repo = self.gh.repository(owner, repo_name)
-            if not repo:
-                logger.warning(f"Repository '{repo_full_name}' not found.")
-                return []
-            contents = repo.directory_contents(
-                directory_path=path
-            )  # Use directory_contents for clarity
+            # Build kwargs dynamically
+            ingest_kwargs = {
+                "max_file_size": max_file_size,
+                "exclude_patterns": exclude_pats,
+                "include_gitignored": False,
+                "include_submodules": False,
+            }
 
-            # contents returns a list of tuples (name, content_obj)
-            for _item_name, content_item in contents:
-                if not isinstance(content_item, Contents):
-                    continue
+            # Only add token if provided (required only for private repos)
+            if self.token:
+                ingest_kwargs["token"] = self.token
 
-                if content_item.type == "dir":
-                    # Check if the directory name is in the skipped list
-                    if content_item.name in self.SKIPPED_DIRS:
-                        logger.debug(f"Skipping directory: {content_item.path}")
-                        continue  # Skip recursion for this directory
+            # Only add branch if specified
+            if branch:
+                ingest_kwargs["branch"] = branch
 
-                    # Recursively fetch contents of subdirectory
-                    files_list.extend(
-                        self.get_repository_files(
-                            repo_full_name, path=content_item.path
-                        )
-                    )
-                elif content_item.type == "file":
-                    # Check if the file extension is relevant and size is within limits
-                    file_extension = (
-                        "." + content_item.name.split(".")[-1].lower()
-                        if "." in content_item.name
-                        else ""
-                    )
-                    is_code = file_extension in CODE_EXTENSIONS
-                    is_doc = file_extension in DOC_EXTENSIONS
+            # Only add include_patterns if explicitly provided
+            if include_patterns is not None:
+                ingest_kwargs["include_patterns"] = include_patterns
 
-                    if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
-                        files_list.append(
-                            {
-                                "path": content_item.path,
-                                "sha": content_item.sha,
-                                "url": content_item.html_url,
-                                "size": content_item.size,
-                                "type": "code" if is_code else "doc",
-                            }
-                        )
-                    elif content_item.size > MAX_FILE_SIZE:
-                        logger.debug(
-                            f"Skipping large file: {content_item.path} ({content_item.size} bytes)"
-                        )
-                    else:
-                        logger.debug(
-                            f"Skipping irrelevant file type: {content_item.path}"
-                        )
+            summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
 
-        except (NotFoundError, ForbiddenError) as e:
-            logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
-        except Exception as e:
-            logger.error(
-                f"Failed to get files for {repo_full_name} at path '{path}': {e}"
+            if not content or not content.strip():
+                logger.warning(
+                    f"No content retrieved from repository: {repo_full_name}"
+                )
+                return None
+
+            digest = RepositoryDigest(
+                repo_full_name=repo_full_name,
+                summary=summary,
+                tree=tree,
+                content=content,
+                branch=branch,
             )
-            # Return what we have collected so far in case of partial failure
 
-        return files_list
+            logger.info(
+                f"Successfully ingested {repo_full_name}: "
+                f"~{digest.estimated_tokens} estimated tokens"
+            )
+            return digest
 
-    def get_file_content(self, repo_full_name: str, file_path: str) -> str | None:
+        except Exception as e:
+            logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
+            return None
+
+    async def ingest_repositories(
+        self,
+        repo_full_names: list[str],
+        branch: str | None = None,
+        include_patterns: list[str] | None = None,
+        exclude_patterns: list[str] | None = None,
+        max_file_size: int = MAX_FILE_SIZE,
+    ) -> list[RepositoryDigest]:
         """
-        Fetches the decoded content of a specific file.
+        Ingest multiple repositories and return their digests.
 
         Args:
-            repo_full_name: The full name of the repository (e.g., 'owner/repo').
-            file_path: The path to the file within the repository.
+            repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
+            branch: Optional specific branch or tag to ingest (applied to all repos).
+            include_patterns: Optional list of glob patterns for files to include.
+            exclude_patterns: Optional list of glob patterns for files to exclude.
+            max_file_size: Maximum file size in bytes to include.
 
         Returns:
-            The decoded file content as a string, or None if fetching fails or file is too large.
+            List of RepositoryDigest objects for successfully ingested repositories.
         """
-        try:
-            owner, repo_name = repo_full_name.split("/")
-            repo = self.gh.repository(owner, repo_name)
-            if not repo:
-                logger.warning(
-                    f"Repository '{repo_full_name}' not found when fetching file '{file_path}'."
-                )
-                return None
+        digests = []
 
-            content_item = repo.file_contents(
-                path=file_path
-            )  # Use file_contents for clarity
+        for repo_full_name in repo_full_names:
+            if not repo_full_name or not isinstance(repo_full_name, str):
+                logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
+                continue
 
-            if (
-                not content_item
-                or not isinstance(content_item, Contents)
-                or content_item.type != "file"
-            ):
-                logger.warning(
-                    f"File '{file_path}' not found or is not a file in '{repo_full_name}'."
-                )
-                return None
-
-            if content_item.size > MAX_FILE_SIZE:
-                logger.warning(
-                    f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch."
-                )
-                return None
-
-            # Content is base64 encoded
-            if content_item.content:
-                try:
-                    decoded_content = base64.b64decode(content_item.content).decode(
-                        "utf-8"
-                    )
-                    return decoded_content
-                except UnicodeDecodeError:
-                    logger.warning(
-                        f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'."
-                    )
-                    try:
-                        # Try a fallback encoding
-                        decoded_content = base64.b64decode(content_item.content).decode(
-                            "latin-1"
-                        )
-                        return decoded_content
-                    except Exception as decode_err:
-                        logger.error(
-                            f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}"
-                        )
-                        return None  # Give up if fallback fails
-            else:
-                logger.warning(
-                    f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty."
-                )
-                return ""  # Return empty string for empty files
-
-        except (NotFoundError, ForbiddenError) as e:
-            logger.warning(
-                f"Cannot access file '{file_path}' in '{repo_full_name}': {e}"
+            digest = await self.ingest_repository(
+                repo_full_name=repo_full_name,
+                branch=branch,
+                include_patterns=include_patterns,
+                exclude_patterns=exclude_patterns,
+                max_file_size=max_file_size,
             )
-            return None
-        except Exception as e:
-            logger.error(
-                f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}"
-            )
-            return None
+
+            if digest:
+                digests.append(digest)
+
+        logger.info(
+            f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
+        )
+        return digests
diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
index e1844a503..f1ccabdef 100644
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@@ -1,5 +1,8 @@
 """
-GitHub connector indexer.
+GitHub connector indexer using gitingest.
+
+This indexer processes entire repository digests in one pass, dramatically
+reducing LLM API calls compared to the previous file-by-file approach.
 """
 
 from datetime import UTC, datetime
@@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.config import config
-from app.connectors.github_connector import GitHubConnector
+from app.connectors.github_connector import GitHubConnector, RepositoryDigest
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
@@ -26,43 +29,55 @@ from .base import (
     logger,
 )
 
+# Maximum tokens for a single digest before splitting
+# Most LLMs can handle 128k+ tokens now, but we'll be conservative
+MAX_DIGEST_CHARS = 500_000  # ~125k tokens
+
 
 async def index_github_repos(
     session: AsyncSession,
     connector_id: int,
     search_space_id: int,
     user_id: str,
-    start_date: str | None = None,
-    end_date: str | None = None,
+    start_date: str | None = None,  # Ignored - GitHub indexes full repo snapshots
+    end_date: str | None = None,  # Ignored - GitHub indexes full repo snapshots
     update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
     """
-    Index code and documentation files from accessible GitHub repositories.
+    Index GitHub repositories using gitingest for efficient processing.
+
+    This function ingests entire repositories as digests, generates a single
+    summary per repository, and chunks the content for vector storage.
+
+    Note: The start_date and end_date parameters are accepted for API compatibility
+    but are IGNORED. GitHub repositories are indexed as complete snapshots since
+    gitingest captures the current state of the entire codebase.
 
     Args:
         session: Database session
         connector_id: ID of the GitHub connector
         search_space_id: ID of the search space to store documents in
         user_id: ID of the user
-        start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
-        end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
+        start_date: Ignored - kept for API compatibility
+        end_date: Ignored - kept for API compatibility
         update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
 
     Returns:
         Tuple containing (number of documents indexed, error message or None)
     """
+    # Note: start_date and end_date are intentionally unused
+    _ = start_date, end_date
     task_logger = TaskLoggingService(session, search_space_id)
 
     # Log task start
     log_entry = await task_logger.log_task_start(
         task_name="github_repos_indexing",
         source="connector_indexing_task",
-        message=f"Starting GitHub repositories indexing for connector {connector_id}",
+        message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)",
         metadata={
             "connector_id": connector_id,
             "user_id": str(user_id),
-            "start_date": start_date,
-            "end_date": end_date,
+            "method": "gitingest",
         },
     )
 
@@ -93,19 +108,11 @@ async def index_github_repos(
                 f"Connector with ID {connector_id} not found or is not a GitHub connector",
             )
 
-        # 2. Get the GitHub PAT and selected repositories from the connector config
-        github_pat = connector.config.get("GITHUB_PAT")
+        # 2. Get the GitHub PAT (optional) and selected repositories from the connector config
+        # PAT is only required for private repositories - public repos work without it
+        github_pat = connector.config.get("GITHUB_PAT")  # Can be None or empty
         repo_full_names_to_index = connector.config.get("repo_full_names")
 
-        if not github_pat:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
-                "Missing GitHub PAT",
-                {"error_type": "MissingToken"},
-            )
-            return 0, "GitHub Personal Access Token (PAT) not found in connector config"
-
         if not repo_full_names_to_index or not isinstance(
             repo_full_names_to_index, list
         ):
@@ -117,10 +124,16 @@ async def index_github_repos(
             )
             return 0, "'repo_full_names' not found or is not a list in connector config"
 
-        # 3. Initialize GitHub connector client
+        # Log whether we're using authentication
+        if github_pat:
+            logger.info("Using GitHub PAT for authentication (private repos supported)")
+        else:
+            logger.info("No GitHub PAT provided - only public repositories can be indexed")
+
+        # 3. Initialize GitHub connector with gitingest backend
         await task_logger.log_task_progress(
             log_entry,
-            f"Initializing GitHub client for connector {connector_id}",
+            f"Initializing gitingest-based GitHub client for connector {connector_id}",
             {
                 "stage": "client_initialization",
                 "repo_count": len(repo_full_names_to_index),
@@ -138,258 +151,52 @@ async def index_github_repos(
             )
             return 0, f"Failed to initialize GitHub client: {e!s}"
 
-        # 4. Validate selected repositories
+        # 4. Process each repository with gitingest
         await task_logger.log_task_progress(
             log_entry,
-            f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
+            f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
             {
                 "stage": "repo_processing",
                 "repo_count": len(repo_full_names_to_index),
-                "start_date": start_date,
-                "end_date": end_date,
             },
         )
 
         logger.info(
-            f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
+            f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
         )
-        if start_date and end_date:
-            logger.info(
-                f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
-            )
 
-        # 6. Iterate through selected repositories and index files
         for repo_full_name in repo_full_names_to_index:
             if not repo_full_name or not isinstance(repo_full_name, str):
                 logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
                 continue
 
-            logger.info(f"Processing repository: {repo_full_name}")
+            logger.info(f"Ingesting repository: {repo_full_name}")
+
             try:
-                files_to_index = github_client.get_repository_files(repo_full_name)
-                if not files_to_index:
-                    logger.info(
-                        f"No indexable files found in repository: {repo_full_name}"
+                # Ingest the entire repository
+                digest = await github_client.ingest_repository(repo_full_name)
+
+                if not digest:
+                    logger.warning(
+                        f"No digest returned for repository: {repo_full_name}"
                     )
+                    errors.append(f"No digest for {repo_full_name}")
                     continue
 
-                logger.info(
-                    f"Found {len(files_to_index)} files to process in {repo_full_name}"
+                # Process the digest and create documents
+                docs_created = await _process_repository_digest(
+                    session=session,
+                    digest=digest,
+                    search_space_id=search_space_id,
+                    user_id=user_id,
+                    task_logger=task_logger,
+                    log_entry=log_entry,
                 )
 
-                for file_info in files_to_index:
-                    file_path = file_info.get("path")
-                    file_url = file_info.get("url")
-                    file_sha = file_info.get("sha")
-                    file_type = file_info.get("type")  # 'code' or 'doc'
-                    full_path_key = f"{repo_full_name}/{file_path}"
-
-                    if not file_path or not file_url or not file_sha:
-                        logger.warning(
-                            f"Skipping file with missing info in {repo_full_name}: {file_info}"
-                        )
-                        continue
-
-                    # Get file content
-                    file_content = github_client.get_file_content(
-                        repo_full_name, file_path
-                    )
-
-                    if file_content is None:
-                        logger.warning(
-                            f"Could not retrieve content for {full_path_key}. Skipping."
-                        )
-                        continue  # Skip if content fetch failed
-
-                    # Generate unique identifier hash for this GitHub file
-                    unique_identifier_hash = generate_unique_identifier_hash(
-                        DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
-                    )
-
-                    # Generate content hash
-                    content_hash = generate_content_hash(file_content, search_space_id)
-
-                    # Check if document with this unique identifier already exists
-                    existing_document = await check_document_by_unique_identifier(
-                        session, unique_identifier_hash
-                    )
-
-                    if existing_document:
-                        # Document exists - check if content has changed
-                        if existing_document.content_hash == content_hash:
-                            logger.info(
-                                f"Document for GitHub file {full_path_key} unchanged. Skipping."
-                            )
-                            continue
-                        else:
-                            # Content has changed - update the existing document
-                            logger.info(
-                                f"Content changed for GitHub file {full_path_key}. Updating document."
-                            )
-
-                            # Generate summary with metadata
-                            user_llm = await get_user_long_context_llm(
-                                session, user_id, search_space_id
-                            )
-                            if user_llm:
-                                file_extension = (
-                                    file_path.split(".")[-1]
-                                    if "." in file_path
-                                    else None
-                                )
-                                document_metadata = {
-                                    "file_path": full_path_key,
-                                    "repository": repo_full_name,
-                                    "file_type": file_extension or "unknown",
-                                    "document_type": "GitHub Repository File",
-                                    "connector_type": "GitHub",
-                                }
-                                (
-                                    summary_content,
-                                    summary_embedding,
-                                ) = await generate_document_summary(
-                                    file_content, user_llm, document_metadata
-                                )
-                            else:
-                                summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
-                                summary_embedding = (
-                                    config.embedding_model_instance.embed(
-                                        summary_content
-                                    )
-                                )
-
-                            # Chunk the content
-                            try:
-                                if hasattr(config, "code_chunker_instance"):
-                                    chunks_data = [
-                                        await create_document_chunks(file_content)
-                                    ][0]
-                                else:
-                                    chunks_data = await create_document_chunks(
-                                        file_content
-                                    )
-                            except Exception as chunk_err:
-                                logger.error(
-                                    f"Failed to chunk file {full_path_key}: {chunk_err}"
-                                )
-                                continue
-
-                            # Update existing document
-                            existing_document.title = f"GitHub - {full_path_key}"
-                            existing_document.content = summary_content
-                            existing_document.content_hash = content_hash
-                            existing_document.embedding = summary_embedding
-                            existing_document.document_metadata = {
-                                "file_path": file_path,
-                                "file_sha": file_sha,
-                                "file_url": file_url,
-                                "repository": repo_full_name,
-                                "indexed_at": datetime.now(UTC).strftime(
-                                    "%Y-%m-%d %H:%M:%S"
-                                ),
-                            }
-                            existing_document.chunks = chunks_data
-                            existing_document.updated_at = get_current_timestamp()
-
-                            logger.info(
-                                f"Successfully updated GitHub file {full_path_key}"
-                            )
-                            continue
-
-                    # Document doesn't exist - create new one
-                    # Generate summary with metadata
-                    user_llm = await get_user_long_context_llm(
-                        session, user_id, search_space_id
-                    )
-                    if user_llm:
-                        # Extract file extension from file path
-                        file_extension = (
-                            file_path.split(".")[-1] if "." in file_path else None
-                        )
-                        document_metadata = {
-                            "file_path": full_path_key,
-                            "repository": repo_full_name,
-                            "file_type": file_extension or "unknown",
-                            "document_type": "GitHub Repository File",
-                            "connector_type": "GitHub",
-                        }
-                        (
-                            summary_content,
-                            summary_embedding,
-                        ) = await generate_document_summary(
-                            file_content, user_llm, document_metadata
-                        )
-                    else:
-                        # Fallback to simple summary if no LLM configured
-                        summary_content = (
-                            f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
-                        )
-                        summary_embedding = config.embedding_model_instance.embed(
-                            summary_content
-                        )
-
-                    # Chunk the content
-                    try:
-                        chunks_data = [await create_document_chunks(file_content)][0]
-
-                        # Use code chunker if available, otherwise regular chunker
-                        if hasattr(config, "code_chunker_instance"):
-                            chunks_data = [
-                                {
-                                    "content": chunk.text,
-                                    "embedding": config.embedding_model_instance.embed(
-                                        chunk.text
-                                    ),
-                                }
-                                for chunk in config.code_chunker_instance.chunk(
-                                    file_content
-                                )
-                            ]
-                        else:
-                            chunks_data = await create_document_chunks(file_content)
-
-                    except Exception as chunk_err:
-                        logger.error(
-                            f"Failed to chunk file {full_path_key}: {chunk_err}"
-                        )
-                        errors.append(
-                            f"Chunking failed for {full_path_key}: {chunk_err}"
-                        )
-                        continue  # Skip this file if chunking fails
-
-                    doc_metadata = {
-                        "repository_full_name": repo_full_name,
-                        "file_path": file_path,
-                        "full_path": full_path_key,  # For easier lookup
-                        "url": file_url,
-                        "sha": file_sha,
-                        "type": file_type,
-                        "indexed_at": datetime.now(UTC).isoformat(),
-                    }
-
-                    # Create new document
-                    logger.info(f"Creating new document for file: {full_path_key}")
-                    document = Document(
-                        title=f"GitHub - {file_path}",
-                        document_type=DocumentType.GITHUB_CONNECTOR,
-                        document_metadata=doc_metadata,
-                        content=summary_content,  # Store summary
-                        content_hash=content_hash,
-                        unique_identifier_hash=unique_identifier_hash,
-                        embedding=summary_embedding,
-                        search_space_id=search_space_id,
-                        chunks=chunks_data,  # Associate chunks directly
-                        updated_at=get_current_timestamp(),
-                    )
-                    session.add(document)
-                    documents_processed += 1
-
-                    # Batch commit every 10 documents
-                    if documents_processed % 10 == 0:
-                        logger.info(
-                            f"Committing batch: {documents_processed} GitHub files processed so far"
-                        )
-                        await session.commit()
+                documents_processed += docs_created
+                logger.info(
+                    f"Created {docs_created} documents from repository: {repo_full_name}"
+                )
 
             except Exception as repo_err:
                 logger.error(
@@ -397,11 +204,11 @@ async def index_github_repos(
                 )
                 errors.append(f"Failed processing {repo_full_name}: {repo_err}")
 
-        # Final commit for any remaining documents not yet committed in batches
-        logger.info(f"Final commit: Total {documents_processed} GitHub files processed")
+        # Final commit
         await session.commit()
         logger.info(
-            f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
+            f"Finished GitHub indexing for connector {connector_id}. "
+            f"Created {documents_processed} documents."
         )
 
         # Log success
@@ -412,6 +219,7 @@ async def index_github_repos(
                 "documents_processed": documents_processed,
                 "errors_count": len(errors),
                 "repo_count": len(repo_full_names_to_index),
+                "method": "gitingest",
             },
         )
 
@@ -428,6 +236,7 @@ async def index_github_repos(
         )
         errors.append(f"Database error: {db_err}")
         return documents_processed, "; ".join(errors) if errors else str(db_err)
+
     except Exception as e:
         await session.rollback()
         await task_logger.log_task_failure(
@@ -445,3 +254,173 @@ async def index_github_repos(
 
     error_message = "; ".join(errors) if errors else None
     return documents_processed, error_message
+
+
+async def _process_repository_digest(
+    session: AsyncSession,
+    digest: RepositoryDigest,
+    search_space_id: int,
+    user_id: str,
+    task_logger: TaskLoggingService,
+    log_entry,
+) -> int:
+    """
+    Process a repository digest and create documents.
+
+    For each repository, we create:
+    1. One main document with the repository summary
+    2. Chunks from the full digest content for granular search
+
+    Args:
+        session: Database session
+        digest: The repository digest from gitingest
+        search_space_id: ID of the search space
+        user_id: ID of the user
+        task_logger: Task logging service
+        log_entry: Current log entry
+
+    Returns:
+        Number of documents created
+    """
+    repo_full_name = digest.repo_full_name
+    documents_created = 0
+
+    # Generate unique identifier based on repo name and content hash
+    # This allows updates when repo content changes
+    full_content = digest.full_digest
+    content_hash = generate_content_hash(full_content, search_space_id)
+
+    # Use repo name as the unique identifier (one document per repo)
+    unique_identifier_hash = generate_unique_identifier_hash(
+        DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
+    )
+
+    # Check if document with this unique identifier already exists
+    existing_document = await check_document_by_unique_identifier(
+        session, unique_identifier_hash
+    )
+
+    if existing_document:
+        # Document exists - check if content has changed
+        if existing_document.content_hash == content_hash:
+            logger.info(
+                f"Repository {repo_full_name} unchanged. Skipping."
+            )
+            return 0
+        else:
+            logger.info(
+                f"Content changed for repository {repo_full_name}. Updating document."
+            )
+            # Delete existing document to replace with new one
+            await session.delete(existing_document)
+            await session.flush()
+
+    # Generate summary using LLM (ONE call per repository!)
+    user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
+
+    document_metadata = {
+        "repository": repo_full_name,
+        "document_type": "GitHub Repository",
+        "connector_type": "GitHub",
+        "ingestion_method": "gitingest",
+        "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
+        "estimated_tokens": digest.estimated_tokens,
+    }
+
+    if user_llm:
+        # Prepare content for summarization
+        # Include tree structure and truncated content if too large
+        summary_content = digest.full_digest
+        if len(summary_content) > MAX_DIGEST_CHARS:
+            # Truncate but keep the tree and beginning of content
+            summary_content = (
+                f"# Repository: {repo_full_name}\n\n"
+                f"## File Structure\n\n{digest.tree}\n\n"
+                f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
+            )
+
+        summary_text, summary_embedding = await generate_document_summary(
+            summary_content, user_llm, document_metadata
+        )
+    else:
+        # Fallback to simple summary if no LLM configured
+        summary_text = (
+            f"# GitHub Repository: {repo_full_name}\n\n"
+            f"## Summary\n{digest.summary}\n\n"
+            f"## File Structure\n{digest.tree[:3000]}"
+        )
+        summary_embedding = config.embedding_model_instance.embed(summary_text)
+
+    # Chunk the full digest content for granular search
+    try:
+        # Use the content (not the summary) for chunking
+        # This preserves file-level granularity in search
+        chunks_data = await create_document_chunks(digest.content)
+    except Exception as chunk_err:
+        logger.error(
+            f"Failed to chunk repository {repo_full_name}: {chunk_err}"
+        )
+        # Fall back to a simpler chunking approach
+        chunks_data = await _simple_chunk_content(digest.content)
+
+    # Create the document
+    doc_metadata = {
+        "repository_full_name": repo_full_name,
+        "url": f"https://github.com/{repo_full_name}",
+        "branch": digest.branch,
+        "ingestion_method": "gitingest",
+        "file_tree": digest.tree,
+        "gitingest_summary": digest.summary,
+        "estimated_tokens": digest.estimated_tokens,
+        "indexed_at": datetime.now(UTC).isoformat(),
+    }
+
+    document = Document(
+        title=f"GitHub Repository: {repo_full_name}",
+        document_type=DocumentType.GITHUB_CONNECTOR,
+        document_metadata=doc_metadata,
+        content=summary_text,
+        content_hash=content_hash,
+        unique_identifier_hash=unique_identifier_hash,
+        embedding=summary_embedding,
+        search_space_id=search_space_id,
+        chunks=chunks_data,
+        updated_at=get_current_timestamp(),
+    )
+
+    session.add(document)
+    documents_created += 1
+
+    logger.info(
+        f"Created document for repository {repo_full_name} "
+        f"with {len(chunks_data)} chunks"
+    )
+
+    return documents_created
+
+
+async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
+    """
+    Simple fallback chunking when the regular chunker fails.
+
+    Args:
+        content: The content to chunk
+        chunk_size: Size of each chunk in characters
+
+    Returns:
+        List of chunk dictionaries with content and embedding
+    """
+    from app.db import Chunk
+
+    chunks = []
+    for i in range(0, len(content), chunk_size):
+        chunk_text = content[i : i + chunk_size]
+        if chunk_text.strip():
+            chunks.append(
+                Chunk(
+                    content=chunk_text,
+                    embedding=config.embedding_model_instance.embed(chunk_text),
+                )
+            )
+
+    return chunks
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index 83a00b4e4..6197dbce7 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -60,6 +60,7 @@ dependencies = [
     "mcp>=1.25.0",
     "starlette>=0.40.0,<0.51.0",
     "sse-starlette>=3.1.1,<3.1.2",
+    "gitingest>=0.3.1",
 ]
 
 [dependency-groups]
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index ef01847f8..44daab0d6 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -1945,6 +1945,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
 ]
 
+[[package]]
+name = "gitingest"
+version = "0.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "httpx" },
+    { name = "loguru" },
+    { name = "pathspec" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "starlette" },
+    { name = "tiktoken" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" },
+]
+
 [[package]]
 name = "google-api-core"
 version = "2.25.1"
@@ -4460,6 +4479,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
 ]
 
+[[package]]
+name = "pathspec"
+version = "1.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" },
+]
+
 [[package]]
 name = "pdf2image"
 version = "1.17.0"
@@ -6484,6 +6512,7 @@ dependencies = [
     { name = "firecrawl-py" },
     { name = "flower" },
     { name = "github3-py" },
+    { name = "gitingest" },
     { name = "google-api-python-client" },
     { name = "google-auth-oauthlib" },
     { name = "kokoro" },
@@ -6549,6 +6578,7 @@ requires-dist = [
     { name = "firecrawl-py", specifier = ">=4.9.0" },
     { name = "flower", specifier = ">=2.0.1" },
     { name = "github3-py", specifier = "==4.0.1" },
+    { name = "gitingest", specifier = ">=0.3.1" },
     { name = "google-api-python-client", specifier = ">=2.156.0" },
     { name = "google-auth-oauthlib", specifier = ">=1.2.1" },
     { name = "kokoro", specifier = ">=0.9.4" },
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index b2b371ed8..6ed36e180 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -34,7 +34,6 @@ import {
 } from "@/components/ui/select";
 import { Switch } from "@/components/ui/switch";
 import { EnumConnectorName } from "@/contracts/enums/connector";
-import { DateRangeSelector } from "../../components/date-range-selector";
 import { getConnectorBenefits } from "../connector-benefits";
 import type { ConnectFormProps } from "../index";
 
@@ -44,12 +43,13 @@ const githubConnectorFormSchema = z.object({
 	}),
 	github_pat: z
 		.string()
-		.min(20, {
-			message: "GitHub Personal Access Token seems too short.",
-		})
-		.refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
-			message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
-		}),
+		.optional()
+		.refine(
+			(pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"),
+			{
+				message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
+			}
+		),
 	repo_full_names: z.string().min(1, {
 		message: "At least one repository is required.",
 	}),
@@ -59,8 +59,6 @@ type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
 
 export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
 	const isSubmittingRef = useRef(false);
-	const [startDate, setStartDate] = useState<Date | undefined>(undefined);
-	const [endDate, setEndDate] = useState<Date | undefined>(undefined);
 	const [periodicEnabled, setPeriodicEnabled] = useState(false);
 	const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
 	const form = useForm<GithubConnectorFormValues>({
@@ -94,7 +92,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 				name: values.name,
 				connector_type: EnumConnectorName.GITHUB_CONNECTOR,
 				config: {
-					GITHUB_PAT: values.github_pat,
+					GITHUB_PAT: values.github_pat || null, // Optional - only for private repos
 					repo_full_names: repoList,
 				},
 				is_indexable: true,
@@ -102,8 +100,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 				periodic_indexing_enabled: periodicEnabled,
 				indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
 				next_scheduled_at: null,
-				startDate,
-				endDate,
+				// GitHub indexes full repo snapshots - no date range needed
+				startDate: undefined,
+				endDate: undefined,
 				periodicEnabled,
 				frequencyMinutes,
 			});
@@ -117,10 +116,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
 				<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
 				<div className="-ml-1">
-					<AlertTitle className="text-xs sm:text-sm">Personal Access Token Required</AlertTitle>
+					<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
 					<AlertDescription className="text-[10px] sm:text-xs !pl-0">
-						You'll need a GitHub Personal Access Token to use this connector. You can create one
-						from{" "}
+						A GitHub PAT is only required for private repositories. Public repos work without a
+						token. Create one from{" "}
 						<a
 							href="https://github.com/settings/tokens"
 							target="_blank"
@@ -128,7 +127,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 							className="font-medium underline underline-offset-4"
 						>
 							GitHub Settings
-						</a>
+						</a>{" "}
+						if needed.
 					</AlertDescription>
 				</div>
 			</Alert>
@@ -167,7 +167,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 							name="github_pat"
 							render={({ field }) => (
 								<FormItem>
-									<FormLabel className="text-xs sm:text-sm">GitHub Personal Access Token</FormLabel>
+									<FormLabel className="text-xs sm:text-sm">
+										GitHub Personal Access Token{" "}
+										<span className="text-muted-foreground font-normal">(optional)</span>
+									</FormLabel>
 									<FormControl>
 										<Input
 											type="password"
@@ -178,8 +181,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 										/>
 									</FormControl>
 									<FormDescription className="text-[10px] sm:text-xs">
-										Your GitHub PAT will be encrypted and stored securely. It typically starts with
-										"ghp_" or "github_pat_".
+										Only required for private repositories. Leave empty if indexing public repos
+										only.
 									</FormDescription>
 									<FormMessage />
 								</FormItem>
@@ -225,15 +228,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 
 						{/* Indexing Configuration */}
 						<div className="space-y-4 pt-4 border-t border-slate-400/20">
-							<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
+							<h3 className="text-sm sm:text-base font-medium">Sync Configuration</h3>
 
-							{/* Date Range Selector */}
-							<DateRangeSelector
-								startDate={startDate}
-								endDate={endDate}
-								onStartDateChange={setStartDate}
-								onEndDateChange={setEndDate}
-							/>
+							{/* Note: No date range for GitHub - it indexes full repo snapshots */}
 
 							{/* Periodic Sync Config */}
 							<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx
index df2809fdb..bf5bd4087 100644
--- a/surfsense_web/components/new-chat/source-detail-panel.tsx
+++ b/surfsense_web/components/new-chat/source-detail-panel.tsx
@@ -490,8 +490,8 @@ export function SourceDetailPanel({
 														>
 															{idx + 1}
 															{isCited && (
-																<span className="absolute -top-1 -right-1 w-3 h-3 bg-primary rounded-full border-2 border-background">
-																	<Sparkles className="h-2 w-2 text-primary-foreground absolute top-0.5 left-0.5" />
+																<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
+																	<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
 																</span>
 															)}
 														</motion.button>

From 35888144ebba4111e2db43382db99790bcd8adba Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 20 Jan 2026 23:24:33 +0530
Subject: [PATCH 3/7] refactor: Update GitHub connector to use gitingest CLI

- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery.
- Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution.
- Adjusted GitHub indexer to call the new synchronous ingestion method.
- Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
---
 .../app/connectors/github_connector.py        | 325 +++++++-----------
 .../connector_indexers/github_indexer.py      |   9 +-
 surfsense_backend/app/utils/validators.py     |   5 +-
 .../config/connector-status-config.json       |   5 -
 .../components/github-connect-form.tsx        |  98 +++---
 .../components/github-config.tsx              |  25 +-
 .../views/connector-edit-view.tsx             |   5 +-
 .../views/indexing-configuration-view.tsx     |   5 +-
 8 files changed, 221 insertions(+), 256 deletions(-)

diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py
index 90fd93fb9..6f04ccdba 100644
--- a/surfsense_backend/app/connectors/github_connector.py
+++ b/surfsense_backend/app/connectors/github_connector.py
@@ -1,130 +1,21 @@
 """
-GitHub connector using gitingest for efficient repository digestion.
+GitHub connector using gitingest CLI for efficient repository digestion.
 
-This connector replaces the previous file-by-file approach with a single
-digest generation per repository, dramatically reducing LLM API calls.
+This connector uses subprocess to call gitingest CLI, completely isolating
+it from any Python event loop/async complexity that can cause hangs in Celery.
 """
 
 import logging
+import os
+import subprocess
+import tempfile
 from dataclasses import dataclass
 
-from gitingest import ingest_async
-
 logger = logging.getLogger(__name__)
 
 # Maximum file size in bytes (5MB)
 MAX_FILE_SIZE = 5 * 1024 * 1024
 
-# Default patterns to exclude (recommended approach for comprehensive analysis)
-# Using only exclude_patterns ensures we don't miss any relevant file types
-DEFAULT_EXCLUDE_PATTERNS = [
-    # Dependencies
-    "node_modules/*",
-    "vendor/*",
-    "bower_components/*",
-    ".pnpm/*",
-    # Build artifacts / Caches
-    "build/*",
-    "dist/*",
-    "target/*",
-    "out/*",
-    "__pycache__/*",
-    "*.pyc",
-    ".cache/*",
-    ".next/*",
-    ".nuxt/*",
-    # Virtual environments
-    "venv/*",
-    ".venv/*",
-    "env/*",
-    ".env/*",
-    # IDE/Editor config
-    ".vscode/*",
-    ".idea/*",
-    ".project",
-    ".settings/*",
-    "*.swp",
-    "*.swo",
-    # Version control
-    ".git/*",
-    ".svn/*",
-    ".hg/*",
-    # Temporary / Logs
-    "tmp/*",
-    "temp/*",
-    "logs/*",
-    "*.log",
-    # Lock files (usually not needed for understanding code)
-    "package-lock.json",
-    "pnpm-lock.yaml",
-    "yarn.lock",
-    "uv.lock",
-    "Gemfile.lock",
-    "poetry.lock",
-    "Cargo.lock",
-    "composer.lock",
-    # Binary/media files
-    "*.png",
-    "*.jpg",
-    "*.jpeg",
-    "*.gif",
-    "*.ico",
-    "*.svg",
-    "*.webp",
-    "*.bmp",
-    "*.tiff",
-    "*.woff",
-    "*.woff2",
-    "*.ttf",
-    "*.eot",
-    "*.otf",
-    "*.mp3",
-    "*.mp4",
-    "*.wav",
-    "*.ogg",
-    "*.webm",
-    "*.avi",
-    "*.mov",
-    "*.pdf",
-    "*.doc",
-    "*.docx",
-    "*.xls",
-    "*.xlsx",
-    "*.ppt",
-    "*.pptx",
-    "*.zip",
-    "*.tar",
-    "*.tar.gz",
-    "*.tgz",
-    "*.rar",
-    "*.7z",
-    "*.exe",
-    "*.dll",
-    "*.so",
-    "*.dylib",
-    "*.bin",
-    "*.obj",
-    "*.o",
-    "*.a",
-    "*.lib",
-    # Minified files
-    "*.min.js",
-    "*.min.css",
-    # Source maps
-    "*.map",
-    # Database files
-    "*.db",
-    "*.sqlite",
-    "*.sqlite3",
-    # Coverage reports
-    "coverage/*",
-    ".coverage",
-    "htmlcov/*",
-    ".nyc_output/*",
-    # Test snapshots (can be large)
-    "__snapshots__/*",
-]
-
 
 @dataclass
 class RepositoryDigest:
@@ -149,21 +40,19 @@ class RepositoryDigest:
 
 class GitHubConnector:
     """
-    Connector for ingesting GitHub repositories using gitingest.
+    Connector for ingesting GitHub repositories using gitingest CLI.
 
-    This connector efficiently processes entire repositories into a single
-    digest, reducing the number of API calls and LLM invocations compared
-    to file-by-file processing.
+    Uses subprocess to run gitingest, which avoids all async/event loop
+    issues that can occur when mixing gitingest with Celery workers.
     """
 
     def __init__(self, token: str | None = None):
         """
-        Initializes the GitHub connector.
+        Initialize the GitHub connector.
 
         Args:
             token: Optional GitHub Personal Access Token (PAT).
                    Only required for private repositories.
-                   Public repositories can be ingested without a token.
         """
         self.token = token if token and token.strip() else None
         if self.token:
@@ -171,72 +60,104 @@ class GitHubConnector:
         else:
             logger.info("GitHub connector initialized without token (public repos only).")
 
-    async def ingest_repository(
+    def ingest_repository(
         self,
         repo_full_name: str,
         branch: str | None = None,
-        include_patterns: list[str] | None = None,
-        exclude_patterns: list[str] | None = None,
         max_file_size: int = MAX_FILE_SIZE,
     ) -> RepositoryDigest | None:
         """
-        Ingest an entire repository and return a digest.
+        Ingest a repository using gitingest CLI via subprocess.
+
+        This approach completely isolates gitingest from Python's event loop,
+        avoiding any async/Celery conflicts.
 
         Args:
             repo_full_name: The full name of the repository (e.g., 'owner/repo').
             branch: Optional specific branch or tag to ingest.
-            include_patterns: Optional list of glob patterns for files to include.
-                             If None, includes all files (recommended).
-            exclude_patterns: Optional list of glob patterns for files to exclude.
-                             If None, uses DEFAULT_EXCLUDE_PATTERNS.
-            max_file_size: Maximum file size in bytes to include (default 5MB).
+            max_file_size: Maximum file size in bytes to include.
 
         Returns:
-            RepositoryDigest containing the summary, tree structure, and content,
-            or None if ingestion fails.
+            RepositoryDigest or None if ingestion fails.
         """
         repo_url = f"https://github.com/{repo_full_name}"
 
-        # Use only exclude_patterns by default (recommended for comprehensive analysis)
-        # This ensures we don't miss any relevant file types
-        exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
-
-        logger.info(f"Starting gitingest for repository: {repo_full_name}")
+        logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
 
         try:
-            # Build kwargs dynamically
-            ingest_kwargs = {
-                "max_file_size": max_file_size,
-                "exclude_patterns": exclude_pats,
-                "include_gitignored": False,
-                "include_submodules": False,
-            }
+            # Create a temporary file for output
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".txt", delete=False
+            ) as tmp_file:
+                output_path = tmp_file.name
 
-            # Only add token if provided (required only for private repos)
-            if self.token:
-                ingest_kwargs["token"] = self.token
+            # Build the gitingest CLI command
+            cmd = [
+                "gitingest",
+                repo_url,
+                "--output", output_path,
+                "--max-size", str(max_file_size),
+                # Common exclude patterns
+                "-e", "node_modules/*",
+                "-e", "vendor/*",
+                "-e", ".git/*",
+                "-e", "__pycache__/*",
+                "-e", "dist/*",
+                "-e", "build/*",
+                "-e", "*.lock",
+                "-e", "package-lock.json",
+            ]
 
-            # Only add branch if specified
+            # Add branch if specified
             if branch:
-                ingest_kwargs["branch"] = branch
+                cmd.extend(["--branch", branch])
 
-            # Only add include_patterns if explicitly provided
-            if include_patterns is not None:
-                ingest_kwargs["include_patterns"] = include_patterns
+            # Set up environment with token if provided
+            env = os.environ.copy()
+            if self.token:
+                env["GITHUB_TOKEN"] = self.token
 
-            summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
+            logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
 
-            if not content or not content.strip():
-                logger.warning(
-                    f"No content retrieved from repository: {repo_full_name}"
-                )
+            # Run gitingest as subprocess with timeout
+            result = subprocess.run(
+                cmd,
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=900,  # 5 minute timeout
+            )
+
+            if result.returncode != 0:
+                logger.error(f"gitingest failed: {result.stderr}")
+                # Clean up temp file
+                if os.path.exists(output_path):
+                    os.unlink(output_path)
                 return None
 
+            # Read the output file
+            if not os.path.exists(output_path):
+                logger.error("gitingest did not create output file")
+                return None
+
+            with open(output_path, encoding="utf-8") as f:
+                full_content = f.read()
+
+            # Clean up temp file
+            os.unlink(output_path)
+
+            if not full_content or not full_content.strip():
+                logger.warning(f"No content retrieved from repository: {repo_full_name}")
+                return None
+
+            # Parse the gitingest output
+            # The output format is: summary + tree + content
+            # We'll extract what we can
             digest = RepositoryDigest(
                 repo_full_name=repo_full_name,
-                summary=summary,
-                tree=tree,
-                content=content,
+                summary=f"Repository: {repo_full_name}",
+                tree="",  # gitingest CLI combines everything into one file
+                content=full_content,
                 branch=branch,
             )
 
@@ -246,50 +167,70 @@ class GitHubConnector:
             )
             return digest
 
+        except subprocess.TimeoutExpired:
+            logger.error(f"gitingest timed out for repository: {repo_full_name}")
+            return None
+        except FileNotFoundError:
+            logger.error(
+                "gitingest CLI not found. Falling back to Python library."
+            )
+            # Fall back to Python library
+            return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
         except Exception as e:
             logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
             return None
 
-    async def ingest_repositories(
+    def _ingest_with_python_library(
         self,
-        repo_full_names: list[str],
+        repo_full_name: str,
         branch: str | None = None,
-        include_patterns: list[str] | None = None,
-        exclude_patterns: list[str] | None = None,
         max_file_size: int = MAX_FILE_SIZE,
-    ) -> list[RepositoryDigest]:
+    ) -> RepositoryDigest | None:
         """
-        Ingest multiple repositories and return their digests.
-
-        Args:
-            repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
-            branch: Optional specific branch or tag to ingest (applied to all repos).
-            include_patterns: Optional list of glob patterns for files to include.
-            exclude_patterns: Optional list of glob patterns for files to exclude.
-            max_file_size: Maximum file size in bytes to include.
-
-        Returns:
-            List of RepositoryDigest objects for successfully ingested repositories.
+        Fallback: Ingest using the Python library directly.
         """
-        digests = []
+        from gitingest import ingest
 
-        for repo_full_name in repo_full_names:
-            if not repo_full_name or not isinstance(repo_full_name, str):
-                logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
-                continue
+        repo_url = f"https://github.com/{repo_full_name}"
 
-            digest = await self.ingest_repository(
+        logger.info(f"Using Python gitingest library for: {repo_full_name}")
+
+        try:
+            kwargs = {
+                "max_file_size": max_file_size,
+                "exclude_patterns": [
+                    "node_modules/*",
+                    "vendor/*",
+                    ".git/*",
+                    "__pycache__/*",
+                    "dist/*",
+                    "build/*",
+                    "*.lock",
+                    "package-lock.json",
+                ],
+                "include_gitignored": False,
+                "include_submodules": False,
+            }
+
+            if self.token:
+                kwargs["token"] = self.token
+            if branch:
+                kwargs["branch"] = branch
+
+            summary, tree, content = ingest(repo_url, **kwargs)
+
+            if not content or not content.strip():
+                logger.warning(f"No content from {repo_full_name}")
+                return None
+
+            return RepositoryDigest(
                 repo_full_name=repo_full_name,
+                summary=summary,
+                tree=tree,
+                content=content,
                 branch=branch,
-                include_patterns=include_patterns,
-                exclude_patterns=exclude_patterns,
-                max_file_size=max_file_size,
             )
 
-            if digest:
-                digests.append(digest)
-
-        logger.info(
-            f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
-        )
-        return digests
+        except Exception as e:
+            logger.error(f"Python library failed for {repo_full_name}: {e}")
+            return None
diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
index f1ccabdef..f16ee0156 100644
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@@ -173,8 +173,13 @@ async def index_github_repos(
             logger.info(f"Ingesting repository: {repo_full_name}")
 
             try:
-                # Ingest the entire repository
-                digest = await github_client.ingest_repository(repo_full_name)
+                # Run gitingest via subprocess (isolated from event loop)
+                # Using to_thread to not block the async database operations
+                import asyncio
+
+                digest = await asyncio.to_thread(
+                    github_client.ingest_repository, repo_full_name
+                )
 
                 if not digest:
                     logger.warning(
diff --git a/surfsense_backend/app/utils/validators.py b/surfsense_backend/app/utils/validators.py
index 54e681518..6a87679ec 100644
--- a/surfsense_backend/app/utils/validators.py
+++ b/surfsense_backend/app/utils/validators.py
@@ -530,7 +530,10 @@ def validate_connector_config(
         #     "validators": {},
         # },
         "GITHUB_CONNECTOR": {
-            "required": ["GITHUB_PAT", "repo_full_names"],
+            # GITHUB_PAT is optional - only required for private repositories
+            # Public repositories can be indexed without authentication
+            "required": ["repo_full_names"],
+            "optional": ["GITHUB_PAT"],  # Optional - only needed for private repos
             "validators": {
                 "repo_full_names": lambda: validate_list_field(
                     "repo_full_names", "repo_full_names"
diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
index 6ed792b8e..b729c3f8b 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
+++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json
@@ -24,11 +24,6 @@
 			"enabled": true,
 			"status": "warning",
 			"statusMessage": "Some requests may be blocked if not using Firecrawl."
-		},
-		"GITHUB_CONNECTOR": {
-			"enabled": false,
-			"status": "maintenance",
-			"statusMessage": "Rework in progress."
 		}
 	},
 	"globalSettings": {
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index 6ed36e180..72d5811d3 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -96,6 +96,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 					repo_full_names: repoList,
 				},
 				is_indexable: true,
+				is_active: true,
 				last_indexed_at: null,
 				periodic_indexing_enabled: periodicEnabled,
 				indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
@@ -119,16 +120,16 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 					<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
 					<AlertDescription className="text-[10px] sm:text-xs !pl-0">
 						A GitHub PAT is only required for private repositories. Public repos work without a
-						token. Create one from{" "}
+						token. {" "}
 						<a
-							href="https://github.com/settings/tokens"
+							href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
 							target="_blank"
 							rel="noopener noreferrer"
 							className="font-medium underline underline-offset-4"
 						>
-							GitHub Settings
+							Get your token
 						</a>{" "}
-						if needed.
+						.
 					</AlertDescription>
 				</div>
 			</Alert>
@@ -324,20 +325,21 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 						<div>
 							<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
 							<p className="text-[10px] sm:text-xs text-muted-foreground">
-								The GitHub connector uses a Personal Access Token (PAT) to authenticate with the
-								GitHub API. You provide a comma-separated list of repository full names (e.g.,
-								"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant
-								files (code, markdown, text) from the selected repositories.
+								The GitHub connector ingests entire repositories in one pass using gitingest,
+								making it highly efficient. Provide a comma-separated list of repository full
+								names (e.g., "owner/repo1, owner/repo2") to index.
 							</p>
 							<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
 								<li>
-									The connector indexes files based on common code and documentation extensions.
+									<strong>Public repos:</strong> No authentication required.
 								</li>
-								<li>Large files (over 1MB) are skipped during indexing.</li>
-								<li>Only specified repositories are indexed.</li>
 								<li>
-									Indexing runs periodically (check connector settings for frequency) to keep
-									content up-to-date.
+									<strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
+								</li>
+								<li>Indexes code, documentation, and configuration files.</li>
+								<li>Large files (over 5MB) and binary files are automatically skipped.</li>
+								<li>
+									Periodic sync detects changes and only re-indexes when content has changed.
 								</li>
 							</ul>
 						</div>
@@ -348,19 +350,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 								<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
 									<Info className="h-3 w-3 sm:h-4 sm:w-4" />
 									<AlertTitle className="text-[10px] sm:text-xs">
-										Personal Access Token Required
+										Personal Access Token (Optional)
 									</AlertTitle>
 									<AlertDescription className="text-[9px] sm:text-[10px]">
-										You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch
-										repositories. The PAT will be stored securely to enable indexing.
+										A GitHub PAT is only needed for <strong>private repositories</strong>. Public
+										repos can be indexed without authentication. If you need to access private
+										repos, create a PAT with the 'repo' scope.
 									</AlertDescription>
 								</Alert>
 
 								<div className="space-y-4 sm:space-y-6">
 									<div>
 										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Step 1: Generate GitHub PAT
+											For Private Repositories Only: Generate GitHub PAT
 										</h4>
+										<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
+											Skip this step if you're only indexing public repositories.
+										</p>
 										<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
 											<li>
 												Go to your GitHub{" "}
@@ -375,46 +381,36 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 											</li>
 											<li>
 												Click on <strong>Personal access tokens</strong>, then choose{" "}
-												<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "}
-												(recommended if available).
+												<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
 											</li>
 											<li>
-												Click <strong>Generate new token</strong> (and choose the appropriate type).
+												Click <strong>Generate new token</strong>.
 											</li>
 											<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
-											<li>Set an expiration date for the token (recommended for security).</li>
 											<li>
-												Under <strong>Select scopes</strong> (for classic tokens) or{" "}
-												<strong>Repository access</strong> (for fine-grained), grant the necessary
-												permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
-												read access to repositories for fine-grained tokens) is required to read
-												repository content.
+												Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
+												to the specific repositories you want to index (for fine-grained tokens).
 											</li>
 											<li>
-												Click <strong>Generate token</strong>.
-											</li>
-											<li>
-												<strong>Important:</strong> Copy your new PAT immediately. You won't be able
-												to see it again after leaving the page.
+												Click <strong>Generate token</strong> and copy it immediately.
 											</li>
 										</ol>
 									</div>
 
 									<div>
 										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Step 2: Specify repositories
+											Specify Repositories
 										</h4>
 										<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
 											Enter a comma-separated list of repository full names in the format
-											"owner/repo1, owner/repo2". The connector will index files from only the
-											specified repositories.
+											"owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
 										</p>
 										<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
 											<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-											<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle>
+											<AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
 											<AlertDescription className="text-[9px] sm:text-[10px]">
-												Make sure your PAT has access to all repositories you want to index. Private
-												repositories require appropriate permissions.
+												Public repositories work without a PAT. For private repositories, ensure
+												your PAT has access to the repos you want to index.
 											</AlertDescription>
 										</Alert>
 									</div>
@@ -424,36 +420,38 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 
 						<div className="space-y-4">
 							<div>
-								<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3>
+								<h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
 								<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
 									<li>
-										Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "}
-										Connector.
+										Enter the <strong>Repository Names</strong> you want to index (e.g.,
+										"facebook/react, vercel/next.js").
 									</li>
 									<li>
-										Enter your <strong>GitHub Personal Access Token</strong> in the form field.
+										<strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
 									</li>
 									<li>
-										Enter a comma-separated list of <strong>Repository Names</strong> (e.g.,
-										"owner/repo1, owner/repo2").
+										Click <strong>Connect GitHub</strong> to start indexing.
 									</li>
 									<li>
-										Click <strong>Connect</strong> to establish the connection.
+										Enable <strong>Periodic Sync</strong> to automatically detect and index
+										changes.
 									</li>
-									<li>Once connected, your GitHub repositories will be indexed automatically.</li>
 								</ol>
 
 								<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
 									<Info className="h-3 w-3 sm:h-4 sm:w-4" />
 									<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
 									<AlertDescription className="text-[9px] sm:text-[10px]">
-										<p className="mb-2">The GitHub connector indexes the following data:</p>
+										<p className="mb-2">The GitHub connector indexes:</p>
 										<ul className="list-disc pl-5 space-y-1">
-											<li>Code files from selected repositories</li>
-											<li>README files and Markdown documentation</li>
-											<li>Common text-based file formats</li>
-											<li>Repository metadata and structure</li>
+											<li>All code files (Python, JavaScript, TypeScript, etc.)</li>
+											<li>Documentation (README, Markdown, text files)</li>
+											<li>Configuration files (JSON, YAML, TOML, etc.)</li>
+											<li>Repository structure and file tree</li>
 										</ul>
+										<p className="mt-2">
+											Binary files, images, and build artifacts are automatically excluded.
+										</p>
 									</AlertDescription>
 								</Alert>
 							</div>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx
index 07c7bdfbc..d5169b49d 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx
@@ -1,8 +1,9 @@
 "use client";
 
-import { KeyRound } from "lucide-react";
+import { Info, KeyRound } from "lucide-react";
 import type { FC } from "react";
 import { useEffect, useState } from "react";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Badge } from "@/components/ui/badge";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
@@ -79,6 +80,26 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 
 	return (
 		<div className="space-y-6">
+			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
+				<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
+				<div className="-ml-1">
+					<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
+					<AlertDescription className="text-[10px] sm:text-xs !pl-0">
+						A GitHub PAT is only required for private repositories. Public repos work without a
+						token. Create one from{" "}
+						<a
+							href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
+							target="_blank"
+							rel="noopener noreferrer"
+							className="font-medium underline underline-offset-4"
+						>
+							GitHub Settings
+						</a>{" "}
+						if needed.
+					</AlertDescription>
+				</div>
+			</Alert>
+
 			{/* Connector Name */}
 			<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
 				<div className="space-y-2">
@@ -105,7 +126,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 					<div className="space-y-2">
 						<Label className="flex items-center gap-2 text-xs sm:text-sm">
 							<KeyRound className="h-4 w-4" />
-							GitHub Personal Access Token
+							GitHub Personal Access Token (optional)
 						</Label>
 						<Input
 							type="password"
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 5433acbf7..66afd84a5 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 						{/* Date range selector and periodic sync - only shown for indexable connectors */}
 						{connector.is_indexable && (
 							<>
-								{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
+								{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
 								{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
-									connector.connector_type !== "WEBCRAWLER_CONNECTOR" && (
+									connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
+									connector.connector_type !== "GITHUB_CONNECTOR" && (
 										<DateRangeSelector
 											startDate={startDate}
 											endDate={endDate}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
index 8c05c8d4a..ea489aec8 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
@@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 						{/* Date range selector and periodic sync - only shown for indexable connectors */}
 						{connector?.is_indexable && (
 							<>
-								{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
+								{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
 								{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
-									config.connectorType !== "WEBCRAWLER_CONNECTOR" && (
+									config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
+									config.connectorType !== "GITHUB_CONNECTOR" && (
 										<DateRangeSelector
 											startDate={startDate}
 											endDate={endDate}

From 6dd535c85a85944b6324b2c07f3c21fbfc1294ba Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 21 Jan 2026 00:19:17 +0530
Subject: [PATCH 4/7] fix: Clean up GitHub connector UI and documentation

- Removed unnecessary period from the GitHub connect form alert description.
- Moved helper functions for string and array conversion outside the component to avoid useEffect dependency issues.
- Updated the GitHub connector documentation to provide detailed indexing information and troubleshooting tips for users.
---
 .../components/github-connect-form.tsx        |  1 -
 .../components/github-config.tsx              | 67 +++++++---------
 .../content/docs/connectors/github.mdx        | 79 ++++++++++++++++++-
 3 files changed, 107 insertions(+), 40 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index 72d5811d3..4fb9e93bf 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -129,7 +129,6 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 						>
 							Get your token
 						</a>{" "}
-						.
 					</AlertDescription>
 				</div>
 			</Alert>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx
index d5169b49d..2c28758b8 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx
@@ -1,9 +1,8 @@
 "use client";
 
-import { Info, KeyRound } from "lucide-react";
+import { KeyRound } from "lucide-react";
 import type { FC } from "react";
-import { useEffect, useState } from "react";
-import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { useEffect, useRef, useState } from "react";
 import { Badge } from "@/components/ui/badge";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
@@ -13,25 +12,29 @@ export interface GithubConfigProps extends ConnectorConfigProps {
 	onNameChange?: (name: string) => void;
 }
 
+// Helper functions moved outside component to avoid useEffect dependency issues
+const stringToArray = (arr: string[] | string | undefined): string[] => {
+	if (Array.isArray(arr)) return arr;
+	if (typeof arr === "string") {
+		return arr
+			.split(",")
+			.map((item) => item.trim())
+			.filter((item) => item.length > 0);
+	}
+	return [];
+};
+
+const arrayToString = (arr: string[]): string => {
+	return arr.join(", ");
+};
+
 export const GithubConfig: FC<GithubConfigProps> = ({
 	connector,
 	onConfigChange,
 	onNameChange,
 }) => {
-	const stringToArray = (arr: string[] | string | undefined): string[] => {
-		if (Array.isArray(arr)) return arr;
-		if (typeof arr === "string") {
-			return arr
-				.split(",")
-				.map((item) => item.trim())
-				.filter((item) => item.length > 0);
-		}
-		return [];
-	};
-
-	const arrayToString = (arr: string[]): string => {
-		return arr.join(", ");
-	};
+	// Track internal changes to prevent useEffect from overwriting user input
+	const isInternalChange = useRef(false);
 
 	const [githubPat, setGithubPat] = useState<string>(
 		(connector.config?.GITHUB_PAT as string) || ""
@@ -41,8 +44,13 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 	);
 	const [name, setName] = useState<string>(connector.name || "");
 
-	// Update values when connector changes
+	// Update values when connector changes externally (not from our own input)
 	useEffect(() => {
+		// Skip if this is our own internal change
+		if (isInternalChange.current) {
+			isInternalChange.current = false;
+			return;
+		}
 		const pat = (connector.config?.GITHUB_PAT as string) || "";
 		const repos = arrayToString(stringToArray(connector.config?.repo_full_names));
 		setGithubPat(pat);
@@ -51,6 +59,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 	}, [connector.config, connector.name]);
 
 	const handleGithubPatChange = (value: string) => {
+		isInternalChange.current = true;
 		setGithubPat(value);
 		if (onConfigChange) {
 			onConfigChange({
@@ -61,6 +70,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 	};
 
 	const handleRepoFullNamesChange = (value: string) => {
+		isInternalChange.current = true;
 		setRepoFullNames(value);
 		const repoList = stringToArray(value);
 		if (onConfigChange) {
@@ -72,6 +82,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 	};
 
 	const handleNameChange = (value: string) => {
+		isInternalChange.current = true;
 		setName(value);
 		if (onNameChange) {
 			onNameChange(value);
@@ -80,26 +91,6 @@ export const GithubConfig: FC<GithubConfigProps> = ({
 
 	return (
 		<div className="space-y-6">
-			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
-				<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
-				<div className="-ml-1">
-					<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
-					<AlertDescription className="text-[10px] sm:text-xs !pl-0">
-						A GitHub PAT is only required for private repositories. Public repos work without a
-						token. Create one from{" "}
-						<a
-							href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
-							target="_blank"
-							rel="noopener noreferrer"
-							className="font-medium underline underline-offset-4"
-						>
-							GitHub Settings
-						</a>{" "}
-						if needed.
-					</AlertDescription>
-				</div>
-			</Alert>
-
 			{/* Connector Name */}
 			<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
 				<div className="space-y-2">
diff --git a/surfsense_web/content/docs/connectors/github.mdx b/surfsense_web/content/docs/connectors/github.mdx
index bb2faca81..6a4574ec4 100644
--- a/surfsense_web/content/docs/connectors/github.mdx
+++ b/surfsense_web/content/docs/connectors/github.mdx
@@ -3,4 +3,81 @@ title: GitHub
 description: Connect your GitHub repositories to SurfSense
 ---
 
-# Documentation in progress
\ No newline at end of file
+# GitHub Connector
+
+Connect your GitHub repositories to SurfSense for code search and AI-powered insights. The connector uses [gitingest](https://gitingest.com) to efficiently index entire codebases.
+
+## What Gets Indexed
+
+| Content Type | Examples |
+|--------------|----------|
+| Code Files | Python, JavaScript, TypeScript, Go, Rust, Java, etc. |
+| Documentation | README files, Markdown documents, text files |
+| Configuration | JSON, YAML, TOML, .env examples, Dockerfiles |
+
+> ⚠️ Binary files and files larger than 5MB are automatically excluded.
+
+---
+
+## Quick Start (Public Repos)
+
+1. Navigate to **Connectors** → **Add Connector** → **GitHub**
+2. Enter repository names: `owner/repo` (e.g., `facebook/react, vercel/next.js`)
+3. Click **Connect GitHub**
+
+No authentication required for public repositories.
+
+---
+
+## Private Repositories
+
+For private repos, you need a GitHub Personal Access Token (PAT).
+
+### Generate a PAT
+
+1. Go to [GitHub's token creation page](https://github.com/settings/tokens/new?description=surfsense&scopes=repo) (pre-filled with `repo` scope)
+2. Set an expiration
+3. Click **Generate token** and copy it
+
+> ⚠️ The token starts with `ghp_`. Store it securely.
+
+---
+
+## Connector Configuration
+
+| Field | Description | Required |
+|-------|-------------|----------|
+| **Connector Name** | A friendly name to identify this connector | Yes |
+| **GitHub Personal Access Token** | Your PAT (only for private repos) | No |
+| **Repository Names** | Comma-separated list: `owner/repo1, owner/repo2` | Yes |
+
+---
+
+## Periodic Sync
+
+Enable periodic sync to automatically re-index repositories when content changes:
+
+| Frequency | Use Case |
+|-----------|----------|
+| Every 5 minutes | Active development |
+| Every 15 minutes | Frequent commits |
+| Every hour | Regular workflow |
+| Every 6 hours | Less active repos |
+| Daily | Reference repositories |
+| Weekly | Stable codebases |
+
+---
+
+## Troubleshooting
+
+**Repository not found**
+- Verify format is `owner/repo`
+- For private repos, ensure PAT has access
+
+**Authentication failed**
+- Check PAT is valid and not expired
+- Token should start with `ghp_` or `github_pat_`
+
+**Rate limit exceeded**
+- Use a PAT for higher limits (5,000/hour vs 60 unauthenticated)
+- Reduce sync frequency

From 8bd1ba025161d8d6b97a43e87dd92fbdbc8b595d Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 21 Jan 2026 00:21:21 +0530
Subject: [PATCH 5/7] refactor: Simplify GitHub connect form by removing unused
 components and documentation sections

---
 .../components/github-connect-form.tsx        | 157 ------------------
 1 file changed, 157 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index 4fb9e93bf..833acf594 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -8,9 +8,6 @@ import { useForm } from "react-hook-form";
 import * as z from "zod";
 import {
 	Accordion,
-	AccordionContent,
-	AccordionItem,
-	AccordionTrigger,
 } from "@/components/ui/accordion";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Badge } from "@/components/ui/badge";
@@ -34,7 +31,6 @@ import {
 } from "@/components/ui/select";
 import { Switch } from "@/components/ui/switch";
 import { EnumConnectorName } from "@/contracts/enums/connector";
-import { getConnectorBenefits } from "../connector-benefits";
 import type { ConnectFormProps } from "../index";
 
 const githubConnectorFormSchema = z.object({
@@ -298,165 +294,12 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 				</Form>
 			</div>
 
-			{/* What you get section */}
-			{getConnectorBenefits(EnumConnectorName.GITHUB_CONNECTOR) && (
-				<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 px-3 sm:px-6 py-4 space-y-2">
-					<h4 className="text-xs sm:text-sm font-medium">What you get with GitHub integration:</h4>
-					<ul className="list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
-						{getConnectorBenefits(EnumConnectorName.GITHUB_CONNECTOR)?.map((benefit) => (
-							<li key={benefit}>{benefit}</li>
-						))}
-					</ul>
-				</div>
-			)}
-
 			{/* Documentation Section */}
 			<Accordion
 				type="single"
 				collapsible
 				className="w-full border border-border rounded-xl bg-slate-400/5 dark:bg-white/5"
 			>
-				<AccordionItem value="documentation" className="border-0">
-					<AccordionTrigger className="text-sm sm:text-base font-medium px-3 sm:px-6 no-underline hover:no-underline">
-						Documentation
-					</AccordionTrigger>
-					<AccordionContent className="px-3 sm:px-6 pb-3 sm:pb-6 space-y-6">
-						<div>
-							<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
-							<p className="text-[10px] sm:text-xs text-muted-foreground">
-								The GitHub connector ingests entire repositories in one pass using gitingest,
-								making it highly efficient. Provide a comma-separated list of repository full
-								names (e.g., "owner/repo1, owner/repo2") to index.
-							</p>
-							<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
-								<li>
-									<strong>Public repos:</strong> No authentication required.
-								</li>
-								<li>
-									<strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
-								</li>
-								<li>Indexes code, documentation, and configuration files.</li>
-								<li>Large files (over 5MB) and binary files are automatically skipped.</li>
-								<li>
-									Periodic sync detects changes and only re-indexes when content has changed.
-								</li>
-							</ul>
-						</div>
-
-						<div className="space-y-4">
-							<div>
-								<h3 className="text-sm sm:text-base font-semibold mb-2">Authorization</h3>
-								<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
-									<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-									<AlertTitle className="text-[10px] sm:text-xs">
-										Personal Access Token (Optional)
-									</AlertTitle>
-									<AlertDescription className="text-[9px] sm:text-[10px]">
-										A GitHub PAT is only needed for <strong>private repositories</strong>. Public
-										repos can be indexed without authentication. If you need to access private
-										repos, create a PAT with the 'repo' scope.
-									</AlertDescription>
-								</Alert>
-
-								<div className="space-y-4 sm:space-y-6">
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											For Private Repositories Only: Generate GitHub PAT
-										</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
-											Skip this step if you're only indexing public repositories.
-										</p>
-										<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
-											<li>
-												Go to your GitHub{" "}
-												<a
-													href="https://github.com/settings/tokens"
-													target="_blank"
-													rel="noopener noreferrer"
-													className="font-medium underline underline-offset-4"
-												>
-													Developer settings
-												</a>
-											</li>
-											<li>
-												Click on <strong>Personal access tokens</strong>, then choose{" "}
-												<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
-											</li>
-											<li>
-												Click <strong>Generate new token</strong>.
-											</li>
-											<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
-											<li>
-												Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
-												to the specific repositories you want to index (for fine-grained tokens).
-											</li>
-											<li>
-												Click <strong>Generate token</strong> and copy it immediately.
-											</li>
-										</ol>
-									</div>
-
-									<div>
-										<h4 className="text-[10px] sm:text-xs font-medium mb-2">
-											Specify Repositories
-										</h4>
-										<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
-											Enter a comma-separated list of repository full names in the format
-											"owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
-										</p>
-										<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
-											<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-											<AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
-											<AlertDescription className="text-[9px] sm:text-[10px]">
-												Public repositories work without a PAT. For private repositories, ensure
-												your PAT has access to the repos you want to index.
-											</AlertDescription>
-										</Alert>
-									</div>
-								</div>
-							</div>
-						</div>
-
-						<div className="space-y-4">
-							<div>
-								<h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
-								<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
-									<li>
-										Enter the <strong>Repository Names</strong> you want to index (e.g.,
-										"facebook/react, vercel/next.js").
-									</li>
-									<li>
-										<strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
-									</li>
-									<li>
-										Click <strong>Connect GitHub</strong> to start indexing.
-									</li>
-									<li>
-										Enable <strong>Periodic Sync</strong> to automatically detect and index
-										changes.
-									</li>
-								</ol>
-
-								<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
-									<Info className="h-3 w-3 sm:h-4 sm:w-4" />
-									<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
-									<AlertDescription className="text-[9px] sm:text-[10px]">
-										<p className="mb-2">The GitHub connector indexes:</p>
-										<ul className="list-disc pl-5 space-y-1">
-											<li>All code files (Python, JavaScript, TypeScript, etc.)</li>
-											<li>Documentation (README, Markdown, text files)</li>
-											<li>Configuration files (JSON, YAML, TOML, etc.)</li>
-											<li>Repository structure and file tree</li>
-										</ul>
-										<p className="mt-2">
-											Binary files, images, and build artifacts are automatically excluded.
-										</p>
-									</AlertDescription>
-								</Alert>
-							</div>
-						</div>
-					</AccordionContent>
-				</AccordionItem>
 			</Accordion>
 		</div>
 	);

From 5a95a6b543c75035181fd5b89e71f6fb5f605178 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 21 Jan 2026 01:21:25 +0530
Subject: [PATCH 6/7] feat: Add documentation link to GitHub connect form

- Replaced the removed Accordion component with a direct link to the GitHub connector documentation.
- Enhanced user experience by providing easy access to relevant documentation.
---
 .../components/github-connect-form.tsx        | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index 833acf594..f83ae0788 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -1,14 +1,12 @@
 "use client";
 
 import { zodResolver } from "@hookform/resolvers/zod";
-import { Info } from "lucide-react";
+import { ExternalLink, Info } from "lucide-react";
+import Link from "next/link";
 import type { FC } from "react";
 import { useRef, useState } from "react";
 import { useForm } from "react-hook-form";
 import * as z from "zod";
-import {
-	Accordion,
-} from "@/components/ui/accordion";
 import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
 import { Badge } from "@/components/ui/badge";
 import {
@@ -294,13 +292,18 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 				</Form>
 			</div>
 
-			{/* Documentation Section */}
-			<Accordion
-				type="single"
-				collapsible
-				className="w-full border border-border rounded-xl bg-slate-400/5 dark:bg-white/5"
-			>
-			</Accordion>
+			{/* Documentation Link */}
+			<div>
+				<Link
+					href="/docs/connectors/github"
+					target="_blank"
+					rel="noopener noreferrer"
+					className="text-xs sm:text-sm font-medium underline underline-offset-4 hover:text-primary transition-colors inline-flex items-center gap-1.5"
+				>
+					View GitHub Connector Documentation
+					<ExternalLink className="h-3 w-3 sm:h-4 sm:w-4" />
+				</Link>
+			</div>
 		</div>
 	);
 };

From d35d89f3a934d326ccf247219881e24c9ba4600f Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 21 Jan 2026 03:28:34 +0530
Subject: [PATCH 7/7] chore: ran linting

---
 .../app/dashboard/[search_space_id]/team/page.tsx  | 10 ++++------
 .../components/github-connect-form.tsx             | 14 ++++++--------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx
index f00982555..6701342de 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx
@@ -778,8 +778,7 @@ function RolesTab({
 												role.name === "Owner" && "text-amber-600",
 												role.name === "Editor" && "text-blue-600",
 												role.name === "Viewer" && "text-gray-600",
-												!["Owner", "Editor", "Viewer"].includes(role.name) &&
-													"text-primary"
+												!["Owner", "Editor", "Viewer"].includes(role.name) && "text-primary"
 											)}
 										/>
 									</div>
@@ -1488,7 +1487,8 @@ function CreateRoleDialog({
 							</div>
 						</div>
 						<p className="text-xs text-muted-foreground">
-							Use presets to quickly apply Editor (create/read/update) or Viewer (read-only) permissions
+							Use presets to quickly apply Editor (create/read/update) or Viewer (read-only)
+							permissions
 						</p>
 						<ScrollArea className="h-64 rounded-lg border p-4">
 							<div className="space-y-4">
@@ -1500,9 +1500,7 @@ function CreateRoleDialog({
 
 									return (
 										<div key={category} className="space-y-2">
-											<label
-												className="flex items-center gap-2 cursor-pointer hover:bg-muted/50 p-1 rounded w-full text-left"
-											>
+											<label className="flex items-center gap-2 cursor-pointer hover:bg-muted/50 p-1 rounded w-full text-left">
 												<Checkbox
 													checked={allSelected}
 													onCheckedChange={() => toggleCategory(category)}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
index f83ae0788..9d0ef2c45 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@@ -38,12 +38,9 @@ const githubConnectorFormSchema = z.object({
 	github_pat: z
 		.string()
 		.optional()
-		.refine(
-			(pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"),
-			{
-				message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
-			}
-		),
+		.refine((pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
+			message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
+		}),
 	repo_full_names: z.string().min(1, {
 		message: "At least one repository is required.",
 	}),
@@ -114,14 +111,15 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 					<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
 					<AlertDescription className="text-[10px] sm:text-xs !pl-0">
 						A GitHub PAT is only required for private repositories. Public repos work without a
-						token. {" "}
+						token.{" "}
 						<a
 							href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
 							target="_blank"
 							rel="noopener noreferrer"
-							className="font-medium underline underline-offset-4"
+							className="font-medium underline underline-offset-4 inline-flex items-center gap-1.5"
 						>
 							Get your token
+							<ExternalLink className="h-3 w-3 sm:h-4 sm:w-4" />
 						</a>{" "}
 					</AlertDescription>
 				</div>