feat: Integrate gitingest for GitHub repository ingestion

- Added gitingest as a dependency to streamline the ingestion of GitHub repositories. - Refactored GitHubConnector to utilize gitingest for efficient repository digest generation, reducing API calls. - Updated GitHub indexer to process entire repository digests, enhancing performance and simplifying the indexing process. - Modified GitHub connect form to indicate that the Personal Access Token is optional for public repositories.
2026-07-12 22:42:13 +02:00 · 2026-01-20 21:52:32 +05:30 · 2026-01-20 21:52:32 +05:30 · 49b8a46d10
commit 49b8a46d10
parent 6e331c3b85
6 changed files with 545 additions and 539 deletions
--- a/surfsense_backend/app/connectors/github_connector.py
+++ b/surfsense_backend/app/connectors/github_connector.py
@ -1,296 +1,295 @@
-import base64
+"""
-import logging
+GitHub connector using gitingest for efficient repository digestion.
 from typing import Any
-from github3 import exceptions as github_exceptions, login as github_login
+This connector replaces the previous file-by-file approach with a single
-from github3.exceptions import ForbiddenError, NotFoundError
+digest generation per repository, dramatically reducing LLM API calls.
-from github3.repos.contents import Contents
+"""
 import logging
 from dataclasses import dataclass
 from gitingest import ingest_async
 logger = logging.getLogger(__name__)
-# List of common code file extensions to target
+# Maximum file size in bytes (5MB)
-CODE_EXTENSIONS = {
+MAX_FILE_SIZE = 5 * 1024 * 1024
    ".py",
    ".js",
    ".jsx",
    ".ts",
    ".tsx",
    ".java",
    ".c",
    ".cpp",
    ".h",
    ".hpp",
    ".cs",
    ".go",
    ".rb",
    ".php",
    ".swift",
    ".kt",
    ".scala",
    ".rs",
    ".m",
    ".sh",
    ".bash",
    ".ps1",
    ".lua",
    ".pl",
    ".pm",
    ".r",
    ".dart",
    ".sql",
 }
-# List of common documentation/text file extensions
+# Default patterns to exclude (recommended approach for comprehensive analysis)
-DOC_EXTENSIONS = {
+# Using only exclude_patterns ensures we don't miss any relevant file types
-    ".md",
+DEFAULT_EXCLUDE_PATTERNS = [
-    ".txt",
+    # Dependencies
-    ".rst",
+    "node_modules/*",
-    ".adoc",
+    "vendor/*",
-    ".html",
+    "bower_components/*",
-    ".htm",
+    ".pnpm/*",
-    ".xml",
+    # Build artifacts / Caches
-    ".json",
+    "build/*",
-    ".yaml",
+    "dist/*",
-    ".yml",
+    "target/*",
-    ".toml",
+    "out/*",
-}
+    "__pycache__/*",
    "*.pyc",
    ".cache/*",
    ".next/*",
    ".nuxt/*",
    # Virtual environments
    "venv/*",
    ".venv/*",
    "env/*",
    ".env/*",
    # IDE/Editor config
    ".vscode/*",
    ".idea/*",
    ".project",
    ".settings/*",
    "*.swp",
    "*.swo",
    # Version control
    ".git/*",
    ".svn/*",
    ".hg/*",
    # Temporary / Logs
    "tmp/*",
    "temp/*",
    "logs/*",
    "*.log",
    # Lock files (usually not needed for understanding code)
    "package-lock.json",
    "pnpm-lock.yaml",
    "yarn.lock",
    "uv.lock",
    "Gemfile.lock",
    "poetry.lock",
    "Cargo.lock",
    "composer.lock",
    # Binary/media files
    "*.png",
    "*.jpg",
    "*.jpeg",
    "*.gif",
    "*.ico",
    "*.svg",
    "*.webp",
    "*.bmp",
    "*.tiff",
    "*.woff",
    "*.woff2",
    "*.ttf",
    "*.eot",
    "*.otf",
    "*.mp3",
    "*.mp4",
    "*.wav",
    "*.ogg",
    "*.webm",
    "*.avi",
    "*.mov",
    "*.pdf",
    "*.doc",
    "*.docx",
    "*.xls",
    "*.xlsx",
    "*.ppt",
    "*.pptx",
    "*.zip",
    "*.tar",
    "*.tar.gz",
    "*.tgz",
    "*.rar",
    "*.7z",
    "*.exe",
    "*.dll",
    "*.so",
    "*.dylib",
    "*.bin",
    "*.obj",
    "*.o",
    "*.a",
    "*.lib",
    # Minified files
    "*.min.js",
    "*.min.css",
    # Source maps
    "*.map",
    # Database files
    "*.db",
    "*.sqlite",
    "*.sqlite3",
    # Coverage reports
    "coverage/*",
    ".coverage",
    "htmlcov/*",
    ".nyc_output/*",
    # Test snapshots (can be large)
    "__snapshots__/*",
 ]
-# Maximum file size in bytes (e.g., 1MB)
+
-MAX_FILE_SIZE = 1 * 1024 * 1024
+@dataclass
 class RepositoryDigest:
    """Represents a digested repository from gitingest."""
    repo_full_name: str
    summary: str
    tree: str
    content: str
    branch: str | None = None
    @property
    def full_digest(self) -> str:
        """Returns the complete digest with tree and content."""
        return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
    @property
    def estimated_tokens(self) -> int:
        """Rough estimate of tokens (1 token ≈ 4 characters)."""
        return len(self.full_digest) // 4
 class GitHubConnector:
-    """Connector for interacting with the GitHub API."""
+    """
    Connector for ingesting GitHub repositories using gitingest.
-    # Directories to skip during file traversal
+    This connector efficiently processes entire repositories into a single
-    SKIPPED_DIRS = {
+    digest, reducing the number of API calls and LLM invocations compared
-        # Version control
+    to file-by-file processing.
-        ".git",
+    """
        # Dependencies
        "node_modules",
        "vendor",
        # Build artifacts / Caches
        "build",
        "dist",
        "target",
        "__pycache__",
        # Virtual environments
        "venv",
        ".venv",
        "env",
        # IDE/Editor config
        ".vscode",
        ".idea",
        ".project",
        ".settings",
        # Temporary / Logs
        "tmp",
        "logs",
        # Add other project-specific irrelevant directories if needed
    }
-    def __init__(self, token: str):
+    def __init__(self, token: str | None = None):
        """
        Initializes the GitHub connector.
        Args:
-            token: GitHub Personal Access Token (PAT).
+            token: Optional GitHub Personal Access Token (PAT).
                   Only required for private repositories.
                   Public repositories can be ingested without a token.
        """
-        if not token:
+        self.token = token if token and token.strip() else None
-            raise ValueError("GitHub token cannot be empty.")
+        if self.token:
-        try:
+            logger.info("GitHub connector initialized with authentication token.")
-            self.gh = github_login(token=token)
+        else:
-            # Try a simple authenticated call to check token validity
+            logger.info("GitHub connector initialized without token (public repos only).")
            self.gh.me()
            logger.info("Successfully authenticated with GitHub API.")
        except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
            logger.error(f"GitHub authentication failed: {e}")
            raise ValueError("Invalid GitHub token or insufficient permissions.") from e
        except Exception as e:
            logger.error(f"Failed to initialize GitHub client: {e}")
            raise e
-    def get_user_repositories(self) -> list[dict[str, Any]]:
+    async def ingest_repository(
-        """Fetches repositories accessible by the authenticated user."""
+        self,
-        repos_data = []
+        repo_full_name: str,
-        try:
+        branch: str | None = None,
-            # type='owner' fetches repos owned by the user
+        include_patterns: list[str] | None = None,
-            # type='member' fetches repos the user is a collaborator on (including orgs)
+        exclude_patterns: list[str] | None = None,
-            # type='all' fetches both
+        max_file_size: int = MAX_FILE_SIZE,
-            for repo in self.gh.repositories(type="all", sort="updated"):
+    ) -> RepositoryDigest | None:
                repos_data.append(
                    {
                        "id": repo.id,
                        "name": repo.name,
                        "full_name": repo.full_name,
                        "private": repo.private,
                        "url": repo.html_url,
                        "description": repo.description or "",
                        "last_updated": repo.updated_at if repo.updated_at else None,
                    }
                )
            logger.info(f"Fetched {len(repos_data)} repositories.")
            return repos_data
        except Exception as e:
            logger.error(f"Failed to fetch GitHub repositories: {e}")
            return []  # Return empty list on error
    def get_repository_files(
        self, repo_full_name: str, path: str = ""
    ) -> list[dict[str, Any]]:
        """
-        Recursively fetches details of relevant files (code, docs) within a repository path.
+        Ingest an entire repository and return a digest.
        Args:
            repo_full_name: The full name of the repository (e.g., 'owner/repo').
-            path: The starting path within the repository (default is root).
+            branch: Optional specific branch or tag to ingest.
            include_patterns: Optional list of glob patterns for files to include.
                             If None, includes all files (recommended).
            exclude_patterns: Optional list of glob patterns for files to exclude.
                             If None, uses DEFAULT_EXCLUDE_PATTERNS.
            max_file_size: Maximum file size in bytes to include (default 5MB).
        Returns:
-            A list of dictionaries, each containing file details (path, sha, url, size).
+            RepositoryDigest containing the summary, tree structure, and content,
-            Returns an empty list if the repository or path is not found or on error.
+            or None if ingestion fails.
        """
-        files_list = []
+        repo_url = f"https://github.com/{repo_full_name}"
        # Use only exclude_patterns by default (recommended for comprehensive analysis)
        # This ensures we don't miss any relevant file types
        exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
        logger.info(f"Starting gitingest for repository: {repo_full_name}")
        try:
-            owner, repo_name = repo_full_name.split("/")
+            # Build kwargs dynamically
-            repo = self.gh.repository(owner, repo_name)
+            ingest_kwargs = {
-            if not repo:
+                "max_file_size": max_file_size,
-                logger.warning(f"Repository '{repo_full_name}' not found.")
+                "exclude_patterns": exclude_pats,
-                return []
+                "include_gitignored": False,
-            contents = repo.directory_contents(
+                "include_submodules": False,
-                directory_path=path
+            }
            )  # Use directory_contents for clarity
-            # contents returns a list of tuples (name, content_obj)
+            # Only add token if provided (required only for private repos)
-            for _item_name, content_item in contents:
+            if self.token:
-                if not isinstance(content_item, Contents):
+                ingest_kwargs["token"] = self.token
                    continue
-                if content_item.type == "dir":
+            # Only add branch if specified
-                    # Check if the directory name is in the skipped list
+            if branch:
-                    if content_item.name in self.SKIPPED_DIRS:
+                ingest_kwargs["branch"] = branch
                        logger.debug(f"Skipping directory: {content_item.path}")
                        continue  # Skip recursion for this directory
-                    # Recursively fetch contents of subdirectory
+            # Only add include_patterns if explicitly provided
-                    files_list.extend(
+            if include_patterns is not None:
-                        self.get_repository_files(
+                ingest_kwargs["include_patterns"] = include_patterns
                            repo_full_name, path=content_item.path
                        )
                    )
                elif content_item.type == "file":
                    # Check if the file extension is relevant and size is within limits
                    file_extension = (
                        "." + content_item.name.split(".")[-1].lower()
                        if "." in content_item.name
                        else ""
                    )
                    is_code = file_extension in CODE_EXTENSIONS
                    is_doc = file_extension in DOC_EXTENSIONS
-                    if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
+            summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
                        files_list.append(
                            {
                                "path": content_item.path,
                                "sha": content_item.sha,
                                "url": content_item.html_url,
                                "size": content_item.size,
                                "type": "code" if is_code else "doc",
                            }
                        )
                    elif content_item.size > MAX_FILE_SIZE:
                        logger.debug(
                            f"Skipping large file: {content_item.path} ({content_item.size} bytes)"
                        )
                    else:
                        logger.debug(
                            f"Skipping irrelevant file type: {content_item.path}"
                        )
-        except (NotFoundError, ForbiddenError) as e:
+            if not content or not content.strip():
-            logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
+                logger.warning(
-        except Exception as e:
+                    f"No content retrieved from repository: {repo_full_name}"
-            logger.error(
+                )
-                f"Failed to get files for {repo_full_name} at path '{path}': {e}"
+                return None
            digest = RepositoryDigest(
                repo_full_name=repo_full_name,
                summary=summary,
                tree=tree,
                content=content,
                branch=branch,
            )
            # Return what we have collected so far in case of partial failure
-        return files_list
+            logger.info(
                f"Successfully ingested {repo_full_name}: "
                f"~{digest.estimated_tokens} estimated tokens"
            )
            return digest
-    def get_file_content(self, repo_full_name: str, file_path: str) -> str | None:
+        except Exception as e:
            logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
            return None
    async def ingest_repositories(
        self,
        repo_full_names: list[str],
        branch: str | None = None,
        include_patterns: list[str] | None = None,
        exclude_patterns: list[str] | None = None,
        max_file_size: int = MAX_FILE_SIZE,
    ) -> list[RepositoryDigest]:
        """
-        Fetches the decoded content of a specific file.
+        Ingest multiple repositories and return their digests.
        Args:
-            repo_full_name: The full name of the repository (e.g., 'owner/repo').
+            repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
-            file_path: The path to the file within the repository.
+            branch: Optional specific branch or tag to ingest (applied to all repos).
            include_patterns: Optional list of glob patterns for files to include.
            exclude_patterns: Optional list of glob patterns for files to exclude.
            max_file_size: Maximum file size in bytes to include.
        Returns:
-            The decoded file content as a string, or None if fetching fails or file is too large.
+            List of RepositoryDigest objects for successfully ingested repositories.
        """
-        try:
+        digests = []
            owner, repo_name = repo_full_name.split("/")
            repo = self.gh.repository(owner, repo_name)
            if not repo:
                logger.warning(
                    f"Repository '{repo_full_name}' not found when fetching file '{file_path}'."
                )
                return None
-            content_item = repo.file_contents(
+        for repo_full_name in repo_full_names:
-                path=file_path
+            if not repo_full_name or not isinstance(repo_full_name, str):
-            )  # Use file_contents for clarity
+                logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
                continue
-            if (
+            digest = await self.ingest_repository(
-                not content_item
+                repo_full_name=repo_full_name,
-                or not isinstance(content_item, Contents)
+                branch=branch,
-                or content_item.type != "file"
+                include_patterns=include_patterns,
-            ):
+                exclude_patterns=exclude_patterns,
-                logger.warning(
+                max_file_size=max_file_size,
                    f"File '{file_path}' not found or is not a file in '{repo_full_name}'."
                )
                return None
            if content_item.size > MAX_FILE_SIZE:
                logger.warning(
                    f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch."
                )
                return None
            # Content is base64 encoded
            if content_item.content:
                try:
                    decoded_content = base64.b64decode(content_item.content).decode(
                        "utf-8"
                    )
                    return decoded_content
                except UnicodeDecodeError:
                    logger.warning(
                        f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'."
                    )
                    try:
                        # Try a fallback encoding
                        decoded_content = base64.b64decode(content_item.content).decode(
                            "latin-1"
                        )
                        return decoded_content
                    except Exception as decode_err:
                        logger.error(
                            f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}"
                        )
                        return None  # Give up if fallback fails
            else:
                logger.warning(
                    f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty."
                )
                return ""  # Return empty string for empty files
        except (NotFoundError, ForbiddenError) as e:
            logger.warning(
                f"Cannot access file '{file_path}' in '{repo_full_name}': {e}"
            )
-            return None
+
-        except Exception as e:
+            if digest:
-            logger.error(
+                digests.append(digest)
-                f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}"
+
-            )
+        logger.info(
-            return None
+            f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
        )
        return digests
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -1,5 +1,8 @@
 """
-GitHub connector indexer.
+GitHub connector indexer using gitingest.
 This indexer processes entire repository digests in one pass, dramatically
 reducing LLM API calls compared to the previous file-by-file approach.
 """
 from datetime import UTC, datetime
@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
-from app.connectors.github_connector import GitHubConnector
+from app.connectors.github_connector import GitHubConnector, RepositoryDigest
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
@ -26,43 +29,55 @@ from .base import (
    logger,
 )
 # Maximum tokens for a single digest before splitting
 # Most LLMs can handle 128k+ tokens now, but we'll be conservative
 MAX_DIGEST_CHARS = 500_000  # ~125k tokens
 async def index_github_repos(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
-    start_date: str | None = None,
+    start_date: str | None = None,  # Ignored - GitHub indexes full repo snapshots
-    end_date: str | None = None,
+    end_date: str | None = None,  # Ignored - GitHub indexes full repo snapshots
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
-    Index code and documentation files from accessible GitHub repositories.
+    Index GitHub repositories using gitingest for efficient processing.
    This function ingests entire repositories as digests, generates a single
    summary per repository, and chunks the content for vector storage.
    Note: The start_date and end_date parameters are accepted for API compatibility
    but are IGNORED. GitHub repositories are indexed as complete snapshots since
    gitingest captures the current state of the entire codebase.
    Args:
        session: Database session
        connector_id: ID of the GitHub connector
        search_space_id: ID of the search space to store documents in
        user_id: ID of the user
-        start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
+        start_date: Ignored - kept for API compatibility
-        end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
+        end_date: Ignored - kept for API compatibility
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    # Note: start_date and end_date are intentionally unused
    _ = start_date, end_date
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="github_repos_indexing",
        source="connector_indexing_task",
-        message=f"Starting GitHub repositories indexing for connector {connector_id}",
+        message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
-            "start_date": start_date,
+            "method": "gitingest",
            "end_date": end_date,
        },
    )
@ -93,19 +108,11 @@ async def index_github_repos(
                f"Connector with ID {connector_id} not found or is not a GitHub connector",
            )
-        # 2. Get the GitHub PAT and selected repositories from the connector config
+        # 2. Get the GitHub PAT (optional) and selected repositories from the connector config
-        github_pat = connector.config.get("GITHUB_PAT")
+        # PAT is only required for private repositories - public repos work without it
        github_pat = connector.config.get("GITHUB_PAT")  # Can be None or empty
        repo_full_names_to_index = connector.config.get("repo_full_names")
        if not github_pat:
            await task_logger.log_task_failure(
                log_entry,
                f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
                "Missing GitHub PAT",
                {"error_type": "MissingToken"},
            )
            return 0, "GitHub Personal Access Token (PAT) not found in connector config"
        if not repo_full_names_to_index or not isinstance(
            repo_full_names_to_index, list
        ):
@ -117,10 +124,16 @@ async def index_github_repos(
            )
            return 0, "'repo_full_names' not found or is not a list in connector config"
-        # 3. Initialize GitHub connector client
+        # Log whether we're using authentication
        if github_pat:
            logger.info("Using GitHub PAT for authentication (private repos supported)")
        else:
            logger.info("No GitHub PAT provided - only public repositories can be indexed")
        # 3. Initialize GitHub connector with gitingest backend
        await task_logger.log_task_progress(
            log_entry,
-            f"Initializing GitHub client for connector {connector_id}",
+            f"Initializing gitingest-based GitHub client for connector {connector_id}",
            {
                "stage": "client_initialization",
                "repo_count": len(repo_full_names_to_index),
@ -138,258 +151,52 @@ async def index_github_repos(
            )
            return 0, f"Failed to initialize GitHub client: {e!s}"
-        # 4. Validate selected repositories
+        # 4. Process each repository with gitingest
        await task_logger.log_task_progress(
            log_entry,
-            f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
+            f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
            {
                "stage": "repo_processing",
                "repo_count": len(repo_full_names_to_index),
                "start_date": start_date,
                "end_date": end_date,
            },
        )
        logger.info(
-            f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
+            f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
        )
        if start_date and end_date:
            logger.info(
                f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
            )
        # 6. Iterate through selected repositories and index files
        for repo_full_name in repo_full_names_to_index:
            if not repo_full_name or not isinstance(repo_full_name, str):
                logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
                continue
-            logger.info(f"Processing repository: {repo_full_name}")
+            logger.info(f"Ingesting repository: {repo_full_name}")
            try:
-                files_to_index = github_client.get_repository_files(repo_full_name)
+                # Ingest the entire repository
-                if not files_to_index:
+                digest = await github_client.ingest_repository(repo_full_name)
-                    logger.info(
+
-                        f"No indexable files found in repository: {repo_full_name}"
+                if not digest:
                    logger.warning(
                        f"No digest returned for repository: {repo_full_name}"
                    )
                    errors.append(f"No digest for {repo_full_name}")
                    continue
-                logger.info(
+                # Process the digest and create documents
-                    f"Found {len(files_to_index)} files to process in {repo_full_name}"
+                docs_created = await _process_repository_digest(
                    session=session,
                    digest=digest,
                    search_space_id=search_space_id,
                    user_id=user_id,
                    task_logger=task_logger,
                    log_entry=log_entry,
                )
-                for file_info in files_to_index:
+                documents_processed += docs_created
-                    file_path = file_info.get("path")
+                logger.info(
-                    file_url = file_info.get("url")
+                    f"Created {docs_created} documents from repository: {repo_full_name}"
-                    file_sha = file_info.get("sha")
+                )
                    file_type = file_info.get("type")  # 'code' or 'doc'
                    full_path_key = f"{repo_full_name}/{file_path}"
                    if not file_path or not file_url or not file_sha:
                        logger.warning(
                            f"Skipping file with missing info in {repo_full_name}: {file_info}"
                        )
                        continue
                    # Get file content
                    file_content = github_client.get_file_content(
                        repo_full_name, file_path
                    )
                    if file_content is None:
                        logger.warning(
                            f"Could not retrieve content for {full_path_key}. Skipping."
                        )
                        continue  # Skip if content fetch failed
                    # Generate unique identifier hash for this GitHub file
                    unique_identifier_hash = generate_unique_identifier_hash(
                        DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
                    )
                    # Generate content hash
                    content_hash = generate_content_hash(file_content, search_space_id)
                    # Check if document with this unique identifier already exists
                    existing_document = await check_document_by_unique_identifier(
                        session, unique_identifier_hash
                    )
                    if existing_document:
                        # Document exists - check if content has changed
                        if existing_document.content_hash == content_hash:
                            logger.info(
                                f"Document for GitHub file {full_path_key} unchanged. Skipping."
                            )
                            continue
                        else:
                            # Content has changed - update the existing document
                            logger.info(
                                f"Content changed for GitHub file {full_path_key}. Updating document."
                            )
                            # Generate summary with metadata
                            user_llm = await get_user_long_context_llm(
                                session, user_id, search_space_id
                            )
                            if user_llm:
                                file_extension = (
                                    file_path.split(".")[-1]
                                    if "." in file_path
                                    else None
                                )
                                document_metadata = {
                                    "file_path": full_path_key,
                                    "repository": repo_full_name,
                                    "file_type": file_extension or "unknown",
                                    "document_type": "GitHub Repository File",
                                    "connector_type": "GitHub",
                                }
                                (
                                    summary_content,
                                    summary_embedding,
                                ) = await generate_document_summary(
                                    file_content, user_llm, document_metadata
                                )
                            else:
                                summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
                                summary_embedding = (
                                    config.embedding_model_instance.embed(
                                        summary_content
                                    )
                                )
                            # Chunk the content
                            try:
                                if hasattr(config, "code_chunker_instance"):
                                    chunks_data = [
                                        await create_document_chunks(file_content)
                                    ][0]
                                else:
                                    chunks_data = await create_document_chunks(
                                        file_content
                                    )
                            except Exception as chunk_err:
                                logger.error(
                                    f"Failed to chunk file {full_path_key}: {chunk_err}"
                                )
                                continue
                            # Update existing document
                            existing_document.title = f"GitHub - {full_path_key}"
                            existing_document.content = summary_content
                            existing_document.content_hash = content_hash
                            existing_document.embedding = summary_embedding
                            existing_document.document_metadata = {
                                "file_path": file_path,
                                "file_sha": file_sha,
                                "file_url": file_url,
                                "repository": repo_full_name,
                                "indexed_at": datetime.now(UTC).strftime(
                                    "%Y-%m-%d %H:%M:%S"
                                ),
                            }
                            existing_document.chunks = chunks_data
                            existing_document.updated_at = get_current_timestamp()
                            logger.info(
                                f"Successfully updated GitHub file {full_path_key}"
                            )
                            continue
                    # Document doesn't exist - create new one
                    # Generate summary with metadata
                    user_llm = await get_user_long_context_llm(
                        session, user_id, search_space_id
                    )
                    if user_llm:
                        # Extract file extension from file path
                        file_extension = (
                            file_path.split(".")[-1] if "." in file_path else None
                        )
                        document_metadata = {
                            "file_path": full_path_key,
                            "repository": repo_full_name,
                            "file_type": file_extension or "unknown",
                            "document_type": "GitHub Repository File",
                            "connector_type": "GitHub",
                        }
                        (
                            summary_content,
                            summary_embedding,
                        ) = await generate_document_summary(
                            file_content, user_llm, document_metadata
                        )
                    else:
                        # Fallback to simple summary if no LLM configured
                        summary_content = (
                            f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
                        )
                        summary_embedding = config.embedding_model_instance.embed(
                            summary_content
                        )
                    # Chunk the content
                    try:
                        chunks_data = [await create_document_chunks(file_content)][0]
                        # Use code chunker if available, otherwise regular chunker
                        if hasattr(config, "code_chunker_instance"):
                            chunks_data = [
                                {
                                    "content": chunk.text,
                                    "embedding": config.embedding_model_instance.embed(
                                        chunk.text
                                    ),
                                }
                                for chunk in config.code_chunker_instance.chunk(
                                    file_content
                                )
                            ]
                        else:
                            chunks_data = await create_document_chunks(file_content)
                    except Exception as chunk_err:
                        logger.error(
                            f"Failed to chunk file {full_path_key}: {chunk_err}"
                        )
                        errors.append(
                            f"Chunking failed for {full_path_key}: {chunk_err}"
                        )
                        continue  # Skip this file if chunking fails
                    doc_metadata = {
                        "repository_full_name": repo_full_name,
                        "file_path": file_path,
                        "full_path": full_path_key,  # For easier lookup
                        "url": file_url,
                        "sha": file_sha,
                        "type": file_type,
                        "indexed_at": datetime.now(UTC).isoformat(),
                    }
                    # Create new document
                    logger.info(f"Creating new document for file: {full_path_key}")
                    document = Document(
                        title=f"GitHub - {file_path}",
                        document_type=DocumentType.GITHUB_CONNECTOR,
                        document_metadata=doc_metadata,
                        content=summary_content,  # Store summary
                        content_hash=content_hash,
                        unique_identifier_hash=unique_identifier_hash,
                        embedding=summary_embedding,
                        search_space_id=search_space_id,
                        chunks=chunks_data,  # Associate chunks directly
                        updated_at=get_current_timestamp(),
                    )
                    session.add(document)
                    documents_processed += 1
                    # Batch commit every 10 documents
                    if documents_processed % 10 == 0:
                        logger.info(
                            f"Committing batch: {documents_processed} GitHub files processed so far"
                        )
                        await session.commit()
            except Exception as repo_err:
                logger.error(
@ -397,11 +204,11 @@ async def index_github_repos(
                )
                errors.append(f"Failed processing {repo_full_name}: {repo_err}")
-        # Final commit for any remaining documents not yet committed in batches
+        # Final commit
        logger.info(f"Final commit: Total {documents_processed} GitHub files processed")
        await session.commit()
        logger.info(
-            f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
+            f"Finished GitHub indexing for connector {connector_id}. "
            f"Created {documents_processed} documents."
        )
        # Log success
@ -412,6 +219,7 @@ async def index_github_repos(
                "documents_processed": documents_processed,
                "errors_count": len(errors),
                "repo_count": len(repo_full_names_to_index),
                "method": "gitingest",
            },
        )
@ -428,6 +236,7 @@ async def index_github_repos(
        )
        errors.append(f"Database error: {db_err}")
        return documents_processed, "; ".join(errors) if errors else str(db_err)
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
@ -445,3 +254,173 @@ async def index_github_repos(
    error_message = "; ".join(errors) if errors else None
    return documents_processed, error_message
 async def _process_repository_digest(
    session: AsyncSession,
    digest: RepositoryDigest,
    search_space_id: int,
    user_id: str,
    task_logger: TaskLoggingService,
    log_entry,
 ) -> int:
    """
    Process a repository digest and create documents.
    For each repository, we create:
    1. One main document with the repository summary
    2. Chunks from the full digest content for granular search
    Args:
        session: Database session
        digest: The repository digest from gitingest
        search_space_id: ID of the search space
        user_id: ID of the user
        task_logger: Task logging service
        log_entry: Current log entry
    Returns:
        Number of documents created
    """
    repo_full_name = digest.repo_full_name
    documents_created = 0
    # Generate unique identifier based on repo name and content hash
    # This allows updates when repo content changes
    full_content = digest.full_digest
    content_hash = generate_content_hash(full_content, search_space_id)
    # Use repo name as the unique identifier (one document per repo)
    unique_identifier_hash = generate_unique_identifier_hash(
        DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
    )
    # Check if document with this unique identifier already exists
    existing_document = await check_document_by_unique_identifier(
        session, unique_identifier_hash
    )
    if existing_document:
        # Document exists - check if content has changed
        if existing_document.content_hash == content_hash:
            logger.info(
                f"Repository {repo_full_name} unchanged. Skipping."
            )
            return 0
        else:
            logger.info(
                f"Content changed for repository {repo_full_name}. Updating document."
            )
            # Delete existing document to replace with new one
            await session.delete(existing_document)
            await session.flush()
    # Generate summary using LLM (ONE call per repository!)
    user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
    document_metadata = {
        "repository": repo_full_name,
        "document_type": "GitHub Repository",
        "connector_type": "GitHub",
        "ingestion_method": "gitingest",
        "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
        "estimated_tokens": digest.estimated_tokens,
    }
    if user_llm:
        # Prepare content for summarization
        # Include tree structure and truncated content if too large
        summary_content = digest.full_digest
        if len(summary_content) > MAX_DIGEST_CHARS:
            # Truncate but keep the tree and beginning of content
            summary_content = (
                f"# Repository: {repo_full_name}\n\n"
                f"## File Structure\n\n{digest.tree}\n\n"
                f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
            )
        summary_text, summary_embedding = await generate_document_summary(
            summary_content, user_llm, document_metadata
        )
    else:
        # Fallback to simple summary if no LLM configured
        summary_text = (
            f"# GitHub Repository: {repo_full_name}\n\n"
            f"## Summary\n{digest.summary}\n\n"
            f"## File Structure\n{digest.tree[:3000]}"
        )
        summary_embedding = config.embedding_model_instance.embed(summary_text)
    # Chunk the full digest content for granular search
    try:
        # Use the content (not the summary) for chunking
        # This preserves file-level granularity in search
        chunks_data = await create_document_chunks(digest.content)
    except Exception as chunk_err:
        logger.error(
            f"Failed to chunk repository {repo_full_name}: {chunk_err}"
        )
        # Fall back to a simpler chunking approach
        chunks_data = await _simple_chunk_content(digest.content)
    # Create the document
    doc_metadata = {
        "repository_full_name": repo_full_name,
        "url": f"https://github.com/{repo_full_name}",
        "branch": digest.branch,
        "ingestion_method": "gitingest",
        "file_tree": digest.tree,
        "gitingest_summary": digest.summary,
        "estimated_tokens": digest.estimated_tokens,
        "indexed_at": datetime.now(UTC).isoformat(),
    }
    document = Document(
        title=f"GitHub Repository: {repo_full_name}",
        document_type=DocumentType.GITHUB_CONNECTOR,
        document_metadata=doc_metadata,
        content=summary_text,
        content_hash=content_hash,
        unique_identifier_hash=unique_identifier_hash,
        embedding=summary_embedding,
        search_space_id=search_space_id,
        chunks=chunks_data,
        updated_at=get_current_timestamp(),
    )
    session.add(document)
    documents_created += 1
    logger.info(
        f"Created document for repository {repo_full_name} "
        f"with {len(chunks_data)} chunks"
    )
    return documents_created
 async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
    """
    Simple fallback chunking when the regular chunker fails.
    Args:
        content: The content to chunk
        chunk_size: Size of each chunk in characters
    Returns:
        List of chunk dictionaries with content and embedding
    """
    from app.db import Chunk
    chunks = []
    for i in range(0, len(content), chunk_size):
        chunk_text = content[i : i + chunk_size]
        if chunk_text.strip():
            chunks.append(
                Chunk(
                    content=chunk_text,
                    embedding=config.embedding_model_instance.embed(chunk_text),
                )
            )
    return chunks
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -60,6 +60,7 @@ dependencies = [
    "mcp>=1.25.0",
    "starlette>=0.40.0,<0.51.0",
    "sse-starlette>=3.1.1,<3.1.2",
    "gitingest>=0.3.1",
 ]
 [dependency-groups]
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@ -1945,6 +1945,25 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
 ]
 [[package]]
 name = "gitingest"
 version = "0.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "click" },
    { name = "httpx" },
    { name = "loguru" },
    { name = "pathspec" },
    { name = "pydantic" },
    { name = "python-dotenv" },
    { name = "starlette" },
    { name = "tiktoken" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" },
 ]
 [[package]]
 name = "google-api-core"
 version = "2.25.1"
@ -4460,6 +4479,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
 ]
 [[package]]
 name = "pathspec"
 version = "1.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" },
 ]
 [[package]]
 name = "pdf2image"
 version = "1.17.0"
@ -6484,6 +6512,7 @@ dependencies = [
    { name = "firecrawl-py" },
    { name = "flower" },
    { name = "github3-py" },
    { name = "gitingest" },
    { name = "google-api-python-client" },
    { name = "google-auth-oauthlib" },
    { name = "kokoro" },
@ -6549,6 +6578,7 @@ requires-dist = [
    { name = "firecrawl-py", specifier = ">=4.9.0" },
    { name = "flower", specifier = ">=2.0.1" },
    { name = "github3-py", specifier = "==4.0.1" },
    { name = "gitingest", specifier = ">=0.3.1" },
    { name = "google-api-python-client", specifier = ">=2.156.0" },
    { name = "google-auth-oauthlib", specifier = ">=1.2.1" },
    { name = "kokoro", specifier = ">=0.9.4" },
--- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx
@ -34,7 +34,6 @@ import {
 } from "@/components/ui/select";
 import { Switch } from "@/components/ui/switch";
 import { EnumConnectorName } from "@/contracts/enums/connector";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { getConnectorBenefits } from "../connector-benefits";
 import type { ConnectFormProps } from "../index";
@ -44,12 +43,13 @@ const githubConnectorFormSchema = z.object({
 	}),
 	github_pat: z
 		.string()
-		.min(20, {
+		.optional()
-			message: "GitHub Personal Access Token seems too short.",
+		.refine(
-		})
+			(pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"),
-		.refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
+			{
-			message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
+				message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
-		}),
+			}
 		),
 	repo_full_names: z.string().min(1, {
 		message: "At least one repository is required.",
 	}),
@ -59,8 +59,6 @@ type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
 export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
 	const isSubmittingRef = useRef(false);
 	const [startDate, setStartDate] = useState<Date | undefined>(undefined);
 	const [endDate, setEndDate] = useState<Date | undefined>(undefined);
 	const [periodicEnabled, setPeriodicEnabled] = useState(false);
 	const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
 	const form = useForm<GithubConnectorFormValues>({
@ -94,7 +92,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 				name: values.name,
 				connector_type: EnumConnectorName.GITHUB_CONNECTOR,
 				config: {
-					GITHUB_PAT: values.github_pat,
+					GITHUB_PAT: values.github_pat || null, // Optional - only for private repos
 					repo_full_names: repoList,
 				},
 				is_indexable: true,
@ -102,8 +100,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 				periodic_indexing_enabled: periodicEnabled,
 				indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
 				next_scheduled_at: null,
-				startDate,
+				// GitHub indexes full repo snapshots - no date range needed
-				endDate,
+				startDate: undefined,
 				endDate: undefined,
 				periodicEnabled,
 				frequencyMinutes,
 			});
@ -117,10 +116,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 			<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
 				<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
 				<div className="-ml-1">
-					<AlertTitle className="text-xs sm:text-sm">Personal Access Token Required</AlertTitle>
+					<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
 					<AlertDescription className="text-[10px] sm:text-xs !pl-0">
-						You'll need a GitHub Personal Access Token to use this connector. You can create one
+						A GitHub PAT is only required for private repositories. Public repos work without a
-						from{" "}
+						token. Create one from{" "}
 						<a
 							href="https://github.com/settings/tokens"
 							target="_blank"
@ -128,7 +127,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 							className="font-medium underline underline-offset-4"
 						>
 							GitHub Settings
-						</a>
+						</a>{" "}
 						if needed.
 					</AlertDescription>
 				</div>
 			</Alert>
@ -167,7 +167,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 							name="github_pat"
 							render={({ field }) => (
 								<FormItem>
-									<FormLabel className="text-xs sm:text-sm">GitHub Personal Access Token</FormLabel>
+									<FormLabel className="text-xs sm:text-sm">
 										GitHub Personal Access Token{" "}
 										<span className="text-muted-foreground font-normal">(optional)</span>
 									</FormLabel>
 									<FormControl>
 										<Input
 											type="password"
@ -178,8 +181,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 										/>
 									</FormControl>
 									<FormDescription className="text-[10px] sm:text-xs">
-										Your GitHub PAT will be encrypted and stored securely. It typically starts with
+										Only required for private repositories. Leave empty if indexing public repos
-										"ghp_" or "github_pat_".
+										only.
 									</FormDescription>
 									<FormMessage />
 								</FormItem>
@ -225,15 +228,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
 						{/* Indexing Configuration */}
 						<div className="space-y-4 pt-4 border-t border-slate-400/20">
-							<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
+							<h3 className="text-sm sm:text-base font-medium">Sync Configuration</h3>
-							{/* Date Range Selector */}
+							{/* Note: No date range for GitHub - it indexes full repo snapshots */}
 							<DateRangeSelector
 								startDate={startDate}
 								endDate={endDate}
 								onStartDateChange={setStartDate}
 								onEndDateChange={setEndDate}
 							/>
 							{/* Periodic Sync Config */}
 							<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
--- a/surfsense_web/components/new-chat/source-detail-panel.tsx
+++ b/surfsense_web/components/new-chat/source-detail-panel.tsx
@ -490,8 +490,8 @@ export function SourceDetailPanel({
 														>
 															{idx + 1}
 															{isCited && (
-																<span className="absolute -top-1 -right-1 w-3 h-3 bg-primary rounded-full border-2 border-background">
+																<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
-																	<Sparkles className="h-2 w-2 text-primary-foreground absolute top-0.5 left-0.5" />
+																	<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
 																</span>
 															)}
 														</motion.button>