diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index 90fd93fb9..6f04ccdba 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -1,130 +1,21 @@ """ -GitHub connector using gitingest for efficient repository digestion. +GitHub connector using gitingest CLI for efficient repository digestion. -This connector replaces the previous file-by-file approach with a single -digest generation per repository, dramatically reducing LLM API calls. +This connector uses subprocess to call gitingest CLI, completely isolating +it from any Python event loop/async complexity that can cause hangs in Celery. """ import logging +import os +import subprocess +import tempfile from dataclasses import dataclass -from gitingest import ingest_async - logger = logging.getLogger(__name__) # Maximum file size in bytes (5MB) MAX_FILE_SIZE = 5 * 1024 * 1024 -# Default patterns to exclude (recommended approach for comprehensive analysis) -# Using only exclude_patterns ensures we don't miss any relevant file types -DEFAULT_EXCLUDE_PATTERNS = [ - # Dependencies - "node_modules/*", - "vendor/*", - "bower_components/*", - ".pnpm/*", - # Build artifacts / Caches - "build/*", - "dist/*", - "target/*", - "out/*", - "__pycache__/*", - "*.pyc", - ".cache/*", - ".next/*", - ".nuxt/*", - # Virtual environments - "venv/*", - ".venv/*", - "env/*", - ".env/*", - # IDE/Editor config - ".vscode/*", - ".idea/*", - ".project", - ".settings/*", - "*.swp", - "*.swo", - # Version control - ".git/*", - ".svn/*", - ".hg/*", - # Temporary / Logs - "tmp/*", - "temp/*", - "logs/*", - "*.log", - # Lock files (usually not needed for understanding code) - "package-lock.json", - "pnpm-lock.yaml", - "yarn.lock", - "uv.lock", - "Gemfile.lock", - "poetry.lock", - "Cargo.lock", - "composer.lock", - # Binary/media files - "*.png", - "*.jpg", - "*.jpeg", - "*.gif", - "*.ico", - "*.svg", - "*.webp", - "*.bmp", - "*.tiff", - "*.woff", - "*.woff2", - "*.ttf", - "*.eot", - "*.otf", - "*.mp3", - "*.mp4", - "*.wav", - "*.ogg", - "*.webm", - "*.avi", - "*.mov", - "*.pdf", - "*.doc", - "*.docx", - "*.xls", - "*.xlsx", - "*.ppt", - "*.pptx", - "*.zip", - "*.tar", - "*.tar.gz", - "*.tgz", - "*.rar", - "*.7z", - "*.exe", - "*.dll", - "*.so", - "*.dylib", - "*.bin", - "*.obj", - "*.o", - "*.a", - "*.lib", - # Minified files - "*.min.js", - "*.min.css", - # Source maps - "*.map", - # Database files - "*.db", - "*.sqlite", - "*.sqlite3", - # Coverage reports - "coverage/*", - ".coverage", - "htmlcov/*", - ".nyc_output/*", - # Test snapshots (can be large) - "__snapshots__/*", -] - @dataclass class RepositoryDigest: @@ -149,21 +40,19 @@ class RepositoryDigest: class GitHubConnector: """ - Connector for ingesting GitHub repositories using gitingest. + Connector for ingesting GitHub repositories using gitingest CLI. - This connector efficiently processes entire repositories into a single - digest, reducing the number of API calls and LLM invocations compared - to file-by-file processing. + Uses subprocess to run gitingest, which avoids all async/event loop + issues that can occur when mixing gitingest with Celery workers. """ def __init__(self, token: str | None = None): """ - Initializes the GitHub connector. + Initialize the GitHub connector. Args: token: Optional GitHub Personal Access Token (PAT). Only required for private repositories. - Public repositories can be ingested without a token. """ self.token = token if token and token.strip() else None if self.token: @@ -171,72 +60,104 @@ class GitHubConnector: else: logger.info("GitHub connector initialized without token (public repos only).") - async def ingest_repository( + def ingest_repository( self, repo_full_name: str, branch: str | None = None, - include_patterns: list[str] | None = None, - exclude_patterns: list[str] | None = None, max_file_size: int = MAX_FILE_SIZE, ) -> RepositoryDigest | None: """ - Ingest an entire repository and return a digest. + Ingest a repository using gitingest CLI via subprocess. + + This approach completely isolates gitingest from Python's event loop, + avoiding any async/Celery conflicts. Args: repo_full_name: The full name of the repository (e.g., 'owner/repo'). branch: Optional specific branch or tag to ingest. - include_patterns: Optional list of glob patterns for files to include. - If None, includes all files (recommended). - exclude_patterns: Optional list of glob patterns for files to exclude. - If None, uses DEFAULT_EXCLUDE_PATTERNS. - max_file_size: Maximum file size in bytes to include (default 5MB). + max_file_size: Maximum file size in bytes to include. Returns: - RepositoryDigest containing the summary, tree structure, and content, - or None if ingestion fails. + RepositoryDigest or None if ingestion fails. """ repo_url = f"https://github.com/{repo_full_name}" - # Use only exclude_patterns by default (recommended for comprehensive analysis) - # This ensures we don't miss any relevant file types - exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS - - logger.info(f"Starting gitingest for repository: {repo_full_name}") + logger.info(f"Starting gitingest CLI for repository: {repo_full_name}") try: - # Build kwargs dynamically - ingest_kwargs = { - "max_file_size": max_file_size, - "exclude_patterns": exclude_pats, - "include_gitignored": False, - "include_submodules": False, - } + # Create a temporary file for output + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", delete=False + ) as tmp_file: + output_path = tmp_file.name - # Only add token if provided (required only for private repos) - if self.token: - ingest_kwargs["token"] = self.token + # Build the gitingest CLI command + cmd = [ + "gitingest", + repo_url, + "--output", output_path, + "--max-size", str(max_file_size), + # Common exclude patterns + "-e", "node_modules/*", + "-e", "vendor/*", + "-e", ".git/*", + "-e", "__pycache__/*", + "-e", "dist/*", + "-e", "build/*", + "-e", "*.lock", + "-e", "package-lock.json", + ] - # Only add branch if specified + # Add branch if specified if branch: - ingest_kwargs["branch"] = branch + cmd.extend(["--branch", branch]) - # Only add include_patterns if explicitly provided - if include_patterns is not None: - ingest_kwargs["include_patterns"] = include_patterns + # Set up environment with token if provided + env = os.environ.copy() + if self.token: + env["GITHUB_TOKEN"] = self.token - summary, tree, content = await ingest_async(repo_url, **ingest_kwargs) + logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...") - if not content or not content.strip(): - logger.warning( - f"No content retrieved from repository: {repo_full_name}" - ) + # Run gitingest as subprocess with timeout + result = subprocess.run( + cmd, + env=env, + capture_output=True, + text=True, + timeout=900, # 5 minute timeout + ) + + if result.returncode != 0: + logger.error(f"gitingest failed: {result.stderr}") + # Clean up temp file + if os.path.exists(output_path): + os.unlink(output_path) return None + # Read the output file + if not os.path.exists(output_path): + logger.error("gitingest did not create output file") + return None + + with open(output_path, encoding="utf-8") as f: + full_content = f.read() + + # Clean up temp file + os.unlink(output_path) + + if not full_content or not full_content.strip(): + logger.warning(f"No content retrieved from repository: {repo_full_name}") + return None + + # Parse the gitingest output + # The output format is: summary + tree + content + # We'll extract what we can digest = RepositoryDigest( repo_full_name=repo_full_name, - summary=summary, - tree=tree, - content=content, + summary=f"Repository: {repo_full_name}", + tree="", # gitingest CLI combines everything into one file + content=full_content, branch=branch, ) @@ -246,50 +167,70 @@ class GitHubConnector: ) return digest + except subprocess.TimeoutExpired: + logger.error(f"gitingest timed out for repository: {repo_full_name}") + return None + except FileNotFoundError: + logger.error( + "gitingest CLI not found. Falling back to Python library." + ) + # Fall back to Python library + return self._ingest_with_python_library(repo_full_name, branch, max_file_size) except Exception as e: logger.error(f"Failed to ingest repository {repo_full_name}: {e}") return None - async def ingest_repositories( + def _ingest_with_python_library( self, - repo_full_names: list[str], + repo_full_name: str, branch: str | None = None, - include_patterns: list[str] | None = None, - exclude_patterns: list[str] | None = None, max_file_size: int = MAX_FILE_SIZE, - ) -> list[RepositoryDigest]: + ) -> RepositoryDigest | None: """ - Ingest multiple repositories and return their digests. - - Args: - repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']). - branch: Optional specific branch or tag to ingest (applied to all repos). - include_patterns: Optional list of glob patterns for files to include. - exclude_patterns: Optional list of glob patterns for files to exclude. - max_file_size: Maximum file size in bytes to include. - - Returns: - List of RepositoryDigest objects for successfully ingested repositories. + Fallback: Ingest using the Python library directly. """ - digests = [] + from gitingest import ingest - for repo_full_name in repo_full_names: - if not repo_full_name or not isinstance(repo_full_name, str): - logger.warning(f"Skipping invalid repository entry: {repo_full_name}") - continue + repo_url = f"https://github.com/{repo_full_name}" - digest = await self.ingest_repository( + logger.info(f"Using Python gitingest library for: {repo_full_name}") + + try: + kwargs = { + "max_file_size": max_file_size, + "exclude_patterns": [ + "node_modules/*", + "vendor/*", + ".git/*", + "__pycache__/*", + "dist/*", + "build/*", + "*.lock", + "package-lock.json", + ], + "include_gitignored": False, + "include_submodules": False, + } + + if self.token: + kwargs["token"] = self.token + if branch: + kwargs["branch"] = branch + + summary, tree, content = ingest(repo_url, **kwargs) + + if not content or not content.strip(): + logger.warning(f"No content from {repo_full_name}") + return None + + return RepositoryDigest( repo_full_name=repo_full_name, + summary=summary, + tree=tree, + content=content, branch=branch, - include_patterns=include_patterns, - exclude_patterns=exclude_patterns, - max_file_size=max_file_size, ) - if digest: - digests.append(digest) - - logger.info( - f"Ingested {len(digests)} out of {len(repo_full_names)} repositories." - ) - return digests + except Exception as e: + logger.error(f"Python library failed for {repo_full_name}: {e}") + return None diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index f1ccabdef..f16ee0156 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -173,8 +173,13 @@ async def index_github_repos( logger.info(f"Ingesting repository: {repo_full_name}") try: - # Ingest the entire repository - digest = await github_client.ingest_repository(repo_full_name) + # Run gitingest via subprocess (isolated from event loop) + # Using to_thread to not block the async database operations + import asyncio + + digest = await asyncio.to_thread( + github_client.ingest_repository, repo_full_name + ) if not digest: logger.warning( diff --git a/surfsense_backend/app/utils/validators.py b/surfsense_backend/app/utils/validators.py index 54e681518..6a87679ec 100644 --- a/surfsense_backend/app/utils/validators.py +++ b/surfsense_backend/app/utils/validators.py @@ -530,7 +530,10 @@ def validate_connector_config( # "validators": {}, # }, "GITHUB_CONNECTOR": { - "required": ["GITHUB_PAT", "repo_full_names"], + # GITHUB_PAT is optional - only required for private repositories + # Public repositories can be indexed without authentication + "required": ["repo_full_names"], + "optional": ["GITHUB_PAT"], # Optional - only needed for private repos "validators": { "repo_full_names": lambda: validate_list_field( "repo_full_names", "repo_full_names" diff --git a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json index 6ed792b8e..b729c3f8b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json +++ b/surfsense_web/components/assistant-ui/connector-popup/config/connector-status-config.json @@ -24,11 +24,6 @@ "enabled": true, "status": "warning", "statusMessage": "Some requests may be blocked if not using Firecrawl." - }, - "GITHUB_CONNECTOR": { - "enabled": false, - "status": "maintenance", - "statusMessage": "Rework in progress." } }, "globalSettings": { diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx index 6ed36e180..72d5811d3 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/github-connect-form.tsx @@ -96,6 +96,7 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting repo_full_names: repoList, }, is_indexable: true, + is_active: true, last_indexed_at: null, periodic_indexing_enabled: periodicEnabled, indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null, @@ -119,16 +120,16 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting Personal Access Token (Optional) A GitHub PAT is only required for private repositories. Public repos work without a - token. Create one from{" "} + token. {" "} - GitHub Settings + Get your token {" "} - if needed. + . @@ -324,20 +325,21 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting

How it works

- The GitHub connector uses a Personal Access Token (PAT) to authenticate with the - GitHub API. You provide a comma-separated list of repository full names (e.g., - "owner/repo1, owner/repo2") that you want to index. The connector indexes relevant - files (code, markdown, text) from the selected repositories. + The GitHub connector ingests entire repositories in one pass using gitingest, + making it highly efficient. Provide a comma-separated list of repository full + names (e.g., "owner/repo1, owner/repo2") to index.

  • - The connector indexes files based on common code and documentation extensions. + Public repos: No authentication required.
  • -
  • Large files (over 1MB) are skipped during indexing.
  • -
  • Only specified repositories are indexed.
  • - Indexing runs periodically (check connector settings for frequency) to keep - content up-to-date. + Private repos: Requires a GitHub Personal Access Token (PAT). +
  • +
  • Indexes code, documentation, and configuration files.
  • +
  • Large files (over 5MB) and binary files are automatically skipped.
  • +
  • + Periodic sync detects changes and only re-indexes when content has changed.
@@ -348,19 +350,23 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting - Personal Access Token Required + Personal Access Token (Optional) - You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch - repositories. The PAT will be stored securely to enable indexing. + A GitHub PAT is only needed for private repositories. Public + repos can be indexed without authentication. If you need to access private + repos, create a PAT with the 'repo' scope.

- Step 1: Generate GitHub PAT + For Private Repositories Only: Generate GitHub PAT

+

+ Skip this step if you're only indexing public repositories. +

  1. Go to your GitHub{" "} @@ -375,46 +381,36 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
  2. Click on Personal access tokens, then choose{" "} - Tokens (classic) or Fine-grained tokens{" "} - (recommended if available). + Tokens (classic) or Fine-grained tokens.
  3. - Click Generate new token (and choose the appropriate type). + Click Generate new token.
  4. Give your token a descriptive name (e.g., "SurfSense Connector").
  5. -
  6. Set an expiration date for the token (recommended for security).
  7. - Under Select scopes (for classic tokens) or{" "} - Repository access (for fine-grained), grant the necessary - permissions. At minimum, the `repo` scope (or equivalent - read access to repositories for fine-grained tokens) is required to read - repository content. + Grant the `repo` scope (for classic tokens) or read access + to the specific repositories you want to index (for fine-grained tokens).
  8. - Click Generate token. -
  9. -
  10. - Important: Copy your new PAT immediately. You won't be able - to see it again after leaving the page. + Click Generate token and copy it immediately.

- Step 2: Specify repositories + Specify Repositories

Enter a comma-separated list of repository full names in the format - "owner/repo1, owner/repo2". The connector will index files from only the - specified repositories. + "owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".

- Repository Access + Public vs Private - Make sure your PAT has access to all repositories you want to index. Private - repositories require appropriate permissions. + Public repositories work without a PAT. For private repositories, ensure + your PAT has access to the repos you want to index.
@@ -424,36 +420,38 @@ export const GithubConnectForm: FC = ({ onSubmit, isSubmitting
-

Indexing

+

Quick Start

  1. - Navigate to the Connector Dashboard and select the GitHub{" "} - Connector. + Enter the Repository Names you want to index (e.g., + "facebook/react, vercel/next.js").
  2. - Enter your GitHub Personal Access Token in the form field. + (Optional) Add a GitHub PAT if indexing private repositories.
  3. - Enter a comma-separated list of Repository Names (e.g., - "owner/repo1, owner/repo2"). + Click Connect GitHub to start indexing.
  4. - Click Connect to establish the connection. + Enable Periodic Sync to automatically detect and index + changes.
  5. -
  6. Once connected, your GitHub repositories will be indexed automatically.
What Gets Indexed -

The GitHub connector indexes the following data:

+

The GitHub connector indexes:

    -
  • Code files from selected repositories
  • -
  • README files and Markdown documentation
  • -
  • Common text-based file formats
  • -
  • Repository metadata and structure
  • +
  • All code files (Python, JavaScript, TypeScript, etc.)
  • +
  • Documentation (README, Markdown, text files)
  • +
  • Configuration files (JSON, YAML, TOML, etc.)
  • +
  • Repository structure and file tree
+

+ Binary files, images, and build artifacts are automatically excluded. +

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx index 07c7bdfbc..d5169b49d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/github-config.tsx @@ -1,8 +1,9 @@ "use client"; -import { KeyRound } from "lucide-react"; +import { Info, KeyRound } from "lucide-react"; import type { FC } from "react"; import { useEffect, useState } from "react"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; @@ -79,6 +80,26 @@ export const GithubConfig: FC = ({ return (
+ + +
+ Personal Access Token (Optional) + + A GitHub PAT is only required for private repositories. Public repos work without a + token. Create one from{" "} + + GitHub Settings + {" "} + if needed. + +
+
+ {/* Connector Name */}
@@ -105,7 +126,7 @@ export const GithubConfig: FC = ({
= ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */} + {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && - connector.connector_type !== "WEBCRAWLER_CONNECTOR" && ( + connector.connector_type !== "WEBCRAWLER_CONNECTOR" && + connector.connector_type !== "GITHUB_CONNECTOR" && ( = ({ {/* Date range selector and periodic sync - only shown for indexable connectors */} {connector?.is_indexable && ( <> - {/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */} + {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && - config.connectorType !== "WEBCRAWLER_CONNECTOR" && ( + config.connectorType !== "WEBCRAWLER_CONNECTOR" && + config.connectorType !== "GITHUB_CONNECTOR" && (