refactor: Update GitHub connector to use gitingest CLI

- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery.
- Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution.
- Adjusted GitHub indexer to call the new synchronous ingestion method.
- Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
This commit is contained in:
Anish Sarkar 2026-01-20 23:24:33 +05:30
parent 49b8a46d10
commit 35888144eb
8 changed files with 221 additions and 256 deletions

View file

@ -1,130 +1,21 @@
"""
GitHub connector using gitingest for efficient repository digestion.
GitHub connector using gitingest CLI for efficient repository digestion.
This connector replaces the previous file-by-file approach with a single
digest generation per repository, dramatically reducing LLM API calls.
This connector uses subprocess to call gitingest CLI, completely isolating
it from any Python event loop/async complexity that can cause hangs in Celery.
"""
import logging
import os
import subprocess
import tempfile
from dataclasses import dataclass
from gitingest import ingest_async
logger = logging.getLogger(__name__)
# Maximum file size in bytes (5MB)
MAX_FILE_SIZE = 5 * 1024 * 1024
# Default patterns to exclude (recommended approach for comprehensive analysis)
# Using only exclude_patterns ensures we don't miss any relevant file types
DEFAULT_EXCLUDE_PATTERNS = [
# Dependencies
"node_modules/*",
"vendor/*",
"bower_components/*",
".pnpm/*",
# Build artifacts / Caches
"build/*",
"dist/*",
"target/*",
"out/*",
"__pycache__/*",
"*.pyc",
".cache/*",
".next/*",
".nuxt/*",
# Virtual environments
"venv/*",
".venv/*",
"env/*",
".env/*",
# IDE/Editor config
".vscode/*",
".idea/*",
".project",
".settings/*",
"*.swp",
"*.swo",
# Version control
".git/*",
".svn/*",
".hg/*",
# Temporary / Logs
"tmp/*",
"temp/*",
"logs/*",
"*.log",
# Lock files (usually not needed for understanding code)
"package-lock.json",
"pnpm-lock.yaml",
"yarn.lock",
"uv.lock",
"Gemfile.lock",
"poetry.lock",
"Cargo.lock",
"composer.lock",
# Binary/media files
"*.png",
"*.jpg",
"*.jpeg",
"*.gif",
"*.ico",
"*.svg",
"*.webp",
"*.bmp",
"*.tiff",
"*.woff",
"*.woff2",
"*.ttf",
"*.eot",
"*.otf",
"*.mp3",
"*.mp4",
"*.wav",
"*.ogg",
"*.webm",
"*.avi",
"*.mov",
"*.pdf",
"*.doc",
"*.docx",
"*.xls",
"*.xlsx",
"*.ppt",
"*.pptx",
"*.zip",
"*.tar",
"*.tar.gz",
"*.tgz",
"*.rar",
"*.7z",
"*.exe",
"*.dll",
"*.so",
"*.dylib",
"*.bin",
"*.obj",
"*.o",
"*.a",
"*.lib",
# Minified files
"*.min.js",
"*.min.css",
# Source maps
"*.map",
# Database files
"*.db",
"*.sqlite",
"*.sqlite3",
# Coverage reports
"coverage/*",
".coverage",
"htmlcov/*",
".nyc_output/*",
# Test snapshots (can be large)
"__snapshots__/*",
]
@dataclass
class RepositoryDigest:
@ -149,21 +40,19 @@ class RepositoryDigest:
class GitHubConnector:
"""
Connector for ingesting GitHub repositories using gitingest.
Connector for ingesting GitHub repositories using gitingest CLI.
This connector efficiently processes entire repositories into a single
digest, reducing the number of API calls and LLM invocations compared
to file-by-file processing.
Uses subprocess to run gitingest, which avoids all async/event loop
issues that can occur when mixing gitingest with Celery workers.
"""
def __init__(self, token: str | None = None):
"""
Initializes the GitHub connector.
Initialize the GitHub connector.
Args:
token: Optional GitHub Personal Access Token (PAT).
Only required for private repositories.
Public repositories can be ingested without a token.
"""
self.token = token if token and token.strip() else None
if self.token:
@ -171,72 +60,104 @@ class GitHubConnector:
else:
logger.info("GitHub connector initialized without token (public repos only).")
async def ingest_repository(
def ingest_repository(
self,
repo_full_name: str,
branch: str | None = None,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
max_file_size: int = MAX_FILE_SIZE,
) -> RepositoryDigest | None:
"""
Ingest an entire repository and return a digest.
Ingest a repository using gitingest CLI via subprocess.
This approach completely isolates gitingest from Python's event loop,
avoiding any async/Celery conflicts.
Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo').
branch: Optional specific branch or tag to ingest.
include_patterns: Optional list of glob patterns for files to include.
If None, includes all files (recommended).
exclude_patterns: Optional list of glob patterns for files to exclude.
If None, uses DEFAULT_EXCLUDE_PATTERNS.
max_file_size: Maximum file size in bytes to include (default 5MB).
max_file_size: Maximum file size in bytes to include.
Returns:
RepositoryDigest containing the summary, tree structure, and content,
or None if ingestion fails.
RepositoryDigest or None if ingestion fails.
"""
repo_url = f"https://github.com/{repo_full_name}"
# Use only exclude_patterns by default (recommended for comprehensive analysis)
# This ensures we don't miss any relevant file types
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
logger.info(f"Starting gitingest for repository: {repo_full_name}")
logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
try:
# Build kwargs dynamically
ingest_kwargs = {
"max_file_size": max_file_size,
"exclude_patterns": exclude_pats,
"include_gitignored": False,
"include_submodules": False,
}
# Create a temporary file for output
with tempfile.NamedTemporaryFile(
mode="w", suffix=".txt", delete=False
) as tmp_file:
output_path = tmp_file.name
# Only add token if provided (required only for private repos)
if self.token:
ingest_kwargs["token"] = self.token
# Build the gitingest CLI command
cmd = [
"gitingest",
repo_url,
"--output", output_path,
"--max-size", str(max_file_size),
# Common exclude patterns
"-e", "node_modules/*",
"-e", "vendor/*",
"-e", ".git/*",
"-e", "__pycache__/*",
"-e", "dist/*",
"-e", "build/*",
"-e", "*.lock",
"-e", "package-lock.json",
]
# Only add branch if specified
# Add branch if specified
if branch:
ingest_kwargs["branch"] = branch
cmd.extend(["--branch", branch])
# Only add include_patterns if explicitly provided
if include_patterns is not None:
ingest_kwargs["include_patterns"] = include_patterns
# Set up environment with token if provided
env = os.environ.copy()
if self.token:
env["GITHUB_TOKEN"] = self.token
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
if not content or not content.strip():
logger.warning(
f"No content retrieved from repository: {repo_full_name}"
)
# Run gitingest as subprocess with timeout
result = subprocess.run(
cmd,
env=env,
capture_output=True,
text=True,
timeout=900, # 5 minute timeout
)
if result.returncode != 0:
logger.error(f"gitingest failed: {result.stderr}")
# Clean up temp file
if os.path.exists(output_path):
os.unlink(output_path)
return None
# Read the output file
if not os.path.exists(output_path):
logger.error("gitingest did not create output file")
return None
with open(output_path, encoding="utf-8") as f:
full_content = f.read()
# Clean up temp file
os.unlink(output_path)
if not full_content or not full_content.strip():
logger.warning(f"No content retrieved from repository: {repo_full_name}")
return None
# Parse the gitingest output
# The output format is: summary + tree + content
# We'll extract what we can
digest = RepositoryDigest(
repo_full_name=repo_full_name,
summary=summary,
tree=tree,
content=content,
summary=f"Repository: {repo_full_name}",
tree="", # gitingest CLI combines everything into one file
content=full_content,
branch=branch,
)
@ -246,50 +167,70 @@ class GitHubConnector:
)
return digest
except subprocess.TimeoutExpired:
logger.error(f"gitingest timed out for repository: {repo_full_name}")
return None
except FileNotFoundError:
logger.error(
"gitingest CLI not found. Falling back to Python library."
)
# Fall back to Python library
return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
except Exception as e:
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
return None
async def ingest_repositories(
def _ingest_with_python_library(
self,
repo_full_names: list[str],
repo_full_name: str,
branch: str | None = None,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
max_file_size: int = MAX_FILE_SIZE,
) -> list[RepositoryDigest]:
) -> RepositoryDigest | None:
"""
Ingest multiple repositories and return their digests.
Args:
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
branch: Optional specific branch or tag to ingest (applied to all repos).
include_patterns: Optional list of glob patterns for files to include.
exclude_patterns: Optional list of glob patterns for files to exclude.
max_file_size: Maximum file size in bytes to include.
Returns:
List of RepositoryDigest objects for successfully ingested repositories.
Fallback: Ingest using the Python library directly.
"""
digests = []
from gitingest import ingest
for repo_full_name in repo_full_names:
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
repo_url = f"https://github.com/{repo_full_name}"
digest = await self.ingest_repository(
logger.info(f"Using Python gitingest library for: {repo_full_name}")
try:
kwargs = {
"max_file_size": max_file_size,
"exclude_patterns": [
"node_modules/*",
"vendor/*",
".git/*",
"__pycache__/*",
"dist/*",
"build/*",
"*.lock",
"package-lock.json",
],
"include_gitignored": False,
"include_submodules": False,
}
if self.token:
kwargs["token"] = self.token
if branch:
kwargs["branch"] = branch
summary, tree, content = ingest(repo_url, **kwargs)
if not content or not content.strip():
logger.warning(f"No content from {repo_full_name}")
return None
return RepositoryDigest(
repo_full_name=repo_full_name,
summary=summary,
tree=tree,
content=content,
branch=branch,
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
max_file_size=max_file_size,
)
if digest:
digests.append(digest)
logger.info(
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
)
return digests
except Exception as e:
logger.error(f"Python library failed for {repo_full_name}: {e}")
return None

View file

@ -173,8 +173,13 @@ async def index_github_repos(
logger.info(f"Ingesting repository: {repo_full_name}")
try:
# Ingest the entire repository
digest = await github_client.ingest_repository(repo_full_name)
# Run gitingest via subprocess (isolated from event loop)
# Using to_thread to not block the async database operations
import asyncio
digest = await asyncio.to_thread(
github_client.ingest_repository, repo_full_name
)
if not digest:
logger.warning(

View file

@ -530,7 +530,10 @@ def validate_connector_config(
# "validators": {},
# },
"GITHUB_CONNECTOR": {
"required": ["GITHUB_PAT", "repo_full_names"],
# GITHUB_PAT is optional - only required for private repositories
# Public repositories can be indexed without authentication
"required": ["repo_full_names"],
"optional": ["GITHUB_PAT"], # Optional - only needed for private repos
"validators": {
"repo_full_names": lambda: validate_list_field(
"repo_full_names", "repo_full_names"

View file

@ -24,11 +24,6 @@
"enabled": true,
"status": "warning",
"statusMessage": "Some requests may be blocked if not using Firecrawl."
},
"GITHUB_CONNECTOR": {
"enabled": false,
"status": "maintenance",
"statusMessage": "Rework in progress."
}
},
"globalSettings": {

View file

@ -96,6 +96,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
repo_full_names: repoList,
},
is_indexable: true,
is_active: true,
last_indexed_at: null,
periodic_indexing_enabled: periodicEnabled,
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
@ -119,16 +120,16 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
A GitHub PAT is only required for private repositories. Public repos work without a
token. Create one from{" "}
token. {" "}
<a
href="https://github.com/settings/tokens"
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
target="_blank"
rel="noopener noreferrer"
className="font-medium underline underline-offset-4"
>
GitHub Settings
Get your token
</a>{" "}
if needed.
.
</AlertDescription>
</div>
</Alert>
@ -324,20 +325,21 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<div>
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
<p className="text-[10px] sm:text-xs text-muted-foreground">
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the
GitHub API. You provide a comma-separated list of repository full names (e.g.,
"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant
files (code, markdown, text) from the selected repositories.
The GitHub connector ingests entire repositories in one pass using gitingest,
making it highly efficient. Provide a comma-separated list of repository full
names (e.g., "owner/repo1, owner/repo2") to index.
</p>
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
<li>
The connector indexes files based on common code and documentation extensions.
<strong>Public repos:</strong> No authentication required.
</li>
<li>Large files (over 1MB) are skipped during indexing.</li>
<li>Only specified repositories are indexed.</li>
<li>
Indexing runs periodically (check connector settings for frequency) to keep
content up-to-date.
<strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
</li>
<li>Indexes code, documentation, and configuration files.</li>
<li>Large files (over 5MB) and binary files are automatically skipped.</li>
<li>
Periodic sync detects changes and only re-indexes when content has changed.
</li>
</ul>
</div>
@ -348,19 +350,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
<AlertTitle className="text-[10px] sm:text-xs">
Personal Access Token Required
Personal Access Token (Optional)
</AlertTitle>
<AlertDescription className="text-[9px] sm:text-[10px]">
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch
repositories. The PAT will be stored securely to enable indexing.
A GitHub PAT is only needed for <strong>private repositories</strong>. Public
repos can be indexed without authentication. If you need to access private
repos, create a PAT with the 'repo' scope.
</AlertDescription>
</Alert>
<div className="space-y-4 sm:space-y-6">
<div>
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
Step 1: Generate GitHub PAT
For Private Repositories Only: Generate GitHub PAT
</h4>
<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
Skip this step if you're only indexing public repositories.
</p>
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
<li>
Go to your GitHub{" "}
@ -375,46 +381,36 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
</li>
<li>
Click on <strong>Personal access tokens</strong>, then choose{" "}
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "}
(recommended if available).
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
</li>
<li>
Click <strong>Generate new token</strong> (and choose the appropriate type).
Click <strong>Generate new token</strong>.
</li>
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
<li>Set an expiration date for the token (recommended for security).</li>
<li>
Under <strong>Select scopes</strong> (for classic tokens) or{" "}
<strong>Repository access</strong> (for fine-grained), grant the necessary
permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
read access to repositories for fine-grained tokens) is required to read
repository content.
Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
to the specific repositories you want to index (for fine-grained tokens).
</li>
<li>
Click <strong>Generate token</strong>.
</li>
<li>
<strong>Important:</strong> Copy your new PAT immediately. You won't be able
to see it again after leaving the page.
Click <strong>Generate token</strong> and copy it immediately.
</li>
</ol>
</div>
<div>
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
Step 2: Specify repositories
Specify Repositories
</h4>
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
Enter a comma-separated list of repository full names in the format
"owner/repo1, owner/repo2". The connector will index files from only the
specified repositories.
"owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
</p>
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle>
<AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
<AlertDescription className="text-[9px] sm:text-[10px]">
Make sure your PAT has access to all repositories you want to index. Private
repositories require appropriate permissions.
Public repositories work without a PAT. For private repositories, ensure
your PAT has access to the repos you want to index.
</AlertDescription>
</Alert>
</div>
@ -424,36 +420,38 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<div className="space-y-4">
<div>
<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3>
<h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
<li>
Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "}
Connector.
Enter the <strong>Repository Names</strong> you want to index (e.g.,
"facebook/react, vercel/next.js").
</li>
<li>
Enter your <strong>GitHub Personal Access Token</strong> in the form field.
<strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
</li>
<li>
Enter a comma-separated list of <strong>Repository Names</strong> (e.g.,
"owner/repo1, owner/repo2").
Click <strong>Connect GitHub</strong> to start indexing.
</li>
<li>
Click <strong>Connect</strong> to establish the connection.
Enable <strong>Periodic Sync</strong> to automatically detect and index
changes.
</li>
<li>Once connected, your GitHub repositories will be indexed automatically.</li>
</ol>
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
<AlertDescription className="text-[9px] sm:text-[10px]">
<p className="mb-2">The GitHub connector indexes the following data:</p>
<p className="mb-2">The GitHub connector indexes:</p>
<ul className="list-disc pl-5 space-y-1">
<li>Code files from selected repositories</li>
<li>README files and Markdown documentation</li>
<li>Common text-based file formats</li>
<li>Repository metadata and structure</li>
<li>All code files (Python, JavaScript, TypeScript, etc.)</li>
<li>Documentation (README, Markdown, text files)</li>
<li>Configuration files (JSON, YAML, TOML, etc.)</li>
<li>Repository structure and file tree</li>
</ul>
<p className="mt-2">
Binary files, images, and build artifacts are automatically excluded.
</p>
</AlertDescription>
</Alert>
</div>

View file

@ -1,8 +1,9 @@
"use client";
import { KeyRound } from "lucide-react";
import { Info, KeyRound } from "lucide-react";
import type { FC } from "react";
import { useEffect, useState } from "react";
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
import { Badge } from "@/components/ui/badge";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
@ -79,6 +80,26 @@ export const GithubConfig: FC<GithubConfigProps> = ({
return (
<div className="space-y-6">
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
<div className="-ml-1">
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
A GitHub PAT is only required for private repositories. Public repos work without a
token. Create one from{" "}
<a
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
target="_blank"
rel="noopener noreferrer"
className="font-medium underline underline-offset-4"
>
GitHub Settings
</a>{" "}
if needed.
</AlertDescription>
</div>
</Alert>
{/* Connector Name */}
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
<div className="space-y-2">
@ -105,7 +126,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
<div className="space-y-2">
<Label className="flex items-center gap-2 text-xs sm:text-sm">
<KeyRound className="h-4 w-4" />
GitHub Personal Access Token
GitHub Personal Access Token (optional)
</Label>
<Input
type="password"

View file

@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
{/* Date range selector and periodic sync - only shown for indexable connectors */}
{connector.is_indexable && (
<>
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
connector.connector_type !== "WEBCRAWLER_CONNECTOR" && (
connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
connector.connector_type !== "GITHUB_CONNECTOR" && (
<DateRangeSelector
startDate={startDate}
endDate={endDate}

View file

@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
{/* Date range selector and periodic sync - only shown for indexable connectors */}
{connector?.is_indexable && (
<>
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
config.connectorType !== "WEBCRAWLER_CONNECTOR" && (
config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
config.connectorType !== "GITHUB_CONNECTOR" && (
<DateRangeSelector
startDate={startDate}
endDate={endDate}