SurfSense/surfsense_backend/app/connectors/github_connector.py
Anish Sarkar 35888144eb refactor: Update GitHub connector to use gitingest CLI
- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery.
- Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution.
- Adjusted GitHub indexer to call the new synchronous ingestion method.
- Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
2026-01-20 23:24:33 +05:30

236 lines
7.6 KiB
Python

"""
GitHub connector using gitingest CLI for efficient repository digestion.
This connector uses subprocess to call gitingest CLI, completely isolating
it from any Python event loop/async complexity that can cause hangs in Celery.
"""
import logging
import os
import subprocess
import tempfile
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Maximum file size in bytes (5MB)
MAX_FILE_SIZE = 5 * 1024 * 1024
@dataclass
class RepositoryDigest:
"""Represents a digested repository from gitingest."""
repo_full_name: str
summary: str
tree: str
content: str
branch: str | None = None
@property
def full_digest(self) -> str:
"""Returns the complete digest with tree and content."""
return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
@property
def estimated_tokens(self) -> int:
"""Rough estimate of tokens (1 token ≈ 4 characters)."""
return len(self.full_digest) // 4
class GitHubConnector:
"""
Connector for ingesting GitHub repositories using gitingest CLI.
Uses subprocess to run gitingest, which avoids all async/event loop
issues that can occur when mixing gitingest with Celery workers.
"""
def __init__(self, token: str | None = None):
"""
Initialize the GitHub connector.
Args:
token: Optional GitHub Personal Access Token (PAT).
Only required for private repositories.
"""
self.token = token if token and token.strip() else None
if self.token:
logger.info("GitHub connector initialized with authentication token.")
else:
logger.info("GitHub connector initialized without token (public repos only).")
def ingest_repository(
self,
repo_full_name: str,
branch: str | None = None,
max_file_size: int = MAX_FILE_SIZE,
) -> RepositoryDigest | None:
"""
Ingest a repository using gitingest CLI via subprocess.
This approach completely isolates gitingest from Python's event loop,
avoiding any async/Celery conflicts.
Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo').
branch: Optional specific branch or tag to ingest.
max_file_size: Maximum file size in bytes to include.
Returns:
RepositoryDigest or None if ingestion fails.
"""
repo_url = f"https://github.com/{repo_full_name}"
logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
try:
# Create a temporary file for output
with tempfile.NamedTemporaryFile(
mode="w", suffix=".txt", delete=False
) as tmp_file:
output_path = tmp_file.name
# Build the gitingest CLI command
cmd = [
"gitingest",
repo_url,
"--output", output_path,
"--max-size", str(max_file_size),
# Common exclude patterns
"-e", "node_modules/*",
"-e", "vendor/*",
"-e", ".git/*",
"-e", "__pycache__/*",
"-e", "dist/*",
"-e", "build/*",
"-e", "*.lock",
"-e", "package-lock.json",
]
# Add branch if specified
if branch:
cmd.extend(["--branch", branch])
# Set up environment with token if provided
env = os.environ.copy()
if self.token:
env["GITHUB_TOKEN"] = self.token
logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
# Run gitingest as subprocess with timeout
result = subprocess.run(
cmd,
env=env,
capture_output=True,
text=True,
timeout=900, # 5 minute timeout
)
if result.returncode != 0:
logger.error(f"gitingest failed: {result.stderr}")
# Clean up temp file
if os.path.exists(output_path):
os.unlink(output_path)
return None
# Read the output file
if not os.path.exists(output_path):
logger.error("gitingest did not create output file")
return None
with open(output_path, encoding="utf-8") as f:
full_content = f.read()
# Clean up temp file
os.unlink(output_path)
if not full_content or not full_content.strip():
logger.warning(f"No content retrieved from repository: {repo_full_name}")
return None
# Parse the gitingest output
# The output format is: summary + tree + content
# We'll extract what we can
digest = RepositoryDigest(
repo_full_name=repo_full_name,
summary=f"Repository: {repo_full_name}",
tree="", # gitingest CLI combines everything into one file
content=full_content,
branch=branch,
)
logger.info(
f"Successfully ingested {repo_full_name}: "
f"~{digest.estimated_tokens} estimated tokens"
)
return digest
except subprocess.TimeoutExpired:
logger.error(f"gitingest timed out for repository: {repo_full_name}")
return None
except FileNotFoundError:
logger.error(
"gitingest CLI not found. Falling back to Python library."
)
# Fall back to Python library
return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
except Exception as e:
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
return None
def _ingest_with_python_library(
self,
repo_full_name: str,
branch: str | None = None,
max_file_size: int = MAX_FILE_SIZE,
) -> RepositoryDigest | None:
"""
Fallback: Ingest using the Python library directly.
"""
from gitingest import ingest
repo_url = f"https://github.com/{repo_full_name}"
logger.info(f"Using Python gitingest library for: {repo_full_name}")
try:
kwargs = {
"max_file_size": max_file_size,
"exclude_patterns": [
"node_modules/*",
"vendor/*",
".git/*",
"__pycache__/*",
"dist/*",
"build/*",
"*.lock",
"package-lock.json",
],
"include_gitignored": False,
"include_submodules": False,
}
if self.token:
kwargs["token"] = self.token
if branch:
kwargs["branch"] = branch
summary, tree, content = ingest(repo_url, **kwargs)
if not content or not content.strip():
logger.warning(f"No content from {repo_full_name}")
return None
return RepositoryDigest(
repo_full_name=repo_full_name,
summary=summary,
tree=tree,
content=content,
branch=branch,
)
except Exception as e:
logger.error(f"Python library failed for {repo_full_name}: {e}")
return None