""" GitHub connector using gitingest CLI for efficient repository digestion. This connector uses subprocess to call gitingest CLI, completely isolating it from any Python event loop/async complexity that can cause hangs in Celery. """ import logging import os import subprocess import tempfile from dataclasses import dataclass logger = logging.getLogger(__name__) # Maximum file size in bytes (5MB) MAX_FILE_SIZE = 5 * 1024 * 1024 @dataclass class RepositoryDigest: """Represents a digested repository from gitingest.""" repo_full_name: str summary: str tree: str content: str branch: str | None = None @property def full_digest(self) -> str: """Returns the complete digest with tree and content.""" return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}" @property def estimated_tokens(self) -> int: """Rough estimate of tokens (1 token ≈ 4 characters).""" return len(self.full_digest) // 4 class GitHubConnector: """ Connector for ingesting GitHub repositories using gitingest CLI. Uses subprocess to run gitingest, which avoids all async/event loop issues that can occur when mixing gitingest with Celery workers. """ def __init__(self, token: str | None = None): """ Initialize the GitHub connector. Args: token: Optional GitHub Personal Access Token (PAT). Only required for private repositories. """ self.token = token if token and token.strip() else None if self.token: logger.info("GitHub connector initialized with authentication token.") else: logger.info( "GitHub connector initialized without token (public repos only)." ) def ingest_repository( self, repo_full_name: str, branch: str | None = None, max_file_size: int = MAX_FILE_SIZE, ) -> RepositoryDigest | None: """ Ingest a repository using gitingest CLI via subprocess. This approach completely isolates gitingest from Python's event loop, avoiding any async/Celery conflicts. Args: repo_full_name: The full name of the repository (e.g., 'owner/repo'). branch: Optional specific branch or tag to ingest. max_file_size: Maximum file size in bytes to include. Returns: RepositoryDigest or None if ingestion fails. """ repo_url = f"https://github.com/{repo_full_name}" logger.info(f"Starting gitingest CLI for repository: {repo_full_name}") try: # Create a temporary file for output with tempfile.NamedTemporaryFile( mode="w", suffix=".txt", delete=False ) as tmp_file: output_path = tmp_file.name # Build the gitingest CLI command cmd = [ "gitingest", repo_url, "--output", output_path, "--max-size", str(max_file_size), # Common exclude patterns "-e", "node_modules/*", "-e", "vendor/*", "-e", ".git/*", "-e", "__pycache__/*", "-e", "dist/*", "-e", "build/*", "-e", "*.lock", "-e", "package-lock.json", ] # Add branch if specified if branch: cmd.extend(["--branch", branch]) # Set up environment with token if provided env = os.environ.copy() if self.token: env["GITHUB_TOKEN"] = self.token logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...") # Run gitingest as subprocess with timeout result = subprocess.run( cmd, env=env, capture_output=True, text=True, timeout=900, # 5 minute timeout ) if result.returncode != 0: logger.error(f"gitingest failed: {result.stderr}") # Clean up temp file if os.path.exists(output_path): os.unlink(output_path) return None # Read the output file if not os.path.exists(output_path): logger.error("gitingest did not create output file") return None with open(output_path, encoding="utf-8") as f: full_content = f.read() # Clean up temp file os.unlink(output_path) if not full_content or not full_content.strip(): logger.warning( f"No content retrieved from repository: {repo_full_name}" ) return None # Parse the gitingest output # The output format is: summary + tree + content # We'll extract what we can digest = RepositoryDigest( repo_full_name=repo_full_name, summary=f"Repository: {repo_full_name}", tree="", # gitingest CLI combines everything into one file content=full_content, branch=branch, ) logger.info( f"Successfully ingested {repo_full_name}: " f"~{digest.estimated_tokens} estimated tokens" ) return digest except subprocess.TimeoutExpired: logger.error(f"gitingest timed out for repository: {repo_full_name}") return None except FileNotFoundError: logger.error("gitingest CLI not found. Falling back to Python library.") # Fall back to Python library return self._ingest_with_python_library( repo_full_name, branch, max_file_size ) except Exception as e: logger.error(f"Failed to ingest repository {repo_full_name}: {e}") return None def _ingest_with_python_library( self, repo_full_name: str, branch: str | None = None, max_file_size: int = MAX_FILE_SIZE, ) -> RepositoryDigest | None: """ Fallback: Ingest using the Python library directly. """ from gitingest import ingest repo_url = f"https://github.com/{repo_full_name}" logger.info(f"Using Python gitingest library for: {repo_full_name}") try: kwargs = { "max_file_size": max_file_size, "exclude_patterns": [ "node_modules/*", "vendor/*", ".git/*", "__pycache__/*", "dist/*", "build/*", "*.lock", "package-lock.json", ], "include_gitignored": False, "include_submodules": False, } if self.token: kwargs["token"] = self.token if branch: kwargs["branch"] = branch summary, tree, content = ingest(repo_url, **kwargs) if not content or not content.strip(): logger.warning(f"No content from {repo_full_name}") return None return RepositoryDigest( repo_full_name=repo_full_name, summary=summary, tree=tree, content=content, branch=branch, ) except Exception as e: logger.error(f"Python library failed for {repo_full_name}: {e}") return None