feat: Integrate gitingest for GitHub repository ingestion

- Added gitingest as a dependency to streamline the ingestion of GitHub repositories.
- Refactored GitHubConnector to utilize gitingest for efficient repository digest generation, reducing API calls.
- Updated GitHub indexer to process entire repository digests, enhancing performance and simplifying the indexing process.
- Modified GitHub connect form to indicate that the Personal Access Token is optional for public repositories.
This commit is contained in:
Anish Sarkar 2026-01-20 21:52:32 +05:30
parent 6e331c3b85
commit 49b8a46d10
6 changed files with 545 additions and 539 deletions

View file

@ -1,296 +1,295 @@
import base64
import logging
from typing import Any
"""
GitHub connector using gitingest for efficient repository digestion.
from github3 import exceptions as github_exceptions, login as github_login
from github3.exceptions import ForbiddenError, NotFoundError
from github3.repos.contents import Contents
This connector replaces the previous file-by-file approach with a single
digest generation per repository, dramatically reducing LLM API calls.
"""
import logging
from dataclasses import dataclass
from gitingest import ingest_async
logger = logging.getLogger(__name__)
# List of common code file extensions to target
CODE_EXTENSIONS = {
".py",
".js",
".jsx",
".ts",
".tsx",
".java",
".c",
".cpp",
".h",
".hpp",
".cs",
".go",
".rb",
".php",
".swift",
".kt",
".scala",
".rs",
".m",
".sh",
".bash",
".ps1",
".lua",
".pl",
".pm",
".r",
".dart",
".sql",
}
# Maximum file size in bytes (5MB)
MAX_FILE_SIZE = 5 * 1024 * 1024
# List of common documentation/text file extensions
DOC_EXTENSIONS = {
".md",
".txt",
".rst",
".adoc",
".html",
".htm",
".xml",
".json",
".yaml",
".yml",
".toml",
}
# Default patterns to exclude (recommended approach for comprehensive analysis)
# Using only exclude_patterns ensures we don't miss any relevant file types
DEFAULT_EXCLUDE_PATTERNS = [
# Dependencies
"node_modules/*",
"vendor/*",
"bower_components/*",
".pnpm/*",
# Build artifacts / Caches
"build/*",
"dist/*",
"target/*",
"out/*",
"__pycache__/*",
"*.pyc",
".cache/*",
".next/*",
".nuxt/*",
# Virtual environments
"venv/*",
".venv/*",
"env/*",
".env/*",
# IDE/Editor config
".vscode/*",
".idea/*",
".project",
".settings/*",
"*.swp",
"*.swo",
# Version control
".git/*",
".svn/*",
".hg/*",
# Temporary / Logs
"tmp/*",
"temp/*",
"logs/*",
"*.log",
# Lock files (usually not needed for understanding code)
"package-lock.json",
"pnpm-lock.yaml",
"yarn.lock",
"uv.lock",
"Gemfile.lock",
"poetry.lock",
"Cargo.lock",
"composer.lock",
# Binary/media files
"*.png",
"*.jpg",
"*.jpeg",
"*.gif",
"*.ico",
"*.svg",
"*.webp",
"*.bmp",
"*.tiff",
"*.woff",
"*.woff2",
"*.ttf",
"*.eot",
"*.otf",
"*.mp3",
"*.mp4",
"*.wav",
"*.ogg",
"*.webm",
"*.avi",
"*.mov",
"*.pdf",
"*.doc",
"*.docx",
"*.xls",
"*.xlsx",
"*.ppt",
"*.pptx",
"*.zip",
"*.tar",
"*.tar.gz",
"*.tgz",
"*.rar",
"*.7z",
"*.exe",
"*.dll",
"*.so",
"*.dylib",
"*.bin",
"*.obj",
"*.o",
"*.a",
"*.lib",
# Minified files
"*.min.js",
"*.min.css",
# Source maps
"*.map",
# Database files
"*.db",
"*.sqlite",
"*.sqlite3",
# Coverage reports
"coverage/*",
".coverage",
"htmlcov/*",
".nyc_output/*",
# Test snapshots (can be large)
"__snapshots__/*",
]
# Maximum file size in bytes (e.g., 1MB)
MAX_FILE_SIZE = 1 * 1024 * 1024
@dataclass
class RepositoryDigest:
"""Represents a digested repository from gitingest."""
repo_full_name: str
summary: str
tree: str
content: str
branch: str | None = None
@property
def full_digest(self) -> str:
"""Returns the complete digest with tree and content."""
return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
@property
def estimated_tokens(self) -> int:
"""Rough estimate of tokens (1 token ≈ 4 characters)."""
return len(self.full_digest) // 4
class GitHubConnector:
"""Connector for interacting with the GitHub API."""
"""
Connector for ingesting GitHub repositories using gitingest.
# Directories to skip during file traversal
SKIPPED_DIRS = {
# Version control
".git",
# Dependencies
"node_modules",
"vendor",
# Build artifacts / Caches
"build",
"dist",
"target",
"__pycache__",
# Virtual environments
"venv",
".venv",
"env",
# IDE/Editor config
".vscode",
".idea",
".project",
".settings",
# Temporary / Logs
"tmp",
"logs",
# Add other project-specific irrelevant directories if needed
}
This connector efficiently processes entire repositories into a single
digest, reducing the number of API calls and LLM invocations compared
to file-by-file processing.
"""
def __init__(self, token: str):
def __init__(self, token: str | None = None):
"""
Initializes the GitHub connector.
Args:
token: GitHub Personal Access Token (PAT).
token: Optional GitHub Personal Access Token (PAT).
Only required for private repositories.
Public repositories can be ingested without a token.
"""
if not token:
raise ValueError("GitHub token cannot be empty.")
try:
self.gh = github_login(token=token)
# Try a simple authenticated call to check token validity
self.gh.me()
logger.info("Successfully authenticated with GitHub API.")
except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
logger.error(f"GitHub authentication failed: {e}")
raise ValueError("Invalid GitHub token or insufficient permissions.") from e
except Exception as e:
logger.error(f"Failed to initialize GitHub client: {e}")
raise e
self.token = token if token and token.strip() else None
if self.token:
logger.info("GitHub connector initialized with authentication token.")
else:
logger.info("GitHub connector initialized without token (public repos only).")
def get_user_repositories(self) -> list[dict[str, Any]]:
"""Fetches repositories accessible by the authenticated user."""
repos_data = []
try:
# type='owner' fetches repos owned by the user
# type='member' fetches repos the user is a collaborator on (including orgs)
# type='all' fetches both
for repo in self.gh.repositories(type="all", sort="updated"):
repos_data.append(
{
"id": repo.id,
"name": repo.name,
"full_name": repo.full_name,
"private": repo.private,
"url": repo.html_url,
"description": repo.description or "",
"last_updated": repo.updated_at if repo.updated_at else None,
}
)
logger.info(f"Fetched {len(repos_data)} repositories.")
return repos_data
except Exception as e:
logger.error(f"Failed to fetch GitHub repositories: {e}")
return [] # Return empty list on error
def get_repository_files(
self, repo_full_name: str, path: str = ""
) -> list[dict[str, Any]]:
async def ingest_repository(
self,
repo_full_name: str,
branch: str | None = None,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
max_file_size: int = MAX_FILE_SIZE,
) -> RepositoryDigest | None:
"""
Recursively fetches details of relevant files (code, docs) within a repository path.
Ingest an entire repository and return a digest.
Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo').
path: The starting path within the repository (default is root).
branch: Optional specific branch or tag to ingest.
include_patterns: Optional list of glob patterns for files to include.
If None, includes all files (recommended).
exclude_patterns: Optional list of glob patterns for files to exclude.
If None, uses DEFAULT_EXCLUDE_PATTERNS.
max_file_size: Maximum file size in bytes to include (default 5MB).
Returns:
A list of dictionaries, each containing file details (path, sha, url, size).
Returns an empty list if the repository or path is not found or on error.
RepositoryDigest containing the summary, tree structure, and content,
or None if ingestion fails.
"""
files_list = []
repo_url = f"https://github.com/{repo_full_name}"
# Use only exclude_patterns by default (recommended for comprehensive analysis)
# This ensures we don't miss any relevant file types
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
logger.info(f"Starting gitingest for repository: {repo_full_name}")
try:
owner, repo_name = repo_full_name.split("/")
repo = self.gh.repository(owner, repo_name)
if not repo:
logger.warning(f"Repository '{repo_full_name}' not found.")
return []
contents = repo.directory_contents(
directory_path=path
) # Use directory_contents for clarity
# Build kwargs dynamically
ingest_kwargs = {
"max_file_size": max_file_size,
"exclude_patterns": exclude_pats,
"include_gitignored": False,
"include_submodules": False,
}
# contents returns a list of tuples (name, content_obj)
for _item_name, content_item in contents:
if not isinstance(content_item, Contents):
continue
# Only add token if provided (required only for private repos)
if self.token:
ingest_kwargs["token"] = self.token
if content_item.type == "dir":
# Check if the directory name is in the skipped list
if content_item.name in self.SKIPPED_DIRS:
logger.debug(f"Skipping directory: {content_item.path}")
continue # Skip recursion for this directory
# Only add branch if specified
if branch:
ingest_kwargs["branch"] = branch
# Recursively fetch contents of subdirectory
files_list.extend(
self.get_repository_files(
repo_full_name, path=content_item.path
)
)
elif content_item.type == "file":
# Check if the file extension is relevant and size is within limits
file_extension = (
"." + content_item.name.split(".")[-1].lower()
if "." in content_item.name
else ""
)
is_code = file_extension in CODE_EXTENSIONS
is_doc = file_extension in DOC_EXTENSIONS
# Only add include_patterns if explicitly provided
if include_patterns is not None:
ingest_kwargs["include_patterns"] = include_patterns
if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
files_list.append(
{
"path": content_item.path,
"sha": content_item.sha,
"url": content_item.html_url,
"size": content_item.size,
"type": "code" if is_code else "doc",
}
)
elif content_item.size > MAX_FILE_SIZE:
logger.debug(
f"Skipping large file: {content_item.path} ({content_item.size} bytes)"
)
else:
logger.debug(
f"Skipping irrelevant file type: {content_item.path}"
)
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
except (NotFoundError, ForbiddenError) as e:
logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
except Exception as e:
logger.error(
f"Failed to get files for {repo_full_name} at path '{path}': {e}"
if not content or not content.strip():
logger.warning(
f"No content retrieved from repository: {repo_full_name}"
)
return None
digest = RepositoryDigest(
repo_full_name=repo_full_name,
summary=summary,
tree=tree,
content=content,
branch=branch,
)
# Return what we have collected so far in case of partial failure
return files_list
logger.info(
f"Successfully ingested {repo_full_name}: "
f"~{digest.estimated_tokens} estimated tokens"
)
return digest
def get_file_content(self, repo_full_name: str, file_path: str) -> str | None:
except Exception as e:
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
return None
async def ingest_repositories(
self,
repo_full_names: list[str],
branch: str | None = None,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
max_file_size: int = MAX_FILE_SIZE,
) -> list[RepositoryDigest]:
"""
Fetches the decoded content of a specific file.
Ingest multiple repositories and return their digests.
Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo').
file_path: The path to the file within the repository.
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
branch: Optional specific branch or tag to ingest (applied to all repos).
include_patterns: Optional list of glob patterns for files to include.
exclude_patterns: Optional list of glob patterns for files to exclude.
max_file_size: Maximum file size in bytes to include.
Returns:
The decoded file content as a string, or None if fetching fails or file is too large.
List of RepositoryDigest objects for successfully ingested repositories.
"""
try:
owner, repo_name = repo_full_name.split("/")
repo = self.gh.repository(owner, repo_name)
if not repo:
logger.warning(
f"Repository '{repo_full_name}' not found when fetching file '{file_path}'."
)
return None
digests = []
content_item = repo.file_contents(
path=file_path
) # Use file_contents for clarity
for repo_full_name in repo_full_names:
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
if (
not content_item
or not isinstance(content_item, Contents)
or content_item.type != "file"
):
logger.warning(
f"File '{file_path}' not found or is not a file in '{repo_full_name}'."
)
return None
if content_item.size > MAX_FILE_SIZE:
logger.warning(
f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch."
)
return None
# Content is base64 encoded
if content_item.content:
try:
decoded_content = base64.b64decode(content_item.content).decode(
"utf-8"
)
return decoded_content
except UnicodeDecodeError:
logger.warning(
f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'."
)
try:
# Try a fallback encoding
decoded_content = base64.b64decode(content_item.content).decode(
"latin-1"
)
return decoded_content
except Exception as decode_err:
logger.error(
f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}"
)
return None # Give up if fallback fails
else:
logger.warning(
f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty."
)
return "" # Return empty string for empty files
except (NotFoundError, ForbiddenError) as e:
logger.warning(
f"Cannot access file '{file_path}' in '{repo_full_name}': {e}"
digest = await self.ingest_repository(
repo_full_name=repo_full_name,
branch=branch,
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
max_file_size=max_file_size,
)
return None
except Exception as e:
logger.error(
f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}"
)
return None
if digest:
digests.append(digest)
logger.info(
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
)
return digests

View file

@ -1,5 +1,8 @@
"""
GitHub connector indexer.
GitHub connector indexer using gitingest.
This indexer processes entire repository digests in one pass, dramatically
reducing LLM API calls compared to the previous file-by-file approach.
"""
from datetime import UTC, datetime
@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.github_connector import GitHubConnector
from app.connectors.github_connector import GitHubConnector, RepositoryDigest
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
@ -26,43 +29,55 @@ from .base import (
logger,
)
# Maximum tokens for a single digest before splitting
# Most LLMs can handle 128k+ tokens now, but we'll be conservative
MAX_DIGEST_CHARS = 500_000 # ~125k tokens
async def index_github_repos(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index code and documentation files from accessible GitHub repositories.
Index GitHub repositories using gitingest for efficient processing.
This function ingests entire repositories as digests, generates a single
summary per repository, and chunks the content for vector storage.
Note: The start_date and end_date parameters are accepted for API compatibility
but are IGNORED. GitHub repositories are indexed as complete snapshots since
gitingest captures the current state of the entire codebase.
Args:
session: Database session
connector_id: ID of the GitHub connector
search_space_id: ID of the search space to store documents in
user_id: ID of the user
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
start_date: Ignored - kept for API compatibility
end_date: Ignored - kept for API compatibility
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
# Note: start_date and end_date are intentionally unused
_ = start_date, end_date
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="github_repos_indexing",
source="connector_indexing_task",
message=f"Starting GitHub repositories indexing for connector {connector_id}",
message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
"method": "gitingest",
},
)
@ -93,19 +108,11 @@ async def index_github_repos(
f"Connector with ID {connector_id} not found or is not a GitHub connector",
)
# 2. Get the GitHub PAT and selected repositories from the connector config
github_pat = connector.config.get("GITHUB_PAT")
# 2. Get the GitHub PAT (optional) and selected repositories from the connector config
# PAT is only required for private repositories - public repos work without it
github_pat = connector.config.get("GITHUB_PAT") # Can be None or empty
repo_full_names_to_index = connector.config.get("repo_full_names")
if not github_pat:
await task_logger.log_task_failure(
log_entry,
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
"Missing GitHub PAT",
{"error_type": "MissingToken"},
)
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
if not repo_full_names_to_index or not isinstance(
repo_full_names_to_index, list
):
@ -117,10 +124,16 @@ async def index_github_repos(
)
return 0, "'repo_full_names' not found or is not a list in connector config"
# 3. Initialize GitHub connector client
# Log whether we're using authentication
if github_pat:
logger.info("Using GitHub PAT for authentication (private repos supported)")
else:
logger.info("No GitHub PAT provided - only public repositories can be indexed")
# 3. Initialize GitHub connector with gitingest backend
await task_logger.log_task_progress(
log_entry,
f"Initializing GitHub client for connector {connector_id}",
f"Initializing gitingest-based GitHub client for connector {connector_id}",
{
"stage": "client_initialization",
"repo_count": len(repo_full_names_to_index),
@ -138,258 +151,52 @@ async def index_github_repos(
)
return 0, f"Failed to initialize GitHub client: {e!s}"
# 4. Validate selected repositories
# 4. Process each repository with gitingest
await task_logger.log_task_progress(
log_entry,
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
{
"stage": "repo_processing",
"repo_count": len(repo_full_names_to_index),
"start_date": start_date,
"end_date": end_date,
},
)
logger.info(
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
)
if start_date and end_date:
logger.info(
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
)
# 6. Iterate through selected repositories and index files
for repo_full_name in repo_full_names_to_index:
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
logger.info(f"Processing repository: {repo_full_name}")
logger.info(f"Ingesting repository: {repo_full_name}")
try:
files_to_index = github_client.get_repository_files(repo_full_name)
if not files_to_index:
logger.info(
f"No indexable files found in repository: {repo_full_name}"
# Ingest the entire repository
digest = await github_client.ingest_repository(repo_full_name)
if not digest:
logger.warning(
f"No digest returned for repository: {repo_full_name}"
)
errors.append(f"No digest for {repo_full_name}")
continue
logger.info(
f"Found {len(files_to_index)} files to process in {repo_full_name}"
# Process the digest and create documents
docs_created = await _process_repository_digest(
session=session,
digest=digest,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
)
for file_info in files_to_index:
file_path = file_info.get("path")
file_url = file_info.get("url")
file_sha = file_info.get("sha")
file_type = file_info.get("type") # 'code' or 'doc'
full_path_key = f"{repo_full_name}/{file_path}"
if not file_path or not file_url or not file_sha:
logger.warning(
f"Skipping file with missing info in {repo_full_name}: {file_info}"
)
continue
# Get file content
file_content = github_client.get_file_content(
repo_full_name, file_path
)
if file_content is None:
logger.warning(
f"Could not retrieve content for {full_path_key}. Skipping."
)
continue # Skip if content fetch failed
# Generate unique identifier hash for this GitHub file
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
)
# Generate content hash
content_hash = generate_content_hash(file_content, search_space_id)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for GitHub file {full_path_key} unchanged. Skipping."
)
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for GitHub file {full_path_key}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
file_extension = (
file_path.split(".")[-1]
if "." in file_path
else None
)
document_metadata = {
"file_path": full_path_key,
"repository": repo_full_name,
"file_type": file_extension or "unknown",
"document_type": "GitHub Repository File",
"connector_type": "GitHub",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
file_content, user_llm, document_metadata
)
else:
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
summary_embedding = (
config.embedding_model_instance.embed(
summary_content
)
)
# Chunk the content
try:
if hasattr(config, "code_chunker_instance"):
chunks_data = [
await create_document_chunks(file_content)
][0]
else:
chunks_data = await create_document_chunks(
file_content
)
except Exception as chunk_err:
logger.error(
f"Failed to chunk file {full_path_key}: {chunk_err}"
)
continue
# Update existing document
existing_document.title = f"GitHub - {full_path_key}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"file_path": file_path,
"file_sha": file_sha,
"file_url": file_url,
"repository": repo_full_name,
"indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S"
),
}
existing_document.chunks = chunks_data
existing_document.updated_at = get_current_timestamp()
logger.info(
f"Successfully updated GitHub file {full_path_key}"
)
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
# Extract file extension from file path
file_extension = (
file_path.split(".")[-1] if "." in file_path else None
)
document_metadata = {
"file_path": full_path_key,
"repository": repo_full_name,
"file_type": file_extension or "unknown",
"document_type": "GitHub Repository File",
"connector_type": "GitHub",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
file_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = (
f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Chunk the content
try:
chunks_data = [await create_document_chunks(file_content)][0]
# Use code chunker if available, otherwise regular chunker
if hasattr(config, "code_chunker_instance"):
chunks_data = [
{
"content": chunk.text,
"embedding": config.embedding_model_instance.embed(
chunk.text
),
}
for chunk in config.code_chunker_instance.chunk(
file_content
)
]
else:
chunks_data = await create_document_chunks(file_content)
except Exception as chunk_err:
logger.error(
f"Failed to chunk file {full_path_key}: {chunk_err}"
)
errors.append(
f"Chunking failed for {full_path_key}: {chunk_err}"
)
continue # Skip this file if chunking fails
doc_metadata = {
"repository_full_name": repo_full_name,
"file_path": file_path,
"full_path": full_path_key, # For easier lookup
"url": file_url,
"sha": file_sha,
"type": file_type,
"indexed_at": datetime.now(UTC).isoformat(),
}
# Create new document
logger.info(f"Creating new document for file: {full_path_key}")
document = Document(
title=f"GitHub - {file_path}",
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata=doc_metadata,
content=summary_content, # Store summary
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data, # Associate chunks directly
updated_at=get_current_timestamp(),
)
session.add(document)
documents_processed += 1
# Batch commit every 10 documents
if documents_processed % 10 == 0:
logger.info(
f"Committing batch: {documents_processed} GitHub files processed so far"
)
await session.commit()
documents_processed += docs_created
logger.info(
f"Created {docs_created} documents from repository: {repo_full_name}"
)
except Exception as repo_err:
logger.error(
@ -397,11 +204,11 @@ async def index_github_repos(
)
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_processed} GitHub files processed")
# Final commit
await session.commit()
logger.info(
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
f"Finished GitHub indexing for connector {connector_id}. "
f"Created {documents_processed} documents."
)
# Log success
@ -412,6 +219,7 @@ async def index_github_repos(
"documents_processed": documents_processed,
"errors_count": len(errors),
"repo_count": len(repo_full_names_to_index),
"method": "gitingest",
},
)
@ -428,6 +236,7 @@ async def index_github_repos(
)
errors.append(f"Database error: {db_err}")
return documents_processed, "; ".join(errors) if errors else str(db_err)
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
@ -445,3 +254,173 @@ async def index_github_repos(
error_message = "; ".join(errors) if errors else None
return documents_processed, error_message
async def _process_repository_digest(
session: AsyncSession,
digest: RepositoryDigest,
search_space_id: int,
user_id: str,
task_logger: TaskLoggingService,
log_entry,
) -> int:
"""
Process a repository digest and create documents.
For each repository, we create:
1. One main document with the repository summary
2. Chunks from the full digest content for granular search
Args:
session: Database session
digest: The repository digest from gitingest
search_space_id: ID of the search space
user_id: ID of the user
task_logger: Task logging service
log_entry: Current log entry
Returns:
Number of documents created
"""
repo_full_name = digest.repo_full_name
documents_created = 0
# Generate unique identifier based on repo name and content hash
# This allows updates when repo content changes
full_content = digest.full_digest
content_hash = generate_content_hash(full_content, search_space_id)
# Use repo name as the unique identifier (one document per repo)
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Repository {repo_full_name} unchanged. Skipping."
)
return 0
else:
logger.info(
f"Content changed for repository {repo_full_name}. Updating document."
)
# Delete existing document to replace with new one
await session.delete(existing_document)
await session.flush()
# Generate summary using LLM (ONE call per repository!)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
document_metadata = {
"repository": repo_full_name,
"document_type": "GitHub Repository",
"connector_type": "GitHub",
"ingestion_method": "gitingest",
"file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
"estimated_tokens": digest.estimated_tokens,
}
if user_llm:
# Prepare content for summarization
# Include tree structure and truncated content if too large
summary_content = digest.full_digest
if len(summary_content) > MAX_DIGEST_CHARS:
# Truncate but keep the tree and beginning of content
summary_content = (
f"# Repository: {repo_full_name}\n\n"
f"## File Structure\n\n{digest.tree}\n\n"
f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
)
summary_text, summary_embedding = await generate_document_summary(
summary_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_text = (
f"# GitHub Repository: {repo_full_name}\n\n"
f"## Summary\n{digest.summary}\n\n"
f"## File Structure\n{digest.tree[:3000]}"
)
summary_embedding = config.embedding_model_instance.embed(summary_text)
# Chunk the full digest content for granular search
try:
# Use the content (not the summary) for chunking
# This preserves file-level granularity in search
chunks_data = await create_document_chunks(digest.content)
except Exception as chunk_err:
logger.error(
f"Failed to chunk repository {repo_full_name}: {chunk_err}"
)
# Fall back to a simpler chunking approach
chunks_data = await _simple_chunk_content(digest.content)
# Create the document
doc_metadata = {
"repository_full_name": repo_full_name,
"url": f"https://github.com/{repo_full_name}",
"branch": digest.branch,
"ingestion_method": "gitingest",
"file_tree": digest.tree,
"gitingest_summary": digest.summary,
"estimated_tokens": digest.estimated_tokens,
"indexed_at": datetime.now(UTC).isoformat(),
}
document = Document(
title=f"GitHub Repository: {repo_full_name}",
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata=doc_metadata,
content=summary_text,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_created += 1
logger.info(
f"Created document for repository {repo_full_name} "
f"with {len(chunks_data)} chunks"
)
return documents_created
async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
"""
Simple fallback chunking when the regular chunker fails.
Args:
content: The content to chunk
chunk_size: Size of each chunk in characters
Returns:
List of chunk dictionaries with content and embedding
"""
from app.db import Chunk
chunks = []
for i in range(0, len(content), chunk_size):
chunk_text = content[i : i + chunk_size]
if chunk_text.strip():
chunks.append(
Chunk(
content=chunk_text,
embedding=config.embedding_model_instance.embed(chunk_text),
)
)
return chunks

View file

@ -60,6 +60,7 @@ dependencies = [
"mcp>=1.25.0",
"starlette>=0.40.0,<0.51.0",
"sse-starlette>=3.1.1,<3.1.2",
"gitingest>=0.3.1",
]
[dependency-groups]

View file

@ -1945,6 +1945,25 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
]
[[package]]
name = "gitingest"
version = "0.3.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "httpx" },
{ name = "loguru" },
{ name = "pathspec" },
{ name = "pydantic" },
{ name = "python-dotenv" },
{ name = "starlette" },
{ name = "tiktoken" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" },
]
[[package]]
name = "google-api-core"
version = "2.25.1"
@ -4460,6 +4479,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
]
[[package]]
name = "pathspec"
version = "1.0.3"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" },
]
[[package]]
name = "pdf2image"
version = "1.17.0"
@ -6484,6 +6512,7 @@ dependencies = [
{ name = "firecrawl-py" },
{ name = "flower" },
{ name = "github3-py" },
{ name = "gitingest" },
{ name = "google-api-python-client" },
{ name = "google-auth-oauthlib" },
{ name = "kokoro" },
@ -6549,6 +6578,7 @@ requires-dist = [
{ name = "firecrawl-py", specifier = ">=4.9.0" },
{ name = "flower", specifier = ">=2.0.1" },
{ name = "github3-py", specifier = "==4.0.1" },
{ name = "gitingest", specifier = ">=0.3.1" },
{ name = "google-api-python-client", specifier = ">=2.156.0" },
{ name = "google-auth-oauthlib", specifier = ">=1.2.1" },
{ name = "kokoro", specifier = ">=0.9.4" },

View file

@ -34,7 +34,6 @@ import {
} from "@/components/ui/select";
import { Switch } from "@/components/ui/switch";
import { EnumConnectorName } from "@/contracts/enums/connector";
import { DateRangeSelector } from "../../components/date-range-selector";
import { getConnectorBenefits } from "../connector-benefits";
import type { ConnectFormProps } from "../index";
@ -44,12 +43,13 @@ const githubConnectorFormSchema = z.object({
}),
github_pat: z
.string()
.min(20, {
message: "GitHub Personal Access Token seems too short.",
})
.refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
}),
.optional()
.refine(
(pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"),
{
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
}
),
repo_full_names: z.string().min(1, {
message: "At least one repository is required.",
}),
@ -59,8 +59,6 @@ type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
const isSubmittingRef = useRef(false);
const [startDate, setStartDate] = useState<Date | undefined>(undefined);
const [endDate, setEndDate] = useState<Date | undefined>(undefined);
const [periodicEnabled, setPeriodicEnabled] = useState(false);
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
const form = useForm<GithubConnectorFormValues>({
@ -94,7 +92,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
name: values.name,
connector_type: EnumConnectorName.GITHUB_CONNECTOR,
config: {
GITHUB_PAT: values.github_pat,
GITHUB_PAT: values.github_pat || null, // Optional - only for private repos
repo_full_names: repoList,
},
is_indexable: true,
@ -102,8 +100,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
periodic_indexing_enabled: periodicEnabled,
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
next_scheduled_at: null,
startDate,
endDate,
// GitHub indexes full repo snapshots - no date range needed
startDate: undefined,
endDate: undefined,
periodicEnabled,
frequencyMinutes,
});
@ -117,10 +116,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
<div className="-ml-1">
<AlertTitle className="text-xs sm:text-sm">Personal Access Token Required</AlertTitle>
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
You'll need a GitHub Personal Access Token to use this connector. You can create one
from{" "}
A GitHub PAT is only required for private repositories. Public repos work without a
token. Create one from{" "}
<a
href="https://github.com/settings/tokens"
target="_blank"
@ -128,7 +127,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
className="font-medium underline underline-offset-4"
>
GitHub Settings
</a>
</a>{" "}
if needed.
</AlertDescription>
</div>
</Alert>
@ -167,7 +167,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
name="github_pat"
render={({ field }) => (
<FormItem>
<FormLabel className="text-xs sm:text-sm">GitHub Personal Access Token</FormLabel>
<FormLabel className="text-xs sm:text-sm">
GitHub Personal Access Token{" "}
<span className="text-muted-foreground font-normal">(optional)</span>
</FormLabel>
<FormControl>
<Input
type="password"
@ -178,8 +181,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
/>
</FormControl>
<FormDescription className="text-[10px] sm:text-xs">
Your GitHub PAT will be encrypted and stored securely. It typically starts with
"ghp_" or "github_pat_".
Only required for private repositories. Leave empty if indexing public repos
only.
</FormDescription>
<FormMessage />
</FormItem>
@ -225,15 +228,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
{/* Indexing Configuration */}
<div className="space-y-4 pt-4 border-t border-slate-400/20">
<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
<h3 className="text-sm sm:text-base font-medium">Sync Configuration</h3>
{/* Date Range Selector */}
<DateRangeSelector
startDate={startDate}
endDate={endDate}
onStartDateChange={setStartDate}
onEndDateChange={setEndDate}
/>
{/* Note: No date range for GitHub - it indexes full repo snapshots */}
{/* Periodic Sync Config */}
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">

View file

@ -490,8 +490,8 @@ export function SourceDetailPanel({
>
{idx + 1}
{isCited && (
<span className="absolute -top-1 -right-1 w-3 h-3 bg-primary rounded-full border-2 border-background">
<Sparkles className="h-2 w-2 text-primary-foreground absolute top-0.5 left-0.5" />
<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
</span>
)}
</motion.button>