mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat: Integrate gitingest for GitHub repository ingestion
- Added gitingest as a dependency to streamline the ingestion of GitHub repositories. - Refactored GitHubConnector to utilize gitingest for efficient repository digest generation, reducing API calls. - Updated GitHub indexer to process entire repository digests, enhancing performance and simplifying the indexing process. - Modified GitHub connect form to indicate that the Personal Access Token is optional for public repositories.
This commit is contained in:
parent
6e331c3b85
commit
49b8a46d10
6 changed files with 545 additions and 539 deletions
|
|
@ -1,296 +1,295 @@
|
||||||
import base64
|
"""
|
||||||
import logging
|
GitHub connector using gitingest for efficient repository digestion.
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from github3 import exceptions as github_exceptions, login as github_login
|
This connector replaces the previous file-by-file approach with a single
|
||||||
from github3.exceptions import ForbiddenError, NotFoundError
|
digest generation per repository, dramatically reducing LLM API calls.
|
||||||
from github3.repos.contents import Contents
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from gitingest import ingest_async
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# List of common code file extensions to target
|
# Maximum file size in bytes (5MB)
|
||||||
CODE_EXTENSIONS = {
|
MAX_FILE_SIZE = 5 * 1024 * 1024
|
||||||
".py",
|
|
||||||
".js",
|
|
||||||
".jsx",
|
|
||||||
".ts",
|
|
||||||
".tsx",
|
|
||||||
".java",
|
|
||||||
".c",
|
|
||||||
".cpp",
|
|
||||||
".h",
|
|
||||||
".hpp",
|
|
||||||
".cs",
|
|
||||||
".go",
|
|
||||||
".rb",
|
|
||||||
".php",
|
|
||||||
".swift",
|
|
||||||
".kt",
|
|
||||||
".scala",
|
|
||||||
".rs",
|
|
||||||
".m",
|
|
||||||
".sh",
|
|
||||||
".bash",
|
|
||||||
".ps1",
|
|
||||||
".lua",
|
|
||||||
".pl",
|
|
||||||
".pm",
|
|
||||||
".r",
|
|
||||||
".dart",
|
|
||||||
".sql",
|
|
||||||
}
|
|
||||||
|
|
||||||
# List of common documentation/text file extensions
|
# Default patterns to exclude (recommended approach for comprehensive analysis)
|
||||||
DOC_EXTENSIONS = {
|
# Using only exclude_patterns ensures we don't miss any relevant file types
|
||||||
".md",
|
DEFAULT_EXCLUDE_PATTERNS = [
|
||||||
".txt",
|
# Dependencies
|
||||||
".rst",
|
"node_modules/*",
|
||||||
".adoc",
|
"vendor/*",
|
||||||
".html",
|
"bower_components/*",
|
||||||
".htm",
|
".pnpm/*",
|
||||||
".xml",
|
# Build artifacts / Caches
|
||||||
".json",
|
"build/*",
|
||||||
".yaml",
|
"dist/*",
|
||||||
".yml",
|
"target/*",
|
||||||
".toml",
|
"out/*",
|
||||||
}
|
"__pycache__/*",
|
||||||
|
"*.pyc",
|
||||||
|
".cache/*",
|
||||||
|
".next/*",
|
||||||
|
".nuxt/*",
|
||||||
|
# Virtual environments
|
||||||
|
"venv/*",
|
||||||
|
".venv/*",
|
||||||
|
"env/*",
|
||||||
|
".env/*",
|
||||||
|
# IDE/Editor config
|
||||||
|
".vscode/*",
|
||||||
|
".idea/*",
|
||||||
|
".project",
|
||||||
|
".settings/*",
|
||||||
|
"*.swp",
|
||||||
|
"*.swo",
|
||||||
|
# Version control
|
||||||
|
".git/*",
|
||||||
|
".svn/*",
|
||||||
|
".hg/*",
|
||||||
|
# Temporary / Logs
|
||||||
|
"tmp/*",
|
||||||
|
"temp/*",
|
||||||
|
"logs/*",
|
||||||
|
"*.log",
|
||||||
|
# Lock files (usually not needed for understanding code)
|
||||||
|
"package-lock.json",
|
||||||
|
"pnpm-lock.yaml",
|
||||||
|
"yarn.lock",
|
||||||
|
"uv.lock",
|
||||||
|
"Gemfile.lock",
|
||||||
|
"poetry.lock",
|
||||||
|
"Cargo.lock",
|
||||||
|
"composer.lock",
|
||||||
|
# Binary/media files
|
||||||
|
"*.png",
|
||||||
|
"*.jpg",
|
||||||
|
"*.jpeg",
|
||||||
|
"*.gif",
|
||||||
|
"*.ico",
|
||||||
|
"*.svg",
|
||||||
|
"*.webp",
|
||||||
|
"*.bmp",
|
||||||
|
"*.tiff",
|
||||||
|
"*.woff",
|
||||||
|
"*.woff2",
|
||||||
|
"*.ttf",
|
||||||
|
"*.eot",
|
||||||
|
"*.otf",
|
||||||
|
"*.mp3",
|
||||||
|
"*.mp4",
|
||||||
|
"*.wav",
|
||||||
|
"*.ogg",
|
||||||
|
"*.webm",
|
||||||
|
"*.avi",
|
||||||
|
"*.mov",
|
||||||
|
"*.pdf",
|
||||||
|
"*.doc",
|
||||||
|
"*.docx",
|
||||||
|
"*.xls",
|
||||||
|
"*.xlsx",
|
||||||
|
"*.ppt",
|
||||||
|
"*.pptx",
|
||||||
|
"*.zip",
|
||||||
|
"*.tar",
|
||||||
|
"*.tar.gz",
|
||||||
|
"*.tgz",
|
||||||
|
"*.rar",
|
||||||
|
"*.7z",
|
||||||
|
"*.exe",
|
||||||
|
"*.dll",
|
||||||
|
"*.so",
|
||||||
|
"*.dylib",
|
||||||
|
"*.bin",
|
||||||
|
"*.obj",
|
||||||
|
"*.o",
|
||||||
|
"*.a",
|
||||||
|
"*.lib",
|
||||||
|
# Minified files
|
||||||
|
"*.min.js",
|
||||||
|
"*.min.css",
|
||||||
|
# Source maps
|
||||||
|
"*.map",
|
||||||
|
# Database files
|
||||||
|
"*.db",
|
||||||
|
"*.sqlite",
|
||||||
|
"*.sqlite3",
|
||||||
|
# Coverage reports
|
||||||
|
"coverage/*",
|
||||||
|
".coverage",
|
||||||
|
"htmlcov/*",
|
||||||
|
".nyc_output/*",
|
||||||
|
# Test snapshots (can be large)
|
||||||
|
"__snapshots__/*",
|
||||||
|
]
|
||||||
|
|
||||||
# Maximum file size in bytes (e.g., 1MB)
|
|
||||||
MAX_FILE_SIZE = 1 * 1024 * 1024
|
@dataclass
|
||||||
|
class RepositoryDigest:
|
||||||
|
"""Represents a digested repository from gitingest."""
|
||||||
|
|
||||||
|
repo_full_name: str
|
||||||
|
summary: str
|
||||||
|
tree: str
|
||||||
|
content: str
|
||||||
|
branch: str | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def full_digest(self) -> str:
|
||||||
|
"""Returns the complete digest with tree and content."""
|
||||||
|
return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def estimated_tokens(self) -> int:
|
||||||
|
"""Rough estimate of tokens (1 token ≈ 4 characters)."""
|
||||||
|
return len(self.full_digest) // 4
|
||||||
|
|
||||||
|
|
||||||
class GitHubConnector:
|
class GitHubConnector:
|
||||||
"""Connector for interacting with the GitHub API."""
|
"""
|
||||||
|
Connector for ingesting GitHub repositories using gitingest.
|
||||||
|
|
||||||
# Directories to skip during file traversal
|
This connector efficiently processes entire repositories into a single
|
||||||
SKIPPED_DIRS = {
|
digest, reducing the number of API calls and LLM invocations compared
|
||||||
# Version control
|
to file-by-file processing.
|
||||||
".git",
|
"""
|
||||||
# Dependencies
|
|
||||||
"node_modules",
|
|
||||||
"vendor",
|
|
||||||
# Build artifacts / Caches
|
|
||||||
"build",
|
|
||||||
"dist",
|
|
||||||
"target",
|
|
||||||
"__pycache__",
|
|
||||||
# Virtual environments
|
|
||||||
"venv",
|
|
||||||
".venv",
|
|
||||||
"env",
|
|
||||||
# IDE/Editor config
|
|
||||||
".vscode",
|
|
||||||
".idea",
|
|
||||||
".project",
|
|
||||||
".settings",
|
|
||||||
# Temporary / Logs
|
|
||||||
"tmp",
|
|
||||||
"logs",
|
|
||||||
# Add other project-specific irrelevant directories if needed
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, token: str):
|
def __init__(self, token: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes the GitHub connector.
|
Initializes the GitHub connector.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token: GitHub Personal Access Token (PAT).
|
token: Optional GitHub Personal Access Token (PAT).
|
||||||
|
Only required for private repositories.
|
||||||
|
Public repositories can be ingested without a token.
|
||||||
"""
|
"""
|
||||||
if not token:
|
self.token = token if token and token.strip() else None
|
||||||
raise ValueError("GitHub token cannot be empty.")
|
if self.token:
|
||||||
try:
|
logger.info("GitHub connector initialized with authentication token.")
|
||||||
self.gh = github_login(token=token)
|
else:
|
||||||
# Try a simple authenticated call to check token validity
|
logger.info("GitHub connector initialized without token (public repos only).")
|
||||||
self.gh.me()
|
|
||||||
logger.info("Successfully authenticated with GitHub API.")
|
|
||||||
except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
|
|
||||||
logger.error(f"GitHub authentication failed: {e}")
|
|
||||||
raise ValueError("Invalid GitHub token or insufficient permissions.") from e
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to initialize GitHub client: {e}")
|
|
||||||
raise e
|
|
||||||
|
|
||||||
def get_user_repositories(self) -> list[dict[str, Any]]:
|
async def ingest_repository(
|
||||||
"""Fetches repositories accessible by the authenticated user."""
|
self,
|
||||||
repos_data = []
|
repo_full_name: str,
|
||||||
try:
|
branch: str | None = None,
|
||||||
# type='owner' fetches repos owned by the user
|
include_patterns: list[str] | None = None,
|
||||||
# type='member' fetches repos the user is a collaborator on (including orgs)
|
exclude_patterns: list[str] | None = None,
|
||||||
# type='all' fetches both
|
max_file_size: int = MAX_FILE_SIZE,
|
||||||
for repo in self.gh.repositories(type="all", sort="updated"):
|
) -> RepositoryDigest | None:
|
||||||
repos_data.append(
|
|
||||||
{
|
|
||||||
"id": repo.id,
|
|
||||||
"name": repo.name,
|
|
||||||
"full_name": repo.full_name,
|
|
||||||
"private": repo.private,
|
|
||||||
"url": repo.html_url,
|
|
||||||
"description": repo.description or "",
|
|
||||||
"last_updated": repo.updated_at if repo.updated_at else None,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
logger.info(f"Fetched {len(repos_data)} repositories.")
|
|
||||||
return repos_data
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to fetch GitHub repositories: {e}")
|
|
||||||
return [] # Return empty list on error
|
|
||||||
|
|
||||||
def get_repository_files(
|
|
||||||
self, repo_full_name: str, path: str = ""
|
|
||||||
) -> list[dict[str, Any]]:
|
|
||||||
"""
|
"""
|
||||||
Recursively fetches details of relevant files (code, docs) within a repository path.
|
Ingest an entire repository and return a digest.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||||
path: The starting path within the repository (default is root).
|
branch: Optional specific branch or tag to ingest.
|
||||||
|
include_patterns: Optional list of glob patterns for files to include.
|
||||||
|
If None, includes all files (recommended).
|
||||||
|
exclude_patterns: Optional list of glob patterns for files to exclude.
|
||||||
|
If None, uses DEFAULT_EXCLUDE_PATTERNS.
|
||||||
|
max_file_size: Maximum file size in bytes to include (default 5MB).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of dictionaries, each containing file details (path, sha, url, size).
|
RepositoryDigest containing the summary, tree structure, and content,
|
||||||
Returns an empty list if the repository or path is not found or on error.
|
or None if ingestion fails.
|
||||||
"""
|
"""
|
||||||
files_list = []
|
repo_url = f"https://github.com/{repo_full_name}"
|
||||||
|
|
||||||
|
# Use only exclude_patterns by default (recommended for comprehensive analysis)
|
||||||
|
# This ensures we don't miss any relevant file types
|
||||||
|
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
|
||||||
|
|
||||||
|
logger.info(f"Starting gitingest for repository: {repo_full_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
owner, repo_name = repo_full_name.split("/")
|
# Build kwargs dynamically
|
||||||
repo = self.gh.repository(owner, repo_name)
|
ingest_kwargs = {
|
||||||
if not repo:
|
"max_file_size": max_file_size,
|
||||||
logger.warning(f"Repository '{repo_full_name}' not found.")
|
"exclude_patterns": exclude_pats,
|
||||||
return []
|
"include_gitignored": False,
|
||||||
contents = repo.directory_contents(
|
"include_submodules": False,
|
||||||
directory_path=path
|
}
|
||||||
) # Use directory_contents for clarity
|
|
||||||
|
|
||||||
# contents returns a list of tuples (name, content_obj)
|
# Only add token if provided (required only for private repos)
|
||||||
for _item_name, content_item in contents:
|
if self.token:
|
||||||
if not isinstance(content_item, Contents):
|
ingest_kwargs["token"] = self.token
|
||||||
continue
|
|
||||||
|
|
||||||
if content_item.type == "dir":
|
# Only add branch if specified
|
||||||
# Check if the directory name is in the skipped list
|
if branch:
|
||||||
if content_item.name in self.SKIPPED_DIRS:
|
ingest_kwargs["branch"] = branch
|
||||||
logger.debug(f"Skipping directory: {content_item.path}")
|
|
||||||
continue # Skip recursion for this directory
|
|
||||||
|
|
||||||
# Recursively fetch contents of subdirectory
|
# Only add include_patterns if explicitly provided
|
||||||
files_list.extend(
|
if include_patterns is not None:
|
||||||
self.get_repository_files(
|
ingest_kwargs["include_patterns"] = include_patterns
|
||||||
repo_full_name, path=content_item.path
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif content_item.type == "file":
|
|
||||||
# Check if the file extension is relevant and size is within limits
|
|
||||||
file_extension = (
|
|
||||||
"." + content_item.name.split(".")[-1].lower()
|
|
||||||
if "." in content_item.name
|
|
||||||
else ""
|
|
||||||
)
|
|
||||||
is_code = file_extension in CODE_EXTENSIONS
|
|
||||||
is_doc = file_extension in DOC_EXTENSIONS
|
|
||||||
|
|
||||||
if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
|
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
|
||||||
files_list.append(
|
|
||||||
{
|
|
||||||
"path": content_item.path,
|
|
||||||
"sha": content_item.sha,
|
|
||||||
"url": content_item.html_url,
|
|
||||||
"size": content_item.size,
|
|
||||||
"type": "code" if is_code else "doc",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
elif content_item.size > MAX_FILE_SIZE:
|
|
||||||
logger.debug(
|
|
||||||
f"Skipping large file: {content_item.path} ({content_item.size} bytes)"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"Skipping irrelevant file type: {content_item.path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
except (NotFoundError, ForbiddenError) as e:
|
if not content or not content.strip():
|
||||||
logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
|
logger.warning(
|
||||||
except Exception as e:
|
f"No content retrieved from repository: {repo_full_name}"
|
||||||
logger.error(
|
)
|
||||||
f"Failed to get files for {repo_full_name} at path '{path}': {e}"
|
return None
|
||||||
|
|
||||||
|
digest = RepositoryDigest(
|
||||||
|
repo_full_name=repo_full_name,
|
||||||
|
summary=summary,
|
||||||
|
tree=tree,
|
||||||
|
content=content,
|
||||||
|
branch=branch,
|
||||||
)
|
)
|
||||||
# Return what we have collected so far in case of partial failure
|
|
||||||
|
|
||||||
return files_list
|
logger.info(
|
||||||
|
f"Successfully ingested {repo_full_name}: "
|
||||||
|
f"~{digest.estimated_tokens} estimated tokens"
|
||||||
|
)
|
||||||
|
return digest
|
||||||
|
|
||||||
def get_file_content(self, repo_full_name: str, file_path: str) -> str | None:
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def ingest_repositories(
|
||||||
|
self,
|
||||||
|
repo_full_names: list[str],
|
||||||
|
branch: str | None = None,
|
||||||
|
include_patterns: list[str] | None = None,
|
||||||
|
exclude_patterns: list[str] | None = None,
|
||||||
|
max_file_size: int = MAX_FILE_SIZE,
|
||||||
|
) -> list[RepositoryDigest]:
|
||||||
"""
|
"""
|
||||||
Fetches the decoded content of a specific file.
|
Ingest multiple repositories and return their digests.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
|
||||||
file_path: The path to the file within the repository.
|
branch: Optional specific branch or tag to ingest (applied to all repos).
|
||||||
|
include_patterns: Optional list of glob patterns for files to include.
|
||||||
|
exclude_patterns: Optional list of glob patterns for files to exclude.
|
||||||
|
max_file_size: Maximum file size in bytes to include.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The decoded file content as a string, or None if fetching fails or file is too large.
|
List of RepositoryDigest objects for successfully ingested repositories.
|
||||||
"""
|
"""
|
||||||
try:
|
digests = []
|
||||||
owner, repo_name = repo_full_name.split("/")
|
|
||||||
repo = self.gh.repository(owner, repo_name)
|
|
||||||
if not repo:
|
|
||||||
logger.warning(
|
|
||||||
f"Repository '{repo_full_name}' not found when fetching file '{file_path}'."
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
content_item = repo.file_contents(
|
for repo_full_name in repo_full_names:
|
||||||
path=file_path
|
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||||
) # Use file_contents for clarity
|
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
if (
|
digest = await self.ingest_repository(
|
||||||
not content_item
|
repo_full_name=repo_full_name,
|
||||||
or not isinstance(content_item, Contents)
|
branch=branch,
|
||||||
or content_item.type != "file"
|
include_patterns=include_patterns,
|
||||||
):
|
exclude_patterns=exclude_patterns,
|
||||||
logger.warning(
|
max_file_size=max_file_size,
|
||||||
f"File '{file_path}' not found or is not a file in '{repo_full_name}'."
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
if content_item.size > MAX_FILE_SIZE:
|
|
||||||
logger.warning(
|
|
||||||
f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch."
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Content is base64 encoded
|
|
||||||
if content_item.content:
|
|
||||||
try:
|
|
||||||
decoded_content = base64.b64decode(content_item.content).decode(
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
return decoded_content
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
logger.warning(
|
|
||||||
f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'."
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
# Try a fallback encoding
|
|
||||||
decoded_content = base64.b64decode(content_item.content).decode(
|
|
||||||
"latin-1"
|
|
||||||
)
|
|
||||||
return decoded_content
|
|
||||||
except Exception as decode_err:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}"
|
|
||||||
)
|
|
||||||
return None # Give up if fallback fails
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty."
|
|
||||||
)
|
|
||||||
return "" # Return empty string for empty files
|
|
||||||
|
|
||||||
except (NotFoundError, ForbiddenError) as e:
|
|
||||||
logger.warning(
|
|
||||||
f"Cannot access file '{file_path}' in '{repo_full_name}': {e}"
|
|
||||||
)
|
)
|
||||||
return None
|
|
||||||
except Exception as e:
|
if digest:
|
||||||
logger.error(
|
digests.append(digest)
|
||||||
f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}"
|
|
||||||
)
|
logger.info(
|
||||||
return None
|
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
|
||||||
|
)
|
||||||
|
return digests
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
"""
|
"""
|
||||||
GitHub connector indexer.
|
GitHub connector indexer using gitingest.
|
||||||
|
|
||||||
|
This indexer processes entire repository digests in one pass, dramatically
|
||||||
|
reducing LLM API calls compared to the previous file-by-file approach.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
|
|
@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.config import config
|
from app.config import config
|
||||||
from app.connectors.github_connector import GitHubConnector
|
from app.connectors.github_connector import GitHubConnector, RepositoryDigest
|
||||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
from app.services.llm_service import get_user_long_context_llm
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
@ -26,43 +29,55 @@ from .base import (
|
||||||
logger,
|
logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Maximum tokens for a single digest before splitting
|
||||||
|
# Most LLMs can handle 128k+ tokens now, but we'll be conservative
|
||||||
|
MAX_DIGEST_CHARS = 500_000 # ~125k tokens
|
||||||
|
|
||||||
|
|
||||||
async def index_github_repos(
|
async def index_github_repos(
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
start_date: str | None = None,
|
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||||
end_date: str | None = None,
|
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||||
update_last_indexed: bool = True,
|
update_last_indexed: bool = True,
|
||||||
) -> tuple[int, str | None]:
|
) -> tuple[int, str | None]:
|
||||||
"""
|
"""
|
||||||
Index code and documentation files from accessible GitHub repositories.
|
Index GitHub repositories using gitingest for efficient processing.
|
||||||
|
|
||||||
|
This function ingests entire repositories as digests, generates a single
|
||||||
|
summary per repository, and chunks the content for vector storage.
|
||||||
|
|
||||||
|
Note: The start_date and end_date parameters are accepted for API compatibility
|
||||||
|
but are IGNORED. GitHub repositories are indexed as complete snapshots since
|
||||||
|
gitingest captures the current state of the entire codebase.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
session: Database session
|
session: Database session
|
||||||
connector_id: ID of the GitHub connector
|
connector_id: ID of the GitHub connector
|
||||||
search_space_id: ID of the search space to store documents in
|
search_space_id: ID of the search space to store documents in
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
start_date: Ignored - kept for API compatibility
|
||||||
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
end_date: Ignored - kept for API compatibility
|
||||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple containing (number of documents indexed, error message or None)
|
Tuple containing (number of documents indexed, error message or None)
|
||||||
"""
|
"""
|
||||||
|
# Note: start_date and end_date are intentionally unused
|
||||||
|
_ = start_date, end_date
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
# Log task start
|
# Log task start
|
||||||
log_entry = await task_logger.log_task_start(
|
log_entry = await task_logger.log_task_start(
|
||||||
task_name="github_repos_indexing",
|
task_name="github_repos_indexing",
|
||||||
source="connector_indexing_task",
|
source="connector_indexing_task",
|
||||||
message=f"Starting GitHub repositories indexing for connector {connector_id}",
|
message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)",
|
||||||
metadata={
|
metadata={
|
||||||
"connector_id": connector_id,
|
"connector_id": connector_id,
|
||||||
"user_id": str(user_id),
|
"user_id": str(user_id),
|
||||||
"start_date": start_date,
|
"method": "gitingest",
|
||||||
"end_date": end_date,
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -93,19 +108,11 @@ async def index_github_repos(
|
||||||
f"Connector with ID {connector_id} not found or is not a GitHub connector",
|
f"Connector with ID {connector_id} not found or is not a GitHub connector",
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2. Get the GitHub PAT and selected repositories from the connector config
|
# 2. Get the GitHub PAT (optional) and selected repositories from the connector config
|
||||||
github_pat = connector.config.get("GITHUB_PAT")
|
# PAT is only required for private repositories - public repos work without it
|
||||||
|
github_pat = connector.config.get("GITHUB_PAT") # Can be None or empty
|
||||||
repo_full_names_to_index = connector.config.get("repo_full_names")
|
repo_full_names_to_index = connector.config.get("repo_full_names")
|
||||||
|
|
||||||
if not github_pat:
|
|
||||||
await task_logger.log_task_failure(
|
|
||||||
log_entry,
|
|
||||||
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
|
|
||||||
"Missing GitHub PAT",
|
|
||||||
{"error_type": "MissingToken"},
|
|
||||||
)
|
|
||||||
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
|
|
||||||
|
|
||||||
if not repo_full_names_to_index or not isinstance(
|
if not repo_full_names_to_index or not isinstance(
|
||||||
repo_full_names_to_index, list
|
repo_full_names_to_index, list
|
||||||
):
|
):
|
||||||
|
|
@ -117,10 +124,16 @@ async def index_github_repos(
|
||||||
)
|
)
|
||||||
return 0, "'repo_full_names' not found or is not a list in connector config"
|
return 0, "'repo_full_names' not found or is not a list in connector config"
|
||||||
|
|
||||||
# 3. Initialize GitHub connector client
|
# Log whether we're using authentication
|
||||||
|
if github_pat:
|
||||||
|
logger.info("Using GitHub PAT for authentication (private repos supported)")
|
||||||
|
else:
|
||||||
|
logger.info("No GitHub PAT provided - only public repositories can be indexed")
|
||||||
|
|
||||||
|
# 3. Initialize GitHub connector with gitingest backend
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Initializing GitHub client for connector {connector_id}",
|
f"Initializing gitingest-based GitHub client for connector {connector_id}",
|
||||||
{
|
{
|
||||||
"stage": "client_initialization",
|
"stage": "client_initialization",
|
||||||
"repo_count": len(repo_full_names_to_index),
|
"repo_count": len(repo_full_names_to_index),
|
||||||
|
|
@ -138,258 +151,52 @@ async def index_github_repos(
|
||||||
)
|
)
|
||||||
return 0, f"Failed to initialize GitHub client: {e!s}"
|
return 0, f"Failed to initialize GitHub client: {e!s}"
|
||||||
|
|
||||||
# 4. Validate selected repositories
|
# 4. Process each repository with gitingest
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
|
f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
|
||||||
{
|
{
|
||||||
"stage": "repo_processing",
|
"stage": "repo_processing",
|
||||||
"repo_count": len(repo_full_names_to_index),
|
"repo_count": len(repo_full_names_to_index),
|
||||||
"start_date": start_date,
|
|
||||||
"end_date": end_date,
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
|
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
|
||||||
)
|
)
|
||||||
if start_date and end_date:
|
|
||||||
logger.info(
|
|
||||||
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 6. Iterate through selected repositories and index files
|
|
||||||
for repo_full_name in repo_full_names_to_index:
|
for repo_full_name in repo_full_names_to_index:
|
||||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.info(f"Processing repository: {repo_full_name}")
|
logger.info(f"Ingesting repository: {repo_full_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
files_to_index = github_client.get_repository_files(repo_full_name)
|
# Ingest the entire repository
|
||||||
if not files_to_index:
|
digest = await github_client.ingest_repository(repo_full_name)
|
||||||
logger.info(
|
|
||||||
f"No indexable files found in repository: {repo_full_name}"
|
if not digest:
|
||||||
|
logger.warning(
|
||||||
|
f"No digest returned for repository: {repo_full_name}"
|
||||||
)
|
)
|
||||||
|
errors.append(f"No digest for {repo_full_name}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.info(
|
# Process the digest and create documents
|
||||||
f"Found {len(files_to_index)} files to process in {repo_full_name}"
|
docs_created = await _process_repository_digest(
|
||||||
|
session=session,
|
||||||
|
digest=digest,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
user_id=user_id,
|
||||||
|
task_logger=task_logger,
|
||||||
|
log_entry=log_entry,
|
||||||
)
|
)
|
||||||
|
|
||||||
for file_info in files_to_index:
|
documents_processed += docs_created
|
||||||
file_path = file_info.get("path")
|
logger.info(
|
||||||
file_url = file_info.get("url")
|
f"Created {docs_created} documents from repository: {repo_full_name}"
|
||||||
file_sha = file_info.get("sha")
|
)
|
||||||
file_type = file_info.get("type") # 'code' or 'doc'
|
|
||||||
full_path_key = f"{repo_full_name}/{file_path}"
|
|
||||||
|
|
||||||
if not file_path or not file_url or not file_sha:
|
|
||||||
logger.warning(
|
|
||||||
f"Skipping file with missing info in {repo_full_name}: {file_info}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get file content
|
|
||||||
file_content = github_client.get_file_content(
|
|
||||||
repo_full_name, file_path
|
|
||||||
)
|
|
||||||
|
|
||||||
if file_content is None:
|
|
||||||
logger.warning(
|
|
||||||
f"Could not retrieve content for {full_path_key}. Skipping."
|
|
||||||
)
|
|
||||||
continue # Skip if content fetch failed
|
|
||||||
|
|
||||||
# Generate unique identifier hash for this GitHub file
|
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
|
||||||
DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate content hash
|
|
||||||
content_hash = generate_content_hash(file_content, search_space_id)
|
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
|
||||||
existing_document = await check_document_by_unique_identifier(
|
|
||||||
session, unique_identifier_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing_document:
|
|
||||||
# Document exists - check if content has changed
|
|
||||||
if existing_document.content_hash == content_hash:
|
|
||||||
logger.info(
|
|
||||||
f"Document for GitHub file {full_path_key} unchanged. Skipping."
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# Content has changed - update the existing document
|
|
||||||
logger.info(
|
|
||||||
f"Content changed for GitHub file {full_path_key}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate summary with metadata
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
if user_llm:
|
|
||||||
file_extension = (
|
|
||||||
file_path.split(".")[-1]
|
|
||||||
if "." in file_path
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
document_metadata = {
|
|
||||||
"file_path": full_path_key,
|
|
||||||
"repository": repo_full_name,
|
|
||||||
"file_type": file_extension or "unknown",
|
|
||||||
"document_type": "GitHub Repository File",
|
|
||||||
"connector_type": "GitHub",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
file_content, user_llm, document_metadata
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
|
||||||
summary_embedding = (
|
|
||||||
config.embedding_model_instance.embed(
|
|
||||||
summary_content
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Chunk the content
|
|
||||||
try:
|
|
||||||
if hasattr(config, "code_chunker_instance"):
|
|
||||||
chunks_data = [
|
|
||||||
await create_document_chunks(file_content)
|
|
||||||
][0]
|
|
||||||
else:
|
|
||||||
chunks_data = await create_document_chunks(
|
|
||||||
file_content
|
|
||||||
)
|
|
||||||
except Exception as chunk_err:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Update existing document
|
|
||||||
existing_document.title = f"GitHub - {full_path_key}"
|
|
||||||
existing_document.content = summary_content
|
|
||||||
existing_document.content_hash = content_hash
|
|
||||||
existing_document.embedding = summary_embedding
|
|
||||||
existing_document.document_metadata = {
|
|
||||||
"file_path": file_path,
|
|
||||||
"file_sha": file_sha,
|
|
||||||
"file_url": file_url,
|
|
||||||
"repository": repo_full_name,
|
|
||||||
"indexed_at": datetime.now(UTC).strftime(
|
|
||||||
"%Y-%m-%d %H:%M:%S"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
existing_document.chunks = chunks_data
|
|
||||||
existing_document.updated_at = get_current_timestamp()
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Successfully updated GitHub file {full_path_key}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Document doesn't exist - create new one
|
|
||||||
# Generate summary with metadata
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
if user_llm:
|
|
||||||
# Extract file extension from file path
|
|
||||||
file_extension = (
|
|
||||||
file_path.split(".")[-1] if "." in file_path else None
|
|
||||||
)
|
|
||||||
document_metadata = {
|
|
||||||
"file_path": full_path_key,
|
|
||||||
"repository": repo_full_name,
|
|
||||||
"file_type": file_extension or "unknown",
|
|
||||||
"document_type": "GitHub Repository File",
|
|
||||||
"connector_type": "GitHub",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
file_content, user_llm, document_metadata
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Fallback to simple summary if no LLM configured
|
|
||||||
summary_content = (
|
|
||||||
f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
|
||||||
)
|
|
||||||
summary_embedding = config.embedding_model_instance.embed(
|
|
||||||
summary_content
|
|
||||||
)
|
|
||||||
|
|
||||||
# Chunk the content
|
|
||||||
try:
|
|
||||||
chunks_data = [await create_document_chunks(file_content)][0]
|
|
||||||
|
|
||||||
# Use code chunker if available, otherwise regular chunker
|
|
||||||
if hasattr(config, "code_chunker_instance"):
|
|
||||||
chunks_data = [
|
|
||||||
{
|
|
||||||
"content": chunk.text,
|
|
||||||
"embedding": config.embedding_model_instance.embed(
|
|
||||||
chunk.text
|
|
||||||
),
|
|
||||||
}
|
|
||||||
for chunk in config.code_chunker_instance.chunk(
|
|
||||||
file_content
|
|
||||||
)
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
chunks_data = await create_document_chunks(file_content)
|
|
||||||
|
|
||||||
except Exception as chunk_err:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
|
||||||
)
|
|
||||||
errors.append(
|
|
||||||
f"Chunking failed for {full_path_key}: {chunk_err}"
|
|
||||||
)
|
|
||||||
continue # Skip this file if chunking fails
|
|
||||||
|
|
||||||
doc_metadata = {
|
|
||||||
"repository_full_name": repo_full_name,
|
|
||||||
"file_path": file_path,
|
|
||||||
"full_path": full_path_key, # For easier lookup
|
|
||||||
"url": file_url,
|
|
||||||
"sha": file_sha,
|
|
||||||
"type": file_type,
|
|
||||||
"indexed_at": datetime.now(UTC).isoformat(),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create new document
|
|
||||||
logger.info(f"Creating new document for file: {full_path_key}")
|
|
||||||
document = Document(
|
|
||||||
title=f"GitHub - {file_path}",
|
|
||||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
|
||||||
document_metadata=doc_metadata,
|
|
||||||
content=summary_content, # Store summary
|
|
||||||
content_hash=content_hash,
|
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
|
||||||
embedding=summary_embedding,
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
chunks=chunks_data, # Associate chunks directly
|
|
||||||
updated_at=get_current_timestamp(),
|
|
||||||
)
|
|
||||||
session.add(document)
|
|
||||||
documents_processed += 1
|
|
||||||
|
|
||||||
# Batch commit every 10 documents
|
|
||||||
if documents_processed % 10 == 0:
|
|
||||||
logger.info(
|
|
||||||
f"Committing batch: {documents_processed} GitHub files processed so far"
|
|
||||||
)
|
|
||||||
await session.commit()
|
|
||||||
|
|
||||||
except Exception as repo_err:
|
except Exception as repo_err:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
|
@ -397,11 +204,11 @@ async def index_github_repos(
|
||||||
)
|
)
|
||||||
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
|
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
|
||||||
|
|
||||||
# Final commit for any remaining documents not yet committed in batches
|
# Final commit
|
||||||
logger.info(f"Final commit: Total {documents_processed} GitHub files processed")
|
|
||||||
await session.commit()
|
await session.commit()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
|
f"Finished GitHub indexing for connector {connector_id}. "
|
||||||
|
f"Created {documents_processed} documents."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Log success
|
# Log success
|
||||||
|
|
@ -412,6 +219,7 @@ async def index_github_repos(
|
||||||
"documents_processed": documents_processed,
|
"documents_processed": documents_processed,
|
||||||
"errors_count": len(errors),
|
"errors_count": len(errors),
|
||||||
"repo_count": len(repo_full_names_to_index),
|
"repo_count": len(repo_full_names_to_index),
|
||||||
|
"method": "gitingest",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -428,6 +236,7 @@ async def index_github_repos(
|
||||||
)
|
)
|
||||||
errors.append(f"Database error: {db_err}")
|
errors.append(f"Database error: {db_err}")
|
||||||
return documents_processed, "; ".join(errors) if errors else str(db_err)
|
return documents_processed, "; ".join(errors) if errors else str(db_err)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
await session.rollback()
|
await session.rollback()
|
||||||
await task_logger.log_task_failure(
|
await task_logger.log_task_failure(
|
||||||
|
|
@ -445,3 +254,173 @@ async def index_github_repos(
|
||||||
|
|
||||||
error_message = "; ".join(errors) if errors else None
|
error_message = "; ".join(errors) if errors else None
|
||||||
return documents_processed, error_message
|
return documents_processed, error_message
|
||||||
|
|
||||||
|
|
||||||
|
async def _process_repository_digest(
|
||||||
|
session: AsyncSession,
|
||||||
|
digest: RepositoryDigest,
|
||||||
|
search_space_id: int,
|
||||||
|
user_id: str,
|
||||||
|
task_logger: TaskLoggingService,
|
||||||
|
log_entry,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Process a repository digest and create documents.
|
||||||
|
|
||||||
|
For each repository, we create:
|
||||||
|
1. One main document with the repository summary
|
||||||
|
2. Chunks from the full digest content for granular search
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
digest: The repository digest from gitingest
|
||||||
|
search_space_id: ID of the search space
|
||||||
|
user_id: ID of the user
|
||||||
|
task_logger: Task logging service
|
||||||
|
log_entry: Current log entry
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of documents created
|
||||||
|
"""
|
||||||
|
repo_full_name = digest.repo_full_name
|
||||||
|
documents_created = 0
|
||||||
|
|
||||||
|
# Generate unique identifier based on repo name and content hash
|
||||||
|
# This allows updates when repo content changes
|
||||||
|
full_content = digest.full_digest
|
||||||
|
content_hash = generate_content_hash(full_content, search_space_id)
|
||||||
|
|
||||||
|
# Use repo name as the unique identifier (one document per repo)
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if document with this unique identifier already exists
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Repository {repo_full_name} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for repository {repo_full_name}. Updating document."
|
||||||
|
)
|
||||||
|
# Delete existing document to replace with new one
|
||||||
|
await session.delete(existing_document)
|
||||||
|
await session.flush()
|
||||||
|
|
||||||
|
# Generate summary using LLM (ONE call per repository!)
|
||||||
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
||||||
|
document_metadata = {
|
||||||
|
"repository": repo_full_name,
|
||||||
|
"document_type": "GitHub Repository",
|
||||||
|
"connector_type": "GitHub",
|
||||||
|
"ingestion_method": "gitingest",
|
||||||
|
"file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
|
||||||
|
"estimated_tokens": digest.estimated_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
# Prepare content for summarization
|
||||||
|
# Include tree structure and truncated content if too large
|
||||||
|
summary_content = digest.full_digest
|
||||||
|
if len(summary_content) > MAX_DIGEST_CHARS:
|
||||||
|
# Truncate but keep the tree and beginning of content
|
||||||
|
summary_content = (
|
||||||
|
f"# Repository: {repo_full_name}\n\n"
|
||||||
|
f"## File Structure\n\n{digest.tree}\n\n"
|
||||||
|
f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
|
||||||
|
)
|
||||||
|
|
||||||
|
summary_text, summary_embedding = await generate_document_summary(
|
||||||
|
summary_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fallback to simple summary if no LLM configured
|
||||||
|
summary_text = (
|
||||||
|
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||||
|
f"## Summary\n{digest.summary}\n\n"
|
||||||
|
f"## File Structure\n{digest.tree[:3000]}"
|
||||||
|
)
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(summary_text)
|
||||||
|
|
||||||
|
# Chunk the full digest content for granular search
|
||||||
|
try:
|
||||||
|
# Use the content (not the summary) for chunking
|
||||||
|
# This preserves file-level granularity in search
|
||||||
|
chunks_data = await create_document_chunks(digest.content)
|
||||||
|
except Exception as chunk_err:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to chunk repository {repo_full_name}: {chunk_err}"
|
||||||
|
)
|
||||||
|
# Fall back to a simpler chunking approach
|
||||||
|
chunks_data = await _simple_chunk_content(digest.content)
|
||||||
|
|
||||||
|
# Create the document
|
||||||
|
doc_metadata = {
|
||||||
|
"repository_full_name": repo_full_name,
|
||||||
|
"url": f"https://github.com/{repo_full_name}",
|
||||||
|
"branch": digest.branch,
|
||||||
|
"ingestion_method": "gitingest",
|
||||||
|
"file_tree": digest.tree,
|
||||||
|
"gitingest_summary": digest.summary,
|
||||||
|
"estimated_tokens": digest.estimated_tokens,
|
||||||
|
"indexed_at": datetime.now(UTC).isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
title=f"GitHub Repository: {repo_full_name}",
|
||||||
|
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||||
|
document_metadata=doc_metadata,
|
||||||
|
content=summary_text,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
chunks=chunks_data,
|
||||||
|
updated_at=get_current_timestamp(),
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
documents_created += 1
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Created document for repository {repo_full_name} "
|
||||||
|
f"with {len(chunks_data)} chunks"
|
||||||
|
)
|
||||||
|
|
||||||
|
return documents_created
|
||||||
|
|
||||||
|
|
||||||
|
async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
|
||||||
|
"""
|
||||||
|
Simple fallback chunking when the regular chunker fails.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: The content to chunk
|
||||||
|
chunk_size: Size of each chunk in characters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of chunk dictionaries with content and embedding
|
||||||
|
"""
|
||||||
|
from app.db import Chunk
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for i in range(0, len(content), chunk_size):
|
||||||
|
chunk_text = content[i : i + chunk_size]
|
||||||
|
if chunk_text.strip():
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=chunk_text,
|
||||||
|
embedding=config.embedding_model_instance.embed(chunk_text),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
|
||||||
|
|
@ -60,6 +60,7 @@ dependencies = [
|
||||||
"mcp>=1.25.0",
|
"mcp>=1.25.0",
|
||||||
"starlette>=0.40.0,<0.51.0",
|
"starlette>=0.40.0,<0.51.0",
|
||||||
"sse-starlette>=3.1.1,<3.1.2",
|
"sse-starlette>=3.1.1,<3.1.2",
|
||||||
|
"gitingest>=0.3.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
|
|
|
||||||
30
surfsense_backend/uv.lock
generated
30
surfsense_backend/uv.lock
generated
|
|
@ -1945,6 +1945,25 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
|
{ url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "gitingest"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "click" },
|
||||||
|
{ name = "httpx" },
|
||||||
|
{ name = "loguru" },
|
||||||
|
{ name = "pathspec" },
|
||||||
|
{ name = "pydantic" },
|
||||||
|
{ name = "python-dotenv" },
|
||||||
|
{ name = "starlette" },
|
||||||
|
{ name = "tiktoken" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "google-api-core"
|
name = "google-api-core"
|
||||||
version = "2.25.1"
|
version = "2.25.1"
|
||||||
|
|
@ -4460,6 +4479,15 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
|
{ url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pathspec"
|
||||||
|
version = "1.0.3"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pdf2image"
|
name = "pdf2image"
|
||||||
version = "1.17.0"
|
version = "1.17.0"
|
||||||
|
|
@ -6484,6 +6512,7 @@ dependencies = [
|
||||||
{ name = "firecrawl-py" },
|
{ name = "firecrawl-py" },
|
||||||
{ name = "flower" },
|
{ name = "flower" },
|
||||||
{ name = "github3-py" },
|
{ name = "github3-py" },
|
||||||
|
{ name = "gitingest" },
|
||||||
{ name = "google-api-python-client" },
|
{ name = "google-api-python-client" },
|
||||||
{ name = "google-auth-oauthlib" },
|
{ name = "google-auth-oauthlib" },
|
||||||
{ name = "kokoro" },
|
{ name = "kokoro" },
|
||||||
|
|
@ -6549,6 +6578,7 @@ requires-dist = [
|
||||||
{ name = "firecrawl-py", specifier = ">=4.9.0" },
|
{ name = "firecrawl-py", specifier = ">=4.9.0" },
|
||||||
{ name = "flower", specifier = ">=2.0.1" },
|
{ name = "flower", specifier = ">=2.0.1" },
|
||||||
{ name = "github3-py", specifier = "==4.0.1" },
|
{ name = "github3-py", specifier = "==4.0.1" },
|
||||||
|
{ name = "gitingest", specifier = ">=0.3.1" },
|
||||||
{ name = "google-api-python-client", specifier = ">=2.156.0" },
|
{ name = "google-api-python-client", specifier = ">=2.156.0" },
|
||||||
{ name = "google-auth-oauthlib", specifier = ">=1.2.1" },
|
{ name = "google-auth-oauthlib", specifier = ">=1.2.1" },
|
||||||
{ name = "kokoro", specifier = ">=0.9.4" },
|
{ name = "kokoro", specifier = ">=0.9.4" },
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,6 @@ import {
|
||||||
} from "@/components/ui/select";
|
} from "@/components/ui/select";
|
||||||
import { Switch } from "@/components/ui/switch";
|
import { Switch } from "@/components/ui/switch";
|
||||||
import { EnumConnectorName } from "@/contracts/enums/connector";
|
import { EnumConnectorName } from "@/contracts/enums/connector";
|
||||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
|
||||||
import { getConnectorBenefits } from "../connector-benefits";
|
import { getConnectorBenefits } from "../connector-benefits";
|
||||||
import type { ConnectFormProps } from "../index";
|
import type { ConnectFormProps } from "../index";
|
||||||
|
|
||||||
|
|
@ -44,12 +43,13 @@ const githubConnectorFormSchema = z.object({
|
||||||
}),
|
}),
|
||||||
github_pat: z
|
github_pat: z
|
||||||
.string()
|
.string()
|
||||||
.min(20, {
|
.optional()
|
||||||
message: "GitHub Personal Access Token seems too short.",
|
.refine(
|
||||||
})
|
(pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"),
|
||||||
.refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
|
{
|
||||||
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
|
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
|
||||||
}),
|
}
|
||||||
|
),
|
||||||
repo_full_names: z.string().min(1, {
|
repo_full_names: z.string().min(1, {
|
||||||
message: "At least one repository is required.",
|
message: "At least one repository is required.",
|
||||||
}),
|
}),
|
||||||
|
|
@ -59,8 +59,6 @@ type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
|
||||||
|
|
||||||
export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
|
export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
|
||||||
const isSubmittingRef = useRef(false);
|
const isSubmittingRef = useRef(false);
|
||||||
const [startDate, setStartDate] = useState<Date | undefined>(undefined);
|
|
||||||
const [endDate, setEndDate] = useState<Date | undefined>(undefined);
|
|
||||||
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
||||||
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
||||||
const form = useForm<GithubConnectorFormValues>({
|
const form = useForm<GithubConnectorFormValues>({
|
||||||
|
|
@ -94,7 +92,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
name: values.name,
|
name: values.name,
|
||||||
connector_type: EnumConnectorName.GITHUB_CONNECTOR,
|
connector_type: EnumConnectorName.GITHUB_CONNECTOR,
|
||||||
config: {
|
config: {
|
||||||
GITHUB_PAT: values.github_pat,
|
GITHUB_PAT: values.github_pat || null, // Optional - only for private repos
|
||||||
repo_full_names: repoList,
|
repo_full_names: repoList,
|
||||||
},
|
},
|
||||||
is_indexable: true,
|
is_indexable: true,
|
||||||
|
|
@ -102,8 +100,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
periodic_indexing_enabled: periodicEnabled,
|
periodic_indexing_enabled: periodicEnabled,
|
||||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||||
next_scheduled_at: null,
|
next_scheduled_at: null,
|
||||||
startDate,
|
// GitHub indexes full repo snapshots - no date range needed
|
||||||
endDate,
|
startDate: undefined,
|
||||||
|
endDate: undefined,
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
});
|
});
|
||||||
|
|
@ -117,10 +116,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
||||||
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
||||||
<div className="-ml-1">
|
<div className="-ml-1">
|
||||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token Required</AlertTitle>
|
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||||
You'll need a GitHub Personal Access Token to use this connector. You can create one
|
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||||
from{" "}
|
token. Create one from{" "}
|
||||||
<a
|
<a
|
||||||
href="https://github.com/settings/tokens"
|
href="https://github.com/settings/tokens"
|
||||||
target="_blank"
|
target="_blank"
|
||||||
|
|
@ -128,7 +127,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
className="font-medium underline underline-offset-4"
|
className="font-medium underline underline-offset-4"
|
||||||
>
|
>
|
||||||
GitHub Settings
|
GitHub Settings
|
||||||
</a>
|
</a>{" "}
|
||||||
|
if needed.
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
</div>
|
</div>
|
||||||
</Alert>
|
</Alert>
|
||||||
|
|
@ -167,7 +167,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
name="github_pat"
|
name="github_pat"
|
||||||
render={({ field }) => (
|
render={({ field }) => (
|
||||||
<FormItem>
|
<FormItem>
|
||||||
<FormLabel className="text-xs sm:text-sm">GitHub Personal Access Token</FormLabel>
|
<FormLabel className="text-xs sm:text-sm">
|
||||||
|
GitHub Personal Access Token{" "}
|
||||||
|
<span className="text-muted-foreground font-normal">(optional)</span>
|
||||||
|
</FormLabel>
|
||||||
<FormControl>
|
<FormControl>
|
||||||
<Input
|
<Input
|
||||||
type="password"
|
type="password"
|
||||||
|
|
@ -178,8 +181,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
/>
|
/>
|
||||||
</FormControl>
|
</FormControl>
|
||||||
<FormDescription className="text-[10px] sm:text-xs">
|
<FormDescription className="text-[10px] sm:text-xs">
|
||||||
Your GitHub PAT will be encrypted and stored securely. It typically starts with
|
Only required for private repositories. Leave empty if indexing public repos
|
||||||
"ghp_" or "github_pat_".
|
only.
|
||||||
</FormDescription>
|
</FormDescription>
|
||||||
<FormMessage />
|
<FormMessage />
|
||||||
</FormItem>
|
</FormItem>
|
||||||
|
|
@ -225,15 +228,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
|
|
||||||
{/* Indexing Configuration */}
|
{/* Indexing Configuration */}
|
||||||
<div className="space-y-4 pt-4 border-t border-slate-400/20">
|
<div className="space-y-4 pt-4 border-t border-slate-400/20">
|
||||||
<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
|
<h3 className="text-sm sm:text-base font-medium">Sync Configuration</h3>
|
||||||
|
|
||||||
{/* Date Range Selector */}
|
{/* Note: No date range for GitHub - it indexes full repo snapshots */}
|
||||||
<DateRangeSelector
|
|
||||||
startDate={startDate}
|
|
||||||
endDate={endDate}
|
|
||||||
onStartDateChange={setStartDate}
|
|
||||||
onEndDateChange={setEndDate}
|
|
||||||
/>
|
|
||||||
|
|
||||||
{/* Periodic Sync Config */}
|
{/* Periodic Sync Config */}
|
||||||
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
||||||
|
|
|
||||||
|
|
@ -490,8 +490,8 @@ export function SourceDetailPanel({
|
||||||
>
|
>
|
||||||
{idx + 1}
|
{idx + 1}
|
||||||
{isCited && (
|
{isCited && (
|
||||||
<span className="absolute -top-1 -right-1 w-3 h-3 bg-primary rounded-full border-2 border-background">
|
<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
|
||||||
<Sparkles className="h-2 w-2 text-primary-foreground absolute top-0.5 left-0.5" />
|
<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
|
||||||
</span>
|
</span>
|
||||||
)}
|
)}
|
||||||
</motion.button>
|
</motion.button>
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue