mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-11 00:32:38 +02:00
feat: Integrate gitingest for GitHub repository ingestion
- Added gitingest as a dependency to streamline the ingestion of GitHub repositories. - Refactored GitHubConnector to utilize gitingest for efficient repository digest generation, reducing API calls. - Updated GitHub indexer to process entire repository digests, enhancing performance and simplifying the indexing process. - Modified GitHub connect form to indicate that the Personal Access Token is optional for public repositories.
This commit is contained in:
parent
6e331c3b85
commit
49b8a46d10
6 changed files with 545 additions and 539 deletions
|
|
@ -1,296 +1,295 @@
|
|||
import base64
|
||||
import logging
|
||||
from typing import Any
|
||||
"""
|
||||
GitHub connector using gitingest for efficient repository digestion.
|
||||
|
||||
from github3 import exceptions as github_exceptions, login as github_login
|
||||
from github3.exceptions import ForbiddenError, NotFoundError
|
||||
from github3.repos.contents import Contents
|
||||
This connector replaces the previous file-by-file approach with a single
|
||||
digest generation per repository, dramatically reducing LLM API calls.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
from gitingest import ingest_async
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# List of common code file extensions to target
|
||||
CODE_EXTENSIONS = {
|
||||
".py",
|
||||
".js",
|
||||
".jsx",
|
||||
".ts",
|
||||
".tsx",
|
||||
".java",
|
||||
".c",
|
||||
".cpp",
|
||||
".h",
|
||||
".hpp",
|
||||
".cs",
|
||||
".go",
|
||||
".rb",
|
||||
".php",
|
||||
".swift",
|
||||
".kt",
|
||||
".scala",
|
||||
".rs",
|
||||
".m",
|
||||
".sh",
|
||||
".bash",
|
||||
".ps1",
|
||||
".lua",
|
||||
".pl",
|
||||
".pm",
|
||||
".r",
|
||||
".dart",
|
||||
".sql",
|
||||
}
|
||||
# Maximum file size in bytes (5MB)
|
||||
MAX_FILE_SIZE = 5 * 1024 * 1024
|
||||
|
||||
# List of common documentation/text file extensions
|
||||
DOC_EXTENSIONS = {
|
||||
".md",
|
||||
".txt",
|
||||
".rst",
|
||||
".adoc",
|
||||
".html",
|
||||
".htm",
|
||||
".xml",
|
||||
".json",
|
||||
".yaml",
|
||||
".yml",
|
||||
".toml",
|
||||
}
|
||||
# Default patterns to exclude (recommended approach for comprehensive analysis)
|
||||
# Using only exclude_patterns ensures we don't miss any relevant file types
|
||||
DEFAULT_EXCLUDE_PATTERNS = [
|
||||
# Dependencies
|
||||
"node_modules/*",
|
||||
"vendor/*",
|
||||
"bower_components/*",
|
||||
".pnpm/*",
|
||||
# Build artifacts / Caches
|
||||
"build/*",
|
||||
"dist/*",
|
||||
"target/*",
|
||||
"out/*",
|
||||
"__pycache__/*",
|
||||
"*.pyc",
|
||||
".cache/*",
|
||||
".next/*",
|
||||
".nuxt/*",
|
||||
# Virtual environments
|
||||
"venv/*",
|
||||
".venv/*",
|
||||
"env/*",
|
||||
".env/*",
|
||||
# IDE/Editor config
|
||||
".vscode/*",
|
||||
".idea/*",
|
||||
".project",
|
||||
".settings/*",
|
||||
"*.swp",
|
||||
"*.swo",
|
||||
# Version control
|
||||
".git/*",
|
||||
".svn/*",
|
||||
".hg/*",
|
||||
# Temporary / Logs
|
||||
"tmp/*",
|
||||
"temp/*",
|
||||
"logs/*",
|
||||
"*.log",
|
||||
# Lock files (usually not needed for understanding code)
|
||||
"package-lock.json",
|
||||
"pnpm-lock.yaml",
|
||||
"yarn.lock",
|
||||
"uv.lock",
|
||||
"Gemfile.lock",
|
||||
"poetry.lock",
|
||||
"Cargo.lock",
|
||||
"composer.lock",
|
||||
# Binary/media files
|
||||
"*.png",
|
||||
"*.jpg",
|
||||
"*.jpeg",
|
||||
"*.gif",
|
||||
"*.ico",
|
||||
"*.svg",
|
||||
"*.webp",
|
||||
"*.bmp",
|
||||
"*.tiff",
|
||||
"*.woff",
|
||||
"*.woff2",
|
||||
"*.ttf",
|
||||
"*.eot",
|
||||
"*.otf",
|
||||
"*.mp3",
|
||||
"*.mp4",
|
||||
"*.wav",
|
||||
"*.ogg",
|
||||
"*.webm",
|
||||
"*.avi",
|
||||
"*.mov",
|
||||
"*.pdf",
|
||||
"*.doc",
|
||||
"*.docx",
|
||||
"*.xls",
|
||||
"*.xlsx",
|
||||
"*.ppt",
|
||||
"*.pptx",
|
||||
"*.zip",
|
||||
"*.tar",
|
||||
"*.tar.gz",
|
||||
"*.tgz",
|
||||
"*.rar",
|
||||
"*.7z",
|
||||
"*.exe",
|
||||
"*.dll",
|
||||
"*.so",
|
||||
"*.dylib",
|
||||
"*.bin",
|
||||
"*.obj",
|
||||
"*.o",
|
||||
"*.a",
|
||||
"*.lib",
|
||||
# Minified files
|
||||
"*.min.js",
|
||||
"*.min.css",
|
||||
# Source maps
|
||||
"*.map",
|
||||
# Database files
|
||||
"*.db",
|
||||
"*.sqlite",
|
||||
"*.sqlite3",
|
||||
# Coverage reports
|
||||
"coverage/*",
|
||||
".coverage",
|
||||
"htmlcov/*",
|
||||
".nyc_output/*",
|
||||
# Test snapshots (can be large)
|
||||
"__snapshots__/*",
|
||||
]
|
||||
|
||||
# Maximum file size in bytes (e.g., 1MB)
|
||||
MAX_FILE_SIZE = 1 * 1024 * 1024
|
||||
|
||||
@dataclass
|
||||
class RepositoryDigest:
|
||||
"""Represents a digested repository from gitingest."""
|
||||
|
||||
repo_full_name: str
|
||||
summary: str
|
||||
tree: str
|
||||
content: str
|
||||
branch: str | None = None
|
||||
|
||||
@property
|
||||
def full_digest(self) -> str:
|
||||
"""Returns the complete digest with tree and content."""
|
||||
return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
|
||||
|
||||
@property
|
||||
def estimated_tokens(self) -> int:
|
||||
"""Rough estimate of tokens (1 token ≈ 4 characters)."""
|
||||
return len(self.full_digest) // 4
|
||||
|
||||
|
||||
class GitHubConnector:
|
||||
"""Connector for interacting with the GitHub API."""
|
||||
"""
|
||||
Connector for ingesting GitHub repositories using gitingest.
|
||||
|
||||
# Directories to skip during file traversal
|
||||
SKIPPED_DIRS = {
|
||||
# Version control
|
||||
".git",
|
||||
# Dependencies
|
||||
"node_modules",
|
||||
"vendor",
|
||||
# Build artifacts / Caches
|
||||
"build",
|
||||
"dist",
|
||||
"target",
|
||||
"__pycache__",
|
||||
# Virtual environments
|
||||
"venv",
|
||||
".venv",
|
||||
"env",
|
||||
# IDE/Editor config
|
||||
".vscode",
|
||||
".idea",
|
||||
".project",
|
||||
".settings",
|
||||
# Temporary / Logs
|
||||
"tmp",
|
||||
"logs",
|
||||
# Add other project-specific irrelevant directories if needed
|
||||
}
|
||||
This connector efficiently processes entire repositories into a single
|
||||
digest, reducing the number of API calls and LLM invocations compared
|
||||
to file-by-file processing.
|
||||
"""
|
||||
|
||||
def __init__(self, token: str):
|
||||
def __init__(self, token: str | None = None):
|
||||
"""
|
||||
Initializes the GitHub connector.
|
||||
|
||||
Args:
|
||||
token: GitHub Personal Access Token (PAT).
|
||||
token: Optional GitHub Personal Access Token (PAT).
|
||||
Only required for private repositories.
|
||||
Public repositories can be ingested without a token.
|
||||
"""
|
||||
if not token:
|
||||
raise ValueError("GitHub token cannot be empty.")
|
||||
try:
|
||||
self.gh = github_login(token=token)
|
||||
# Try a simple authenticated call to check token validity
|
||||
self.gh.me()
|
||||
logger.info("Successfully authenticated with GitHub API.")
|
||||
except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
|
||||
logger.error(f"GitHub authentication failed: {e}")
|
||||
raise ValueError("Invalid GitHub token or insufficient permissions.") from e
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize GitHub client: {e}")
|
||||
raise e
|
||||
self.token = token if token and token.strip() else None
|
||||
if self.token:
|
||||
logger.info("GitHub connector initialized with authentication token.")
|
||||
else:
|
||||
logger.info("GitHub connector initialized without token (public repos only).")
|
||||
|
||||
def get_user_repositories(self) -> list[dict[str, Any]]:
|
||||
"""Fetches repositories accessible by the authenticated user."""
|
||||
repos_data = []
|
||||
try:
|
||||
# type='owner' fetches repos owned by the user
|
||||
# type='member' fetches repos the user is a collaborator on (including orgs)
|
||||
# type='all' fetches both
|
||||
for repo in self.gh.repositories(type="all", sort="updated"):
|
||||
repos_data.append(
|
||||
{
|
||||
"id": repo.id,
|
||||
"name": repo.name,
|
||||
"full_name": repo.full_name,
|
||||
"private": repo.private,
|
||||
"url": repo.html_url,
|
||||
"description": repo.description or "",
|
||||
"last_updated": repo.updated_at if repo.updated_at else None,
|
||||
}
|
||||
)
|
||||
logger.info(f"Fetched {len(repos_data)} repositories.")
|
||||
return repos_data
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch GitHub repositories: {e}")
|
||||
return [] # Return empty list on error
|
||||
|
||||
def get_repository_files(
|
||||
self, repo_full_name: str, path: str = ""
|
||||
) -> list[dict[str, Any]]:
|
||||
async def ingest_repository(
|
||||
self,
|
||||
repo_full_name: str,
|
||||
branch: str | None = None,
|
||||
include_patterns: list[str] | None = None,
|
||||
exclude_patterns: list[str] | None = None,
|
||||
max_file_size: int = MAX_FILE_SIZE,
|
||||
) -> RepositoryDigest | None:
|
||||
"""
|
||||
Recursively fetches details of relevant files (code, docs) within a repository path.
|
||||
Ingest an entire repository and return a digest.
|
||||
|
||||
Args:
|
||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||
path: The starting path within the repository (default is root).
|
||||
branch: Optional specific branch or tag to ingest.
|
||||
include_patterns: Optional list of glob patterns for files to include.
|
||||
If None, includes all files (recommended).
|
||||
exclude_patterns: Optional list of glob patterns for files to exclude.
|
||||
If None, uses DEFAULT_EXCLUDE_PATTERNS.
|
||||
max_file_size: Maximum file size in bytes to include (default 5MB).
|
||||
|
||||
Returns:
|
||||
A list of dictionaries, each containing file details (path, sha, url, size).
|
||||
Returns an empty list if the repository or path is not found or on error.
|
||||
RepositoryDigest containing the summary, tree structure, and content,
|
||||
or None if ingestion fails.
|
||||
"""
|
||||
files_list = []
|
||||
repo_url = f"https://github.com/{repo_full_name}"
|
||||
|
||||
# Use only exclude_patterns by default (recommended for comprehensive analysis)
|
||||
# This ensures we don't miss any relevant file types
|
||||
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
|
||||
|
||||
logger.info(f"Starting gitingest for repository: {repo_full_name}")
|
||||
|
||||
try:
|
||||
owner, repo_name = repo_full_name.split("/")
|
||||
repo = self.gh.repository(owner, repo_name)
|
||||
if not repo:
|
||||
logger.warning(f"Repository '{repo_full_name}' not found.")
|
||||
return []
|
||||
contents = repo.directory_contents(
|
||||
directory_path=path
|
||||
) # Use directory_contents for clarity
|
||||
# Build kwargs dynamically
|
||||
ingest_kwargs = {
|
||||
"max_file_size": max_file_size,
|
||||
"exclude_patterns": exclude_pats,
|
||||
"include_gitignored": False,
|
||||
"include_submodules": False,
|
||||
}
|
||||
|
||||
# contents returns a list of tuples (name, content_obj)
|
||||
for _item_name, content_item in contents:
|
||||
if not isinstance(content_item, Contents):
|
||||
continue
|
||||
# Only add token if provided (required only for private repos)
|
||||
if self.token:
|
||||
ingest_kwargs["token"] = self.token
|
||||
|
||||
if content_item.type == "dir":
|
||||
# Check if the directory name is in the skipped list
|
||||
if content_item.name in self.SKIPPED_DIRS:
|
||||
logger.debug(f"Skipping directory: {content_item.path}")
|
||||
continue # Skip recursion for this directory
|
||||
# Only add branch if specified
|
||||
if branch:
|
||||
ingest_kwargs["branch"] = branch
|
||||
|
||||
# Recursively fetch contents of subdirectory
|
||||
files_list.extend(
|
||||
self.get_repository_files(
|
||||
repo_full_name, path=content_item.path
|
||||
)
|
||||
)
|
||||
elif content_item.type == "file":
|
||||
# Check if the file extension is relevant and size is within limits
|
||||
file_extension = (
|
||||
"." + content_item.name.split(".")[-1].lower()
|
||||
if "." in content_item.name
|
||||
else ""
|
||||
)
|
||||
is_code = file_extension in CODE_EXTENSIONS
|
||||
is_doc = file_extension in DOC_EXTENSIONS
|
||||
# Only add include_patterns if explicitly provided
|
||||
if include_patterns is not None:
|
||||
ingest_kwargs["include_patterns"] = include_patterns
|
||||
|
||||
if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
|
||||
files_list.append(
|
||||
{
|
||||
"path": content_item.path,
|
||||
"sha": content_item.sha,
|
||||
"url": content_item.html_url,
|
||||
"size": content_item.size,
|
||||
"type": "code" if is_code else "doc",
|
||||
}
|
||||
)
|
||||
elif content_item.size > MAX_FILE_SIZE:
|
||||
logger.debug(
|
||||
f"Skipping large file: {content_item.path} ({content_item.size} bytes)"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"Skipping irrelevant file type: {content_item.path}"
|
||||
)
|
||||
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
|
||||
|
||||
except (NotFoundError, ForbiddenError) as e:
|
||||
logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get files for {repo_full_name} at path '{path}': {e}"
|
||||
if not content or not content.strip():
|
||||
logger.warning(
|
||||
f"No content retrieved from repository: {repo_full_name}"
|
||||
)
|
||||
return None
|
||||
|
||||
digest = RepositoryDigest(
|
||||
repo_full_name=repo_full_name,
|
||||
summary=summary,
|
||||
tree=tree,
|
||||
content=content,
|
||||
branch=branch,
|
||||
)
|
||||
# Return what we have collected so far in case of partial failure
|
||||
|
||||
return files_list
|
||||
logger.info(
|
||||
f"Successfully ingested {repo_full_name}: "
|
||||
f"~{digest.estimated_tokens} estimated tokens"
|
||||
)
|
||||
return digest
|
||||
|
||||
def get_file_content(self, repo_full_name: str, file_path: str) -> str | None:
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
|
||||
return None
|
||||
|
||||
async def ingest_repositories(
|
||||
self,
|
||||
repo_full_names: list[str],
|
||||
branch: str | None = None,
|
||||
include_patterns: list[str] | None = None,
|
||||
exclude_patterns: list[str] | None = None,
|
||||
max_file_size: int = MAX_FILE_SIZE,
|
||||
) -> list[RepositoryDigest]:
|
||||
"""
|
||||
Fetches the decoded content of a specific file.
|
||||
Ingest multiple repositories and return their digests.
|
||||
|
||||
Args:
|
||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||
file_path: The path to the file within the repository.
|
||||
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
|
||||
branch: Optional specific branch or tag to ingest (applied to all repos).
|
||||
include_patterns: Optional list of glob patterns for files to include.
|
||||
exclude_patterns: Optional list of glob patterns for files to exclude.
|
||||
max_file_size: Maximum file size in bytes to include.
|
||||
|
||||
Returns:
|
||||
The decoded file content as a string, or None if fetching fails or file is too large.
|
||||
List of RepositoryDigest objects for successfully ingested repositories.
|
||||
"""
|
||||
try:
|
||||
owner, repo_name = repo_full_name.split("/")
|
||||
repo = self.gh.repository(owner, repo_name)
|
||||
if not repo:
|
||||
logger.warning(
|
||||
f"Repository '{repo_full_name}' not found when fetching file '{file_path}'."
|
||||
)
|
||||
return None
|
||||
digests = []
|
||||
|
||||
content_item = repo.file_contents(
|
||||
path=file_path
|
||||
) # Use file_contents for clarity
|
||||
for repo_full_name in repo_full_names:
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
|
||||
if (
|
||||
not content_item
|
||||
or not isinstance(content_item, Contents)
|
||||
or content_item.type != "file"
|
||||
):
|
||||
logger.warning(
|
||||
f"File '{file_path}' not found or is not a file in '{repo_full_name}'."
|
||||
)
|
||||
return None
|
||||
|
||||
if content_item.size > MAX_FILE_SIZE:
|
||||
logger.warning(
|
||||
f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch."
|
||||
)
|
||||
return None
|
||||
|
||||
# Content is base64 encoded
|
||||
if content_item.content:
|
||||
try:
|
||||
decoded_content = base64.b64decode(content_item.content).decode(
|
||||
"utf-8"
|
||||
)
|
||||
return decoded_content
|
||||
except UnicodeDecodeError:
|
||||
logger.warning(
|
||||
f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'."
|
||||
)
|
||||
try:
|
||||
# Try a fallback encoding
|
||||
decoded_content = base64.b64decode(content_item.content).decode(
|
||||
"latin-1"
|
||||
)
|
||||
return decoded_content
|
||||
except Exception as decode_err:
|
||||
logger.error(
|
||||
f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}"
|
||||
)
|
||||
return None # Give up if fallback fails
|
||||
else:
|
||||
logger.warning(
|
||||
f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty."
|
||||
)
|
||||
return "" # Return empty string for empty files
|
||||
|
||||
except (NotFoundError, ForbiddenError) as e:
|
||||
logger.warning(
|
||||
f"Cannot access file '{file_path}' in '{repo_full_name}': {e}"
|
||||
digest = await self.ingest_repository(
|
||||
repo_full_name=repo_full_name,
|
||||
branch=branch,
|
||||
include_patterns=include_patterns,
|
||||
exclude_patterns=exclude_patterns,
|
||||
max_file_size=max_file_size,
|
||||
)
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
if digest:
|
||||
digests.append(digest)
|
||||
|
||||
logger.info(
|
||||
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
|
||||
)
|
||||
return digests
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
"""
|
||||
GitHub connector indexer.
|
||||
GitHub connector indexer using gitingest.
|
||||
|
||||
This indexer processes entire repository digests in one pass, dramatically
|
||||
reducing LLM API calls compared to the previous file-by-file approach.
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
|
@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.github_connector import GitHubConnector
|
||||
from app.connectors.github_connector import GitHubConnector, RepositoryDigest
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
|
@ -26,43 +29,55 @@ from .base import (
|
|||
logger,
|
||||
)
|
||||
|
||||
# Maximum tokens for a single digest before splitting
|
||||
# Most LLMs can handle 128k+ tokens now, but we'll be conservative
|
||||
MAX_DIGEST_CHARS = 500_000 # ~125k tokens
|
||||
|
||||
|
||||
async def index_github_repos(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||
update_last_indexed: bool = True,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index code and documentation files from accessible GitHub repositories.
|
||||
Index GitHub repositories using gitingest for efficient processing.
|
||||
|
||||
This function ingests entire repositories as digests, generates a single
|
||||
summary per repository, and chunks the content for vector storage.
|
||||
|
||||
Note: The start_date and end_date parameters are accepted for API compatibility
|
||||
but are IGNORED. GitHub repositories are indexed as complete snapshots since
|
||||
gitingest captures the current state of the entire codebase.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the GitHub connector
|
||||
search_space_id: ID of the search space to store documents in
|
||||
user_id: ID of the user
|
||||
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
||||
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
||||
start_date: Ignored - kept for API compatibility
|
||||
end_date: Ignored - kept for API compatibility
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
"""
|
||||
# Note: start_date and end_date are intentionally unused
|
||||
_ = start_date, end_date
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="github_repos_indexing",
|
||||
source="connector_indexing_task",
|
||||
message=f"Starting GitHub repositories indexing for connector {connector_id}",
|
||||
message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)",
|
||||
metadata={
|
||||
"connector_id": connector_id,
|
||||
"user_id": str(user_id),
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"method": "gitingest",
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -93,19 +108,11 @@ async def index_github_repos(
|
|||
f"Connector with ID {connector_id} not found or is not a GitHub connector",
|
||||
)
|
||||
|
||||
# 2. Get the GitHub PAT and selected repositories from the connector config
|
||||
github_pat = connector.config.get("GITHUB_PAT")
|
||||
# 2. Get the GitHub PAT (optional) and selected repositories from the connector config
|
||||
# PAT is only required for private repositories - public repos work without it
|
||||
github_pat = connector.config.get("GITHUB_PAT") # Can be None or empty
|
||||
repo_full_names_to_index = connector.config.get("repo_full_names")
|
||||
|
||||
if not github_pat:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
|
||||
"Missing GitHub PAT",
|
||||
{"error_type": "MissingToken"},
|
||||
)
|
||||
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
|
||||
|
||||
if not repo_full_names_to_index or not isinstance(
|
||||
repo_full_names_to_index, list
|
||||
):
|
||||
|
|
@ -117,10 +124,16 @@ async def index_github_repos(
|
|||
)
|
||||
return 0, "'repo_full_names' not found or is not a list in connector config"
|
||||
|
||||
# 3. Initialize GitHub connector client
|
||||
# Log whether we're using authentication
|
||||
if github_pat:
|
||||
logger.info("Using GitHub PAT for authentication (private repos supported)")
|
||||
else:
|
||||
logger.info("No GitHub PAT provided - only public repositories can be indexed")
|
||||
|
||||
# 3. Initialize GitHub connector with gitingest backend
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Initializing GitHub client for connector {connector_id}",
|
||||
f"Initializing gitingest-based GitHub client for connector {connector_id}",
|
||||
{
|
||||
"stage": "client_initialization",
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
|
|
@ -138,258 +151,52 @@ async def index_github_repos(
|
|||
)
|
||||
return 0, f"Failed to initialize GitHub client: {e!s}"
|
||||
|
||||
# 4. Validate selected repositories
|
||||
# 4. Process each repository with gitingest
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
|
||||
f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
|
||||
{
|
||||
"stage": "repo_processing",
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
|
||||
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
|
||||
)
|
||||
if start_date and end_date:
|
||||
logger.info(
|
||||
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
|
||||
)
|
||||
|
||||
# 6. Iterate through selected repositories and index files
|
||||
for repo_full_name in repo_full_names_to_index:
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing repository: {repo_full_name}")
|
||||
logger.info(f"Ingesting repository: {repo_full_name}")
|
||||
|
||||
try:
|
||||
files_to_index = github_client.get_repository_files(repo_full_name)
|
||||
if not files_to_index:
|
||||
logger.info(
|
||||
f"No indexable files found in repository: {repo_full_name}"
|
||||
# Ingest the entire repository
|
||||
digest = await github_client.ingest_repository(repo_full_name)
|
||||
|
||||
if not digest:
|
||||
logger.warning(
|
||||
f"No digest returned for repository: {repo_full_name}"
|
||||
)
|
||||
errors.append(f"No digest for {repo_full_name}")
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
f"Found {len(files_to_index)} files to process in {repo_full_name}"
|
||||
# Process the digest and create documents
|
||||
docs_created = await _process_repository_digest(
|
||||
session=session,
|
||||
digest=digest,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
)
|
||||
|
||||
for file_info in files_to_index:
|
||||
file_path = file_info.get("path")
|
||||
file_url = file_info.get("url")
|
||||
file_sha = file_info.get("sha")
|
||||
file_type = file_info.get("type") # 'code' or 'doc'
|
||||
full_path_key = f"{repo_full_name}/{file_path}"
|
||||
|
||||
if not file_path or not file_url or not file_sha:
|
||||
logger.warning(
|
||||
f"Skipping file with missing info in {repo_full_name}: {file_info}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get file content
|
||||
file_content = github_client.get_file_content(
|
||||
repo_full_name, file_path
|
||||
)
|
||||
|
||||
if file_content is None:
|
||||
logger.warning(
|
||||
f"Could not retrieve content for {full_path_key}. Skipping."
|
||||
)
|
||||
continue # Skip if content fetch failed
|
||||
|
||||
# Generate unique identifier hash for this GitHub file
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(file_content, search_space_id)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for GitHub file {full_path_key} unchanged. Skipping."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for GitHub file {full_path_key}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
if user_llm:
|
||||
file_extension = (
|
||||
file_path.split(".")[-1]
|
||||
if "." in file_path
|
||||
else None
|
||||
)
|
||||
document_metadata = {
|
||||
"file_path": full_path_key,
|
||||
"repository": repo_full_name,
|
||||
"file_type": file_extension or "unknown",
|
||||
"document_type": "GitHub Repository File",
|
||||
"connector_type": "GitHub",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
file_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
||||
summary_embedding = (
|
||||
config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
)
|
||||
|
||||
# Chunk the content
|
||||
try:
|
||||
if hasattr(config, "code_chunker_instance"):
|
||||
chunks_data = [
|
||||
await create_document_chunks(file_content)
|
||||
][0]
|
||||
else:
|
||||
chunks_data = await create_document_chunks(
|
||||
file_content
|
||||
)
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"GitHub - {full_path_key}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"file_path": file_path,
|
||||
"file_sha": file_sha,
|
||||
"file_url": file_url,
|
||||
"repository": repo_full_name,
|
||||
"indexed_at": datetime.now(UTC).strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
existing_document.chunks = chunks_data
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
logger.info(
|
||||
f"Successfully updated GitHub file {full_path_key}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
if user_llm:
|
||||
# Extract file extension from file path
|
||||
file_extension = (
|
||||
file_path.split(".")[-1] if "." in file_path else None
|
||||
)
|
||||
document_metadata = {
|
||||
"file_path": full_path_key,
|
||||
"repository": repo_full_name,
|
||||
"file_type": file_extension or "unknown",
|
||||
"document_type": "GitHub Repository File",
|
||||
"connector_type": "GitHub",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
file_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = (
|
||||
f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Chunk the content
|
||||
try:
|
||||
chunks_data = [await create_document_chunks(file_content)][0]
|
||||
|
||||
# Use code chunker if available, otherwise regular chunker
|
||||
if hasattr(config, "code_chunker_instance"):
|
||||
chunks_data = [
|
||||
{
|
||||
"content": chunk.text,
|
||||
"embedding": config.embedding_model_instance.embed(
|
||||
chunk.text
|
||||
),
|
||||
}
|
||||
for chunk in config.code_chunker_instance.chunk(
|
||||
file_content
|
||||
)
|
||||
]
|
||||
else:
|
||||
chunks_data = await create_document_chunks(file_content)
|
||||
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
||||
)
|
||||
errors.append(
|
||||
f"Chunking failed for {full_path_key}: {chunk_err}"
|
||||
)
|
||||
continue # Skip this file if chunking fails
|
||||
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"file_path": file_path,
|
||||
"full_path": full_path_key, # For easier lookup
|
||||
"url": file_url,
|
||||
"sha": file_sha,
|
||||
"type": file_type,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
# Create new document
|
||||
logger.info(f"Creating new document for file: {full_path_key}")
|
||||
document = Document(
|
||||
title=f"GitHub - {file_path}",
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_content, # Store summary
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
search_space_id=search_space_id,
|
||||
chunks=chunks_data, # Associate chunks directly
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_processed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_processed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_processed} GitHub files processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
documents_processed += docs_created
|
||||
logger.info(
|
||||
f"Created {docs_created} documents from repository: {repo_full_name}"
|
||||
)
|
||||
|
||||
except Exception as repo_err:
|
||||
logger.error(
|
||||
|
|
@ -397,11 +204,11 @@ async def index_github_repos(
|
|||
)
|
||||
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_processed} GitHub files processed")
|
||||
# Final commit
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
|
||||
f"Finished GitHub indexing for connector {connector_id}. "
|
||||
f"Created {documents_processed} documents."
|
||||
)
|
||||
|
||||
# Log success
|
||||
|
|
@ -412,6 +219,7 @@ async def index_github_repos(
|
|||
"documents_processed": documents_processed,
|
||||
"errors_count": len(errors),
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
"method": "gitingest",
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -428,6 +236,7 @@ async def index_github_repos(
|
|||
)
|
||||
errors.append(f"Database error: {db_err}")
|
||||
return documents_processed, "; ".join(errors) if errors else str(db_err)
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
|
|
@ -445,3 +254,173 @@ async def index_github_repos(
|
|||
|
||||
error_message = "; ".join(errors) if errors else None
|
||||
return documents_processed, error_message
|
||||
|
||||
|
||||
async def _process_repository_digest(
|
||||
session: AsyncSession,
|
||||
digest: RepositoryDigest,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
) -> int:
|
||||
"""
|
||||
Process a repository digest and create documents.
|
||||
|
||||
For each repository, we create:
|
||||
1. One main document with the repository summary
|
||||
2. Chunks from the full digest content for granular search
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
digest: The repository digest from gitingest
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
task_logger: Task logging service
|
||||
log_entry: Current log entry
|
||||
|
||||
Returns:
|
||||
Number of documents created
|
||||
"""
|
||||
repo_full_name = digest.repo_full_name
|
||||
documents_created = 0
|
||||
|
||||
# Generate unique identifier based on repo name and content hash
|
||||
# This allows updates when repo content changes
|
||||
full_content = digest.full_digest
|
||||
content_hash = generate_content_hash(full_content, search_space_id)
|
||||
|
||||
# Use repo name as the unique identifier (one document per repo)
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Repository {repo_full_name} unchanged. Skipping."
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
logger.info(
|
||||
f"Content changed for repository {repo_full_name}. Updating document."
|
||||
)
|
||||
# Delete existing document to replace with new one
|
||||
await session.delete(existing_document)
|
||||
await session.flush()
|
||||
|
||||
# Generate summary using LLM (ONE call per repository!)
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
document_metadata = {
|
||||
"repository": repo_full_name,
|
||||
"document_type": "GitHub Repository",
|
||||
"connector_type": "GitHub",
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
# Prepare content for summarization
|
||||
# Include tree structure and truncated content if too large
|
||||
summary_content = digest.full_digest
|
||||
if len(summary_content) > MAX_DIGEST_CHARS:
|
||||
# Truncate but keep the tree and beginning of content
|
||||
summary_content = (
|
||||
f"# Repository: {repo_full_name}\n\n"
|
||||
f"## File Structure\n\n{digest.tree}\n\n"
|
||||
f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
|
||||
)
|
||||
|
||||
summary_text, summary_embedding = await generate_document_summary(
|
||||
summary_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_text = (
|
||||
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||
f"## Summary\n{digest.summary}\n\n"
|
||||
f"## File Structure\n{digest.tree[:3000]}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_text)
|
||||
|
||||
# Chunk the full digest content for granular search
|
||||
try:
|
||||
# Use the content (not the summary) for chunking
|
||||
# This preserves file-level granularity in search
|
||||
chunks_data = await create_document_chunks(digest.content)
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk repository {repo_full_name}: {chunk_err}"
|
||||
)
|
||||
# Fall back to a simpler chunking approach
|
||||
chunks_data = await _simple_chunk_content(digest.content)
|
||||
|
||||
# Create the document
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"url": f"https://github.com/{repo_full_name}",
|
||||
"branch": digest.branch,
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree,
|
||||
"gitingest_summary": digest.summary,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
document = Document(
|
||||
title=f"GitHub Repository: {repo_full_name}",
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_text,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
search_space_id=search_space_id,
|
||||
chunks=chunks_data,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_created += 1
|
||||
|
||||
logger.info(
|
||||
f"Created document for repository {repo_full_name} "
|
||||
f"with {len(chunks_data)} chunks"
|
||||
)
|
||||
|
||||
return documents_created
|
||||
|
||||
|
||||
async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
|
||||
"""
|
||||
Simple fallback chunking when the regular chunker fails.
|
||||
|
||||
Args:
|
||||
content: The content to chunk
|
||||
chunk_size: Size of each chunk in characters
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries with content and embedding
|
||||
"""
|
||||
from app.db import Chunk
|
||||
|
||||
chunks = []
|
||||
for i in range(0, len(content), chunk_size):
|
||||
chunk_text = content[i : i + chunk_size]
|
||||
if chunk_text.strip():
|
||||
chunks.append(
|
||||
Chunk(
|
||||
content=chunk_text,
|
||||
embedding=config.embedding_model_instance.embed(chunk_text),
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ dependencies = [
|
|||
"mcp>=1.25.0",
|
||||
"starlette>=0.40.0,<0.51.0",
|
||||
"sse-starlette>=3.1.1,<3.1.2",
|
||||
"gitingest>=0.3.1",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
30
surfsense_backend/uv.lock
generated
30
surfsense_backend/uv.lock
generated
|
|
@ -1945,6 +1945,25 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gitingest"
|
||||
version = "0.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "httpx" },
|
||||
{ name = "loguru" },
|
||||
{ name = "pathspec" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "starlette" },
|
||||
{ name = "tiktoken" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "google-api-core"
|
||||
version = "2.25.1"
|
||||
|
|
@ -4460,6 +4479,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pathspec"
|
||||
version = "1.0.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdf2image"
|
||||
version = "1.17.0"
|
||||
|
|
@ -6484,6 +6512,7 @@ dependencies = [
|
|||
{ name = "firecrawl-py" },
|
||||
{ name = "flower" },
|
||||
{ name = "github3-py" },
|
||||
{ name = "gitingest" },
|
||||
{ name = "google-api-python-client" },
|
||||
{ name = "google-auth-oauthlib" },
|
||||
{ name = "kokoro" },
|
||||
|
|
@ -6549,6 +6578,7 @@ requires-dist = [
|
|||
{ name = "firecrawl-py", specifier = ">=4.9.0" },
|
||||
{ name = "flower", specifier = ">=2.0.1" },
|
||||
{ name = "github3-py", specifier = "==4.0.1" },
|
||||
{ name = "gitingest", specifier = ">=0.3.1" },
|
||||
{ name = "google-api-python-client", specifier = ">=2.156.0" },
|
||||
{ name = "google-auth-oauthlib", specifier = ">=1.2.1" },
|
||||
{ name = "kokoro", specifier = ">=0.9.4" },
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ import {
|
|||
} from "@/components/ui/select";
|
||||
import { Switch } from "@/components/ui/switch";
|
||||
import { EnumConnectorName } from "@/contracts/enums/connector";
|
||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
||||
import { getConnectorBenefits } from "../connector-benefits";
|
||||
import type { ConnectFormProps } from "../index";
|
||||
|
||||
|
|
@ -44,12 +43,13 @@ const githubConnectorFormSchema = z.object({
|
|||
}),
|
||||
github_pat: z
|
||||
.string()
|
||||
.min(20, {
|
||||
message: "GitHub Personal Access Token seems too short.",
|
||||
})
|
||||
.refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
|
||||
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
|
||||
}),
|
||||
.optional()
|
||||
.refine(
|
||||
(pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"),
|
||||
{
|
||||
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
|
||||
}
|
||||
),
|
||||
repo_full_names: z.string().min(1, {
|
||||
message: "At least one repository is required.",
|
||||
}),
|
||||
|
|
@ -59,8 +59,6 @@ type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
|
|||
|
||||
export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
|
||||
const isSubmittingRef = useRef(false);
|
||||
const [startDate, setStartDate] = useState<Date | undefined>(undefined);
|
||||
const [endDate, setEndDate] = useState<Date | undefined>(undefined);
|
||||
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
||||
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
||||
const form = useForm<GithubConnectorFormValues>({
|
||||
|
|
@ -94,7 +92,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
name: values.name,
|
||||
connector_type: EnumConnectorName.GITHUB_CONNECTOR,
|
||||
config: {
|
||||
GITHUB_PAT: values.github_pat,
|
||||
GITHUB_PAT: values.github_pat || null, // Optional - only for private repos
|
||||
repo_full_names: repoList,
|
||||
},
|
||||
is_indexable: true,
|
||||
|
|
@ -102,8 +100,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
periodic_indexing_enabled: periodicEnabled,
|
||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||
next_scheduled_at: null,
|
||||
startDate,
|
||||
endDate,
|
||||
// GitHub indexes full repo snapshots - no date range needed
|
||||
startDate: undefined,
|
||||
endDate: undefined,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
});
|
||||
|
|
@ -117,10 +116,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
||||
<div className="-ml-1">
|
||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token Required</AlertTitle>
|
||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||
You'll need a GitHub Personal Access Token to use this connector. You can create one
|
||||
from{" "}
|
||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||
token. Create one from{" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens"
|
||||
target="_blank"
|
||||
|
|
@ -128,7 +127,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
className="font-medium underline underline-offset-4"
|
||||
>
|
||||
GitHub Settings
|
||||
</a>
|
||||
</a>{" "}
|
||||
if needed.
|
||||
</AlertDescription>
|
||||
</div>
|
||||
</Alert>
|
||||
|
|
@ -167,7 +167,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
name="github_pat"
|
||||
render={({ field }) => (
|
||||
<FormItem>
|
||||
<FormLabel className="text-xs sm:text-sm">GitHub Personal Access Token</FormLabel>
|
||||
<FormLabel className="text-xs sm:text-sm">
|
||||
GitHub Personal Access Token{" "}
|
||||
<span className="text-muted-foreground font-normal">(optional)</span>
|
||||
</FormLabel>
|
||||
<FormControl>
|
||||
<Input
|
||||
type="password"
|
||||
|
|
@ -178,8 +181,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
/>
|
||||
</FormControl>
|
||||
<FormDescription className="text-[10px] sm:text-xs">
|
||||
Your GitHub PAT will be encrypted and stored securely. It typically starts with
|
||||
"ghp_" or "github_pat_".
|
||||
Only required for private repositories. Leave empty if indexing public repos
|
||||
only.
|
||||
</FormDescription>
|
||||
<FormMessage />
|
||||
</FormItem>
|
||||
|
|
@ -225,15 +228,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
|
||||
{/* Indexing Configuration */}
|
||||
<div className="space-y-4 pt-4 border-t border-slate-400/20">
|
||||
<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
|
||||
<h3 className="text-sm sm:text-base font-medium">Sync Configuration</h3>
|
||||
|
||||
{/* Date Range Selector */}
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
onStartDateChange={setStartDate}
|
||||
onEndDateChange={setEndDate}
|
||||
/>
|
||||
{/* Note: No date range for GitHub - it indexes full repo snapshots */}
|
||||
|
||||
{/* Periodic Sync Config */}
|
||||
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
||||
|
|
|
|||
|
|
@ -490,8 +490,8 @@ export function SourceDetailPanel({
|
|||
>
|
||||
{idx + 1}
|
||||
{isCited && (
|
||||
<span className="absolute -top-1 -right-1 w-3 h-3 bg-primary rounded-full border-2 border-background">
|
||||
<Sparkles className="h-2 w-2 text-primary-foreground absolute top-0.5 left-0.5" />
|
||||
<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
|
||||
<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
|
||||
</span>
|
||||
)}
|
||||
</motion.button>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue