mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-03 12:52:39 +02:00
refactor: Update GitHub connector to use gitingest CLI
- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery. - Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution. - Adjusted GitHub indexer to call the new synchronous ingestion method. - Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
This commit is contained in:
parent
49b8a46d10
commit
35888144eb
8 changed files with 221 additions and 256 deletions
|
|
@ -1,130 +1,21 @@
|
||||||
"""
|
"""
|
||||||
GitHub connector using gitingest for efficient repository digestion.
|
GitHub connector using gitingest CLI for efficient repository digestion.
|
||||||
|
|
||||||
This connector replaces the previous file-by-file approach with a single
|
This connector uses subprocess to call gitingest CLI, completely isolating
|
||||||
digest generation per repository, dramatically reducing LLM API calls.
|
it from any Python event loop/async complexity that can cause hangs in Celery.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from gitingest import ingest_async
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Maximum file size in bytes (5MB)
|
# Maximum file size in bytes (5MB)
|
||||||
MAX_FILE_SIZE = 5 * 1024 * 1024
|
MAX_FILE_SIZE = 5 * 1024 * 1024
|
||||||
|
|
||||||
# Default patterns to exclude (recommended approach for comprehensive analysis)
|
|
||||||
# Using only exclude_patterns ensures we don't miss any relevant file types
|
|
||||||
DEFAULT_EXCLUDE_PATTERNS = [
|
|
||||||
# Dependencies
|
|
||||||
"node_modules/*",
|
|
||||||
"vendor/*",
|
|
||||||
"bower_components/*",
|
|
||||||
".pnpm/*",
|
|
||||||
# Build artifacts / Caches
|
|
||||||
"build/*",
|
|
||||||
"dist/*",
|
|
||||||
"target/*",
|
|
||||||
"out/*",
|
|
||||||
"__pycache__/*",
|
|
||||||
"*.pyc",
|
|
||||||
".cache/*",
|
|
||||||
".next/*",
|
|
||||||
".nuxt/*",
|
|
||||||
# Virtual environments
|
|
||||||
"venv/*",
|
|
||||||
".venv/*",
|
|
||||||
"env/*",
|
|
||||||
".env/*",
|
|
||||||
# IDE/Editor config
|
|
||||||
".vscode/*",
|
|
||||||
".idea/*",
|
|
||||||
".project",
|
|
||||||
".settings/*",
|
|
||||||
"*.swp",
|
|
||||||
"*.swo",
|
|
||||||
# Version control
|
|
||||||
".git/*",
|
|
||||||
".svn/*",
|
|
||||||
".hg/*",
|
|
||||||
# Temporary / Logs
|
|
||||||
"tmp/*",
|
|
||||||
"temp/*",
|
|
||||||
"logs/*",
|
|
||||||
"*.log",
|
|
||||||
# Lock files (usually not needed for understanding code)
|
|
||||||
"package-lock.json",
|
|
||||||
"pnpm-lock.yaml",
|
|
||||||
"yarn.lock",
|
|
||||||
"uv.lock",
|
|
||||||
"Gemfile.lock",
|
|
||||||
"poetry.lock",
|
|
||||||
"Cargo.lock",
|
|
||||||
"composer.lock",
|
|
||||||
# Binary/media files
|
|
||||||
"*.png",
|
|
||||||
"*.jpg",
|
|
||||||
"*.jpeg",
|
|
||||||
"*.gif",
|
|
||||||
"*.ico",
|
|
||||||
"*.svg",
|
|
||||||
"*.webp",
|
|
||||||
"*.bmp",
|
|
||||||
"*.tiff",
|
|
||||||
"*.woff",
|
|
||||||
"*.woff2",
|
|
||||||
"*.ttf",
|
|
||||||
"*.eot",
|
|
||||||
"*.otf",
|
|
||||||
"*.mp3",
|
|
||||||
"*.mp4",
|
|
||||||
"*.wav",
|
|
||||||
"*.ogg",
|
|
||||||
"*.webm",
|
|
||||||
"*.avi",
|
|
||||||
"*.mov",
|
|
||||||
"*.pdf",
|
|
||||||
"*.doc",
|
|
||||||
"*.docx",
|
|
||||||
"*.xls",
|
|
||||||
"*.xlsx",
|
|
||||||
"*.ppt",
|
|
||||||
"*.pptx",
|
|
||||||
"*.zip",
|
|
||||||
"*.tar",
|
|
||||||
"*.tar.gz",
|
|
||||||
"*.tgz",
|
|
||||||
"*.rar",
|
|
||||||
"*.7z",
|
|
||||||
"*.exe",
|
|
||||||
"*.dll",
|
|
||||||
"*.so",
|
|
||||||
"*.dylib",
|
|
||||||
"*.bin",
|
|
||||||
"*.obj",
|
|
||||||
"*.o",
|
|
||||||
"*.a",
|
|
||||||
"*.lib",
|
|
||||||
# Minified files
|
|
||||||
"*.min.js",
|
|
||||||
"*.min.css",
|
|
||||||
# Source maps
|
|
||||||
"*.map",
|
|
||||||
# Database files
|
|
||||||
"*.db",
|
|
||||||
"*.sqlite",
|
|
||||||
"*.sqlite3",
|
|
||||||
# Coverage reports
|
|
||||||
"coverage/*",
|
|
||||||
".coverage",
|
|
||||||
"htmlcov/*",
|
|
||||||
".nyc_output/*",
|
|
||||||
# Test snapshots (can be large)
|
|
||||||
"__snapshots__/*",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RepositoryDigest:
|
class RepositoryDigest:
|
||||||
|
|
@ -149,21 +40,19 @@ class RepositoryDigest:
|
||||||
|
|
||||||
class GitHubConnector:
|
class GitHubConnector:
|
||||||
"""
|
"""
|
||||||
Connector for ingesting GitHub repositories using gitingest.
|
Connector for ingesting GitHub repositories using gitingest CLI.
|
||||||
|
|
||||||
This connector efficiently processes entire repositories into a single
|
Uses subprocess to run gitingest, which avoids all async/event loop
|
||||||
digest, reducing the number of API calls and LLM invocations compared
|
issues that can occur when mixing gitingest with Celery workers.
|
||||||
to file-by-file processing.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, token: str | None = None):
|
def __init__(self, token: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes the GitHub connector.
|
Initialize the GitHub connector.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token: Optional GitHub Personal Access Token (PAT).
|
token: Optional GitHub Personal Access Token (PAT).
|
||||||
Only required for private repositories.
|
Only required for private repositories.
|
||||||
Public repositories can be ingested without a token.
|
|
||||||
"""
|
"""
|
||||||
self.token = token if token and token.strip() else None
|
self.token = token if token and token.strip() else None
|
||||||
if self.token:
|
if self.token:
|
||||||
|
|
@ -171,72 +60,104 @@ class GitHubConnector:
|
||||||
else:
|
else:
|
||||||
logger.info("GitHub connector initialized without token (public repos only).")
|
logger.info("GitHub connector initialized without token (public repos only).")
|
||||||
|
|
||||||
async def ingest_repository(
|
def ingest_repository(
|
||||||
self,
|
self,
|
||||||
repo_full_name: str,
|
repo_full_name: str,
|
||||||
branch: str | None = None,
|
branch: str | None = None,
|
||||||
include_patterns: list[str] | None = None,
|
|
||||||
exclude_patterns: list[str] | None = None,
|
|
||||||
max_file_size: int = MAX_FILE_SIZE,
|
max_file_size: int = MAX_FILE_SIZE,
|
||||||
) -> RepositoryDigest | None:
|
) -> RepositoryDigest | None:
|
||||||
"""
|
"""
|
||||||
Ingest an entire repository and return a digest.
|
Ingest a repository using gitingest CLI via subprocess.
|
||||||
|
|
||||||
|
This approach completely isolates gitingest from Python's event loop,
|
||||||
|
avoiding any async/Celery conflicts.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||||
branch: Optional specific branch or tag to ingest.
|
branch: Optional specific branch or tag to ingest.
|
||||||
include_patterns: Optional list of glob patterns for files to include.
|
max_file_size: Maximum file size in bytes to include.
|
||||||
If None, includes all files (recommended).
|
|
||||||
exclude_patterns: Optional list of glob patterns for files to exclude.
|
|
||||||
If None, uses DEFAULT_EXCLUDE_PATTERNS.
|
|
||||||
max_file_size: Maximum file size in bytes to include (default 5MB).
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
RepositoryDigest containing the summary, tree structure, and content,
|
RepositoryDigest or None if ingestion fails.
|
||||||
or None if ingestion fails.
|
|
||||||
"""
|
"""
|
||||||
repo_url = f"https://github.com/{repo_full_name}"
|
repo_url = f"https://github.com/{repo_full_name}"
|
||||||
|
|
||||||
# Use only exclude_patterns by default (recommended for comprehensive analysis)
|
logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
|
||||||
# This ensures we don't miss any relevant file types
|
|
||||||
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
|
|
||||||
|
|
||||||
logger.info(f"Starting gitingest for repository: {repo_full_name}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Build kwargs dynamically
|
# Create a temporary file for output
|
||||||
ingest_kwargs = {
|
with tempfile.NamedTemporaryFile(
|
||||||
"max_file_size": max_file_size,
|
mode="w", suffix=".txt", delete=False
|
||||||
"exclude_patterns": exclude_pats,
|
) as tmp_file:
|
||||||
"include_gitignored": False,
|
output_path = tmp_file.name
|
||||||
"include_submodules": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Only add token if provided (required only for private repos)
|
# Build the gitingest CLI command
|
||||||
if self.token:
|
cmd = [
|
||||||
ingest_kwargs["token"] = self.token
|
"gitingest",
|
||||||
|
repo_url,
|
||||||
|
"--output", output_path,
|
||||||
|
"--max-size", str(max_file_size),
|
||||||
|
# Common exclude patterns
|
||||||
|
"-e", "node_modules/*",
|
||||||
|
"-e", "vendor/*",
|
||||||
|
"-e", ".git/*",
|
||||||
|
"-e", "__pycache__/*",
|
||||||
|
"-e", "dist/*",
|
||||||
|
"-e", "build/*",
|
||||||
|
"-e", "*.lock",
|
||||||
|
"-e", "package-lock.json",
|
||||||
|
]
|
||||||
|
|
||||||
# Only add branch if specified
|
# Add branch if specified
|
||||||
if branch:
|
if branch:
|
||||||
ingest_kwargs["branch"] = branch
|
cmd.extend(["--branch", branch])
|
||||||
|
|
||||||
# Only add include_patterns if explicitly provided
|
# Set up environment with token if provided
|
||||||
if include_patterns is not None:
|
env = os.environ.copy()
|
||||||
ingest_kwargs["include_patterns"] = include_patterns
|
if self.token:
|
||||||
|
env["GITHUB_TOKEN"] = self.token
|
||||||
|
|
||||||
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
|
logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
|
||||||
|
|
||||||
if not content or not content.strip():
|
# Run gitingest as subprocess with timeout
|
||||||
logger.warning(
|
result = subprocess.run(
|
||||||
f"No content retrieved from repository: {repo_full_name}"
|
cmd,
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=900, # 5 minute timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
logger.error(f"gitingest failed: {result.stderr}")
|
||||||
|
# Clean up temp file
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
os.unlink(output_path)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Read the output file
|
||||||
|
if not os.path.exists(output_path):
|
||||||
|
logger.error("gitingest did not create output file")
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(output_path, encoding="utf-8") as f:
|
||||||
|
full_content = f.read()
|
||||||
|
|
||||||
|
# Clean up temp file
|
||||||
|
os.unlink(output_path)
|
||||||
|
|
||||||
|
if not full_content or not full_content.strip():
|
||||||
|
logger.warning(f"No content retrieved from repository: {repo_full_name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Parse the gitingest output
|
||||||
|
# The output format is: summary + tree + content
|
||||||
|
# We'll extract what we can
|
||||||
digest = RepositoryDigest(
|
digest = RepositoryDigest(
|
||||||
repo_full_name=repo_full_name,
|
repo_full_name=repo_full_name,
|
||||||
summary=summary,
|
summary=f"Repository: {repo_full_name}",
|
||||||
tree=tree,
|
tree="", # gitingest CLI combines everything into one file
|
||||||
content=content,
|
content=full_content,
|
||||||
branch=branch,
|
branch=branch,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -246,50 +167,70 @@ class GitHubConnector:
|
||||||
)
|
)
|
||||||
return digest
|
return digest
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.error(f"gitingest timed out for repository: {repo_full_name}")
|
||||||
|
return None
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error(
|
||||||
|
"gitingest CLI not found. Falling back to Python library."
|
||||||
|
)
|
||||||
|
# Fall back to Python library
|
||||||
|
return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
|
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def ingest_repositories(
|
def _ingest_with_python_library(
|
||||||
self,
|
self,
|
||||||
repo_full_names: list[str],
|
repo_full_name: str,
|
||||||
branch: str | None = None,
|
branch: str | None = None,
|
||||||
include_patterns: list[str] | None = None,
|
|
||||||
exclude_patterns: list[str] | None = None,
|
|
||||||
max_file_size: int = MAX_FILE_SIZE,
|
max_file_size: int = MAX_FILE_SIZE,
|
||||||
) -> list[RepositoryDigest]:
|
) -> RepositoryDigest | None:
|
||||||
"""
|
"""
|
||||||
Ingest multiple repositories and return their digests.
|
Fallback: Ingest using the Python library directly.
|
||||||
|
|
||||||
Args:
|
|
||||||
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
|
|
||||||
branch: Optional specific branch or tag to ingest (applied to all repos).
|
|
||||||
include_patterns: Optional list of glob patterns for files to include.
|
|
||||||
exclude_patterns: Optional list of glob patterns for files to exclude.
|
|
||||||
max_file_size: Maximum file size in bytes to include.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of RepositoryDigest objects for successfully ingested repositories.
|
|
||||||
"""
|
"""
|
||||||
digests = []
|
from gitingest import ingest
|
||||||
|
|
||||||
for repo_full_name in repo_full_names:
|
repo_url = f"https://github.com/{repo_full_name}"
|
||||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
|
||||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
digest = await self.ingest_repository(
|
logger.info(f"Using Python gitingest library for: {repo_full_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
kwargs = {
|
||||||
|
"max_file_size": max_file_size,
|
||||||
|
"exclude_patterns": [
|
||||||
|
"node_modules/*",
|
||||||
|
"vendor/*",
|
||||||
|
".git/*",
|
||||||
|
"__pycache__/*",
|
||||||
|
"dist/*",
|
||||||
|
"build/*",
|
||||||
|
"*.lock",
|
||||||
|
"package-lock.json",
|
||||||
|
],
|
||||||
|
"include_gitignored": False,
|
||||||
|
"include_submodules": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.token:
|
||||||
|
kwargs["token"] = self.token
|
||||||
|
if branch:
|
||||||
|
kwargs["branch"] = branch
|
||||||
|
|
||||||
|
summary, tree, content = ingest(repo_url, **kwargs)
|
||||||
|
|
||||||
|
if not content or not content.strip():
|
||||||
|
logger.warning(f"No content from {repo_full_name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return RepositoryDigest(
|
||||||
repo_full_name=repo_full_name,
|
repo_full_name=repo_full_name,
|
||||||
|
summary=summary,
|
||||||
|
tree=tree,
|
||||||
|
content=content,
|
||||||
branch=branch,
|
branch=branch,
|
||||||
include_patterns=include_patterns,
|
|
||||||
exclude_patterns=exclude_patterns,
|
|
||||||
max_file_size=max_file_size,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if digest:
|
except Exception as e:
|
||||||
digests.append(digest)
|
logger.error(f"Python library failed for {repo_full_name}: {e}")
|
||||||
|
return None
|
||||||
logger.info(
|
|
||||||
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
|
|
||||||
)
|
|
||||||
return digests
|
|
||||||
|
|
|
||||||
|
|
@ -173,8 +173,13 @@ async def index_github_repos(
|
||||||
logger.info(f"Ingesting repository: {repo_full_name}")
|
logger.info(f"Ingesting repository: {repo_full_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Ingest the entire repository
|
# Run gitingest via subprocess (isolated from event loop)
|
||||||
digest = await github_client.ingest_repository(repo_full_name)
|
# Using to_thread to not block the async database operations
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
digest = await asyncio.to_thread(
|
||||||
|
github_client.ingest_repository, repo_full_name
|
||||||
|
)
|
||||||
|
|
||||||
if not digest:
|
if not digest:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
|
||||||
|
|
@ -530,7 +530,10 @@ def validate_connector_config(
|
||||||
# "validators": {},
|
# "validators": {},
|
||||||
# },
|
# },
|
||||||
"GITHUB_CONNECTOR": {
|
"GITHUB_CONNECTOR": {
|
||||||
"required": ["GITHUB_PAT", "repo_full_names"],
|
# GITHUB_PAT is optional - only required for private repositories
|
||||||
|
# Public repositories can be indexed without authentication
|
||||||
|
"required": ["repo_full_names"],
|
||||||
|
"optional": ["GITHUB_PAT"], # Optional - only needed for private repos
|
||||||
"validators": {
|
"validators": {
|
||||||
"repo_full_names": lambda: validate_list_field(
|
"repo_full_names": lambda: validate_list_field(
|
||||||
"repo_full_names", "repo_full_names"
|
"repo_full_names", "repo_full_names"
|
||||||
|
|
|
||||||
|
|
@ -24,11 +24,6 @@
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"status": "warning",
|
"status": "warning",
|
||||||
"statusMessage": "Some requests may be blocked if not using Firecrawl."
|
"statusMessage": "Some requests may be blocked if not using Firecrawl."
|
||||||
},
|
|
||||||
"GITHUB_CONNECTOR": {
|
|
||||||
"enabled": false,
|
|
||||||
"status": "maintenance",
|
|
||||||
"statusMessage": "Rework in progress."
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"globalSettings": {
|
"globalSettings": {
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
repo_full_names: repoList,
|
repo_full_names: repoList,
|
||||||
},
|
},
|
||||||
is_indexable: true,
|
is_indexable: true,
|
||||||
|
is_active: true,
|
||||||
last_indexed_at: null,
|
last_indexed_at: null,
|
||||||
periodic_indexing_enabled: periodicEnabled,
|
periodic_indexing_enabled: periodicEnabled,
|
||||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||||
|
|
@ -119,16 +120,16 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||||
token. Create one from{" "}
|
token. {" "}
|
||||||
<a
|
<a
|
||||||
href="https://github.com/settings/tokens"
|
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||||
target="_blank"
|
target="_blank"
|
||||||
rel="noopener noreferrer"
|
rel="noopener noreferrer"
|
||||||
className="font-medium underline underline-offset-4"
|
className="font-medium underline underline-offset-4"
|
||||||
>
|
>
|
||||||
GitHub Settings
|
Get your token
|
||||||
</a>{" "}
|
</a>{" "}
|
||||||
if needed.
|
.
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
</div>
|
</div>
|
||||||
</Alert>
|
</Alert>
|
||||||
|
|
@ -324,20 +325,21 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
<div>
|
<div>
|
||||||
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
|
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
|
||||||
<p className="text-[10px] sm:text-xs text-muted-foreground">
|
<p className="text-[10px] sm:text-xs text-muted-foreground">
|
||||||
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the
|
The GitHub connector ingests entire repositories in one pass using gitingest,
|
||||||
GitHub API. You provide a comma-separated list of repository full names (e.g.,
|
making it highly efficient. Provide a comma-separated list of repository full
|
||||||
"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant
|
names (e.g., "owner/repo1, owner/repo2") to index.
|
||||||
files (code, markdown, text) from the selected repositories.
|
|
||||||
</p>
|
</p>
|
||||||
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
|
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
|
||||||
<li>
|
<li>
|
||||||
The connector indexes files based on common code and documentation extensions.
|
<strong>Public repos:</strong> No authentication required.
|
||||||
</li>
|
</li>
|
||||||
<li>Large files (over 1MB) are skipped during indexing.</li>
|
|
||||||
<li>Only specified repositories are indexed.</li>
|
|
||||||
<li>
|
<li>
|
||||||
Indexing runs periodically (check connector settings for frequency) to keep
|
<strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
|
||||||
content up-to-date.
|
</li>
|
||||||
|
<li>Indexes code, documentation, and configuration files.</li>
|
||||||
|
<li>Large files (over 5MB) and binary files are automatically skipped.</li>
|
||||||
|
<li>
|
||||||
|
Periodic sync detects changes and only re-indexes when content has changed.
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -348,19 +350,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
|
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
|
||||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||||
<AlertTitle className="text-[10px] sm:text-xs">
|
<AlertTitle className="text-[10px] sm:text-xs">
|
||||||
Personal Access Token Required
|
Personal Access Token (Optional)
|
||||||
</AlertTitle>
|
</AlertTitle>
|
||||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||||
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch
|
A GitHub PAT is only needed for <strong>private repositories</strong>. Public
|
||||||
repositories. The PAT will be stored securely to enable indexing.
|
repos can be indexed without authentication. If you need to access private
|
||||||
|
repos, create a PAT with the 'repo' scope.
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
</Alert>
|
</Alert>
|
||||||
|
|
||||||
<div className="space-y-4 sm:space-y-6">
|
<div className="space-y-4 sm:space-y-6">
|
||||||
<div>
|
<div>
|
||||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||||
Step 1: Generate GitHub PAT
|
For Private Repositories Only: Generate GitHub PAT
|
||||||
</h4>
|
</h4>
|
||||||
|
<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
|
||||||
|
Skip this step if you're only indexing public repositories.
|
||||||
|
</p>
|
||||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
|
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
|
||||||
<li>
|
<li>
|
||||||
Go to your GitHub{" "}
|
Go to your GitHub{" "}
|
||||||
|
|
@ -375,46 +381,36 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
Click on <strong>Personal access tokens</strong>, then choose{" "}
|
Click on <strong>Personal access tokens</strong>, then choose{" "}
|
||||||
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "}
|
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
|
||||||
(recommended if available).
|
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
Click <strong>Generate new token</strong> (and choose the appropriate type).
|
Click <strong>Generate new token</strong>.
|
||||||
</li>
|
</li>
|
||||||
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
|
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
|
||||||
<li>Set an expiration date for the token (recommended for security).</li>
|
|
||||||
<li>
|
<li>
|
||||||
Under <strong>Select scopes</strong> (for classic tokens) or{" "}
|
Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
|
||||||
<strong>Repository access</strong> (for fine-grained), grant the necessary
|
to the specific repositories you want to index (for fine-grained tokens).
|
||||||
permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
|
|
||||||
read access to repositories for fine-grained tokens) is required to read
|
|
||||||
repository content.
|
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
Click <strong>Generate token</strong>.
|
Click <strong>Generate token</strong> and copy it immediately.
|
||||||
</li>
|
|
||||||
<li>
|
|
||||||
<strong>Important:</strong> Copy your new PAT immediately. You won't be able
|
|
||||||
to see it again after leaving the page.
|
|
||||||
</li>
|
</li>
|
||||||
</ol>
|
</ol>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||||
Step 2: Specify repositories
|
Specify Repositories
|
||||||
</h4>
|
</h4>
|
||||||
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
|
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
|
||||||
Enter a comma-separated list of repository full names in the format
|
Enter a comma-separated list of repository full names in the format
|
||||||
"owner/repo1, owner/repo2". The connector will index files from only the
|
"owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
|
||||||
specified repositories.
|
|
||||||
</p>
|
</p>
|
||||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||||
<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle>
|
<AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
|
||||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||||
Make sure your PAT has access to all repositories you want to index. Private
|
Public repositories work without a PAT. For private repositories, ensure
|
||||||
repositories require appropriate permissions.
|
your PAT has access to the repos you want to index.
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
</Alert>
|
</Alert>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -424,36 +420,38 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
||||||
|
|
||||||
<div className="space-y-4">
|
<div className="space-y-4">
|
||||||
<div>
|
<div>
|
||||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3>
|
<h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
|
||||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
|
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
|
||||||
<li>
|
<li>
|
||||||
Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "}
|
Enter the <strong>Repository Names</strong> you want to index (e.g.,
|
||||||
Connector.
|
"facebook/react, vercel/next.js").
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
Enter your <strong>GitHub Personal Access Token</strong> in the form field.
|
<strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
Enter a comma-separated list of <strong>Repository Names</strong> (e.g.,
|
Click <strong>Connect GitHub</strong> to start indexing.
|
||||||
"owner/repo1, owner/repo2").
|
|
||||||
</li>
|
</li>
|
||||||
<li>
|
<li>
|
||||||
Click <strong>Connect</strong> to establish the connection.
|
Enable <strong>Periodic Sync</strong> to automatically detect and index
|
||||||
|
changes.
|
||||||
</li>
|
</li>
|
||||||
<li>Once connected, your GitHub repositories will be indexed automatically.</li>
|
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||||
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
|
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
|
||||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||||
<p className="mb-2">The GitHub connector indexes the following data:</p>
|
<p className="mb-2">The GitHub connector indexes:</p>
|
||||||
<ul className="list-disc pl-5 space-y-1">
|
<ul className="list-disc pl-5 space-y-1">
|
||||||
<li>Code files from selected repositories</li>
|
<li>All code files (Python, JavaScript, TypeScript, etc.)</li>
|
||||||
<li>README files and Markdown documentation</li>
|
<li>Documentation (README, Markdown, text files)</li>
|
||||||
<li>Common text-based file formats</li>
|
<li>Configuration files (JSON, YAML, TOML, etc.)</li>
|
||||||
<li>Repository metadata and structure</li>
|
<li>Repository structure and file tree</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
<p className="mt-2">
|
||||||
|
Binary files, images, and build artifacts are automatically excluded.
|
||||||
|
</p>
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
</Alert>
|
</Alert>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,9 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { KeyRound } from "lucide-react";
|
import { Info, KeyRound } from "lucide-react";
|
||||||
import type { FC } from "react";
|
import type { FC } from "react";
|
||||||
import { useEffect, useState } from "react";
|
import { useEffect, useState } from "react";
|
||||||
|
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
|
||||||
import { Badge } from "@/components/ui/badge";
|
import { Badge } from "@/components/ui/badge";
|
||||||
import { Input } from "@/components/ui/input";
|
import { Input } from "@/components/ui/input";
|
||||||
import { Label } from "@/components/ui/label";
|
import { Label } from "@/components/ui/label";
|
||||||
|
|
@ -79,6 +80,26 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="space-y-6">
|
<div className="space-y-6">
|
||||||
|
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
||||||
|
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
||||||
|
<div className="-ml-1">
|
||||||
|
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||||
|
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||||
|
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||||
|
token. Create one from{" "}
|
||||||
|
<a
|
||||||
|
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="font-medium underline underline-offset-4"
|
||||||
|
>
|
||||||
|
GitHub Settings
|
||||||
|
</a>{" "}
|
||||||
|
if needed.
|
||||||
|
</AlertDescription>
|
||||||
|
</div>
|
||||||
|
</Alert>
|
||||||
|
|
||||||
{/* Connector Name */}
|
{/* Connector Name */}
|
||||||
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
|
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
|
||||||
<div className="space-y-2">
|
<div className="space-y-2">
|
||||||
|
|
@ -105,7 +126,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
||||||
<div className="space-y-2">
|
<div className="space-y-2">
|
||||||
<Label className="flex items-center gap-2 text-xs sm:text-sm">
|
<Label className="flex items-center gap-2 text-xs sm:text-sm">
|
||||||
<KeyRound className="h-4 w-4" />
|
<KeyRound className="h-4 w-4" />
|
||||||
GitHub Personal Access Token
|
GitHub Personal Access Token (optional)
|
||||||
</Label>
|
</Label>
|
||||||
<Input
|
<Input
|
||||||
type="password"
|
type="password"
|
||||||
|
|
|
||||||
|
|
@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||||
{connector.is_indexable && (
|
{connector.is_indexable && (
|
||||||
<>
|
<>
|
||||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" && (
|
connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
|
||||||
|
connector.connector_type !== "GITHUB_CONNECTOR" && (
|
||||||
<DateRangeSelector
|
<DateRangeSelector
|
||||||
startDate={startDate}
|
startDate={startDate}
|
||||||
endDate={endDate}
|
endDate={endDate}
|
||||||
|
|
|
||||||
|
|
@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
||||||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||||
{connector?.is_indexable && (
|
{connector?.is_indexable && (
|
||||||
<>
|
<>
|
||||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||||
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" && (
|
config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
|
||||||
|
config.connectorType !== "GITHUB_CONNECTOR" && (
|
||||||
<DateRangeSelector
|
<DateRangeSelector
|
||||||
startDate={startDate}
|
startDate={startDate}
|
||||||
endDate={endDate}
|
endDate={endDate}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue