mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
refactor: Update GitHub connector to use gitingest CLI
- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery. - Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution. - Adjusted GitHub indexer to call the new synchronous ingestion method. - Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
This commit is contained in:
parent
49b8a46d10
commit
35888144eb
8 changed files with 221 additions and 256 deletions
|
|
@ -1,130 +1,21 @@
|
|||
"""
|
||||
GitHub connector using gitingest for efficient repository digestion.
|
||||
GitHub connector using gitingest CLI for efficient repository digestion.
|
||||
|
||||
This connector replaces the previous file-by-file approach with a single
|
||||
digest generation per repository, dramatically reducing LLM API calls.
|
||||
This connector uses subprocess to call gitingest CLI, completely isolating
|
||||
it from any Python event loop/async complexity that can cause hangs in Celery.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
|
||||
from gitingest import ingest_async
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Maximum file size in bytes (5MB)
|
||||
MAX_FILE_SIZE = 5 * 1024 * 1024
|
||||
|
||||
# Default patterns to exclude (recommended approach for comprehensive analysis)
|
||||
# Using only exclude_patterns ensures we don't miss any relevant file types
|
||||
DEFAULT_EXCLUDE_PATTERNS = [
|
||||
# Dependencies
|
||||
"node_modules/*",
|
||||
"vendor/*",
|
||||
"bower_components/*",
|
||||
".pnpm/*",
|
||||
# Build artifacts / Caches
|
||||
"build/*",
|
||||
"dist/*",
|
||||
"target/*",
|
||||
"out/*",
|
||||
"__pycache__/*",
|
||||
"*.pyc",
|
||||
".cache/*",
|
||||
".next/*",
|
||||
".nuxt/*",
|
||||
# Virtual environments
|
||||
"venv/*",
|
||||
".venv/*",
|
||||
"env/*",
|
||||
".env/*",
|
||||
# IDE/Editor config
|
||||
".vscode/*",
|
||||
".idea/*",
|
||||
".project",
|
||||
".settings/*",
|
||||
"*.swp",
|
||||
"*.swo",
|
||||
# Version control
|
||||
".git/*",
|
||||
".svn/*",
|
||||
".hg/*",
|
||||
# Temporary / Logs
|
||||
"tmp/*",
|
||||
"temp/*",
|
||||
"logs/*",
|
||||
"*.log",
|
||||
# Lock files (usually not needed for understanding code)
|
||||
"package-lock.json",
|
||||
"pnpm-lock.yaml",
|
||||
"yarn.lock",
|
||||
"uv.lock",
|
||||
"Gemfile.lock",
|
||||
"poetry.lock",
|
||||
"Cargo.lock",
|
||||
"composer.lock",
|
||||
# Binary/media files
|
||||
"*.png",
|
||||
"*.jpg",
|
||||
"*.jpeg",
|
||||
"*.gif",
|
||||
"*.ico",
|
||||
"*.svg",
|
||||
"*.webp",
|
||||
"*.bmp",
|
||||
"*.tiff",
|
||||
"*.woff",
|
||||
"*.woff2",
|
||||
"*.ttf",
|
||||
"*.eot",
|
||||
"*.otf",
|
||||
"*.mp3",
|
||||
"*.mp4",
|
||||
"*.wav",
|
||||
"*.ogg",
|
||||
"*.webm",
|
||||
"*.avi",
|
||||
"*.mov",
|
||||
"*.pdf",
|
||||
"*.doc",
|
||||
"*.docx",
|
||||
"*.xls",
|
||||
"*.xlsx",
|
||||
"*.ppt",
|
||||
"*.pptx",
|
||||
"*.zip",
|
||||
"*.tar",
|
||||
"*.tar.gz",
|
||||
"*.tgz",
|
||||
"*.rar",
|
||||
"*.7z",
|
||||
"*.exe",
|
||||
"*.dll",
|
||||
"*.so",
|
||||
"*.dylib",
|
||||
"*.bin",
|
||||
"*.obj",
|
||||
"*.o",
|
||||
"*.a",
|
||||
"*.lib",
|
||||
# Minified files
|
||||
"*.min.js",
|
||||
"*.min.css",
|
||||
# Source maps
|
||||
"*.map",
|
||||
# Database files
|
||||
"*.db",
|
||||
"*.sqlite",
|
||||
"*.sqlite3",
|
||||
# Coverage reports
|
||||
"coverage/*",
|
||||
".coverage",
|
||||
"htmlcov/*",
|
||||
".nyc_output/*",
|
||||
# Test snapshots (can be large)
|
||||
"__snapshots__/*",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepositoryDigest:
|
||||
|
|
@ -149,21 +40,19 @@ class RepositoryDigest:
|
|||
|
||||
class GitHubConnector:
|
||||
"""
|
||||
Connector for ingesting GitHub repositories using gitingest.
|
||||
Connector for ingesting GitHub repositories using gitingest CLI.
|
||||
|
||||
This connector efficiently processes entire repositories into a single
|
||||
digest, reducing the number of API calls and LLM invocations compared
|
||||
to file-by-file processing.
|
||||
Uses subprocess to run gitingest, which avoids all async/event loop
|
||||
issues that can occur when mixing gitingest with Celery workers.
|
||||
"""
|
||||
|
||||
def __init__(self, token: str | None = None):
|
||||
"""
|
||||
Initializes the GitHub connector.
|
||||
Initialize the GitHub connector.
|
||||
|
||||
Args:
|
||||
token: Optional GitHub Personal Access Token (PAT).
|
||||
Only required for private repositories.
|
||||
Public repositories can be ingested without a token.
|
||||
"""
|
||||
self.token = token if token and token.strip() else None
|
||||
if self.token:
|
||||
|
|
@ -171,72 +60,104 @@ class GitHubConnector:
|
|||
else:
|
||||
logger.info("GitHub connector initialized without token (public repos only).")
|
||||
|
||||
async def ingest_repository(
|
||||
def ingest_repository(
|
||||
self,
|
||||
repo_full_name: str,
|
||||
branch: str | None = None,
|
||||
include_patterns: list[str] | None = None,
|
||||
exclude_patterns: list[str] | None = None,
|
||||
max_file_size: int = MAX_FILE_SIZE,
|
||||
) -> RepositoryDigest | None:
|
||||
"""
|
||||
Ingest an entire repository and return a digest.
|
||||
Ingest a repository using gitingest CLI via subprocess.
|
||||
|
||||
This approach completely isolates gitingest from Python's event loop,
|
||||
avoiding any async/Celery conflicts.
|
||||
|
||||
Args:
|
||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||
branch: Optional specific branch or tag to ingest.
|
||||
include_patterns: Optional list of glob patterns for files to include.
|
||||
If None, includes all files (recommended).
|
||||
exclude_patterns: Optional list of glob patterns for files to exclude.
|
||||
If None, uses DEFAULT_EXCLUDE_PATTERNS.
|
||||
max_file_size: Maximum file size in bytes to include (default 5MB).
|
||||
max_file_size: Maximum file size in bytes to include.
|
||||
|
||||
Returns:
|
||||
RepositoryDigest containing the summary, tree structure, and content,
|
||||
or None if ingestion fails.
|
||||
RepositoryDigest or None if ingestion fails.
|
||||
"""
|
||||
repo_url = f"https://github.com/{repo_full_name}"
|
||||
|
||||
# Use only exclude_patterns by default (recommended for comprehensive analysis)
|
||||
# This ensures we don't miss any relevant file types
|
||||
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
|
||||
|
||||
logger.info(f"Starting gitingest for repository: {repo_full_name}")
|
||||
logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
|
||||
|
||||
try:
|
||||
# Build kwargs dynamically
|
||||
ingest_kwargs = {
|
||||
"max_file_size": max_file_size,
|
||||
"exclude_patterns": exclude_pats,
|
||||
"include_gitignored": False,
|
||||
"include_submodules": False,
|
||||
}
|
||||
# Create a temporary file for output
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".txt", delete=False
|
||||
) as tmp_file:
|
||||
output_path = tmp_file.name
|
||||
|
||||
# Only add token if provided (required only for private repos)
|
||||
if self.token:
|
||||
ingest_kwargs["token"] = self.token
|
||||
# Build the gitingest CLI command
|
||||
cmd = [
|
||||
"gitingest",
|
||||
repo_url,
|
||||
"--output", output_path,
|
||||
"--max-size", str(max_file_size),
|
||||
# Common exclude patterns
|
||||
"-e", "node_modules/*",
|
||||
"-e", "vendor/*",
|
||||
"-e", ".git/*",
|
||||
"-e", "__pycache__/*",
|
||||
"-e", "dist/*",
|
||||
"-e", "build/*",
|
||||
"-e", "*.lock",
|
||||
"-e", "package-lock.json",
|
||||
]
|
||||
|
||||
# Only add branch if specified
|
||||
# Add branch if specified
|
||||
if branch:
|
||||
ingest_kwargs["branch"] = branch
|
||||
cmd.extend(["--branch", branch])
|
||||
|
||||
# Only add include_patterns if explicitly provided
|
||||
if include_patterns is not None:
|
||||
ingest_kwargs["include_patterns"] = include_patterns
|
||||
# Set up environment with token if provided
|
||||
env = os.environ.copy()
|
||||
if self.token:
|
||||
env["GITHUB_TOKEN"] = self.token
|
||||
|
||||
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs)
|
||||
logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
|
||||
|
||||
if not content or not content.strip():
|
||||
logger.warning(
|
||||
f"No content retrieved from repository: {repo_full_name}"
|
||||
)
|
||||
# Run gitingest as subprocess with timeout
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900, # 5 minute timeout
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.error(f"gitingest failed: {result.stderr}")
|
||||
# Clean up temp file
|
||||
if os.path.exists(output_path):
|
||||
os.unlink(output_path)
|
||||
return None
|
||||
|
||||
# Read the output file
|
||||
if not os.path.exists(output_path):
|
||||
logger.error("gitingest did not create output file")
|
||||
return None
|
||||
|
||||
with open(output_path, encoding="utf-8") as f:
|
||||
full_content = f.read()
|
||||
|
||||
# Clean up temp file
|
||||
os.unlink(output_path)
|
||||
|
||||
if not full_content or not full_content.strip():
|
||||
logger.warning(f"No content retrieved from repository: {repo_full_name}")
|
||||
return None
|
||||
|
||||
# Parse the gitingest output
|
||||
# The output format is: summary + tree + content
|
||||
# We'll extract what we can
|
||||
digest = RepositoryDigest(
|
||||
repo_full_name=repo_full_name,
|
||||
summary=summary,
|
||||
tree=tree,
|
||||
content=content,
|
||||
summary=f"Repository: {repo_full_name}",
|
||||
tree="", # gitingest CLI combines everything into one file
|
||||
content=full_content,
|
||||
branch=branch,
|
||||
)
|
||||
|
||||
|
|
@ -246,50 +167,70 @@ class GitHubConnector:
|
|||
)
|
||||
return digest
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(f"gitingest timed out for repository: {repo_full_name}")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
logger.error(
|
||||
"gitingest CLI not found. Falling back to Python library."
|
||||
)
|
||||
# Fall back to Python library
|
||||
return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
|
||||
return None
|
||||
|
||||
async def ingest_repositories(
|
||||
def _ingest_with_python_library(
|
||||
self,
|
||||
repo_full_names: list[str],
|
||||
repo_full_name: str,
|
||||
branch: str | None = None,
|
||||
include_patterns: list[str] | None = None,
|
||||
exclude_patterns: list[str] | None = None,
|
||||
max_file_size: int = MAX_FILE_SIZE,
|
||||
) -> list[RepositoryDigest]:
|
||||
) -> RepositoryDigest | None:
|
||||
"""
|
||||
Ingest multiple repositories and return their digests.
|
||||
|
||||
Args:
|
||||
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
|
||||
branch: Optional specific branch or tag to ingest (applied to all repos).
|
||||
include_patterns: Optional list of glob patterns for files to include.
|
||||
exclude_patterns: Optional list of glob patterns for files to exclude.
|
||||
max_file_size: Maximum file size in bytes to include.
|
||||
|
||||
Returns:
|
||||
List of RepositoryDigest objects for successfully ingested repositories.
|
||||
Fallback: Ingest using the Python library directly.
|
||||
"""
|
||||
digests = []
|
||||
from gitingest import ingest
|
||||
|
||||
for repo_full_name in repo_full_names:
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
repo_url = f"https://github.com/{repo_full_name}"
|
||||
|
||||
digest = await self.ingest_repository(
|
||||
logger.info(f"Using Python gitingest library for: {repo_full_name}")
|
||||
|
||||
try:
|
||||
kwargs = {
|
||||
"max_file_size": max_file_size,
|
||||
"exclude_patterns": [
|
||||
"node_modules/*",
|
||||
"vendor/*",
|
||||
".git/*",
|
||||
"__pycache__/*",
|
||||
"dist/*",
|
||||
"build/*",
|
||||
"*.lock",
|
||||
"package-lock.json",
|
||||
],
|
||||
"include_gitignored": False,
|
||||
"include_submodules": False,
|
||||
}
|
||||
|
||||
if self.token:
|
||||
kwargs["token"] = self.token
|
||||
if branch:
|
||||
kwargs["branch"] = branch
|
||||
|
||||
summary, tree, content = ingest(repo_url, **kwargs)
|
||||
|
||||
if not content or not content.strip():
|
||||
logger.warning(f"No content from {repo_full_name}")
|
||||
return None
|
||||
|
||||
return RepositoryDigest(
|
||||
repo_full_name=repo_full_name,
|
||||
summary=summary,
|
||||
tree=tree,
|
||||
content=content,
|
||||
branch=branch,
|
||||
include_patterns=include_patterns,
|
||||
exclude_patterns=exclude_patterns,
|
||||
max_file_size=max_file_size,
|
||||
)
|
||||
|
||||
if digest:
|
||||
digests.append(digest)
|
||||
|
||||
logger.info(
|
||||
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
|
||||
)
|
||||
return digests
|
||||
except Exception as e:
|
||||
logger.error(f"Python library failed for {repo_full_name}: {e}")
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -173,8 +173,13 @@ async def index_github_repos(
|
|||
logger.info(f"Ingesting repository: {repo_full_name}")
|
||||
|
||||
try:
|
||||
# Ingest the entire repository
|
||||
digest = await github_client.ingest_repository(repo_full_name)
|
||||
# Run gitingest via subprocess (isolated from event loop)
|
||||
# Using to_thread to not block the async database operations
|
||||
import asyncio
|
||||
|
||||
digest = await asyncio.to_thread(
|
||||
github_client.ingest_repository, repo_full_name
|
||||
)
|
||||
|
||||
if not digest:
|
||||
logger.warning(
|
||||
|
|
|
|||
|
|
@ -530,7 +530,10 @@ def validate_connector_config(
|
|||
# "validators": {},
|
||||
# },
|
||||
"GITHUB_CONNECTOR": {
|
||||
"required": ["GITHUB_PAT", "repo_full_names"],
|
||||
# GITHUB_PAT is optional - only required for private repositories
|
||||
# Public repositories can be indexed without authentication
|
||||
"required": ["repo_full_names"],
|
||||
"optional": ["GITHUB_PAT"], # Optional - only needed for private repos
|
||||
"validators": {
|
||||
"repo_full_names": lambda: validate_list_field(
|
||||
"repo_full_names", "repo_full_names"
|
||||
|
|
|
|||
|
|
@ -24,11 +24,6 @@
|
|||
"enabled": true,
|
||||
"status": "warning",
|
||||
"statusMessage": "Some requests may be blocked if not using Firecrawl."
|
||||
},
|
||||
"GITHUB_CONNECTOR": {
|
||||
"enabled": false,
|
||||
"status": "maintenance",
|
||||
"statusMessage": "Rework in progress."
|
||||
}
|
||||
},
|
||||
"globalSettings": {
|
||||
|
|
|
|||
|
|
@ -96,6 +96,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
repo_full_names: repoList,
|
||||
},
|
||||
is_indexable: true,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: periodicEnabled,
|
||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||
|
|
@ -119,16 +120,16 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||
token. Create one from{" "}
|
||||
token. {" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens"
|
||||
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="font-medium underline underline-offset-4"
|
||||
>
|
||||
GitHub Settings
|
||||
Get your token
|
||||
</a>{" "}
|
||||
if needed.
|
||||
.
|
||||
</AlertDescription>
|
||||
</div>
|
||||
</Alert>
|
||||
|
|
@ -324,20 +325,21 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground">
|
||||
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the
|
||||
GitHub API. You provide a comma-separated list of repository full names (e.g.,
|
||||
"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant
|
||||
files (code, markdown, text) from the selected repositories.
|
||||
The GitHub connector ingests entire repositories in one pass using gitingest,
|
||||
making it highly efficient. Provide a comma-separated list of repository full
|
||||
names (e.g., "owner/repo1, owner/repo2") to index.
|
||||
</p>
|
||||
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
|
||||
<li>
|
||||
The connector indexes files based on common code and documentation extensions.
|
||||
<strong>Public repos:</strong> No authentication required.
|
||||
</li>
|
||||
<li>Large files (over 1MB) are skipped during indexing.</li>
|
||||
<li>Only specified repositories are indexed.</li>
|
||||
<li>
|
||||
Indexing runs periodically (check connector settings for frequency) to keep
|
||||
content up-to-date.
|
||||
<strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
|
||||
</li>
|
||||
<li>Indexes code, documentation, and configuration files.</li>
|
||||
<li>Large files (over 5MB) and binary files are automatically skipped.</li>
|
||||
<li>
|
||||
Periodic sync detects changes and only re-indexes when content has changed.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
|
@ -348,19 +350,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">
|
||||
Personal Access Token Required
|
||||
Personal Access Token (Optional)
|
||||
</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch
|
||||
repositories. The PAT will be stored securely to enable indexing.
|
||||
A GitHub PAT is only needed for <strong>private repositories</strong>. Public
|
||||
repos can be indexed without authentication. If you need to access private
|
||||
repos, create a PAT with the 'repo' scope.
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
|
||||
<div className="space-y-4 sm:space-y-6">
|
||||
<div>
|
||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||
Step 1: Generate GitHub PAT
|
||||
For Private Repositories Only: Generate GitHub PAT
|
||||
</h4>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
|
||||
Skip this step if you're only indexing public repositories.
|
||||
</p>
|
||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
|
||||
<li>
|
||||
Go to your GitHub{" "}
|
||||
|
|
@ -375,46 +381,36 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
</li>
|
||||
<li>
|
||||
Click on <strong>Personal access tokens</strong>, then choose{" "}
|
||||
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "}
|
||||
(recommended if available).
|
||||
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Generate new token</strong> (and choose the appropriate type).
|
||||
Click <strong>Generate new token</strong>.
|
||||
</li>
|
||||
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
|
||||
<li>Set an expiration date for the token (recommended for security).</li>
|
||||
<li>
|
||||
Under <strong>Select scopes</strong> (for classic tokens) or{" "}
|
||||
<strong>Repository access</strong> (for fine-grained), grant the necessary
|
||||
permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
|
||||
read access to repositories for fine-grained tokens) is required to read
|
||||
repository content.
|
||||
Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
|
||||
to the specific repositories you want to index (for fine-grained tokens).
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Generate token</strong>.
|
||||
</li>
|
||||
<li>
|
||||
<strong>Important:</strong> Copy your new PAT immediately. You won't be able
|
||||
to see it again after leaving the page.
|
||||
Click <strong>Generate token</strong> and copy it immediately.
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||
Step 2: Specify repositories
|
||||
Specify Repositories
|
||||
</h4>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
|
||||
Enter a comma-separated list of repository full names in the format
|
||||
"owner/repo1, owner/repo2". The connector will index files from only the
|
||||
specified repositories.
|
||||
"owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
|
||||
</p>
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle>
|
||||
<AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
Make sure your PAT has access to all repositories you want to index. Private
|
||||
repositories require appropriate permissions.
|
||||
Public repositories work without a PAT. For private repositories, ensure
|
||||
your PAT has access to the repos you want to index.
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
</div>
|
||||
|
|
@ -424,36 +420,38 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
|
||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
|
||||
<li>
|
||||
Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "}
|
||||
Connector.
|
||||
Enter the <strong>Repository Names</strong> you want to index (e.g.,
|
||||
"facebook/react, vercel/next.js").
|
||||
</li>
|
||||
<li>
|
||||
Enter your <strong>GitHub Personal Access Token</strong> in the form field.
|
||||
<strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
|
||||
</li>
|
||||
<li>
|
||||
Enter a comma-separated list of <strong>Repository Names</strong> (e.g.,
|
||||
"owner/repo1, owner/repo2").
|
||||
Click <strong>Connect GitHub</strong> to start indexing.
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Connect</strong> to establish the connection.
|
||||
Enable <strong>Periodic Sync</strong> to automatically detect and index
|
||||
changes.
|
||||
</li>
|
||||
<li>Once connected, your GitHub repositories will be indexed automatically.</li>
|
||||
</ol>
|
||||
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
<p className="mb-2">The GitHub connector indexes the following data:</p>
|
||||
<p className="mb-2">The GitHub connector indexes:</p>
|
||||
<ul className="list-disc pl-5 space-y-1">
|
||||
<li>Code files from selected repositories</li>
|
||||
<li>README files and Markdown documentation</li>
|
||||
<li>Common text-based file formats</li>
|
||||
<li>Repository metadata and structure</li>
|
||||
<li>All code files (Python, JavaScript, TypeScript, etc.)</li>
|
||||
<li>Documentation (README, Markdown, text files)</li>
|
||||
<li>Configuration files (JSON, YAML, TOML, etc.)</li>
|
||||
<li>Repository structure and file tree</li>
|
||||
</ul>
|
||||
<p className="mt-2">
|
||||
Binary files, images, and build artifacts are automatically excluded.
|
||||
</p>
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
"use client";
|
||||
|
||||
import { KeyRound } from "lucide-react";
|
||||
import { Info, KeyRound } from "lucide-react";
|
||||
import type { FC } from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
|
|
@ -79,6 +80,26 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
||||
<div className="-ml-1">
|
||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||
token. Create one from{" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="font-medium underline underline-offset-4"
|
||||
>
|
||||
GitHub Settings
|
||||
</a>{" "}
|
||||
if needed.
|
||||
</AlertDescription>
|
||||
</div>
|
||||
</Alert>
|
||||
|
||||
{/* Connector Name */}
|
||||
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
|
||||
<div className="space-y-2">
|
||||
|
|
@ -105,7 +126,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
<div className="space-y-2">
|
||||
<Label className="flex items-center gap-2 text-xs sm:text-sm">
|
||||
<KeyRound className="h-4 w-4" />
|
||||
GitHub Personal Access Token
|
||||
GitHub Personal Access Token (optional)
|
||||
</Label>
|
||||
<Input
|
||||
type="password"
|
||||
|
|
|
|||
|
|
@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{connector.is_indexable && (
|
||||
<>
|
||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
||||
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" && (
|
||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
|
||||
connector.connector_type !== "GITHUB_CONNECTOR" && (
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
|
|
|
|||
|
|
@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
|||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{connector?.is_indexable && (
|
||||
<>
|
||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
||||
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" && (
|
||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
|
||||
config.connectorType !== "GITHUB_CONNECTOR" && (
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue