refactor: Update GitHub connector to use gitingest CLI

- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery.
- Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution.
- Adjusted GitHub indexer to call the new synchronous ingestion method.
- Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
This commit is contained in:
Anish Sarkar 2026-01-20 23:24:33 +05:30
parent 49b8a46d10
commit 35888144eb
8 changed files with 221 additions and 256 deletions

View file

@ -1,130 +1,21 @@
""" """
GitHub connector using gitingest for efficient repository digestion. GitHub connector using gitingest CLI for efficient repository digestion.
This connector replaces the previous file-by-file approach with a single This connector uses subprocess to call gitingest CLI, completely isolating
digest generation per repository, dramatically reducing LLM API calls. it from any Python event loop/async complexity that can cause hangs in Celery.
""" """
import logging import logging
import os
import subprocess
import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from gitingest import ingest_async
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Maximum file size in bytes (5MB) # Maximum file size in bytes (5MB)
MAX_FILE_SIZE = 5 * 1024 * 1024 MAX_FILE_SIZE = 5 * 1024 * 1024
# Default patterns to exclude (recommended approach for comprehensive analysis)
# Using only exclude_patterns ensures we don't miss any relevant file types
DEFAULT_EXCLUDE_PATTERNS = [
# Dependencies
"node_modules/*",
"vendor/*",
"bower_components/*",
".pnpm/*",
# Build artifacts / Caches
"build/*",
"dist/*",
"target/*",
"out/*",
"__pycache__/*",
"*.pyc",
".cache/*",
".next/*",
".nuxt/*",
# Virtual environments
"venv/*",
".venv/*",
"env/*",
".env/*",
# IDE/Editor config
".vscode/*",
".idea/*",
".project",
".settings/*",
"*.swp",
"*.swo",
# Version control
".git/*",
".svn/*",
".hg/*",
# Temporary / Logs
"tmp/*",
"temp/*",
"logs/*",
"*.log",
# Lock files (usually not needed for understanding code)
"package-lock.json",
"pnpm-lock.yaml",
"yarn.lock",
"uv.lock",
"Gemfile.lock",
"poetry.lock",
"Cargo.lock",
"composer.lock",
# Binary/media files
"*.png",
"*.jpg",
"*.jpeg",
"*.gif",
"*.ico",
"*.svg",
"*.webp",
"*.bmp",
"*.tiff",
"*.woff",
"*.woff2",
"*.ttf",
"*.eot",
"*.otf",
"*.mp3",
"*.mp4",
"*.wav",
"*.ogg",
"*.webm",
"*.avi",
"*.mov",
"*.pdf",
"*.doc",
"*.docx",
"*.xls",
"*.xlsx",
"*.ppt",
"*.pptx",
"*.zip",
"*.tar",
"*.tar.gz",
"*.tgz",
"*.rar",
"*.7z",
"*.exe",
"*.dll",
"*.so",
"*.dylib",
"*.bin",
"*.obj",
"*.o",
"*.a",
"*.lib",
# Minified files
"*.min.js",
"*.min.css",
# Source maps
"*.map",
# Database files
"*.db",
"*.sqlite",
"*.sqlite3",
# Coverage reports
"coverage/*",
".coverage",
"htmlcov/*",
".nyc_output/*",
# Test snapshots (can be large)
"__snapshots__/*",
]
@dataclass @dataclass
class RepositoryDigest: class RepositoryDigest:
@ -149,21 +40,19 @@ class RepositoryDigest:
class GitHubConnector: class GitHubConnector:
""" """
Connector for ingesting GitHub repositories using gitingest. Connector for ingesting GitHub repositories using gitingest CLI.
This connector efficiently processes entire repositories into a single Uses subprocess to run gitingest, which avoids all async/event loop
digest, reducing the number of API calls and LLM invocations compared issues that can occur when mixing gitingest with Celery workers.
to file-by-file processing.
""" """
def __init__(self, token: str | None = None): def __init__(self, token: str | None = None):
""" """
Initializes the GitHub connector. Initialize the GitHub connector.
Args: Args:
token: Optional GitHub Personal Access Token (PAT). token: Optional GitHub Personal Access Token (PAT).
Only required for private repositories. Only required for private repositories.
Public repositories can be ingested without a token.
""" """
self.token = token if token and token.strip() else None self.token = token if token and token.strip() else None
if self.token: if self.token:
@ -171,72 +60,104 @@ class GitHubConnector:
else: else:
logger.info("GitHub connector initialized without token (public repos only).") logger.info("GitHub connector initialized without token (public repos only).")
async def ingest_repository( def ingest_repository(
self, self,
repo_full_name: str, repo_full_name: str,
branch: str | None = None, branch: str | None = None,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
max_file_size: int = MAX_FILE_SIZE, max_file_size: int = MAX_FILE_SIZE,
) -> RepositoryDigest | None: ) -> RepositoryDigest | None:
""" """
Ingest an entire repository and return a digest. Ingest a repository using gitingest CLI via subprocess.
This approach completely isolates gitingest from Python's event loop,
avoiding any async/Celery conflicts.
Args: Args:
repo_full_name: The full name of the repository (e.g., 'owner/repo'). repo_full_name: The full name of the repository (e.g., 'owner/repo').
branch: Optional specific branch or tag to ingest. branch: Optional specific branch or tag to ingest.
include_patterns: Optional list of glob patterns for files to include. max_file_size: Maximum file size in bytes to include.
If None, includes all files (recommended).
exclude_patterns: Optional list of glob patterns for files to exclude.
If None, uses DEFAULT_EXCLUDE_PATTERNS.
max_file_size: Maximum file size in bytes to include (default 5MB).
Returns: Returns:
RepositoryDigest containing the summary, tree structure, and content, RepositoryDigest or None if ingestion fails.
or None if ingestion fails.
""" """
repo_url = f"https://github.com/{repo_full_name}" repo_url = f"https://github.com/{repo_full_name}"
# Use only exclude_patterns by default (recommended for comprehensive analysis) logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
# This ensures we don't miss any relevant file types
exclude_pats = exclude_patterns if exclude_patterns is not None else DEFAULT_EXCLUDE_PATTERNS
logger.info(f"Starting gitingest for repository: {repo_full_name}")
try: try:
# Build kwargs dynamically # Create a temporary file for output
ingest_kwargs = { with tempfile.NamedTemporaryFile(
"max_file_size": max_file_size, mode="w", suffix=".txt", delete=False
"exclude_patterns": exclude_pats, ) as tmp_file:
"include_gitignored": False, output_path = tmp_file.name
"include_submodules": False,
}
# Only add token if provided (required only for private repos) # Build the gitingest CLI command
if self.token: cmd = [
ingest_kwargs["token"] = self.token "gitingest",
repo_url,
"--output", output_path,
"--max-size", str(max_file_size),
# Common exclude patterns
"-e", "node_modules/*",
"-e", "vendor/*",
"-e", ".git/*",
"-e", "__pycache__/*",
"-e", "dist/*",
"-e", "build/*",
"-e", "*.lock",
"-e", "package-lock.json",
]
# Only add branch if specified # Add branch if specified
if branch: if branch:
ingest_kwargs["branch"] = branch cmd.extend(["--branch", branch])
# Only add include_patterns if explicitly provided # Set up environment with token if provided
if include_patterns is not None: env = os.environ.copy()
ingest_kwargs["include_patterns"] = include_patterns if self.token:
env["GITHUB_TOKEN"] = self.token
summary, tree, content = await ingest_async(repo_url, **ingest_kwargs) logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
if not content or not content.strip(): # Run gitingest as subprocess with timeout
logger.warning( result = subprocess.run(
f"No content retrieved from repository: {repo_full_name}" cmd,
) env=env,
capture_output=True,
text=True,
timeout=900, # 5 minute timeout
)
if result.returncode != 0:
logger.error(f"gitingest failed: {result.stderr}")
# Clean up temp file
if os.path.exists(output_path):
os.unlink(output_path)
return None return None
# Read the output file
if not os.path.exists(output_path):
logger.error("gitingest did not create output file")
return None
with open(output_path, encoding="utf-8") as f:
full_content = f.read()
# Clean up temp file
os.unlink(output_path)
if not full_content or not full_content.strip():
logger.warning(f"No content retrieved from repository: {repo_full_name}")
return None
# Parse the gitingest output
# The output format is: summary + tree + content
# We'll extract what we can
digest = RepositoryDigest( digest = RepositoryDigest(
repo_full_name=repo_full_name, repo_full_name=repo_full_name,
summary=summary, summary=f"Repository: {repo_full_name}",
tree=tree, tree="", # gitingest CLI combines everything into one file
content=content, content=full_content,
branch=branch, branch=branch,
) )
@ -246,50 +167,70 @@ class GitHubConnector:
) )
return digest return digest
except subprocess.TimeoutExpired:
logger.error(f"gitingest timed out for repository: {repo_full_name}")
return None
except FileNotFoundError:
logger.error(
"gitingest CLI not found. Falling back to Python library."
)
# Fall back to Python library
return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
except Exception as e: except Exception as e:
logger.error(f"Failed to ingest repository {repo_full_name}: {e}") logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
return None return None
async def ingest_repositories( def _ingest_with_python_library(
self, self,
repo_full_names: list[str], repo_full_name: str,
branch: str | None = None, branch: str | None = None,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
max_file_size: int = MAX_FILE_SIZE, max_file_size: int = MAX_FILE_SIZE,
) -> list[RepositoryDigest]: ) -> RepositoryDigest | None:
""" """
Ingest multiple repositories and return their digests. Fallback: Ingest using the Python library directly.
Args:
repo_full_names: List of repository full names (e.g., ['owner/repo1', 'owner/repo2']).
branch: Optional specific branch or tag to ingest (applied to all repos).
include_patterns: Optional list of glob patterns for files to include.
exclude_patterns: Optional list of glob patterns for files to exclude.
max_file_size: Maximum file size in bytes to include.
Returns:
List of RepositoryDigest objects for successfully ingested repositories.
""" """
digests = [] from gitingest import ingest
for repo_full_name in repo_full_names: repo_url = f"https://github.com/{repo_full_name}"
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
digest = await self.ingest_repository( logger.info(f"Using Python gitingest library for: {repo_full_name}")
try:
kwargs = {
"max_file_size": max_file_size,
"exclude_patterns": [
"node_modules/*",
"vendor/*",
".git/*",
"__pycache__/*",
"dist/*",
"build/*",
"*.lock",
"package-lock.json",
],
"include_gitignored": False,
"include_submodules": False,
}
if self.token:
kwargs["token"] = self.token
if branch:
kwargs["branch"] = branch
summary, tree, content = ingest(repo_url, **kwargs)
if not content or not content.strip():
logger.warning(f"No content from {repo_full_name}")
return None
return RepositoryDigest(
repo_full_name=repo_full_name, repo_full_name=repo_full_name,
summary=summary,
tree=tree,
content=content,
branch=branch, branch=branch,
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
max_file_size=max_file_size,
) )
if digest: except Exception as e:
digests.append(digest) logger.error(f"Python library failed for {repo_full_name}: {e}")
return None
logger.info(
f"Ingested {len(digests)} out of {len(repo_full_names)} repositories."
)
return digests

View file

@ -173,8 +173,13 @@ async def index_github_repos(
logger.info(f"Ingesting repository: {repo_full_name}") logger.info(f"Ingesting repository: {repo_full_name}")
try: try:
# Ingest the entire repository # Run gitingest via subprocess (isolated from event loop)
digest = await github_client.ingest_repository(repo_full_name) # Using to_thread to not block the async database operations
import asyncio
digest = await asyncio.to_thread(
github_client.ingest_repository, repo_full_name
)
if not digest: if not digest:
logger.warning( logger.warning(

View file

@ -530,7 +530,10 @@ def validate_connector_config(
# "validators": {}, # "validators": {},
# }, # },
"GITHUB_CONNECTOR": { "GITHUB_CONNECTOR": {
"required": ["GITHUB_PAT", "repo_full_names"], # GITHUB_PAT is optional - only required for private repositories
# Public repositories can be indexed without authentication
"required": ["repo_full_names"],
"optional": ["GITHUB_PAT"], # Optional - only needed for private repos
"validators": { "validators": {
"repo_full_names": lambda: validate_list_field( "repo_full_names": lambda: validate_list_field(
"repo_full_names", "repo_full_names" "repo_full_names", "repo_full_names"

View file

@ -24,11 +24,6 @@
"enabled": true, "enabled": true,
"status": "warning", "status": "warning",
"statusMessage": "Some requests may be blocked if not using Firecrawl." "statusMessage": "Some requests may be blocked if not using Firecrawl."
},
"GITHUB_CONNECTOR": {
"enabled": false,
"status": "maintenance",
"statusMessage": "Rework in progress."
} }
}, },
"globalSettings": { "globalSettings": {

View file

@ -96,6 +96,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
repo_full_names: repoList, repo_full_names: repoList,
}, },
is_indexable: true, is_indexable: true,
is_active: true,
last_indexed_at: null, last_indexed_at: null,
periodic_indexing_enabled: periodicEnabled, periodic_indexing_enabled: periodicEnabled,
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null, indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
@ -119,16 +120,16 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle> <AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
<AlertDescription className="text-[10px] sm:text-xs !pl-0"> <AlertDescription className="text-[10px] sm:text-xs !pl-0">
A GitHub PAT is only required for private repositories. Public repos work without a A GitHub PAT is only required for private repositories. Public repos work without a
token. Create one from{" "} token. {" "}
<a <a
href="https://github.com/settings/tokens" href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
target="_blank" target="_blank"
rel="noopener noreferrer" rel="noopener noreferrer"
className="font-medium underline underline-offset-4" className="font-medium underline underline-offset-4"
> >
GitHub Settings Get your token
</a>{" "} </a>{" "}
if needed. .
</AlertDescription> </AlertDescription>
</div> </div>
</Alert> </Alert>
@ -324,20 +325,21 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<div> <div>
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3> <h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
<p className="text-[10px] sm:text-xs text-muted-foreground"> <p className="text-[10px] sm:text-xs text-muted-foreground">
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the The GitHub connector ingests entire repositories in one pass using gitingest,
GitHub API. You provide a comma-separated list of repository full names (e.g., making it highly efficient. Provide a comma-separated list of repository full
"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant names (e.g., "owner/repo1, owner/repo2") to index.
files (code, markdown, text) from the selected repositories.
</p> </p>
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1"> <ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
<li> <li>
The connector indexes files based on common code and documentation extensions. <strong>Public repos:</strong> No authentication required.
</li> </li>
<li>Large files (over 1MB) are skipped during indexing.</li>
<li>Only specified repositories are indexed.</li>
<li> <li>
Indexing runs periodically (check connector settings for frequency) to keep <strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
content up-to-date. </li>
<li>Indexes code, documentation, and configuration files.</li>
<li>Large files (over 5MB) and binary files are automatically skipped.</li>
<li>
Periodic sync detects changes and only re-indexes when content has changed.
</li> </li>
</ul> </ul>
</div> </div>
@ -348,19 +350,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4"> <Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
<Info className="h-3 w-3 sm:h-4 sm:w-4" /> <Info className="h-3 w-3 sm:h-4 sm:w-4" />
<AlertTitle className="text-[10px] sm:text-xs"> <AlertTitle className="text-[10px] sm:text-xs">
Personal Access Token Required Personal Access Token (Optional)
</AlertTitle> </AlertTitle>
<AlertDescription className="text-[9px] sm:text-[10px]"> <AlertDescription className="text-[9px] sm:text-[10px]">
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch A GitHub PAT is only needed for <strong>private repositories</strong>. Public
repositories. The PAT will be stored securely to enable indexing. repos can be indexed without authentication. If you need to access private
repos, create a PAT with the 'repo' scope.
</AlertDescription> </AlertDescription>
</Alert> </Alert>
<div className="space-y-4 sm:space-y-6"> <div className="space-y-4 sm:space-y-6">
<div> <div>
<h4 className="text-[10px] sm:text-xs font-medium mb-2"> <h4 className="text-[10px] sm:text-xs font-medium mb-2">
Step 1: Generate GitHub PAT For Private Repositories Only: Generate GitHub PAT
</h4> </h4>
<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
Skip this step if you're only indexing public repositories.
</p>
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground"> <ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
<li> <li>
Go to your GitHub{" "} Go to your GitHub{" "}
@ -375,46 +381,36 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
</li> </li>
<li> <li>
Click on <strong>Personal access tokens</strong>, then choose{" "} Click on <strong>Personal access tokens</strong>, then choose{" "}
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "} <strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
(recommended if available).
</li> </li>
<li> <li>
Click <strong>Generate new token</strong> (and choose the appropriate type). Click <strong>Generate new token</strong>.
</li> </li>
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li> <li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
<li>Set an expiration date for the token (recommended for security).</li>
<li> <li>
Under <strong>Select scopes</strong> (for classic tokens) or{" "} Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
<strong>Repository access</strong> (for fine-grained), grant the necessary to the specific repositories you want to index (for fine-grained tokens).
permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
read access to repositories for fine-grained tokens) is required to read
repository content.
</li> </li>
<li> <li>
Click <strong>Generate token</strong>. Click <strong>Generate token</strong> and copy it immediately.
</li>
<li>
<strong>Important:</strong> Copy your new PAT immediately. You won't be able
to see it again after leaving the page.
</li> </li>
</ol> </ol>
</div> </div>
<div> <div>
<h4 className="text-[10px] sm:text-xs font-medium mb-2"> <h4 className="text-[10px] sm:text-xs font-medium mb-2">
Step 2: Specify repositories Specify Repositories
</h4> </h4>
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3"> <p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
Enter a comma-separated list of repository full names in the format Enter a comma-separated list of repository full names in the format
"owner/repo1, owner/repo2". The connector will index files from only the "owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
specified repositories.
</p> </p>
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20"> <Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
<Info className="h-3 w-3 sm:h-4 sm:w-4" /> <Info className="h-3 w-3 sm:h-4 sm:w-4" />
<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle> <AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
<AlertDescription className="text-[9px] sm:text-[10px]"> <AlertDescription className="text-[9px] sm:text-[10px]">
Make sure your PAT has access to all repositories you want to index. Private Public repositories work without a PAT. For private repositories, ensure
repositories require appropriate permissions. your PAT has access to the repos you want to index.
</AlertDescription> </AlertDescription>
</Alert> </Alert>
</div> </div>
@ -424,36 +420,38 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
<div className="space-y-4"> <div className="space-y-4">
<div> <div>
<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3> <h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4"> <ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
<li> <li>
Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "} Enter the <strong>Repository Names</strong> you want to index (e.g.,
Connector. "facebook/react, vercel/next.js").
</li> </li>
<li> <li>
Enter your <strong>GitHub Personal Access Token</strong> in the form field. <strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
</li> </li>
<li> <li>
Enter a comma-separated list of <strong>Repository Names</strong> (e.g., Click <strong>Connect GitHub</strong> to start indexing.
"owner/repo1, owner/repo2").
</li> </li>
<li> <li>
Click <strong>Connect</strong> to establish the connection. Enable <strong>Periodic Sync</strong> to automatically detect and index
changes.
</li> </li>
<li>Once connected, your GitHub repositories will be indexed automatically.</li>
</ol> </ol>
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20"> <Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
<Info className="h-3 w-3 sm:h-4 sm:w-4" /> <Info className="h-3 w-3 sm:h-4 sm:w-4" />
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle> <AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
<AlertDescription className="text-[9px] sm:text-[10px]"> <AlertDescription className="text-[9px] sm:text-[10px]">
<p className="mb-2">The GitHub connector indexes the following data:</p> <p className="mb-2">The GitHub connector indexes:</p>
<ul className="list-disc pl-5 space-y-1"> <ul className="list-disc pl-5 space-y-1">
<li>Code files from selected repositories</li> <li>All code files (Python, JavaScript, TypeScript, etc.)</li>
<li>README files and Markdown documentation</li> <li>Documentation (README, Markdown, text files)</li>
<li>Common text-based file formats</li> <li>Configuration files (JSON, YAML, TOML, etc.)</li>
<li>Repository metadata and structure</li> <li>Repository structure and file tree</li>
</ul> </ul>
<p className="mt-2">
Binary files, images, and build artifacts are automatically excluded.
</p>
</AlertDescription> </AlertDescription>
</Alert> </Alert>
</div> </div>

View file

@ -1,8 +1,9 @@
"use client"; "use client";
import { KeyRound } from "lucide-react"; import { Info, KeyRound } from "lucide-react";
import type { FC } from "react"; import type { FC } from "react";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
import { Badge } from "@/components/ui/badge"; import { Badge } from "@/components/ui/badge";
import { Input } from "@/components/ui/input"; import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label"; import { Label } from "@/components/ui/label";
@ -79,6 +80,26 @@ export const GithubConfig: FC<GithubConfigProps> = ({
return ( return (
<div className="space-y-6"> <div className="space-y-6">
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
<div className="-ml-1">
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
A GitHub PAT is only required for private repositories. Public repos work without a
token. Create one from{" "}
<a
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
target="_blank"
rel="noopener noreferrer"
className="font-medium underline underline-offset-4"
>
GitHub Settings
</a>{" "}
if needed.
</AlertDescription>
</div>
</Alert>
{/* Connector Name */} {/* Connector Name */}
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4"> <div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
<div className="space-y-2"> <div className="space-y-2">
@ -105,7 +126,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
<div className="space-y-2"> <div className="space-y-2">
<Label className="flex items-center gap-2 text-xs sm:text-sm"> <Label className="flex items-center gap-2 text-xs sm:text-sm">
<KeyRound className="h-4 w-4" /> <KeyRound className="h-4 w-4" />
GitHub Personal Access Token GitHub Personal Access Token (optional)
</Label> </Label>
<Input <Input
type="password" type="password"

View file

@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
{/* Date range selector and periodic sync - only shown for indexable connectors */} {/* Date range selector and periodic sync - only shown for indexable connectors */}
{connector.is_indexable && ( {connector.is_indexable && (
<> <>
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */} {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
connector.connector_type !== "WEBCRAWLER_CONNECTOR" && ( connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
connector.connector_type !== "GITHUB_CONNECTOR" && (
<DateRangeSelector <DateRangeSelector
startDate={startDate} startDate={startDate}
endDate={endDate} endDate={endDate}

View file

@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
{/* Date range selector and periodic sync - only shown for indexable connectors */} {/* Date range selector and periodic sync - only shown for indexable connectors */}
{connector?.is_indexable && ( {connector?.is_indexable && (
<> <>
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */} {/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
config.connectorType !== "WEBCRAWLER_CONNECTOR" && ( config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
config.connectorType !== "GITHUB_CONNECTOR" && (
<DateRangeSelector <DateRangeSelector
startDate={startDate} startDate={startDate}
endDate={endDate} endDate={endDate}