mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 17:26:23 +02:00
refactor: Update GitHub connector to use gitingest CLI
- Refactored GitHubConnector to utilize gitingest CLI via subprocess, improving performance and avoiding async issues with Celery. - Updated ingestion method to handle repository digests more efficiently, including error handling for subprocess execution. - Adjusted GitHub indexer to call the new synchronous ingestion method. - Clarified documentation regarding the optional nature of the Personal Access Token for public repositories.
This commit is contained in:
parent
49b8a46d10
commit
35888144eb
8 changed files with 221 additions and 256 deletions
|
|
@ -24,11 +24,6 @@
|
|||
"enabled": true,
|
||||
"status": "warning",
|
||||
"statusMessage": "Some requests may be blocked if not using Firecrawl."
|
||||
},
|
||||
"GITHUB_CONNECTOR": {
|
||||
"enabled": false,
|
||||
"status": "maintenance",
|
||||
"statusMessage": "Rework in progress."
|
||||
}
|
||||
},
|
||||
"globalSettings": {
|
||||
|
|
|
|||
|
|
@ -96,6 +96,7 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
repo_full_names: repoList,
|
||||
},
|
||||
is_indexable: true,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: periodicEnabled,
|
||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||
|
|
@ -119,16 +120,16 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||
token. Create one from{" "}
|
||||
token. {" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens"
|
||||
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="font-medium underline underline-offset-4"
|
||||
>
|
||||
GitHub Settings
|
||||
Get your token
|
||||
</a>{" "}
|
||||
if needed.
|
||||
.
|
||||
</AlertDescription>
|
||||
</div>
|
||||
</Alert>
|
||||
|
|
@ -324,20 +325,21 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground">
|
||||
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the
|
||||
GitHub API. You provide a comma-separated list of repository full names (e.g.,
|
||||
"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant
|
||||
files (code, markdown, text) from the selected repositories.
|
||||
The GitHub connector ingests entire repositories in one pass using gitingest,
|
||||
making it highly efficient. Provide a comma-separated list of repository full
|
||||
names (e.g., "owner/repo1, owner/repo2") to index.
|
||||
</p>
|
||||
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
|
||||
<li>
|
||||
The connector indexes files based on common code and documentation extensions.
|
||||
<strong>Public repos:</strong> No authentication required.
|
||||
</li>
|
||||
<li>Large files (over 1MB) are skipped during indexing.</li>
|
||||
<li>Only specified repositories are indexed.</li>
|
||||
<li>
|
||||
Indexing runs periodically (check connector settings for frequency) to keep
|
||||
content up-to-date.
|
||||
<strong>Private repos:</strong> Requires a GitHub Personal Access Token (PAT).
|
||||
</li>
|
||||
<li>Indexes code, documentation, and configuration files.</li>
|
||||
<li>Large files (over 5MB) and binary files are automatically skipped.</li>
|
||||
<li>
|
||||
Periodic sync detects changes and only re-indexes when content has changed.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
|
@ -348,19 +350,23 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">
|
||||
Personal Access Token Required
|
||||
Personal Access Token (Optional)
|
||||
</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch
|
||||
repositories. The PAT will be stored securely to enable indexing.
|
||||
A GitHub PAT is only needed for <strong>private repositories</strong>. Public
|
||||
repos can be indexed without authentication. If you need to access private
|
||||
repos, create a PAT with the 'repo' scope.
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
|
||||
<div className="space-y-4 sm:space-y-6">
|
||||
<div>
|
||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||
Step 1: Generate GitHub PAT
|
||||
For Private Repositories Only: Generate GitHub PAT
|
||||
</h4>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground mb-2">
|
||||
Skip this step if you're only indexing public repositories.
|
||||
</p>
|
||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
|
||||
<li>
|
||||
Go to your GitHub{" "}
|
||||
|
|
@ -375,46 +381,36 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
</li>
|
||||
<li>
|
||||
Click on <strong>Personal access tokens</strong>, then choose{" "}
|
||||
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "}
|
||||
(recommended if available).
|
||||
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>.
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Generate new token</strong> (and choose the appropriate type).
|
||||
Click <strong>Generate new token</strong>.
|
||||
</li>
|
||||
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
|
||||
<li>Set an expiration date for the token (recommended for security).</li>
|
||||
<li>
|
||||
Under <strong>Select scopes</strong> (for classic tokens) or{" "}
|
||||
<strong>Repository access</strong> (for fine-grained), grant the necessary
|
||||
permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
|
||||
read access to repositories for fine-grained tokens) is required to read
|
||||
repository content.
|
||||
Grant the <strong>`repo`</strong> scope (for classic tokens) or read access
|
||||
to the specific repositories you want to index (for fine-grained tokens).
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Generate token</strong>.
|
||||
</li>
|
||||
<li>
|
||||
<strong>Important:</strong> Copy your new PAT immediately. You won't be able
|
||||
to see it again after leaving the page.
|
||||
Click <strong>Generate token</strong> and copy it immediately.
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||
Step 2: Specify repositories
|
||||
Specify Repositories
|
||||
</h4>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
|
||||
Enter a comma-separated list of repository full names in the format
|
||||
"owner/repo1, owner/repo2". The connector will index files from only the
|
||||
specified repositories.
|
||||
"owner/repo1, owner/repo2". For example: "facebook/react, vercel/next.js".
|
||||
</p>
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle>
|
||||
<AlertTitle className="text-[10px] sm:text-xs">Public vs Private</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
Make sure your PAT has access to all repositories you want to index. Private
|
||||
repositories require appropriate permissions.
|
||||
Public repositories work without a PAT. For private repositories, ensure
|
||||
your PAT has access to the repos you want to index.
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
</div>
|
||||
|
|
@ -424,36 +420,38 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Quick Start</h3>
|
||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
|
||||
<li>
|
||||
Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "}
|
||||
Connector.
|
||||
Enter the <strong>Repository Names</strong> you want to index (e.g.,
|
||||
"facebook/react, vercel/next.js").
|
||||
</li>
|
||||
<li>
|
||||
Enter your <strong>GitHub Personal Access Token</strong> in the form field.
|
||||
<strong>(Optional)</strong> Add a GitHub PAT if indexing private repositories.
|
||||
</li>
|
||||
<li>
|
||||
Enter a comma-separated list of <strong>Repository Names</strong> (e.g.,
|
||||
"owner/repo1, owner/repo2").
|
||||
Click <strong>Connect GitHub</strong> to start indexing.
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Connect</strong> to establish the connection.
|
||||
Enable <strong>Periodic Sync</strong> to automatically detect and index
|
||||
changes.
|
||||
</li>
|
||||
<li>Once connected, your GitHub repositories will be indexed automatically.</li>
|
||||
</ol>
|
||||
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
<p className="mb-2">The GitHub connector indexes the following data:</p>
|
||||
<p className="mb-2">The GitHub connector indexes:</p>
|
||||
<ul className="list-disc pl-5 space-y-1">
|
||||
<li>Code files from selected repositories</li>
|
||||
<li>README files and Markdown documentation</li>
|
||||
<li>Common text-based file formats</li>
|
||||
<li>Repository metadata and structure</li>
|
||||
<li>All code files (Python, JavaScript, TypeScript, etc.)</li>
|
||||
<li>Documentation (README, Markdown, text files)</li>
|
||||
<li>Configuration files (JSON, YAML, TOML, etc.)</li>
|
||||
<li>Repository structure and file tree</li>
|
||||
</ul>
|
||||
<p className="mt-2">
|
||||
Binary files, images, and build artifacts are automatically excluded.
|
||||
</p>
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
"use client";
|
||||
|
||||
import { KeyRound } from "lucide-react";
|
||||
import { Info, KeyRound } from "lucide-react";
|
||||
import type { FC } from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
|
|
@ -79,6 +80,26 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
||||
<div className="-ml-1">
|
||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||
token. Create one from{" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="font-medium underline underline-offset-4"
|
||||
>
|
||||
GitHub Settings
|
||||
</a>{" "}
|
||||
if needed.
|
||||
</AlertDescription>
|
||||
</div>
|
||||
</Alert>
|
||||
|
||||
{/* Connector Name */}
|
||||
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
|
||||
<div className="space-y-2">
|
||||
|
|
@ -105,7 +126,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
<div className="space-y-2">
|
||||
<Label className="flex items-center gap-2 text-xs sm:text-sm">
|
||||
<KeyRound className="h-4 w-4" />
|
||||
GitHub Personal Access Token
|
||||
GitHub Personal Access Token (optional)
|
||||
</Label>
|
||||
<Input
|
||||
type="password"
|
||||
|
|
|
|||
|
|
@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{connector.is_indexable && (
|
||||
<>
|
||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
||||
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" && (
|
||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
|
||||
connector.connector_type !== "GITHUB_CONNECTOR" && (
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
|
|
|
|||
|
|
@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
|||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{connector?.is_indexable && (
|
||||
<>
|
||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
||||
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" && (
|
||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
|
||||
config.connectorType !== "GITHUB_CONNECTOR" && (
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue