mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/inbox
This commit is contained in:
commit
200f8732f6
13 changed files with 657 additions and 796 deletions
|
|
@ -67,63 +67,42 @@ NEW_VIEWER_PERMISSIONS = [
|
|||
def upgrade():
|
||||
connection = op.get_bind()
|
||||
|
||||
# Step 1: For each search space, get the Editor role ID and Admin role ID
|
||||
search_spaces = connection.execute(
|
||||
sa.text("SELECT id FROM searchspaces")
|
||||
).fetchall()
|
||||
# Step 1: Move all memberships from Admin roles to corresponding Editor roles (BULK)
|
||||
# Uses a subquery to match Admin->Editor within the same search space
|
||||
connection.execute(
|
||||
sa.text("""
|
||||
UPDATE search_space_memberships m
|
||||
SET role_id = e.id
|
||||
FROM search_space_roles a
|
||||
JOIN search_space_roles e ON a.search_space_id = e.search_space_id
|
||||
WHERE m.role_id = a.id
|
||||
AND a.name = 'Admin'
|
||||
AND e.name = 'Editor'
|
||||
""")
|
||||
)
|
||||
|
||||
for (ss_id,) in search_spaces:
|
||||
# Get Admin and Editor role IDs for this search space
|
||||
admin_role = connection.execute(
|
||||
sa.text("""
|
||||
SELECT id FROM search_space_roles
|
||||
WHERE search_space_id = :ss_id AND name = 'Admin'
|
||||
"""),
|
||||
{"ss_id": ss_id},
|
||||
).fetchone()
|
||||
# Step 2: Move all invites from Admin roles to corresponding Editor roles (BULK)
|
||||
connection.execute(
|
||||
sa.text("""
|
||||
UPDATE search_space_invites i
|
||||
SET role_id = e.id
|
||||
FROM search_space_roles a
|
||||
JOIN search_space_roles e ON a.search_space_id = e.search_space_id
|
||||
WHERE i.role_id = a.id
|
||||
AND a.name = 'Admin'
|
||||
AND e.name = 'Editor'
|
||||
""")
|
||||
)
|
||||
|
||||
editor_role = connection.execute(
|
||||
sa.text("""
|
||||
SELECT id FROM search_space_roles
|
||||
WHERE search_space_id = :ss_id AND name = 'Editor'
|
||||
"""),
|
||||
{"ss_id": ss_id},
|
||||
).fetchone()
|
||||
# Step 3: Delete all Admin roles (BULK)
|
||||
connection.execute(
|
||||
sa.text("""
|
||||
DELETE FROM search_space_roles
|
||||
WHERE name = 'Admin' AND is_system_role = TRUE
|
||||
""")
|
||||
)
|
||||
|
||||
if admin_role and editor_role:
|
||||
admin_role_id = admin_role[0]
|
||||
editor_role_id = editor_role[0]
|
||||
|
||||
# Step 2: Move all memberships from Admin to Editor
|
||||
connection.execute(
|
||||
sa.text("""
|
||||
UPDATE search_space_memberships
|
||||
SET role_id = :editor_role_id
|
||||
WHERE role_id = :admin_role_id
|
||||
"""),
|
||||
{"editor_role_id": editor_role_id, "admin_role_id": admin_role_id},
|
||||
)
|
||||
|
||||
# Step 3: Move all invites from Admin to Editor
|
||||
connection.execute(
|
||||
sa.text("""
|
||||
UPDATE search_space_invites
|
||||
SET role_id = :editor_role_id
|
||||
WHERE role_id = :admin_role_id
|
||||
"""),
|
||||
{"editor_role_id": editor_role_id, "admin_role_id": admin_role_id},
|
||||
)
|
||||
|
||||
# Step 4: Delete the Admin role
|
||||
connection.execute(
|
||||
sa.text("""
|
||||
DELETE FROM search_space_roles
|
||||
WHERE id = :admin_role_id
|
||||
"""),
|
||||
{"admin_role_id": admin_role_id},
|
||||
)
|
||||
|
||||
# Step 5: Update Editor permissions for all search spaces
|
||||
# Step 4: Update Editor permissions for all search spaces (BULK)
|
||||
editor_perms_literal = (
|
||||
"ARRAY[" + ",".join(f"'{p}'" for p in NEW_EDITOR_PERMISSIONS) + "]::TEXT[]"
|
||||
)
|
||||
|
|
@ -136,7 +115,7 @@ def upgrade():
|
|||
""")
|
||||
)
|
||||
|
||||
# Step 6: Update Viewer permissions for all search spaces
|
||||
# Step 5: Update Viewer permissions for all search spaces (BULK)
|
||||
viewer_perms_literal = (
|
||||
"ARRAY[" + ",".join(f"'{p}'" for p in NEW_VIEWER_PERMISSIONS) + "]::TEXT[]"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,296 +1,236 @@
|
|||
import base64
|
||||
import logging
|
||||
from typing import Any
|
||||
"""
|
||||
GitHub connector using gitingest CLI for efficient repository digestion.
|
||||
|
||||
from github3 import exceptions as github_exceptions, login as github_login
|
||||
from github3.exceptions import ForbiddenError, NotFoundError
|
||||
from github3.repos.contents import Contents
|
||||
This connector uses subprocess to call gitingest CLI, completely isolating
|
||||
it from any Python event loop/async complexity that can cause hangs in Celery.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# List of common code file extensions to target
|
||||
CODE_EXTENSIONS = {
|
||||
".py",
|
||||
".js",
|
||||
".jsx",
|
||||
".ts",
|
||||
".tsx",
|
||||
".java",
|
||||
".c",
|
||||
".cpp",
|
||||
".h",
|
||||
".hpp",
|
||||
".cs",
|
||||
".go",
|
||||
".rb",
|
||||
".php",
|
||||
".swift",
|
||||
".kt",
|
||||
".scala",
|
||||
".rs",
|
||||
".m",
|
||||
".sh",
|
||||
".bash",
|
||||
".ps1",
|
||||
".lua",
|
||||
".pl",
|
||||
".pm",
|
||||
".r",
|
||||
".dart",
|
||||
".sql",
|
||||
}
|
||||
# Maximum file size in bytes (5MB)
|
||||
MAX_FILE_SIZE = 5 * 1024 * 1024
|
||||
|
||||
# List of common documentation/text file extensions
|
||||
DOC_EXTENSIONS = {
|
||||
".md",
|
||||
".txt",
|
||||
".rst",
|
||||
".adoc",
|
||||
".html",
|
||||
".htm",
|
||||
".xml",
|
||||
".json",
|
||||
".yaml",
|
||||
".yml",
|
||||
".toml",
|
||||
}
|
||||
|
||||
# Maximum file size in bytes (e.g., 1MB)
|
||||
MAX_FILE_SIZE = 1 * 1024 * 1024
|
||||
@dataclass
|
||||
class RepositoryDigest:
|
||||
"""Represents a digested repository from gitingest."""
|
||||
|
||||
repo_full_name: str
|
||||
summary: str
|
||||
tree: str
|
||||
content: str
|
||||
branch: str | None = None
|
||||
|
||||
@property
|
||||
def full_digest(self) -> str:
|
||||
"""Returns the complete digest with tree and content."""
|
||||
return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}"
|
||||
|
||||
@property
|
||||
def estimated_tokens(self) -> int:
|
||||
"""Rough estimate of tokens (1 token ≈ 4 characters)."""
|
||||
return len(self.full_digest) // 4
|
||||
|
||||
|
||||
class GitHubConnector:
|
||||
"""Connector for interacting with the GitHub API."""
|
||||
"""
|
||||
Connector for ingesting GitHub repositories using gitingest CLI.
|
||||
|
||||
# Directories to skip during file traversal
|
||||
SKIPPED_DIRS = {
|
||||
# Version control
|
||||
".git",
|
||||
# Dependencies
|
||||
"node_modules",
|
||||
"vendor",
|
||||
# Build artifacts / Caches
|
||||
"build",
|
||||
"dist",
|
||||
"target",
|
||||
"__pycache__",
|
||||
# Virtual environments
|
||||
"venv",
|
||||
".venv",
|
||||
"env",
|
||||
# IDE/Editor config
|
||||
".vscode",
|
||||
".idea",
|
||||
".project",
|
||||
".settings",
|
||||
# Temporary / Logs
|
||||
"tmp",
|
||||
"logs",
|
||||
# Add other project-specific irrelevant directories if needed
|
||||
}
|
||||
Uses subprocess to run gitingest, which avoids all async/event loop
|
||||
issues that can occur when mixing gitingest with Celery workers.
|
||||
"""
|
||||
|
||||
def __init__(self, token: str):
|
||||
def __init__(self, token: str | None = None):
|
||||
"""
|
||||
Initializes the GitHub connector.
|
||||
Initialize the GitHub connector.
|
||||
|
||||
Args:
|
||||
token: GitHub Personal Access Token (PAT).
|
||||
token: Optional GitHub Personal Access Token (PAT).
|
||||
Only required for private repositories.
|
||||
"""
|
||||
if not token:
|
||||
raise ValueError("GitHub token cannot be empty.")
|
||||
try:
|
||||
self.gh = github_login(token=token)
|
||||
# Try a simple authenticated call to check token validity
|
||||
self.gh.me()
|
||||
logger.info("Successfully authenticated with GitHub API.")
|
||||
except (github_exceptions.AuthenticationFailed, ForbiddenError) as e:
|
||||
logger.error(f"GitHub authentication failed: {e}")
|
||||
raise ValueError("Invalid GitHub token or insufficient permissions.") from e
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize GitHub client: {e}")
|
||||
raise e
|
||||
self.token = token if token and token.strip() else None
|
||||
if self.token:
|
||||
logger.info("GitHub connector initialized with authentication token.")
|
||||
else:
|
||||
logger.info("GitHub connector initialized without token (public repos only).")
|
||||
|
||||
def get_user_repositories(self) -> list[dict[str, Any]]:
|
||||
"""Fetches repositories accessible by the authenticated user."""
|
||||
repos_data = []
|
||||
try:
|
||||
# type='owner' fetches repos owned by the user
|
||||
# type='member' fetches repos the user is a collaborator on (including orgs)
|
||||
# type='all' fetches both
|
||||
for repo in self.gh.repositories(type="all", sort="updated"):
|
||||
repos_data.append(
|
||||
{
|
||||
"id": repo.id,
|
||||
"name": repo.name,
|
||||
"full_name": repo.full_name,
|
||||
"private": repo.private,
|
||||
"url": repo.html_url,
|
||||
"description": repo.description or "",
|
||||
"last_updated": repo.updated_at if repo.updated_at else None,
|
||||
}
|
||||
)
|
||||
logger.info(f"Fetched {len(repos_data)} repositories.")
|
||||
return repos_data
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch GitHub repositories: {e}")
|
||||
return [] # Return empty list on error
|
||||
|
||||
def get_repository_files(
|
||||
self, repo_full_name: str, path: str = ""
|
||||
) -> list[dict[str, Any]]:
|
||||
def ingest_repository(
|
||||
self,
|
||||
repo_full_name: str,
|
||||
branch: str | None = None,
|
||||
max_file_size: int = MAX_FILE_SIZE,
|
||||
) -> RepositoryDigest | None:
|
||||
"""
|
||||
Recursively fetches details of relevant files (code, docs) within a repository path.
|
||||
Ingest a repository using gitingest CLI via subprocess.
|
||||
|
||||
This approach completely isolates gitingest from Python's event loop,
|
||||
avoiding any async/Celery conflicts.
|
||||
|
||||
Args:
|
||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||
path: The starting path within the repository (default is root).
|
||||
branch: Optional specific branch or tag to ingest.
|
||||
max_file_size: Maximum file size in bytes to include.
|
||||
|
||||
Returns:
|
||||
A list of dictionaries, each containing file details (path, sha, url, size).
|
||||
Returns an empty list if the repository or path is not found or on error.
|
||||
RepositoryDigest or None if ingestion fails.
|
||||
"""
|
||||
files_list = []
|
||||
repo_url = f"https://github.com/{repo_full_name}"
|
||||
|
||||
logger.info(f"Starting gitingest CLI for repository: {repo_full_name}")
|
||||
|
||||
try:
|
||||
owner, repo_name = repo_full_name.split("/")
|
||||
repo = self.gh.repository(owner, repo_name)
|
||||
if not repo:
|
||||
logger.warning(f"Repository '{repo_full_name}' not found.")
|
||||
return []
|
||||
contents = repo.directory_contents(
|
||||
directory_path=path
|
||||
) # Use directory_contents for clarity
|
||||
# Create a temporary file for output
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".txt", delete=False
|
||||
) as tmp_file:
|
||||
output_path = tmp_file.name
|
||||
|
||||
# contents returns a list of tuples (name, content_obj)
|
||||
for _item_name, content_item in contents:
|
||||
if not isinstance(content_item, Contents):
|
||||
continue
|
||||
# Build the gitingest CLI command
|
||||
cmd = [
|
||||
"gitingest",
|
||||
repo_url,
|
||||
"--output", output_path,
|
||||
"--max-size", str(max_file_size),
|
||||
# Common exclude patterns
|
||||
"-e", "node_modules/*",
|
||||
"-e", "vendor/*",
|
||||
"-e", ".git/*",
|
||||
"-e", "__pycache__/*",
|
||||
"-e", "dist/*",
|
||||
"-e", "build/*",
|
||||
"-e", "*.lock",
|
||||
"-e", "package-lock.json",
|
||||
]
|
||||
|
||||
if content_item.type == "dir":
|
||||
# Check if the directory name is in the skipped list
|
||||
if content_item.name in self.SKIPPED_DIRS:
|
||||
logger.debug(f"Skipping directory: {content_item.path}")
|
||||
continue # Skip recursion for this directory
|
||||
# Add branch if specified
|
||||
if branch:
|
||||
cmd.extend(["--branch", branch])
|
||||
|
||||
# Recursively fetch contents of subdirectory
|
||||
files_list.extend(
|
||||
self.get_repository_files(
|
||||
repo_full_name, path=content_item.path
|
||||
)
|
||||
)
|
||||
elif content_item.type == "file":
|
||||
# Check if the file extension is relevant and size is within limits
|
||||
file_extension = (
|
||||
"." + content_item.name.split(".")[-1].lower()
|
||||
if "." in content_item.name
|
||||
else ""
|
||||
)
|
||||
is_code = file_extension in CODE_EXTENSIONS
|
||||
is_doc = file_extension in DOC_EXTENSIONS
|
||||
# Set up environment with token if provided
|
||||
env = os.environ.copy()
|
||||
if self.token:
|
||||
env["GITHUB_TOKEN"] = self.token
|
||||
|
||||
if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE:
|
||||
files_list.append(
|
||||
{
|
||||
"path": content_item.path,
|
||||
"sha": content_item.sha,
|
||||
"url": content_item.html_url,
|
||||
"size": content_item.size,
|
||||
"type": "code" if is_code else "doc",
|
||||
}
|
||||
)
|
||||
elif content_item.size > MAX_FILE_SIZE:
|
||||
logger.debug(
|
||||
f"Skipping large file: {content_item.path} ({content_item.size} bytes)"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"Skipping irrelevant file type: {content_item.path}"
|
||||
)
|
||||
logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...")
|
||||
|
||||
except (NotFoundError, ForbiddenError) as e:
|
||||
logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get files for {repo_full_name} at path '{path}': {e}"
|
||||
# Run gitingest as subprocess with timeout
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900, # 5 minute timeout
|
||||
)
|
||||
# Return what we have collected so far in case of partial failure
|
||||
|
||||
return files_list
|
||||
if result.returncode != 0:
|
||||
logger.error(f"gitingest failed: {result.stderr}")
|
||||
# Clean up temp file
|
||||
if os.path.exists(output_path):
|
||||
os.unlink(output_path)
|
||||
return None
|
||||
|
||||
def get_file_content(self, repo_full_name: str, file_path: str) -> str | None:
|
||||
# Read the output file
|
||||
if not os.path.exists(output_path):
|
||||
logger.error("gitingest did not create output file")
|
||||
return None
|
||||
|
||||
with open(output_path, encoding="utf-8") as f:
|
||||
full_content = f.read()
|
||||
|
||||
# Clean up temp file
|
||||
os.unlink(output_path)
|
||||
|
||||
if not full_content or not full_content.strip():
|
||||
logger.warning(f"No content retrieved from repository: {repo_full_name}")
|
||||
return None
|
||||
|
||||
# Parse the gitingest output
|
||||
# The output format is: summary + tree + content
|
||||
# We'll extract what we can
|
||||
digest = RepositoryDigest(
|
||||
repo_full_name=repo_full_name,
|
||||
summary=f"Repository: {repo_full_name}",
|
||||
tree="", # gitingest CLI combines everything into one file
|
||||
content=full_content,
|
||||
branch=branch,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully ingested {repo_full_name}: "
|
||||
f"~{digest.estimated_tokens} estimated tokens"
|
||||
)
|
||||
return digest
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(f"gitingest timed out for repository: {repo_full_name}")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
logger.error(
|
||||
"gitingest CLI not found. Falling back to Python library."
|
||||
)
|
||||
# Fall back to Python library
|
||||
return self._ingest_with_python_library(repo_full_name, branch, max_file_size)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to ingest repository {repo_full_name}: {e}")
|
||||
return None
|
||||
|
||||
def _ingest_with_python_library(
|
||||
self,
|
||||
repo_full_name: str,
|
||||
branch: str | None = None,
|
||||
max_file_size: int = MAX_FILE_SIZE,
|
||||
) -> RepositoryDigest | None:
|
||||
"""
|
||||
Fetches the decoded content of a specific file.
|
||||
|
||||
Args:
|
||||
repo_full_name: The full name of the repository (e.g., 'owner/repo').
|
||||
file_path: The path to the file within the repository.
|
||||
|
||||
Returns:
|
||||
The decoded file content as a string, or None if fetching fails or file is too large.
|
||||
Fallback: Ingest using the Python library directly.
|
||||
"""
|
||||
from gitingest import ingest
|
||||
|
||||
repo_url = f"https://github.com/{repo_full_name}"
|
||||
|
||||
logger.info(f"Using Python gitingest library for: {repo_full_name}")
|
||||
|
||||
try:
|
||||
owner, repo_name = repo_full_name.split("/")
|
||||
repo = self.gh.repository(owner, repo_name)
|
||||
if not repo:
|
||||
logger.warning(
|
||||
f"Repository '{repo_full_name}' not found when fetching file '{file_path}'."
|
||||
)
|
||||
kwargs = {
|
||||
"max_file_size": max_file_size,
|
||||
"exclude_patterns": [
|
||||
"node_modules/*",
|
||||
"vendor/*",
|
||||
".git/*",
|
||||
"__pycache__/*",
|
||||
"dist/*",
|
||||
"build/*",
|
||||
"*.lock",
|
||||
"package-lock.json",
|
||||
],
|
||||
"include_gitignored": False,
|
||||
"include_submodules": False,
|
||||
}
|
||||
|
||||
if self.token:
|
||||
kwargs["token"] = self.token
|
||||
if branch:
|
||||
kwargs["branch"] = branch
|
||||
|
||||
summary, tree, content = ingest(repo_url, **kwargs)
|
||||
|
||||
if not content or not content.strip():
|
||||
logger.warning(f"No content from {repo_full_name}")
|
||||
return None
|
||||
|
||||
content_item = repo.file_contents(
|
||||
path=file_path
|
||||
) # Use file_contents for clarity
|
||||
|
||||
if (
|
||||
not content_item
|
||||
or not isinstance(content_item, Contents)
|
||||
or content_item.type != "file"
|
||||
):
|
||||
logger.warning(
|
||||
f"File '{file_path}' not found or is not a file in '{repo_full_name}'."
|
||||
)
|
||||
return None
|
||||
|
||||
if content_item.size > MAX_FILE_SIZE:
|
||||
logger.warning(
|
||||
f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch."
|
||||
)
|
||||
return None
|
||||
|
||||
# Content is base64 encoded
|
||||
if content_item.content:
|
||||
try:
|
||||
decoded_content = base64.b64decode(content_item.content).decode(
|
||||
"utf-8"
|
||||
)
|
||||
return decoded_content
|
||||
except UnicodeDecodeError:
|
||||
logger.warning(
|
||||
f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'."
|
||||
)
|
||||
try:
|
||||
# Try a fallback encoding
|
||||
decoded_content = base64.b64decode(content_item.content).decode(
|
||||
"latin-1"
|
||||
)
|
||||
return decoded_content
|
||||
except Exception as decode_err:
|
||||
logger.error(
|
||||
f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}"
|
||||
)
|
||||
return None # Give up if fallback fails
|
||||
else:
|
||||
logger.warning(
|
||||
f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty."
|
||||
)
|
||||
return "" # Return empty string for empty files
|
||||
|
||||
except (NotFoundError, ForbiddenError) as e:
|
||||
logger.warning(
|
||||
f"Cannot access file '{file_path}' in '{repo_full_name}': {e}"
|
||||
return RepositoryDigest(
|
||||
repo_full_name=repo_full_name,
|
||||
summary=summary,
|
||||
tree=tree,
|
||||
content=content,
|
||||
branch=branch,
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}"
|
||||
)
|
||||
logger.error(f"Python library failed for {repo_full_name}: {e}")
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
"""
|
||||
GitHub connector indexer.
|
||||
GitHub connector indexer using gitingest.
|
||||
|
||||
This indexer processes entire repository digests in one pass, dramatically
|
||||
reducing LLM API calls compared to the previous file-by-file approach.
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
|
@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.github_connector import GitHubConnector
|
||||
from app.connectors.github_connector import GitHubConnector, RepositoryDigest
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
|
@ -26,43 +29,55 @@ from .base import (
|
|||
logger,
|
||||
)
|
||||
|
||||
# Maximum tokens for a single digest before splitting
|
||||
# Most LLMs can handle 128k+ tokens now, but we'll be conservative
|
||||
MAX_DIGEST_CHARS = 500_000 # ~125k tokens
|
||||
|
||||
|
||||
async def index_github_repos(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||
update_last_indexed: bool = True,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index code and documentation files from accessible GitHub repositories.
|
||||
Index GitHub repositories using gitingest for efficient processing.
|
||||
|
||||
This function ingests entire repositories as digests, generates a single
|
||||
summary per repository, and chunks the content for vector storage.
|
||||
|
||||
Note: The start_date and end_date parameters are accepted for API compatibility
|
||||
but are IGNORED. GitHub repositories are indexed as complete snapshots since
|
||||
gitingest captures the current state of the entire codebase.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the GitHub connector
|
||||
search_space_id: ID of the search space to store documents in
|
||||
user_id: ID of the user
|
||||
start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
||||
end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates
|
||||
start_date: Ignored - kept for API compatibility
|
||||
end_date: Ignored - kept for API compatibility
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
"""
|
||||
# Note: start_date and end_date are intentionally unused
|
||||
_ = start_date, end_date
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="github_repos_indexing",
|
||||
source="connector_indexing_task",
|
||||
message=f"Starting GitHub repositories indexing for connector {connector_id}",
|
||||
message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)",
|
||||
metadata={
|
||||
"connector_id": connector_id,
|
||||
"user_id": str(user_id),
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"method": "gitingest",
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -93,19 +108,11 @@ async def index_github_repos(
|
|||
f"Connector with ID {connector_id} not found or is not a GitHub connector",
|
||||
)
|
||||
|
||||
# 2. Get the GitHub PAT and selected repositories from the connector config
|
||||
github_pat = connector.config.get("GITHUB_PAT")
|
||||
# 2. Get the GitHub PAT (optional) and selected repositories from the connector config
|
||||
# PAT is only required for private repositories - public repos work without it
|
||||
github_pat = connector.config.get("GITHUB_PAT") # Can be None or empty
|
||||
repo_full_names_to_index = connector.config.get("repo_full_names")
|
||||
|
||||
if not github_pat:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}",
|
||||
"Missing GitHub PAT",
|
||||
{"error_type": "MissingToken"},
|
||||
)
|
||||
return 0, "GitHub Personal Access Token (PAT) not found in connector config"
|
||||
|
||||
if not repo_full_names_to_index or not isinstance(
|
||||
repo_full_names_to_index, list
|
||||
):
|
||||
|
|
@ -117,10 +124,16 @@ async def index_github_repos(
|
|||
)
|
||||
return 0, "'repo_full_names' not found or is not a list in connector config"
|
||||
|
||||
# 3. Initialize GitHub connector client
|
||||
# Log whether we're using authentication
|
||||
if github_pat:
|
||||
logger.info("Using GitHub PAT for authentication (private repos supported)")
|
||||
else:
|
||||
logger.info("No GitHub PAT provided - only public repositories can be indexed")
|
||||
|
||||
# 3. Initialize GitHub connector with gitingest backend
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Initializing GitHub client for connector {connector_id}",
|
||||
f"Initializing gitingest-based GitHub client for connector {connector_id}",
|
||||
{
|
||||
"stage": "client_initialization",
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
|
|
@ -138,258 +151,57 @@ async def index_github_repos(
|
|||
)
|
||||
return 0, f"Failed to initialize GitHub client: {e!s}"
|
||||
|
||||
# 4. Validate selected repositories
|
||||
# 4. Process each repository with gitingest
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories",
|
||||
f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
|
||||
{
|
||||
"stage": "repo_processing",
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Starting indexing for {len(repo_full_names_to_index)} selected repositories."
|
||||
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
|
||||
)
|
||||
if start_date and end_date:
|
||||
logger.info(
|
||||
f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)"
|
||||
)
|
||||
|
||||
# 6. Iterate through selected repositories and index files
|
||||
for repo_full_name in repo_full_names_to_index:
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing repository: {repo_full_name}")
|
||||
try:
|
||||
files_to_index = github_client.get_repository_files(repo_full_name)
|
||||
if not files_to_index:
|
||||
logger.info(
|
||||
f"No indexable files found in repository: {repo_full_name}"
|
||||
)
|
||||
continue
|
||||
logger.info(f"Ingesting repository: {repo_full_name}")
|
||||
|
||||
logger.info(
|
||||
f"Found {len(files_to_index)} files to process in {repo_full_name}"
|
||||
try:
|
||||
# Run gitingest via subprocess (isolated from event loop)
|
||||
# Using to_thread to not block the async database operations
|
||||
import asyncio
|
||||
|
||||
digest = await asyncio.to_thread(
|
||||
github_client.ingest_repository, repo_full_name
|
||||
)
|
||||
|
||||
for file_info in files_to_index:
|
||||
file_path = file_info.get("path")
|
||||
file_url = file_info.get("url")
|
||||
file_sha = file_info.get("sha")
|
||||
file_type = file_info.get("type") # 'code' or 'doc'
|
||||
full_path_key = f"{repo_full_name}/{file_path}"
|
||||
|
||||
if not file_path or not file_url or not file_sha:
|
||||
logger.warning(
|
||||
f"Skipping file with missing info in {repo_full_name}: {file_info}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get file content
|
||||
file_content = github_client.get_file_content(
|
||||
repo_full_name, file_path
|
||||
if not digest:
|
||||
logger.warning(
|
||||
f"No digest returned for repository: {repo_full_name}"
|
||||
)
|
||||
errors.append(f"No digest for {repo_full_name}")
|
||||
continue
|
||||
|
||||
if file_content is None:
|
||||
logger.warning(
|
||||
f"Could not retrieve content for {full_path_key}. Skipping."
|
||||
)
|
||||
continue # Skip if content fetch failed
|
||||
# Process the digest and create documents
|
||||
docs_created = await _process_repository_digest(
|
||||
session=session,
|
||||
digest=digest,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
)
|
||||
|
||||
# Generate unique identifier hash for this GitHub file
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(file_content, search_space_id)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for GitHub file {full_path_key} unchanged. Skipping."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for GitHub file {full_path_key}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
if user_llm:
|
||||
file_extension = (
|
||||
file_path.split(".")[-1]
|
||||
if "." in file_path
|
||||
else None
|
||||
)
|
||||
document_metadata = {
|
||||
"file_path": full_path_key,
|
||||
"repository": repo_full_name,
|
||||
"file_type": file_extension or "unknown",
|
||||
"document_type": "GitHub Repository File",
|
||||
"connector_type": "GitHub",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
file_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
||||
summary_embedding = (
|
||||
config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
)
|
||||
|
||||
# Chunk the content
|
||||
try:
|
||||
if hasattr(config, "code_chunker_instance"):
|
||||
chunks_data = [
|
||||
await create_document_chunks(file_content)
|
||||
][0]
|
||||
else:
|
||||
chunks_data = await create_document_chunks(
|
||||
file_content
|
||||
)
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"GitHub - {full_path_key}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"file_path": file_path,
|
||||
"file_sha": file_sha,
|
||||
"file_url": file_url,
|
||||
"repository": repo_full_name,
|
||||
"indexed_at": datetime.now(UTC).strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
existing_document.chunks = chunks_data
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
logger.info(
|
||||
f"Successfully updated GitHub file {full_path_key}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
if user_llm:
|
||||
# Extract file extension from file path
|
||||
file_extension = (
|
||||
file_path.split(".")[-1] if "." in file_path else None
|
||||
)
|
||||
document_metadata = {
|
||||
"file_path": full_path_key,
|
||||
"repository": repo_full_name,
|
||||
"file_type": file_extension or "unknown",
|
||||
"document_type": "GitHub Repository File",
|
||||
"connector_type": "GitHub",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
file_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = (
|
||||
f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Chunk the content
|
||||
try:
|
||||
chunks_data = [await create_document_chunks(file_content)][0]
|
||||
|
||||
# Use code chunker if available, otherwise regular chunker
|
||||
if hasattr(config, "code_chunker_instance"):
|
||||
chunks_data = [
|
||||
{
|
||||
"content": chunk.text,
|
||||
"embedding": config.embedding_model_instance.embed(
|
||||
chunk.text
|
||||
),
|
||||
}
|
||||
for chunk in config.code_chunker_instance.chunk(
|
||||
file_content
|
||||
)
|
||||
]
|
||||
else:
|
||||
chunks_data = await create_document_chunks(file_content)
|
||||
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
||||
)
|
||||
errors.append(
|
||||
f"Chunking failed for {full_path_key}: {chunk_err}"
|
||||
)
|
||||
continue # Skip this file if chunking fails
|
||||
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"file_path": file_path,
|
||||
"full_path": full_path_key, # For easier lookup
|
||||
"url": file_url,
|
||||
"sha": file_sha,
|
||||
"type": file_type,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
# Create new document
|
||||
logger.info(f"Creating new document for file: {full_path_key}")
|
||||
document = Document(
|
||||
title=f"GitHub - {file_path}",
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_content, # Store summary
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
search_space_id=search_space_id,
|
||||
chunks=chunks_data, # Associate chunks directly
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
session.add(document)
|
||||
documents_processed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_processed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_processed} GitHub files processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
documents_processed += docs_created
|
||||
logger.info(
|
||||
f"Created {docs_created} documents from repository: {repo_full_name}"
|
||||
)
|
||||
|
||||
except Exception as repo_err:
|
||||
logger.error(
|
||||
|
|
@ -397,11 +209,11 @@ async def index_github_repos(
|
|||
)
|
||||
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_processed} GitHub files processed")
|
||||
# Final commit
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files."
|
||||
f"Finished GitHub indexing for connector {connector_id}. "
|
||||
f"Created {documents_processed} documents."
|
||||
)
|
||||
|
||||
# Log success
|
||||
|
|
@ -412,6 +224,7 @@ async def index_github_repos(
|
|||
"documents_processed": documents_processed,
|
||||
"errors_count": len(errors),
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
"method": "gitingest",
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -428,6 +241,7 @@ async def index_github_repos(
|
|||
)
|
||||
errors.append(f"Database error: {db_err}")
|
||||
return documents_processed, "; ".join(errors) if errors else str(db_err)
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
|
|
@ -445,3 +259,173 @@ async def index_github_repos(
|
|||
|
||||
error_message = "; ".join(errors) if errors else None
|
||||
return documents_processed, error_message
|
||||
|
||||
|
||||
async def _process_repository_digest(
|
||||
session: AsyncSession,
|
||||
digest: RepositoryDigest,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
) -> int:
|
||||
"""
|
||||
Process a repository digest and create documents.
|
||||
|
||||
For each repository, we create:
|
||||
1. One main document with the repository summary
|
||||
2. Chunks from the full digest content for granular search
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
digest: The repository digest from gitingest
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
task_logger: Task logging service
|
||||
log_entry: Current log entry
|
||||
|
||||
Returns:
|
||||
Number of documents created
|
||||
"""
|
||||
repo_full_name = digest.repo_full_name
|
||||
documents_created = 0
|
||||
|
||||
# Generate unique identifier based on repo name and content hash
|
||||
# This allows updates when repo content changes
|
||||
full_content = digest.full_digest
|
||||
content_hash = generate_content_hash(full_content, search_space_id)
|
||||
|
||||
# Use repo name as the unique identifier (one document per repo)
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Repository {repo_full_name} unchanged. Skipping."
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
logger.info(
|
||||
f"Content changed for repository {repo_full_name}. Updating document."
|
||||
)
|
||||
# Delete existing document to replace with new one
|
||||
await session.delete(existing_document)
|
||||
await session.flush()
|
||||
|
||||
# Generate summary using LLM (ONE call per repository!)
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
document_metadata = {
|
||||
"repository": repo_full_name,
|
||||
"document_type": "GitHub Repository",
|
||||
"connector_type": "GitHub",
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
# Prepare content for summarization
|
||||
# Include tree structure and truncated content if too large
|
||||
summary_content = digest.full_digest
|
||||
if len(summary_content) > MAX_DIGEST_CHARS:
|
||||
# Truncate but keep the tree and beginning of content
|
||||
summary_content = (
|
||||
f"# Repository: {repo_full_name}\n\n"
|
||||
f"## File Structure\n\n{digest.tree}\n\n"
|
||||
f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
|
||||
)
|
||||
|
||||
summary_text, summary_embedding = await generate_document_summary(
|
||||
summary_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_text = (
|
||||
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||
f"## Summary\n{digest.summary}\n\n"
|
||||
f"## File Structure\n{digest.tree[:3000]}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_text)
|
||||
|
||||
# Chunk the full digest content for granular search
|
||||
try:
|
||||
# Use the content (not the summary) for chunking
|
||||
# This preserves file-level granularity in search
|
||||
chunks_data = await create_document_chunks(digest.content)
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk repository {repo_full_name}: {chunk_err}"
|
||||
)
|
||||
# Fall back to a simpler chunking approach
|
||||
chunks_data = await _simple_chunk_content(digest.content)
|
||||
|
||||
# Create the document
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"url": f"https://github.com/{repo_full_name}",
|
||||
"branch": digest.branch,
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree,
|
||||
"gitingest_summary": digest.summary,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
document = Document(
|
||||
title=f"GitHub Repository: {repo_full_name}",
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_text,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
search_space_id=search_space_id,
|
||||
chunks=chunks_data,
|
||||
updated_at=get_current_timestamp(),
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_created += 1
|
||||
|
||||
logger.info(
|
||||
f"Created document for repository {repo_full_name} "
|
||||
f"with {len(chunks_data)} chunks"
|
||||
)
|
||||
|
||||
return documents_created
|
||||
|
||||
|
||||
async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
|
||||
"""
|
||||
Simple fallback chunking when the regular chunker fails.
|
||||
|
||||
Args:
|
||||
content: The content to chunk
|
||||
chunk_size: Size of each chunk in characters
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries with content and embedding
|
||||
"""
|
||||
from app.db import Chunk
|
||||
|
||||
chunks = []
|
||||
for i in range(0, len(content), chunk_size):
|
||||
chunk_text = content[i : i + chunk_size]
|
||||
if chunk_text.strip():
|
||||
chunks.append(
|
||||
Chunk(
|
||||
content=chunk_text,
|
||||
embedding=config.embedding_model_instance.embed(chunk_text),
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
|
|
|||
|
|
@ -530,7 +530,10 @@ def validate_connector_config(
|
|||
# "validators": {},
|
||||
# },
|
||||
"GITHUB_CONNECTOR": {
|
||||
"required": ["GITHUB_PAT", "repo_full_names"],
|
||||
# GITHUB_PAT is optional - only required for private repositories
|
||||
# Public repositories can be indexed without authentication
|
||||
"required": ["repo_full_names"],
|
||||
"optional": ["GITHUB_PAT"], # Optional - only needed for private repos
|
||||
"validators": {
|
||||
"repo_full_names": lambda: validate_list_field(
|
||||
"repo_full_names", "repo_full_names"
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ dependencies = [
|
|||
"mcp>=1.25.0",
|
||||
"starlette>=0.40.0,<0.51.0",
|
||||
"sse-starlette>=3.1.1,<3.1.2",
|
||||
"gitingest>=0.3.1",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
30
surfsense_backend/uv.lock
generated
30
surfsense_backend/uv.lock
generated
|
|
@ -1945,6 +1945,25 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800, upload-time = "2023-04-26T17:56:25.015Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gitingest"
|
||||
version = "0.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "httpx" },
|
||||
{ name = "loguru" },
|
||||
{ name = "pathspec" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "starlette" },
|
||||
{ name = "tiktoken" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "google-api-core"
|
||||
version = "2.25.1"
|
||||
|
|
@ -4460,6 +4479,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pathspec"
|
||||
version = "1.0.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdf2image"
|
||||
version = "1.17.0"
|
||||
|
|
@ -6484,6 +6512,7 @@ dependencies = [
|
|||
{ name = "firecrawl-py" },
|
||||
{ name = "flower" },
|
||||
{ name = "github3-py" },
|
||||
{ name = "gitingest" },
|
||||
{ name = "google-api-python-client" },
|
||||
{ name = "google-auth-oauthlib" },
|
||||
{ name = "kokoro" },
|
||||
|
|
@ -6549,6 +6578,7 @@ requires-dist = [
|
|||
{ name = "firecrawl-py", specifier = ">=4.9.0" },
|
||||
{ name = "flower", specifier = ">=2.0.1" },
|
||||
{ name = "github3-py", specifier = "==4.0.1" },
|
||||
{ name = "gitingest", specifier = ">=0.3.1" },
|
||||
{ name = "google-api-python-client", specifier = ">=2.156.0" },
|
||||
{ name = "google-auth-oauthlib", specifier = ">=1.2.1" },
|
||||
{ name = "kokoro", specifier = ">=0.9.4" },
|
||||
|
|
|
|||
|
|
@ -24,11 +24,6 @@
|
|||
"enabled": true,
|
||||
"status": "warning",
|
||||
"statusMessage": "Some requests may be blocked if not using Firecrawl."
|
||||
},
|
||||
"GITHUB_CONNECTOR": {
|
||||
"enabled": false,
|
||||
"status": "maintenance",
|
||||
"statusMessage": "Rework in progress."
|
||||
}
|
||||
},
|
||||
"globalSettings": {
|
||||
|
|
|
|||
|
|
@ -1,17 +1,12 @@
|
|||
"use client";
|
||||
|
||||
import { zodResolver } from "@hookform/resolvers/zod";
|
||||
import { Info } from "lucide-react";
|
||||
import { ExternalLink, Info } from "lucide-react";
|
||||
import Link from "next/link";
|
||||
import type { FC } from "react";
|
||||
import { useRef, useState } from "react";
|
||||
import { useForm } from "react-hook-form";
|
||||
import * as z from "zod";
|
||||
import {
|
||||
Accordion,
|
||||
AccordionContent,
|
||||
AccordionItem,
|
||||
AccordionTrigger,
|
||||
} from "@/components/ui/accordion";
|
||||
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import {
|
||||
|
|
@ -34,8 +29,6 @@ import {
|
|||
} from "@/components/ui/select";
|
||||
import { Switch } from "@/components/ui/switch";
|
||||
import { EnumConnectorName } from "@/contracts/enums/connector";
|
||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
||||
import { getConnectorBenefits } from "../connector-benefits";
|
||||
import type { ConnectFormProps } from "../index";
|
||||
|
||||
const githubConnectorFormSchema = z.object({
|
||||
|
|
@ -44,10 +37,8 @@ const githubConnectorFormSchema = z.object({
|
|||
}),
|
||||
github_pat: z
|
||||
.string()
|
||||
.min(20, {
|
||||
message: "GitHub Personal Access Token seems too short.",
|
||||
})
|
||||
.refine((pat) => pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
|
||||
.optional()
|
||||
.refine((pat) => !pat || pat.startsWith("ghp_") || pat.startsWith("github_pat_"), {
|
||||
message: "GitHub PAT should start with 'ghp_' or 'github_pat_'",
|
||||
}),
|
||||
repo_full_names: z.string().min(1, {
|
||||
|
|
@ -59,8 +50,6 @@ type GithubConnectorFormValues = z.infer<typeof githubConnectorFormSchema>;
|
|||
|
||||
export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting }) => {
|
||||
const isSubmittingRef = useRef(false);
|
||||
const [startDate, setStartDate] = useState<Date | undefined>(undefined);
|
||||
const [endDate, setEndDate] = useState<Date | undefined>(undefined);
|
||||
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
||||
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
||||
const form = useForm<GithubConnectorFormValues>({
|
||||
|
|
@ -94,16 +83,18 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
name: values.name,
|
||||
connector_type: EnumConnectorName.GITHUB_CONNECTOR,
|
||||
config: {
|
||||
GITHUB_PAT: values.github_pat,
|
||||
GITHUB_PAT: values.github_pat || null, // Optional - only for private repos
|
||||
repo_full_names: repoList,
|
||||
},
|
||||
is_indexable: true,
|
||||
is_active: true,
|
||||
last_indexed_at: null,
|
||||
periodic_indexing_enabled: periodicEnabled,
|
||||
indexing_frequency_minutes: periodicEnabled ? parseInt(frequencyMinutes, 10) : null,
|
||||
next_scheduled_at: null,
|
||||
startDate,
|
||||
endDate,
|
||||
// GitHub indexes full repo snapshots - no date range needed
|
||||
startDate: undefined,
|
||||
endDate: undefined,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
});
|
||||
|
|
@ -117,18 +108,19 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0 ml-1" />
|
||||
<div className="-ml-1">
|
||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token Required</AlertTitle>
|
||||
<AlertTitle className="text-xs sm:text-sm">Personal Access Token (Optional)</AlertTitle>
|
||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
||||
You'll need a GitHub Personal Access Token to use this connector. You can create one
|
||||
from{" "}
|
||||
A GitHub PAT is only required for private repositories. Public repos work without a
|
||||
token.{" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens"
|
||||
href="https://github.com/settings/tokens/new?description=surfsense&scopes=repo"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="font-medium underline underline-offset-4"
|
||||
className="font-medium underline underline-offset-4 inline-flex items-center gap-1.5"
|
||||
>
|
||||
GitHub Settings
|
||||
</a>
|
||||
Get your token
|
||||
<ExternalLink className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
</a>{" "}
|
||||
</AlertDescription>
|
||||
</div>
|
||||
</Alert>
|
||||
|
|
@ -167,7 +159,10 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
name="github_pat"
|
||||
render={({ field }) => (
|
||||
<FormItem>
|
||||
<FormLabel className="text-xs sm:text-sm">GitHub Personal Access Token</FormLabel>
|
||||
<FormLabel className="text-xs sm:text-sm">
|
||||
GitHub Personal Access Token{" "}
|
||||
<span className="text-muted-foreground font-normal">(optional)</span>
|
||||
</FormLabel>
|
||||
<FormControl>
|
||||
<Input
|
||||
type="password"
|
||||
|
|
@ -178,8 +173,8 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
/>
|
||||
</FormControl>
|
||||
<FormDescription className="text-[10px] sm:text-xs">
|
||||
Your GitHub PAT will be encrypted and stored securely. It typically starts with
|
||||
"ghp_" or "github_pat_".
|
||||
Only required for private repositories. Leave empty if indexing public repos
|
||||
only.
|
||||
</FormDescription>
|
||||
<FormMessage />
|
||||
</FormItem>
|
||||
|
|
@ -225,15 +220,9 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
|
||||
{/* Indexing Configuration */}
|
||||
<div className="space-y-4 pt-4 border-t border-slate-400/20">
|
||||
<h3 className="text-sm sm:text-base font-medium">Indexing Configuration</h3>
|
||||
<h3 className="text-sm sm:text-base font-medium">Sync Configuration</h3>
|
||||
|
||||
{/* Date Range Selector */}
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
onStartDateChange={setStartDate}
|
||||
onEndDateChange={setEndDate}
|
||||
/>
|
||||
{/* Note: No date range for GitHub - it indexes full repo snapshots */}
|
||||
|
||||
{/* Periodic Sync Config */}
|
||||
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
||||
|
|
@ -301,169 +290,18 @@ export const GithubConnectForm: FC<ConnectFormProps> = ({ onSubmit, isSubmitting
|
|||
</Form>
|
||||
</div>
|
||||
|
||||
{/* What you get section */}
|
||||
{getConnectorBenefits(EnumConnectorName.GITHUB_CONNECTOR) && (
|
||||
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 px-3 sm:px-6 py-4 space-y-2">
|
||||
<h4 className="text-xs sm:text-sm font-medium">What you get with GitHub integration:</h4>
|
||||
<ul className="list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
|
||||
{getConnectorBenefits(EnumConnectorName.GITHUB_CONNECTOR)?.map((benefit) => (
|
||||
<li key={benefit}>{benefit}</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Documentation Section */}
|
||||
<Accordion
|
||||
type="single"
|
||||
collapsible
|
||||
className="w-full border border-border rounded-xl bg-slate-400/5 dark:bg-white/5"
|
||||
>
|
||||
<AccordionItem value="documentation" className="border-0">
|
||||
<AccordionTrigger className="text-sm sm:text-base font-medium px-3 sm:px-6 no-underline hover:no-underline">
|
||||
Documentation
|
||||
</AccordionTrigger>
|
||||
<AccordionContent className="px-3 sm:px-6 pb-3 sm:pb-6 space-y-6">
|
||||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">How it works</h3>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground">
|
||||
The GitHub connector uses a Personal Access Token (PAT) to authenticate with the
|
||||
GitHub API. You provide a comma-separated list of repository full names (e.g.,
|
||||
"owner/repo1, owner/repo2") that you want to index. The connector indexes relevant
|
||||
files (code, markdown, text) from the selected repositories.
|
||||
</p>
|
||||
<ul className="mt-2 list-disc pl-5 text-[10px] sm:text-xs text-muted-foreground space-y-1">
|
||||
<li>
|
||||
The connector indexes files based on common code and documentation extensions.
|
||||
</li>
|
||||
<li>Large files (over 1MB) are skipped during indexing.</li>
|
||||
<li>Only specified repositories are indexed.</li>
|
||||
<li>
|
||||
Indexing runs periodically (check connector settings for frequency) to keep
|
||||
content up-to-date.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Authorization</h3>
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 mb-4">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">
|
||||
Personal Access Token Required
|
||||
</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
You'll need a GitHub PAT with the appropriate scopes (e.g., 'repo') to fetch
|
||||
repositories. The PAT will be stored securely to enable indexing.
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
|
||||
<div className="space-y-4 sm:space-y-6">
|
||||
<div>
|
||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||
Step 1: Generate GitHub PAT
|
||||
</h4>
|
||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground">
|
||||
<li>
|
||||
Go to your GitHub{" "}
|
||||
<a
|
||||
href="https://github.com/settings/tokens"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="font-medium underline underline-offset-4"
|
||||
>
|
||||
Developer settings
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
Click on <strong>Personal access tokens</strong>, then choose{" "}
|
||||
<strong>Tokens (classic)</strong> or <strong>Fine-grained tokens</strong>{" "}
|
||||
(recommended if available).
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Generate new token</strong> (and choose the appropriate type).
|
||||
</li>
|
||||
<li>Give your token a descriptive name (e.g., "SurfSense Connector").</li>
|
||||
<li>Set an expiration date for the token (recommended for security).</li>
|
||||
<li>
|
||||
Under <strong>Select scopes</strong> (for classic tokens) or{" "}
|
||||
<strong>Repository access</strong> (for fine-grained), grant the necessary
|
||||
permissions. At minimum, the <strong>`repo`</strong> scope (or equivalent
|
||||
read access to repositories for fine-grained tokens) is required to read
|
||||
repository content.
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Generate token</strong>.
|
||||
</li>
|
||||
<li>
|
||||
<strong>Important:</strong> Copy your new PAT immediately. You won't be able
|
||||
to see it again after leaving the page.
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<h4 className="text-[10px] sm:text-xs font-medium mb-2">
|
||||
Step 2: Specify repositories
|
||||
</h4>
|
||||
<p className="text-[10px] sm:text-xs text-muted-foreground mb-3">
|
||||
Enter a comma-separated list of repository full names in the format
|
||||
"owner/repo1, owner/repo2". The connector will index files from only the
|
||||
specified repositories.
|
||||
</p>
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">Repository Access</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
Make sure your PAT has access to all repositories you want to index. Private
|
||||
repositories require appropriate permissions.
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm sm:text-base font-semibold mb-2">Indexing</h3>
|
||||
<ol className="list-decimal pl-5 space-y-2 text-[10px] sm:text-xs text-muted-foreground mb-4">
|
||||
<li>
|
||||
Navigate to the Connector Dashboard and select the <strong>GitHub</strong>{" "}
|
||||
Connector.
|
||||
</li>
|
||||
<li>
|
||||
Enter your <strong>GitHub Personal Access Token</strong> in the form field.
|
||||
</li>
|
||||
<li>
|
||||
Enter a comma-separated list of <strong>Repository Names</strong> (e.g.,
|
||||
"owner/repo1, owner/repo2").
|
||||
</li>
|
||||
<li>
|
||||
Click <strong>Connect</strong> to establish the connection.
|
||||
</li>
|
||||
<li>Once connected, your GitHub repositories will be indexed automatically.</li>
|
||||
</ol>
|
||||
|
||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20">
|
||||
<Info className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
<AlertTitle className="text-[10px] sm:text-xs">What Gets Indexed</AlertTitle>
|
||||
<AlertDescription className="text-[9px] sm:text-[10px]">
|
||||
<p className="mb-2">The GitHub connector indexes the following data:</p>
|
||||
<ul className="list-disc pl-5 space-y-1">
|
||||
<li>Code files from selected repositories</li>
|
||||
<li>README files and Markdown documentation</li>
|
||||
<li>Common text-based file formats</li>
|
||||
<li>Repository metadata and structure</li>
|
||||
</ul>
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
</div>
|
||||
</div>
|
||||
</AccordionContent>
|
||||
</AccordionItem>
|
||||
</Accordion>
|
||||
{/* Documentation Link */}
|
||||
<div>
|
||||
<Link
|
||||
href="/docs/connectors/github"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-xs sm:text-sm font-medium underline underline-offset-4 hover:text-primary transition-colors inline-flex items-center gap-1.5"
|
||||
>
|
||||
View GitHub Connector Documentation
|
||||
<ExternalLink className="h-3 w-3 sm:h-4 sm:w-4" />
|
||||
</Link>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import { KeyRound } from "lucide-react";
|
||||
import type { FC } from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { useEffect, useRef, useState } from "react";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
|
|
@ -12,25 +12,29 @@ export interface GithubConfigProps extends ConnectorConfigProps {
|
|||
onNameChange?: (name: string) => void;
|
||||
}
|
||||
|
||||
// Helper functions moved outside component to avoid useEffect dependency issues
|
||||
const stringToArray = (arr: string[] | string | undefined): string[] => {
|
||||
if (Array.isArray(arr)) return arr;
|
||||
if (typeof arr === "string") {
|
||||
return arr
|
||||
.split(",")
|
||||
.map((item) => item.trim())
|
||||
.filter((item) => item.length > 0);
|
||||
}
|
||||
return [];
|
||||
};
|
||||
|
||||
const arrayToString = (arr: string[]): string => {
|
||||
return arr.join(", ");
|
||||
};
|
||||
|
||||
export const GithubConfig: FC<GithubConfigProps> = ({
|
||||
connector,
|
||||
onConfigChange,
|
||||
onNameChange,
|
||||
}) => {
|
||||
const stringToArray = (arr: string[] | string | undefined): string[] => {
|
||||
if (Array.isArray(arr)) return arr;
|
||||
if (typeof arr === "string") {
|
||||
return arr
|
||||
.split(",")
|
||||
.map((item) => item.trim())
|
||||
.filter((item) => item.length > 0);
|
||||
}
|
||||
return [];
|
||||
};
|
||||
|
||||
const arrayToString = (arr: string[]): string => {
|
||||
return arr.join(", ");
|
||||
};
|
||||
// Track internal changes to prevent useEffect from overwriting user input
|
||||
const isInternalChange = useRef(false);
|
||||
|
||||
const [githubPat, setGithubPat] = useState<string>(
|
||||
(connector.config?.GITHUB_PAT as string) || ""
|
||||
|
|
@ -40,8 +44,13 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
);
|
||||
const [name, setName] = useState<string>(connector.name || "");
|
||||
|
||||
// Update values when connector changes
|
||||
// Update values when connector changes externally (not from our own input)
|
||||
useEffect(() => {
|
||||
// Skip if this is our own internal change
|
||||
if (isInternalChange.current) {
|
||||
isInternalChange.current = false;
|
||||
return;
|
||||
}
|
||||
const pat = (connector.config?.GITHUB_PAT as string) || "";
|
||||
const repos = arrayToString(stringToArray(connector.config?.repo_full_names));
|
||||
setGithubPat(pat);
|
||||
|
|
@ -50,6 +59,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
}, [connector.config, connector.name]);
|
||||
|
||||
const handleGithubPatChange = (value: string) => {
|
||||
isInternalChange.current = true;
|
||||
setGithubPat(value);
|
||||
if (onConfigChange) {
|
||||
onConfigChange({
|
||||
|
|
@ -60,6 +70,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
};
|
||||
|
||||
const handleRepoFullNamesChange = (value: string) => {
|
||||
isInternalChange.current = true;
|
||||
setRepoFullNames(value);
|
||||
const repoList = stringToArray(value);
|
||||
if (onConfigChange) {
|
||||
|
|
@ -71,6 +82,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
};
|
||||
|
||||
const handleNameChange = (value: string) => {
|
||||
isInternalChange.current = true;
|
||||
setName(value);
|
||||
if (onNameChange) {
|
||||
onNameChange(value);
|
||||
|
|
@ -105,7 +117,7 @@ export const GithubConfig: FC<GithubConfigProps> = ({
|
|||
<div className="space-y-2">
|
||||
<Label className="flex items-center gap-2 text-xs sm:text-sm">
|
||||
<KeyRound className="h-4 w-4" />
|
||||
GitHub Personal Access Token
|
||||
GitHub Personal Access Token (optional)
|
||||
</Label>
|
||||
<Input
|
||||
type="password"
|
||||
|
|
|
|||
|
|
@ -206,9 +206,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{connector.is_indexable && (
|
||||
<>
|
||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
||||
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" && (
|
||||
connector.connector_type !== "WEBCRAWLER_CONNECTOR" &&
|
||||
connector.connector_type !== "GITHUB_CONNECTOR" && (
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
|
|
|
|||
|
|
@ -151,9 +151,10 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
|||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{connector?.is_indexable && (
|
||||
<>
|
||||
{/* Date range selector - not shown for Google Drive (uses folder selection) or Webcrawler (uses config) */}
|
||||
{/* Date range selector - not shown for Google Drive, Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" && (
|
||||
config.connectorType !== "WEBCRAWLER_CONNECTOR" &&
|
||||
config.connectorType !== "GITHUB_CONNECTOR" && (
|
||||
<DateRangeSelector
|
||||
startDate={startDate}
|
||||
endDate={endDate}
|
||||
|
|
|
|||
|
|
@ -489,8 +489,8 @@ export function SourceDetailPanel({
|
|||
>
|
||||
{idx + 1}
|
||||
{isCited && (
|
||||
<span className="absolute -top-1 -right-1 w-3 h-3 bg-primary rounded-full border-2 border-background">
|
||||
<Sparkles className="h-2 w-2 text-primary-foreground absolute top-0.5 left-0.5" />
|
||||
<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
|
||||
<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
|
||||
</span>
|
||||
)}
|
||||
</motion.button>
|
||||
|
|
|
|||
|
|
@ -3,4 +3,81 @@ title: GitHub
|
|||
description: Connect your GitHub repositories to SurfSense
|
||||
---
|
||||
|
||||
# Documentation in progress
|
||||
# GitHub Connector
|
||||
|
||||
Connect your GitHub repositories to SurfSense for code search and AI-powered insights. The connector uses [gitingest](https://gitingest.com) to efficiently index entire codebases.
|
||||
|
||||
## What Gets Indexed
|
||||
|
||||
| Content Type | Examples |
|
||||
|--------------|----------|
|
||||
| Code Files | Python, JavaScript, TypeScript, Go, Rust, Java, etc. |
|
||||
| Documentation | README files, Markdown documents, text files |
|
||||
| Configuration | JSON, YAML, TOML, .env examples, Dockerfiles |
|
||||
|
||||
> ⚠️ Binary files and files larger than 5MB are automatically excluded.
|
||||
|
||||
---
|
||||
|
||||
## Quick Start (Public Repos)
|
||||
|
||||
1. Navigate to **Connectors** → **Add Connector** → **GitHub**
|
||||
2. Enter repository names: `owner/repo` (e.g., `facebook/react, vercel/next.js`)
|
||||
3. Click **Connect GitHub**
|
||||
|
||||
No authentication required for public repositories.
|
||||
|
||||
---
|
||||
|
||||
## Private Repositories
|
||||
|
||||
For private repos, you need a GitHub Personal Access Token (PAT).
|
||||
|
||||
### Generate a PAT
|
||||
|
||||
1. Go to [GitHub's token creation page](https://github.com/settings/tokens/new?description=surfsense&scopes=repo) (pre-filled with `repo` scope)
|
||||
2. Set an expiration
|
||||
3. Click **Generate token** and copy it
|
||||
|
||||
> ⚠️ The token starts with `ghp_`. Store it securely.
|
||||
|
||||
---
|
||||
|
||||
## Connector Configuration
|
||||
|
||||
| Field | Description | Required |
|
||||
|-------|-------------|----------|
|
||||
| **Connector Name** | A friendly name to identify this connector | Yes |
|
||||
| **GitHub Personal Access Token** | Your PAT (only for private repos) | No |
|
||||
| **Repository Names** | Comma-separated list: `owner/repo1, owner/repo2` | Yes |
|
||||
|
||||
---
|
||||
|
||||
## Periodic Sync
|
||||
|
||||
Enable periodic sync to automatically re-index repositories when content changes:
|
||||
|
||||
| Frequency | Use Case |
|
||||
|-----------|----------|
|
||||
| Every 5 minutes | Active development |
|
||||
| Every 15 minutes | Frequent commits |
|
||||
| Every hour | Regular workflow |
|
||||
| Every 6 hours | Less active repos |
|
||||
| Daily | Reference repositories |
|
||||
| Weekly | Stable codebases |
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Repository not found**
|
||||
- Verify format is `owner/repo`
|
||||
- For private repos, ensure PAT has access
|
||||
|
||||
**Authentication failed**
|
||||
- Check PAT is valid and not expired
|
||||
- Token should start with `ghp_` or `github_pat_`
|
||||
|
||||
**Rate limit exceeded**
|
||||
- Use a PAT for higher limits (5,000/hour vs 60 unauthenticated)
|
||||
- Reduce sync frequency
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue