diff --git a/surfsense_backend/alembic/versions/73_add_user_memories_table.py b/surfsense_backend/alembic/versions/73_add_user_memories_table.py new file mode 100644 index 000000000..c0a1fb97e --- /dev/null +++ b/surfsense_backend/alembic/versions/73_add_user_memories_table.py @@ -0,0 +1,135 @@ +"""Add user_memories table for AI memory feature + +Revision ID: 73 +Revises: 72 +Create Date: 2026-01-20 + +This migration adds the user_memories table which enables Claude-like memory +functionality - allowing the AI to remember facts, preferences, and context +about users across conversations. +""" + +from collections.abc import Sequence + +from alembic import op +from app.config import config + +# revision identifiers, used by Alembic. +revision: str = "73" +down_revision: str | None = "72" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +# Get embedding dimension from config +EMBEDDING_DIM = config.embedding_model_instance.dimension + + +def upgrade() -> None: + """Create user_memories table and MemoryCategory enum.""" + + # Create the MemoryCategory enum type + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'memorycategory') THEN + CREATE TYPE memorycategory AS ENUM ( + 'preference', + 'fact', + 'instruction', + 'context' + ); + END IF; + END$$; + """ + ) + + # Create user_memories table + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'user_memories' + ) THEN + CREATE TABLE user_memories ( + id SERIAL PRIMARY KEY, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE, + search_space_id INTEGER REFERENCES searchspaces(id) ON DELETE CASCADE, + memory_text TEXT NOT NULL, + category memorycategory NOT NULL DEFAULT 'fact', + embedding vector({EMBEDDING_DIM}), + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() + ); + END IF; + END$$; + """ + ) + + # Create indexes for efficient querying + op.execute( + """ + DO $$ + BEGIN + -- Index on user_id for filtering memories by user + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'user_memories' AND indexname = 'ix_user_memories_user_id' + ) THEN + CREATE INDEX ix_user_memories_user_id ON user_memories(user_id); + END IF; + + -- Index on search_space_id for filtering memories by search space + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'user_memories' AND indexname = 'ix_user_memories_search_space_id' + ) THEN + CREATE INDEX ix_user_memories_search_space_id ON user_memories(search_space_id); + END IF; + + -- Index on updated_at for ordering by recency + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'user_memories' AND indexname = 'ix_user_memories_updated_at' + ) THEN + CREATE INDEX ix_user_memories_updated_at ON user_memories(updated_at); + END IF; + + -- Index on category for filtering by memory type + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'user_memories' AND indexname = 'ix_user_memories_category' + ) THEN + CREATE INDEX ix_user_memories_category ON user_memories(category); + END IF; + + -- Composite index for common query pattern (user + search space) + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'user_memories' AND indexname = 'ix_user_memories_user_search_space' + ) THEN + CREATE INDEX ix_user_memories_user_search_space ON user_memories(user_id, search_space_id); + END IF; + END$$; + """ + ) + + # Create vector index for semantic search + op.execute( + """ + CREATE INDEX IF NOT EXISTS user_memories_vector_index + ON user_memories USING hnsw (embedding public.vector_cosine_ops); + """ + ) + + +def downgrade() -> None: + """Drop user_memories table and MemoryCategory enum.""" + + # Drop the table + op.execute("DROP TABLE IF EXISTS user_memories CASCADE;") + + # Drop the enum type + op.execute("DROP TYPE IF EXISTS memorycategory;") diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index 9675521f5..5bc6ac2e2 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -34,6 +34,7 @@ async def create_surfsense_deep_agent( db_session: AsyncSession, connector_service: ConnectorService, checkpointer: Checkpointer, + user_id: str | None = None, agent_config: AgentConfig | None = None, enabled_tools: list[str] | None = None, disabled_tools: list[str] | None = None, @@ -49,6 +50,8 @@ async def create_surfsense_deep_agent( - link_preview: Fetch rich previews for URLs - display_image: Display images in chat - scrape_webpage: Extract content from webpages + - save_memory: Store facts/preferences about the user + - recall_memory: Retrieve relevant user memories The agent also includes TodoListMiddleware by default (via create_deep_agent) which provides: - write_todos: Create and update planning/todo lists for complex tasks @@ -64,6 +67,7 @@ async def create_surfsense_deep_agent( connector_service: Initialized connector service for knowledge base search checkpointer: LangGraph checkpointer for conversation state persistence. Use AsyncPostgresSaver for production or MemorySaver for testing. + user_id: The current user's UUID string (required for memory tools) agent_config: Optional AgentConfig from NewLLMConfig for prompt configuration. If None, uses default system prompt with citations enabled. enabled_tools: Explicit list of tool names to enable. If None, all default tools @@ -118,6 +122,7 @@ async def create_surfsense_deep_agent( "db_session": db_session, "connector_service": connector_service, "firecrawl_api_key": firecrawl_api_key, + "user_id": user_id, # Required for memory tools } # Build tools using the async registry (includes MCP tools) diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index 76429a830..d8202a8b0 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -116,6 +116,45 @@ You have access to the following tools: * This makes your response more visual and engaging. * Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content. * Don't show every image - just the most relevant 1-3 images that enhance understanding. + +6. save_memory: Save facts, preferences, or context about the user for personalized responses. + - Use this when the user explicitly or implicitly shares information worth remembering. + - Trigger scenarios: + * User says "remember this", "keep this in mind", "note that", or similar + * User shares personal preferences (e.g., "I prefer Python over JavaScript") + * User shares facts about themselves (e.g., "I'm a senior developer at Company X") + * User gives standing instructions (e.g., "always respond in bullet points") + * User shares project context (e.g., "I'm working on migrating our codebase to TypeScript") + - Args: + - content: The fact/preference to remember. Phrase it clearly: + * "User prefers dark mode for all interfaces" + * "User is a senior Python developer" + * "User wants responses in bullet point format" + * "User is working on project called ProjectX" + - category: Type of memory: + * "preference": User preferences (coding style, tools, formats) + * "fact": Facts about the user (role, expertise, background) + * "instruction": Standing instructions (response format, communication style) + * "context": Current context (ongoing projects, goals, challenges) + - Returns: Confirmation of saved memory + - IMPORTANT: Only save information that would be genuinely useful for future conversations. + Don't save trivial or temporary information. + +7. recall_memory: Retrieve relevant memories about the user for personalized responses. + - Use this to access stored information about the user. + - Trigger scenarios: + * You need user context to give a better, more personalized answer + * User references something they mentioned before + * User asks "what do you know about me?" or similar + * Personalization would significantly improve response quality + * Before making recommendations that should consider user preferences + - Args: + - query: Optional search query to find specific memories (e.g., "programming preferences") + - category: Optional filter by category ("preference", "fact", "instruction", "context") + - top_k: Number of memories to retrieve (default: 5) + - Returns: Relevant memories formatted as context + - IMPORTANT: Use the recalled memories naturally in your response without explicitly + stating "Based on your memory..." - integrate the context seamlessly. - User: "How do I install SurfSense?" @@ -136,6 +175,23 @@ You have access to the following tools: - User: "What did I discuss on Slack last week about the React migration?" - Call: `search_knowledge_base(query="React migration", connectors_to_search=["SLACK_CONNECTOR"], start_date="YYYY-MM-DD", end_date="YYYY-MM-DD")` +- User: "Remember that I prefer TypeScript over JavaScript" + - Call: `save_memory(content="User prefers TypeScript over JavaScript for development", category="preference")` + +- User: "I'm a data scientist working on ML pipelines" + - Call: `save_memory(content="User is a data scientist working on ML pipelines", category="fact")` + +- User: "Always give me code examples in Python" + - Call: `save_memory(content="User wants code examples to be written in Python", category="instruction")` + +- User: "What programming language should I use for this project?" + - First recall: `recall_memory(query="programming language preferences")` + - Then provide a personalized recommendation based on their preferences + +- User: "What do you know about me?" + - Call: `recall_memory(top_k=10)` + - Then summarize the stored memories + - User: "Give me a podcast about AI trends based on what we discussed" - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")` diff --git a/surfsense_backend/app/agents/new_chat/tools/__init__.py b/surfsense_backend/app/agents/new_chat/tools/__init__.py index b531d9b4d..acbdbcb3a 100644 --- a/surfsense_backend/app/agents/new_chat/tools/__init__.py +++ b/surfsense_backend/app/agents/new_chat/tools/__init__.py @@ -11,6 +11,8 @@ Available tools: - link_preview: Fetch rich previews for URLs - display_image: Display images in chat - scrape_webpage: Extract content from webpages +- save_memory: Store facts/preferences about the user +- recall_memory: Retrieve relevant user memories """ # Registry exports @@ -33,6 +35,7 @@ from .registry import ( ) from .scrape_webpage import create_scrape_webpage_tool from .search_surfsense_docs import create_search_surfsense_docs_tool +from .user_memory import create_recall_memory_tool, create_save_memory_tool __all__ = [ # Registry @@ -43,6 +46,8 @@ __all__ = [ "create_display_image_tool", "create_generate_podcast_tool", "create_link_preview_tool", + "create_recall_memory_tool", + "create_save_memory_tool", "create_scrape_webpage_tool", "create_search_knowledge_base_tool", "create_search_surfsense_docs_tool", diff --git a/surfsense_backend/app/agents/new_chat/tools/registry.py b/surfsense_backend/app/agents/new_chat/tools/registry.py index 6873f864c..e4ce7a6b7 100644 --- a/surfsense_backend/app/agents/new_chat/tools/registry.py +++ b/surfsense_backend/app/agents/new_chat/tools/registry.py @@ -50,6 +50,7 @@ from .mcp_tool import load_mcp_tools from .podcast import create_generate_podcast_tool from .scrape_webpage import create_scrape_webpage_tool from .search_surfsense_docs import create_search_surfsense_docs_tool +from .user_memory import create_recall_memory_tool, create_save_memory_tool # ============================================================================= # Tool Definition @@ -138,6 +139,31 @@ BUILTIN_TOOLS: list[ToolDefinition] = [ requires=["db_session"], ), # ========================================================================= + # USER MEMORY TOOLS - Claude-like memory feature + # ========================================================================= + # Save memory tool - stores facts/preferences about the user + ToolDefinition( + name="save_memory", + description="Save facts, preferences, or context about the user for personalized responses", + factory=lambda deps: create_save_memory_tool( + user_id=deps["user_id"], + search_space_id=deps["search_space_id"], + db_session=deps["db_session"], + ), + requires=["user_id", "search_space_id", "db_session"], + ), + # Recall memory tool - retrieves relevant user memories + ToolDefinition( + name="recall_memory", + description="Recall user memories for personalized and contextual responses", + factory=lambda deps: create_recall_memory_tool( + user_id=deps["user_id"], + search_space_id=deps["search_space_id"], + db_session=deps["db_session"], + ), + requires=["user_id", "search_space_id", "db_session"], + ), + # ========================================================================= # ADD YOUR CUSTOM TOOLS BELOW # ========================================================================= # Example: diff --git a/surfsense_backend/app/agents/new_chat/tools/user_memory.py b/surfsense_backend/app/agents/new_chat/tools/user_memory.py new file mode 100644 index 000000000..23a0b8666 --- /dev/null +++ b/surfsense_backend/app/agents/new_chat/tools/user_memory.py @@ -0,0 +1,352 @@ +""" +User memory tools for the SurfSense agent. + +This module provides tools for storing and retrieving user memories, +enabling personalized AI responses similar to Claude's memory feature. + +Features: +- save_memory: Store facts, preferences, and context about the user +- recall_memory: Retrieve relevant memories using semantic search +""" + +import logging +from typing import Any +from uuid import UUID + +from langchain_core.tools import tool +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.db import MemoryCategory, UserMemory + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Constants +# ============================================================================= + +# Default number of memories to retrieve +DEFAULT_RECALL_TOP_K = 5 + +# Maximum number of memories per user (to prevent unbounded growth) +MAX_MEMORIES_PER_USER = 100 + + +# ============================================================================= +# Helper Functions +# ============================================================================= + + +def _to_uuid(user_id: str) -> UUID: + """Convert a string user_id to a UUID object.""" + if isinstance(user_id, UUID): + return user_id + return UUID(user_id) + + +async def get_user_memory_count( + db_session: AsyncSession, + user_id: str, + search_space_id: int | None = None, +) -> int: + """Get the count of memories for a user.""" + uuid_user_id = _to_uuid(user_id) + query = select(UserMemory).where(UserMemory.user_id == uuid_user_id) + if search_space_id is not None: + query = query.where( + (UserMemory.search_space_id == search_space_id) + | (UserMemory.search_space_id.is_(None)) + ) + result = await db_session.execute(query) + return len(result.scalars().all()) + + +async def delete_oldest_memory( + db_session: AsyncSession, + user_id: str, + search_space_id: int | None = None, +) -> None: + """Delete the oldest memory for a user to make room for new ones.""" + uuid_user_id = _to_uuid(user_id) + query = ( + select(UserMemory) + .where(UserMemory.user_id == uuid_user_id) + .order_by(UserMemory.updated_at.asc()) + .limit(1) + ) + if search_space_id is not None: + query = query.where( + (UserMemory.search_space_id == search_space_id) + | (UserMemory.search_space_id.is_(None)) + ) + result = await db_session.execute(query) + oldest_memory = result.scalars().first() + if oldest_memory: + await db_session.delete(oldest_memory) + await db_session.commit() + + +def format_memories_for_context(memories: list[dict[str, Any]]) -> str: + """Format retrieved memories into a readable context string for the LLM.""" + if not memories: + return "No relevant memories found for this user." + + parts = [""] + for memory in memories: + category = memory.get("category", "unknown") + text = memory.get("memory_text", "") + updated = memory.get("updated_at", "") + parts.append( + f" {text}" + ) + parts.append("") + + return "\n".join(parts) + + +# ============================================================================= +# Tool Factory Functions +# ============================================================================= + + +def create_save_memory_tool( + user_id: str, + search_space_id: int, + db_session: AsyncSession, +): + """ + Factory function to create the save_memory tool. + + Args: + user_id: The user's UUID + search_space_id: The search space ID (for space-specific memories) + db_session: Database session for executing queries + + Returns: + A configured tool function for saving user memories + """ + + @tool + async def save_memory( + content: str, + category: str = "fact", + ) -> dict[str, Any]: + """ + Save a fact, preference, or context about the user for future reference. + + Use this tool when: + - User explicitly says "remember this", "keep this in mind", or similar + - User shares personal preferences (e.g., "I prefer Python over JavaScript") + - User shares important facts about themselves (name, role, interests, projects) + - User gives standing instructions (e.g., "always respond in bullet points") + - User shares relevant context (e.g., "I'm working on project X") + + The saved information will be available in future conversations to provide + more personalized and contextual responses. + + Args: + content: The fact/preference/context to remember. + Phrase it clearly, e.g., "User prefers dark mode", + "User is a senior Python developer", "User is working on an AI project" + category: Type of memory. One of: + - "preference": User preferences (e.g., coding style, tools, formats) + - "fact": Facts about the user (e.g., name, role, expertise) + - "instruction": Standing instructions (e.g., response format preferences) + - "context": Current context (e.g., ongoing projects, goals) + + Returns: + A dictionary with the save status and memory details + """ + # Normalize and validate category (LLMs may send uppercase) + category = category.lower() if category else "fact" + valid_categories = ["preference", "fact", "instruction", "context"] + if category not in valid_categories: + category = "fact" + + try: + # Convert user_id to UUID + uuid_user_id = _to_uuid(user_id) + + # Check if we've hit the memory limit + memory_count = await get_user_memory_count( + db_session, user_id, search_space_id + ) + if memory_count >= MAX_MEMORIES_PER_USER: + # Delete oldest memory to make room + await delete_oldest_memory(db_session, user_id, search_space_id) + + # Generate embedding for the memory + embedding = config.embedding_model_instance.embed(content) + + # Create new memory using ORM + # The pgvector Vector column type handles embedding conversion automatically + new_memory = UserMemory( + user_id=uuid_user_id, + search_space_id=search_space_id, + memory_text=content, + category=MemoryCategory(category), # Convert string to enum + embedding=embedding, # Pass embedding directly (list or numpy array) + ) + + db_session.add(new_memory) + await db_session.commit() + await db_session.refresh(new_memory) + + return { + "status": "saved", + "memory_id": new_memory.id, + "memory_text": content, + "category": category, + "message": f"I'll remember: {content}", + } + + except Exception as e: + logger.exception(f"Failed to save memory for user {user_id}: {e}") + # Rollback the session to clear any failed transaction state + await db_session.rollback() + return { + "status": "error", + "error": str(e), + "message": "Failed to save memory. Please try again.", + } + + return save_memory + + +def create_recall_memory_tool( + user_id: str, + search_space_id: int, + db_session: AsyncSession, +): + """ + Factory function to create the recall_memory tool. + + Args: + user_id: The user's UUID + search_space_id: The search space ID + db_session: Database session for executing queries + + Returns: + A configured tool function for recalling user memories + """ + + @tool + async def recall_memory( + query: str | None = None, + category: str | None = None, + top_k: int = DEFAULT_RECALL_TOP_K, + ) -> dict[str, Any]: + """ + Recall relevant memories about the user to provide personalized responses. + + Use this tool when: + - You need user context to give a better, more personalized answer + - User asks about their preferences or past information they shared + - User references something they told you before + - Personalization would significantly improve the response quality + - User asks "what do you know about me?" or similar + + Args: + query: Optional search query to find specific memories. + If not provided, returns the most recent memories. + Example: "programming preferences", "current projects" + category: Optional category filter. One of: + "preference", "fact", "instruction", "context" + If not provided, searches all categories. + top_k: Number of memories to retrieve (default: 5, max: 20) + + Returns: + A dictionary containing relevant memories and formatted context + """ + top_k = min(max(top_k, 1), 20) # Clamp between 1 and 20 + + try: + # Convert user_id to UUID + uuid_user_id = _to_uuid(user_id) + + if query: + # Semantic search using embeddings + query_embedding = config.embedding_model_instance.embed(query) + + # Build query with vector similarity + stmt = ( + select(UserMemory) + .where(UserMemory.user_id == uuid_user_id) + .where( + (UserMemory.search_space_id == search_space_id) + | (UserMemory.search_space_id.is_(None)) + ) + ) + + # Add category filter if specified + if category and category in [ + "preference", + "fact", + "instruction", + "context", + ]: + stmt = stmt.where(UserMemory.category == MemoryCategory(category)) + + # Order by vector similarity + stmt = stmt.order_by( + UserMemory.embedding.op("<=>")(query_embedding) + ).limit(top_k) + + else: + # No query - return most recent memories + stmt = ( + select(UserMemory) + .where(UserMemory.user_id == uuid_user_id) + .where( + (UserMemory.search_space_id == search_space_id) + | (UserMemory.search_space_id.is_(None)) + ) + ) + + # Add category filter if specified + if category and category in [ + "preference", + "fact", + "instruction", + "context", + ]: + stmt = stmt.where(UserMemory.category == MemoryCategory(category)) + + stmt = stmt.order_by(UserMemory.updated_at.desc()).limit(top_k) + + result = await db_session.execute(stmt) + memories = result.scalars().all() + + # Format memories for response + memory_list = [ + { + "id": m.id, + "memory_text": m.memory_text, + "category": m.category.value if m.category else "unknown", + "updated_at": m.updated_at.isoformat() if m.updated_at else None, + } + for m in memories + ] + + formatted_context = format_memories_for_context(memory_list) + + return { + "status": "success", + "count": len(memory_list), + "memories": memory_list, + "formatted_context": formatted_context, + } + + except Exception as e: + logger.exception(f"Failed to recall memories for user {user_id}: {e}") + await db_session.rollback() + return { + "status": "error", + "error": str(e), + "memories": [], + "formatted_context": "Failed to recall memories.", + } + + return recall_memory diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index 647856c6f..6f04ccdba 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -1,296 +1,236 @@ -import base64 -import logging -from typing import Any +""" +GitHub connector using gitingest CLI for efficient repository digestion. -from github3 import exceptions as github_exceptions, login as github_login -from github3.exceptions import ForbiddenError, NotFoundError -from github3.repos.contents import Contents +This connector uses subprocess to call gitingest CLI, completely isolating +it from any Python event loop/async complexity that can cause hangs in Celery. +""" + +import logging +import os +import subprocess +import tempfile +from dataclasses import dataclass logger = logging.getLogger(__name__) -# List of common code file extensions to target -CODE_EXTENSIONS = { - ".py", - ".js", - ".jsx", - ".ts", - ".tsx", - ".java", - ".c", - ".cpp", - ".h", - ".hpp", - ".cs", - ".go", - ".rb", - ".php", - ".swift", - ".kt", - ".scala", - ".rs", - ".m", - ".sh", - ".bash", - ".ps1", - ".lua", - ".pl", - ".pm", - ".r", - ".dart", - ".sql", -} +# Maximum file size in bytes (5MB) +MAX_FILE_SIZE = 5 * 1024 * 1024 -# List of common documentation/text file extensions -DOC_EXTENSIONS = { - ".md", - ".txt", - ".rst", - ".adoc", - ".html", - ".htm", - ".xml", - ".json", - ".yaml", - ".yml", - ".toml", -} -# Maximum file size in bytes (e.g., 1MB) -MAX_FILE_SIZE = 1 * 1024 * 1024 +@dataclass +class RepositoryDigest: + """Represents a digested repository from gitingest.""" + + repo_full_name: str + summary: str + tree: str + content: str + branch: str | None = None + + @property + def full_digest(self) -> str: + """Returns the complete digest with tree and content.""" + return f"# Repository: {self.repo_full_name}\n\n## File Structure\n\n{self.tree}\n\n## File Contents\n\n{self.content}" + + @property + def estimated_tokens(self) -> int: + """Rough estimate of tokens (1 token ≈ 4 characters).""" + return len(self.full_digest) // 4 class GitHubConnector: - """Connector for interacting with the GitHub API.""" + """ + Connector for ingesting GitHub repositories using gitingest CLI. - # Directories to skip during file traversal - SKIPPED_DIRS = { - # Version control - ".git", - # Dependencies - "node_modules", - "vendor", - # Build artifacts / Caches - "build", - "dist", - "target", - "__pycache__", - # Virtual environments - "venv", - ".venv", - "env", - # IDE/Editor config - ".vscode", - ".idea", - ".project", - ".settings", - # Temporary / Logs - "tmp", - "logs", - # Add other project-specific irrelevant directories if needed - } + Uses subprocess to run gitingest, which avoids all async/event loop + issues that can occur when mixing gitingest with Celery workers. + """ - def __init__(self, token: str): + def __init__(self, token: str | None = None): """ - Initializes the GitHub connector. + Initialize the GitHub connector. Args: - token: GitHub Personal Access Token (PAT). + token: Optional GitHub Personal Access Token (PAT). + Only required for private repositories. """ - if not token: - raise ValueError("GitHub token cannot be empty.") - try: - self.gh = github_login(token=token) - # Try a simple authenticated call to check token validity - self.gh.me() - logger.info("Successfully authenticated with GitHub API.") - except (github_exceptions.AuthenticationFailed, ForbiddenError) as e: - logger.error(f"GitHub authentication failed: {e}") - raise ValueError("Invalid GitHub token or insufficient permissions.") from e - except Exception as e: - logger.error(f"Failed to initialize GitHub client: {e}") - raise e + self.token = token if token and token.strip() else None + if self.token: + logger.info("GitHub connector initialized with authentication token.") + else: + logger.info("GitHub connector initialized without token (public repos only).") - def get_user_repositories(self) -> list[dict[str, Any]]: - """Fetches repositories accessible by the authenticated user.""" - repos_data = [] - try: - # type='owner' fetches repos owned by the user - # type='member' fetches repos the user is a collaborator on (including orgs) - # type='all' fetches both - for repo in self.gh.repositories(type="all", sort="updated"): - repos_data.append( - { - "id": repo.id, - "name": repo.name, - "full_name": repo.full_name, - "private": repo.private, - "url": repo.html_url, - "description": repo.description or "", - "last_updated": repo.updated_at if repo.updated_at else None, - } - ) - logger.info(f"Fetched {len(repos_data)} repositories.") - return repos_data - except Exception as e: - logger.error(f"Failed to fetch GitHub repositories: {e}") - return [] # Return empty list on error - - def get_repository_files( - self, repo_full_name: str, path: str = "" - ) -> list[dict[str, Any]]: + def ingest_repository( + self, + repo_full_name: str, + branch: str | None = None, + max_file_size: int = MAX_FILE_SIZE, + ) -> RepositoryDigest | None: """ - Recursively fetches details of relevant files (code, docs) within a repository path. + Ingest a repository using gitingest CLI via subprocess. + + This approach completely isolates gitingest from Python's event loop, + avoiding any async/Celery conflicts. Args: repo_full_name: The full name of the repository (e.g., 'owner/repo'). - path: The starting path within the repository (default is root). + branch: Optional specific branch or tag to ingest. + max_file_size: Maximum file size in bytes to include. Returns: - A list of dictionaries, each containing file details (path, sha, url, size). - Returns an empty list if the repository or path is not found or on error. + RepositoryDigest or None if ingestion fails. """ - files_list = [] + repo_url = f"https://github.com/{repo_full_name}" + + logger.info(f"Starting gitingest CLI for repository: {repo_full_name}") + try: - owner, repo_name = repo_full_name.split("/") - repo = self.gh.repository(owner, repo_name) - if not repo: - logger.warning(f"Repository '{repo_full_name}' not found.") - return [] - contents = repo.directory_contents( - directory_path=path - ) # Use directory_contents for clarity + # Create a temporary file for output + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", delete=False + ) as tmp_file: + output_path = tmp_file.name - # contents returns a list of tuples (name, content_obj) - for _item_name, content_item in contents: - if not isinstance(content_item, Contents): - continue + # Build the gitingest CLI command + cmd = [ + "gitingest", + repo_url, + "--output", output_path, + "--max-size", str(max_file_size), + # Common exclude patterns + "-e", "node_modules/*", + "-e", "vendor/*", + "-e", ".git/*", + "-e", "__pycache__/*", + "-e", "dist/*", + "-e", "build/*", + "-e", "*.lock", + "-e", "package-lock.json", + ] - if content_item.type == "dir": - # Check if the directory name is in the skipped list - if content_item.name in self.SKIPPED_DIRS: - logger.debug(f"Skipping directory: {content_item.path}") - continue # Skip recursion for this directory + # Add branch if specified + if branch: + cmd.extend(["--branch", branch]) - # Recursively fetch contents of subdirectory - files_list.extend( - self.get_repository_files( - repo_full_name, path=content_item.path - ) - ) - elif content_item.type == "file": - # Check if the file extension is relevant and size is within limits - file_extension = ( - "." + content_item.name.split(".")[-1].lower() - if "." in content_item.name - else "" - ) - is_code = file_extension in CODE_EXTENSIONS - is_doc = file_extension in DOC_EXTENSIONS + # Set up environment with token if provided + env = os.environ.copy() + if self.token: + env["GITHUB_TOKEN"] = self.token - if (is_code or is_doc) and content_item.size <= MAX_FILE_SIZE: - files_list.append( - { - "path": content_item.path, - "sha": content_item.sha, - "url": content_item.html_url, - "size": content_item.size, - "type": "code" if is_code else "doc", - } - ) - elif content_item.size > MAX_FILE_SIZE: - logger.debug( - f"Skipping large file: {content_item.path} ({content_item.size} bytes)" - ) - else: - logger.debug( - f"Skipping irrelevant file type: {content_item.path}" - ) + logger.info(f"Running gitingest CLI: {' '.join(cmd[:5])}...") - except (NotFoundError, ForbiddenError) as e: - logger.warning(f"Cannot access path '{path}' in '{repo_full_name}': {e}") - except Exception as e: - logger.error( - f"Failed to get files for {repo_full_name} at path '{path}': {e}" + # Run gitingest as subprocess with timeout + result = subprocess.run( + cmd, + env=env, + capture_output=True, + text=True, + timeout=900, # 5 minute timeout ) - # Return what we have collected so far in case of partial failure - return files_list + if result.returncode != 0: + logger.error(f"gitingest failed: {result.stderr}") + # Clean up temp file + if os.path.exists(output_path): + os.unlink(output_path) + return None - def get_file_content(self, repo_full_name: str, file_path: str) -> str | None: + # Read the output file + if not os.path.exists(output_path): + logger.error("gitingest did not create output file") + return None + + with open(output_path, encoding="utf-8") as f: + full_content = f.read() + + # Clean up temp file + os.unlink(output_path) + + if not full_content or not full_content.strip(): + logger.warning(f"No content retrieved from repository: {repo_full_name}") + return None + + # Parse the gitingest output + # The output format is: summary + tree + content + # We'll extract what we can + digest = RepositoryDigest( + repo_full_name=repo_full_name, + summary=f"Repository: {repo_full_name}", + tree="", # gitingest CLI combines everything into one file + content=full_content, + branch=branch, + ) + + logger.info( + f"Successfully ingested {repo_full_name}: " + f"~{digest.estimated_tokens} estimated tokens" + ) + return digest + + except subprocess.TimeoutExpired: + logger.error(f"gitingest timed out for repository: {repo_full_name}") + return None + except FileNotFoundError: + logger.error( + "gitingest CLI not found. Falling back to Python library." + ) + # Fall back to Python library + return self._ingest_with_python_library(repo_full_name, branch, max_file_size) + except Exception as e: + logger.error(f"Failed to ingest repository {repo_full_name}: {e}") + return None + + def _ingest_with_python_library( + self, + repo_full_name: str, + branch: str | None = None, + max_file_size: int = MAX_FILE_SIZE, + ) -> RepositoryDigest | None: """ - Fetches the decoded content of a specific file. - - Args: - repo_full_name: The full name of the repository (e.g., 'owner/repo'). - file_path: The path to the file within the repository. - - Returns: - The decoded file content as a string, or None if fetching fails or file is too large. + Fallback: Ingest using the Python library directly. """ + from gitingest import ingest + + repo_url = f"https://github.com/{repo_full_name}" + + logger.info(f"Using Python gitingest library for: {repo_full_name}") + try: - owner, repo_name = repo_full_name.split("/") - repo = self.gh.repository(owner, repo_name) - if not repo: - logger.warning( - f"Repository '{repo_full_name}' not found when fetching file '{file_path}'." - ) + kwargs = { + "max_file_size": max_file_size, + "exclude_patterns": [ + "node_modules/*", + "vendor/*", + ".git/*", + "__pycache__/*", + "dist/*", + "build/*", + "*.lock", + "package-lock.json", + ], + "include_gitignored": False, + "include_submodules": False, + } + + if self.token: + kwargs["token"] = self.token + if branch: + kwargs["branch"] = branch + + summary, tree, content = ingest(repo_url, **kwargs) + + if not content or not content.strip(): + logger.warning(f"No content from {repo_full_name}") return None - content_item = repo.file_contents( - path=file_path - ) # Use file_contents for clarity - - if ( - not content_item - or not isinstance(content_item, Contents) - or content_item.type != "file" - ): - logger.warning( - f"File '{file_path}' not found or is not a file in '{repo_full_name}'." - ) - return None - - if content_item.size > MAX_FILE_SIZE: - logger.warning( - f"File '{file_path}' in '{repo_full_name}' exceeds max size ({content_item.size} > {MAX_FILE_SIZE}). Skipping content fetch." - ) - return None - - # Content is base64 encoded - if content_item.content: - try: - decoded_content = base64.b64decode(content_item.content).decode( - "utf-8" - ) - return decoded_content - except UnicodeDecodeError: - logger.warning( - f"Could not decode file '{file_path}' in '{repo_full_name}' as UTF-8. Trying with 'latin-1'." - ) - try: - # Try a fallback encoding - decoded_content = base64.b64decode(content_item.content).decode( - "latin-1" - ) - return decoded_content - except Exception as decode_err: - logger.error( - f"Failed to decode file '{file_path}' with fallback encoding: {decode_err}" - ) - return None # Give up if fallback fails - else: - logger.warning( - f"No content returned for file '{file_path}' in '{repo_full_name}'. It might be empty." - ) - return "" # Return empty string for empty files - - except (NotFoundError, ForbiddenError) as e: - logger.warning( - f"Cannot access file '{file_path}' in '{repo_full_name}': {e}" + return RepositoryDigest( + repo_full_name=repo_full_name, + summary=summary, + tree=tree, + content=content, + branch=branch, ) - return None + except Exception as e: - logger.error( - f"Failed to get content for file '{file_path}' in '{repo_full_name}': {e}" - ) + logger.error(f"Python library failed for {repo_full_name}: {e}") return None diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index d5aa2b687..b56f37373 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -474,6 +474,66 @@ class ChatCommentMention(BaseModel, TimestampMixin): mentioned_user = relationship("User") +class MemoryCategory(str, Enum): + """Categories for user memories.""" + + # Using lowercase keys to match PostgreSQL enum values + preference = "preference" # User preferences (e.g., "prefers dark mode") + fact = "fact" # Facts about the user (e.g., "is a Python developer") + instruction = ( + "instruction" # Standing instructions (e.g., "always respond in bullet points") + ) + context = "context" # Contextual information (e.g., "working on project X") + + +class UserMemory(BaseModel, TimestampMixin): + """ + Stores facts, preferences, and context about users for personalized AI responses. + Similar to Claude's memory feature - enables the AI to remember user information + across conversations. + """ + + __tablename__ = "user_memories" + + user_id = Column( + UUID(as_uuid=True), + ForeignKey("user.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + # Optional association with a search space (if memory is space-specific) + search_space_id = Column( + Integer, + ForeignKey("searchspaces.id", ondelete="CASCADE"), + nullable=True, + index=True, + ) + + # The actual memory content + memory_text = Column(Text, nullable=False) + # Category for organization and filtering + category = Column( + SQLAlchemyEnum(MemoryCategory), + nullable=False, + default=MemoryCategory.fact, + ) + # Vector embedding for semantic search + embedding = Column(Vector(config.embedding_model_instance.dimension)) + + # Track when memory was last updated + updated_at = Column( + TIMESTAMP(timezone=True), + nullable=False, + default=lambda: datetime.now(UTC), + onupdate=lambda: datetime.now(UTC), + index=True, + ) + + # Relationships + user = relationship("User", back_populates="memories") + search_space = relationship("SearchSpace", back_populates="user_memories") + + class Document(BaseModel, TimestampMixin): __tablename__ = "documents" @@ -661,6 +721,14 @@ class SearchSpace(BaseModel, TimestampMixin): cascade="all, delete-orphan", ) + # User memories associated with this search space + user_memories = relationship( + "UserMemory", + back_populates="search_space", + order_by="UserMemory.updated_at.desc()", + cascade="all, delete-orphan", + ) + class SearchSourceConnector(BaseModel, TimestampMixin): __tablename__ = "search_source_connectors" @@ -969,6 +1037,14 @@ if config.AUTH_TYPE == "GOOGLE": passive_deletes=True, ) + # User memories for personalized AI responses + memories = relationship( + "UserMemory", + back_populates="user", + order_by="UserMemory.updated_at.desc()", + cascade="all, delete-orphan", + ) + # Page usage tracking for ETL services pages_limit = Column( Integer, @@ -1012,6 +1088,14 @@ else: passive_deletes=True, ) + # User memories for personalized AI responses + memories = relationship( + "UserMemory", + back_populates="user", + order_by="UserMemory.updated_at.desc()", + cascade="all, delete-orphan", + ) + # Page usage tracking for ETL services pages_limit = Column( Integer, diff --git a/surfsense_backend/app/routes/new_chat_routes.py b/surfsense_backend/app/routes/new_chat_routes.py index 8fddc55c4..4b8600fab 100644 --- a/surfsense_backend/app/routes/new_chat_routes.py +++ b/surfsense_backend/app/routes/new_chat_routes.py @@ -990,6 +990,7 @@ async def handle_new_chat( search_space_id=request.search_space_id, chat_id=request.chat_id, session=session, + user_id=str(user.id), # Pass user ID for memory tools llm_config_id=llm_config_id, attachments=request.attachments, mentioned_document_ids=request.mentioned_document_ids, diff --git a/surfsense_backend/app/services/chat_comments_service.py b/surfsense_backend/app/services/chat_comments_service.py index fa26bf6d5..6f81c0158 100644 --- a/surfsense_backend/app/services/chat_comments_service.py +++ b/surfsense_backend/app/services/chat_comments_service.py @@ -315,6 +315,8 @@ async def create_comment( thread_title=thread.title or "Untitled thread", author_id=str(user.id), author_name=author_name, + author_avatar_url=user.avatar_url, + author_email=user.email, content_preview=content_preview[:200], search_space_id=search_space_id, ) @@ -426,6 +428,8 @@ async def create_reply( thread_title=thread.title or "Untitled thread", author_id=str(user.id), author_name=author_name, + author_avatar_url=user.avatar_url, + author_email=user.email, content_preview=content_preview[:200], search_space_id=search_space_id, ) @@ -565,6 +569,8 @@ async def update_comment( thread_title=thread.title or "Untitled thread", author_id=str(user.id), author_name=author_name, + author_avatar_url=user.avatar_url, + author_email=user.email, content_preview=content_preview[:200], search_space_id=search_space_id, ) diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 97e0f9457..5f7f568f6 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -634,6 +634,8 @@ class MentionNotificationHandler(BaseNotificationHandler): thread_title: str, author_id: str, author_name: str, + author_avatar_url: str | None, + author_email: str, content_preview: str, search_space_id: int, ) -> Notification: @@ -650,6 +652,8 @@ class MentionNotificationHandler(BaseNotificationHandler): thread_title: Title of the chat thread author_id: ID of the comment author author_name: Display name of the comment author + author_avatar_url: Avatar URL of the comment author + author_email: Email of the comment author (for fallback initials) content_preview: First ~100 chars of the comment search_space_id: Search space ID @@ -667,6 +671,8 @@ class MentionNotificationHandler(BaseNotificationHandler): "thread_title": thread_title, "author_id": author_id, "author_name": author_name, + "author_avatar_url": author_avatar_url, + "author_email": author_email, "content_preview": content_preview[:200], } diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 85a524108..7d2cf4172 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -149,6 +149,7 @@ async def stream_new_chat( search_space_id: int, chat_id: int, session: AsyncSession, + user_id: str | None = None, llm_config_id: int = -1, attachments: list[ChatAttachment] | None = None, mentioned_document_ids: list[int] | None = None, @@ -166,6 +167,7 @@ async def stream_new_chat( search_space_id: The search space ID chat_id: The chat ID (used as LangGraph thread_id for memory) session: The database session + user_id: The current user's UUID string (for memory tools) llm_config_id: The LLM configuration ID (default: -1 for first global config) messages: Optional chat history from frontend (list of ChatMessage) attachments: Optional attachments with extracted content @@ -243,6 +245,7 @@ async def stream_new_chat( db_session=session, connector_service=connector_service, checkpointer=checkpointer, + user_id=user_id, # Pass user ID for memory tools agent_config=agent_config, # Pass prompt configuration firecrawl_api_key=firecrawl_api_key, # Pass Firecrawl API key if configured ) diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index e1844a503..f16ee0156 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -1,5 +1,8 @@ """ -GitHub connector indexer. +GitHub connector indexer using gitingest. + +This indexer processes entire repository digests in one pass, dramatically +reducing LLM API calls compared to the previous file-by-file approach. """ from datetime import UTC, datetime @@ -8,7 +11,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config -from app.connectors.github_connector import GitHubConnector +from app.connectors.github_connector import GitHubConnector, RepositoryDigest from app.db import Document, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService @@ -26,43 +29,55 @@ from .base import ( logger, ) +# Maximum tokens for a single digest before splitting +# Most LLMs can handle 128k+ tokens now, but we'll be conservative +MAX_DIGEST_CHARS = 500_000 # ~125k tokens + async def index_github_repos( session: AsyncSession, connector_id: int, search_space_id: int, user_id: str, - start_date: str | None = None, - end_date: str | None = None, + start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots + end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots update_last_indexed: bool = True, ) -> tuple[int, str | None]: """ - Index code and documentation files from accessible GitHub repositories. + Index GitHub repositories using gitingest for efficient processing. + + This function ingests entire repositories as digests, generates a single + summary per repository, and chunks the content for vector storage. + + Note: The start_date and end_date parameters are accepted for API compatibility + but are IGNORED. GitHub repositories are indexed as complete snapshots since + gitingest captures the current state of the entire codebase. Args: session: Database session connector_id: ID of the GitHub connector search_space_id: ID of the search space to store documents in user_id: ID of the user - start_date: Start date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates - end_date: End date for filtering (YYYY-MM-DD format) - Note: GitHub indexing processes all files regardless of dates + start_date: Ignored - kept for API compatibility + end_date: Ignored - kept for API compatibility update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) Returns: Tuple containing (number of documents indexed, error message or None) """ + # Note: start_date and end_date are intentionally unused + _ = start_date, end_date task_logger = TaskLoggingService(session, search_space_id) # Log task start log_entry = await task_logger.log_task_start( task_name="github_repos_indexing", source="connector_indexing_task", - message=f"Starting GitHub repositories indexing for connector {connector_id}", + message=f"Starting GitHub repositories indexing for connector {connector_id} (using gitingest)", metadata={ "connector_id": connector_id, "user_id": str(user_id), - "start_date": start_date, - "end_date": end_date, + "method": "gitingest", }, ) @@ -93,19 +108,11 @@ async def index_github_repos( f"Connector with ID {connector_id} not found or is not a GitHub connector", ) - # 2. Get the GitHub PAT and selected repositories from the connector config - github_pat = connector.config.get("GITHUB_PAT") + # 2. Get the GitHub PAT (optional) and selected repositories from the connector config + # PAT is only required for private repositories - public repos work without it + github_pat = connector.config.get("GITHUB_PAT") # Can be None or empty repo_full_names_to_index = connector.config.get("repo_full_names") - if not github_pat: - await task_logger.log_task_failure( - log_entry, - f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", - "Missing GitHub PAT", - {"error_type": "MissingToken"}, - ) - return 0, "GitHub Personal Access Token (PAT) not found in connector config" - if not repo_full_names_to_index or not isinstance( repo_full_names_to_index, list ): @@ -117,10 +124,16 @@ async def index_github_repos( ) return 0, "'repo_full_names' not found or is not a list in connector config" - # 3. Initialize GitHub connector client + # Log whether we're using authentication + if github_pat: + logger.info("Using GitHub PAT for authentication (private repos supported)") + else: + logger.info("No GitHub PAT provided - only public repositories can be indexed") + + # 3. Initialize GitHub connector with gitingest backend await task_logger.log_task_progress( log_entry, - f"Initializing GitHub client for connector {connector_id}", + f"Initializing gitingest-based GitHub client for connector {connector_id}", { "stage": "client_initialization", "repo_count": len(repo_full_names_to_index), @@ -138,258 +151,57 @@ async def index_github_repos( ) return 0, f"Failed to initialize GitHub client: {e!s}" - # 4. Validate selected repositories + # 4. Process each repository with gitingest await task_logger.log_task_progress( log_entry, - f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", + f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories", { "stage": "repo_processing", "repo_count": len(repo_full_names_to_index), - "start_date": start_date, - "end_date": end_date, }, ) logger.info( - f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." + f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories." ) - if start_date and end_date: - logger.info( - f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" - ) - # 6. Iterate through selected repositories and index files for repo_full_name in repo_full_names_to_index: if not repo_full_name or not isinstance(repo_full_name, str): logger.warning(f"Skipping invalid repository entry: {repo_full_name}") continue - logger.info(f"Processing repository: {repo_full_name}") - try: - files_to_index = github_client.get_repository_files(repo_full_name) - if not files_to_index: - logger.info( - f"No indexable files found in repository: {repo_full_name}" - ) - continue + logger.info(f"Ingesting repository: {repo_full_name}") - logger.info( - f"Found {len(files_to_index)} files to process in {repo_full_name}" + try: + # Run gitingest via subprocess (isolated from event loop) + # Using to_thread to not block the async database operations + import asyncio + + digest = await asyncio.to_thread( + github_client.ingest_repository, repo_full_name ) - for file_info in files_to_index: - file_path = file_info.get("path") - file_url = file_info.get("url") - file_sha = file_info.get("sha") - file_type = file_info.get("type") # 'code' or 'doc' - full_path_key = f"{repo_full_name}/{file_path}" - - if not file_path or not file_url or not file_sha: - logger.warning( - f"Skipping file with missing info in {repo_full_name}: {file_info}" - ) - continue - - # Get file content - file_content = github_client.get_file_content( - repo_full_name, file_path + if not digest: + logger.warning( + f"No digest returned for repository: {repo_full_name}" ) + errors.append(f"No digest for {repo_full_name}") + continue - if file_content is None: - logger.warning( - f"Could not retrieve content for {full_path_key}. Skipping." - ) - continue # Skip if content fetch failed + # Process the digest and create documents + docs_created = await _process_repository_digest( + session=session, + digest=digest, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) - # Generate unique identifier hash for this GitHub file - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_content, search_space_id) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info( - f"Document for GitHub file {full_path_key} unchanged. Skipping." - ) - continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for GitHub file {full_path_key}. Updating document." - ) - - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if user_llm: - file_extension = ( - file_path.split(".")[-1] - if "." in file_path - else None - ) - document_metadata = { - "file_path": full_path_key, - "repository": repo_full_name, - "file_type": file_extension or "unknown", - "document_type": "GitHub Repository File", - "connector_type": "GitHub", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - file_content, user_llm, document_metadata - ) - else: - summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." - summary_embedding = ( - config.embedding_model_instance.embed( - summary_content - ) - ) - - # Chunk the content - try: - if hasattr(config, "code_chunker_instance"): - chunks_data = [ - await create_document_chunks(file_content) - ][0] - else: - chunks_data = await create_document_chunks( - file_content - ) - except Exception as chunk_err: - logger.error( - f"Failed to chunk file {full_path_key}: {chunk_err}" - ) - continue - - # Update existing document - existing_document.title = f"GitHub - {full_path_key}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_path": file_path, - "file_sha": file_sha, - "file_url": file_url, - "repository": repo_full_name, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - existing_document.chunks = chunks_data - existing_document.updated_at = get_current_timestamp() - - logger.info( - f"Successfully updated GitHub file {full_path_key}" - ) - continue - - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if user_llm: - # Extract file extension from file path - file_extension = ( - file_path.split(".")[-1] if "." in file_path else None - ) - document_metadata = { - "file_path": full_path_key, - "repository": repo_full_name, - "file_type": file_extension or "unknown", - "document_type": "GitHub Repository File", - "connector_type": "GitHub", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - file_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = ( - f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Chunk the content - try: - chunks_data = [await create_document_chunks(file_content)][0] - - # Use code chunker if available, otherwise regular chunker - if hasattr(config, "code_chunker_instance"): - chunks_data = [ - { - "content": chunk.text, - "embedding": config.embedding_model_instance.embed( - chunk.text - ), - } - for chunk in config.code_chunker_instance.chunk( - file_content - ) - ] - else: - chunks_data = await create_document_chunks(file_content) - - except Exception as chunk_err: - logger.error( - f"Failed to chunk file {full_path_key}: {chunk_err}" - ) - errors.append( - f"Chunking failed for {full_path_key}: {chunk_err}" - ) - continue # Skip this file if chunking fails - - doc_metadata = { - "repository_full_name": repo_full_name, - "file_path": file_path, - "full_path": full_path_key, # For easier lookup - "url": file_url, - "sha": file_sha, - "type": file_type, - "indexed_at": datetime.now(UTC).isoformat(), - } - - # Create new document - logger.info(f"Creating new document for file: {full_path_key}") - document = Document( - title=f"GitHub - {file_path}", - document_type=DocumentType.GITHUB_CONNECTOR, - document_metadata=doc_metadata, - content=summary_content, # Store summary - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - search_space_id=search_space_id, - chunks=chunks_data, # Associate chunks directly - updated_at=get_current_timestamp(), - ) - session.add(document) - documents_processed += 1 - - # Batch commit every 10 documents - if documents_processed % 10 == 0: - logger.info( - f"Committing batch: {documents_processed} GitHub files processed so far" - ) - await session.commit() + documents_processed += docs_created + logger.info( + f"Created {docs_created} documents from repository: {repo_full_name}" + ) except Exception as repo_err: logger.error( @@ -397,11 +209,11 @@ async def index_github_repos( ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") - # Final commit for any remaining documents not yet committed in batches - logger.info(f"Final commit: Total {documents_processed} GitHub files processed") + # Final commit await session.commit() logger.info( - f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." + f"Finished GitHub indexing for connector {connector_id}. " + f"Created {documents_processed} documents." ) # Log success @@ -412,6 +224,7 @@ async def index_github_repos( "documents_processed": documents_processed, "errors_count": len(errors), "repo_count": len(repo_full_names_to_index), + "method": "gitingest", }, ) @@ -428,6 +241,7 @@ async def index_github_repos( ) errors.append(f"Database error: {db_err}") return documents_processed, "; ".join(errors) if errors else str(db_err) + except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -445,3 +259,173 @@ async def index_github_repos( error_message = "; ".join(errors) if errors else None return documents_processed, error_message + + +async def _process_repository_digest( + session: AsyncSession, + digest: RepositoryDigest, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry, +) -> int: + """ + Process a repository digest and create documents. + + For each repository, we create: + 1. One main document with the repository summary + 2. Chunks from the full digest content for granular search + + Args: + session: Database session + digest: The repository digest from gitingest + search_space_id: ID of the search space + user_id: ID of the user + task_logger: Task logging service + log_entry: Current log entry + + Returns: + Number of documents created + """ + repo_full_name = digest.repo_full_name + documents_created = 0 + + # Generate unique identifier based on repo name and content hash + # This allows updates when repo content changes + full_content = digest.full_digest + content_hash = generate_content_hash(full_content, search_space_id) + + # Use repo name as the unique identifier (one document per repo) + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id + ) + + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + logger.info( + f"Repository {repo_full_name} unchanged. Skipping." + ) + return 0 + else: + logger.info( + f"Content changed for repository {repo_full_name}. Updating document." + ) + # Delete existing document to replace with new one + await session.delete(existing_document) + await session.flush() + + # Generate summary using LLM (ONE call per repository!) + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + document_metadata = { + "repository": repo_full_name, + "document_type": "GitHub Repository", + "connector_type": "GitHub", + "ingestion_method": "gitingest", + "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree, + "estimated_tokens": digest.estimated_tokens, + } + + if user_llm: + # Prepare content for summarization + # Include tree structure and truncated content if too large + summary_content = digest.full_digest + if len(summary_content) > MAX_DIGEST_CHARS: + # Truncate but keep the tree and beginning of content + summary_content = ( + f"# Repository: {repo_full_name}\n\n" + f"## File Structure\n\n{digest.tree}\n\n" + f"## File Contents (truncated)\n\n{digest.content[:MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." + ) + + summary_text, summary_embedding = await generate_document_summary( + summary_content, user_llm, document_metadata + ) + else: + # Fallback to simple summary if no LLM configured + summary_text = ( + f"# GitHub Repository: {repo_full_name}\n\n" + f"## Summary\n{digest.summary}\n\n" + f"## File Structure\n{digest.tree[:3000]}" + ) + summary_embedding = config.embedding_model_instance.embed(summary_text) + + # Chunk the full digest content for granular search + try: + # Use the content (not the summary) for chunking + # This preserves file-level granularity in search + chunks_data = await create_document_chunks(digest.content) + except Exception as chunk_err: + logger.error( + f"Failed to chunk repository {repo_full_name}: {chunk_err}" + ) + # Fall back to a simpler chunking approach + chunks_data = await _simple_chunk_content(digest.content) + + # Create the document + doc_metadata = { + "repository_full_name": repo_full_name, + "url": f"https://github.com/{repo_full_name}", + "branch": digest.branch, + "ingestion_method": "gitingest", + "file_tree": digest.tree, + "gitingest_summary": digest.summary, + "estimated_tokens": digest.estimated_tokens, + "indexed_at": datetime.now(UTC).isoformat(), + } + + document = Document( + title=f"GitHub Repository: {repo_full_name}", + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata=doc_metadata, + content=summary_text, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=summary_embedding, + search_space_id=search_space_id, + chunks=chunks_data, + updated_at=get_current_timestamp(), + ) + + session.add(document) + documents_created += 1 + + logger.info( + f"Created document for repository {repo_full_name} " + f"with {len(chunks_data)} chunks" + ) + + return documents_created + + +async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: + """ + Simple fallback chunking when the regular chunker fails. + + Args: + content: The content to chunk + chunk_size: Size of each chunk in characters + + Returns: + List of chunk dictionaries with content and embedding + """ + from app.db import Chunk + + chunks = [] + for i in range(0, len(content), chunk_size): + chunk_text = content[i : i + chunk_size] + if chunk_text.strip(): + chunks.append( + Chunk( + content=chunk_text, + embedding=config.embedding_model_instance.embed(chunk_text), + ) + ) + + return chunks diff --git a/surfsense_backend/app/utils/validators.py b/surfsense_backend/app/utils/validators.py index 54e681518..6a87679ec 100644 --- a/surfsense_backend/app/utils/validators.py +++ b/surfsense_backend/app/utils/validators.py @@ -530,7 +530,10 @@ def validate_connector_config( # "validators": {}, # }, "GITHUB_CONNECTOR": { - "required": ["GITHUB_PAT", "repo_full_names"], + # GITHUB_PAT is optional - only required for private repositories + # Public repositories can be indexed without authentication + "required": ["repo_full_names"], + "optional": ["GITHUB_PAT"], # Optional - only needed for private repos "validators": { "repo_full_names": lambda: validate_list_field( "repo_full_names", "repo_full_names" diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index c208ea8e2..f96619607 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "mcp>=1.25.0", "starlette>=0.40.0,<0.51.0", "sse-starlette>=3.1.1,<3.1.2", + "gitingest>=0.3.1", "composio>=0.10.9", ] diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index bc98b250f..96282380b 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -1978,6 +1978,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/61/ad/2394d4fb542574678b0ba342daf734d4d811768da3c2ee0c84d509dcb26c/github3.py-4.0.1-py3-none-any.whl", hash = "sha256:a89af7de25650612d1da2f0609622bcdeb07ee8a45a1c06b2d16a05e4234e753", size = 151800 }, ] +[[package]] +name = "gitingest" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "httpx" }, + { name = "loguru" }, + { name = "pathspec" }, + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "starlette" }, + { name = "tiktoken" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/fe/a915f0c32a3d7920206a677f73c185b3eadf4ec151fb05aedd52e64713f7/gitingest-0.3.1.tar.gz", hash = "sha256:4587cab873d4e08bdb16d612bb153c23e0ce59771a1d57a438239c5e39f05ebf", size = 70681, upload-time = "2025-07-31T13:56:19.845Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/15/f200ab2e73287e67d1dce6fbacf421552ae9fbafdc5f0cc8dd0d2fe4fc47/gitingest-0.3.1-py3-none-any.whl", hash = "sha256:8143a5e6a7140ede9f680e13d3931ac07c82ac9bd8bab9ad1fba017c8c1e8666", size = 68343, upload-time = "2025-07-31T13:56:17.729Z" }, +] + [[package]] name = "google-api-core" version = "2.25.1" @@ -4493,6 +4512,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847 }, ] +[[package]] +name = "pathspec" +version = "1.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/b2/bb8e495d5262bfec41ab5cb18f522f1012933347fb5d9e62452d446baca2/pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d", size = 130841, upload-time = "2026-01-09T15:46:46.009Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c", size = 55021, upload-time = "2026-01-09T15:46:44.652Z" }, +] + [[package]] name = "pdf2image" version = "1.17.0" @@ -6523,6 +6551,7 @@ dependencies = [ { name = "firecrawl-py" }, { name = "flower" }, { name = "github3-py" }, + { name = "gitingest" }, { name = "google-api-python-client" }, { name = "google-auth-oauthlib" }, { name = "kokoro" }, @@ -6589,6 +6618,7 @@ requires-dist = [ { name = "firecrawl-py", specifier = ">=4.9.0" }, { name = "flower", specifier = ">=2.0.1" }, { name = "github3-py", specifier = "==4.0.1" }, + { name = "gitingest", specifier = ">=0.3.1" }, { name = "google-api-python-client", specifier = ">=2.156.0" }, { name = "google-auth-oauthlib", specifier = ">=1.2.1" }, { name = "kokoro", specifier = ">=0.9.4" }, diff --git a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx index 7b1bb61b0..bbafa9703 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx @@ -16,7 +16,6 @@ import { import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; import { DocumentUploadDialogProvider } from "@/components/assistant-ui/document-upload-popup"; import { DashboardBreadcrumb } from "@/components/dashboard-breadcrumb"; -import { LanguageSwitcher } from "@/components/LanguageSwitcher"; import { LayoutDataProvider } from "@/components/layout"; import { OnboardingTour } from "@/components/onboarding-tour"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; @@ -197,11 +196,7 @@ export function DashboardClientLayout({ return ( - } - languageSwitcher={} - > + }> {children} diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index 43c33ba5a..c20436a5e 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -32,6 +32,7 @@ import { DisplayImageToolUI } from "@/components/tool-ui/display-image"; import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast"; import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview"; import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage"; +import { SaveMemoryToolUI, RecallMemoryToolUI } from "@/components/tool-ui/user-memory"; // import { WriteTodosToolUI } from "@/components/tool-ui/write-todos"; import { getBearerToken } from "@/lib/auth-utils"; import { createAttachmentAdapter, extractAttachmentContent } from "@/lib/chat/attachment-adapter"; @@ -1056,17 +1057,13 @@ export default function NewChatPage() { + + {/* Disabled for now */}
- } + header={} />
diff --git a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx index f00982555..6701342de 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/team/page.tsx @@ -778,8 +778,7 @@ function RolesTab({ role.name === "Owner" && "text-amber-600", role.name === "Editor" && "text-blue-600", role.name === "Viewer" && "text-gray-600", - !["Owner", "Editor", "Viewer"].includes(role.name) && - "text-primary" + !["Owner", "Editor", "Viewer"].includes(role.name) && "text-primary" )} /> @@ -1488,7 +1487,8 @@ function CreateRoleDialog({

- Use presets to quickly apply Editor (create/read/update) or Viewer (read-only) permissions + Use presets to quickly apply Editor (create/read/update) or Viewer (read-only) + permissions

@@ -1500,9 +1500,7 @@ function CreateRoleDialog({ return (
-