From f488c106fa120c65005f60c4d4ff9381a74bd046 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Wed, 23 Jul 2025 00:33:04 +0200 Subject: [PATCH 01/17] feat: add a starter implementation of JIRA connector module --- .pre-commit-config.yaml | 4 +- .../app/connectors/jira_connector.py | 90 +++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 surfsense_backend/app/connectors/jira_connector.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0548e6667..78de72540 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,7 +67,7 @@ repos: hooks: - id: mypy files: ^surfsense_backend/ - additional_dependencies: [] + additional_dependencies: ['types-requests'] args: [--ignore-missing-imports, --disallow-untyped-defs] - repo: https://github.com/PyCQA/bandit @@ -75,7 +75,7 @@ repos: hooks: - id: bandit files: ^surfsense_backend/ - args: ['-r', '.', '-f', 'json'] + args: ['-r', '-f', 'json'] exclude: ^surfsense_backend/(tests/|alembic/) # Frontend/Extension Hooks (TypeScript/JavaScript) diff --git a/surfsense_backend/app/connectors/jira_connector.py b/surfsense_backend/app/connectors/jira_connector.py new file mode 100644 index 000000000..7f2345311 --- /dev/null +++ b/surfsense_backend/app/connectors/jira_connector.py @@ -0,0 +1,90 @@ +""" +Jira Connector Module + +A module for retrieving data from Jira. +Allows fetching issue lists and their comments, projects and more. +""" + +from typing import Any, Dict, Optional + +import requests + + +class JiraConnector: + """Class for retrieving data from Jira.""" + + def __init__( + self, + base_url: Optional[str] = None, + personal_access_token: Optional[str] = None, + ): + """ + Initialize the JiraConnector class. + + Args: + base_url: Jira instance base URL (e.g., 'https://yourcompany.atlassian.net') (optional) + personal_access_token: Jira personal access token (optional) + """ + self.base_url = base_url + self.personal_access_token = personal_access_token + self.api_version = "3" # Jira Cloud API version + + def set_personal_access_token(self, personal_access_token: str) -> None: + """ + Set the Jira personal access token. + + Args: + personal_access_token: Jira personal access token + """ + self.personal_access_token = personal_access_token + + def get_headers(self) -> Dict[str, str]: + """ + Get headers for Jira API requests. + + Returns: + Dictionary of headers + + Raises: + ValueError: If personal_access_token or base_url have not been set + """ + if not all([self.base_url, self.personal_access_token]): + raise ValueError("Jira personal access token or base URL not initialized.") + + return { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.personal_access_token}", + "Accept": "application/json", + } + + def make_api_request( + self, endpoint: str, params: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Make a request to the Jira API. + + Args: + endpoint: API endpoint (without base URL) + params: Query parameters for the request (optional) + + Returns: + Response data from the API + + Raises: + ValueError: If personal_access_token or base_url have not been set + Exception: If the API request fails + """ + if not all([self.base_url, self.personal_access_token]): + raise ValueError("Jira personal access token or base URL not initialized.") + + url = f"{self.base_url}/rest/api/{self.api_version}/{endpoint}" + headers = self.get_headers() + + response = requests.get(url, headers=headers, params=params, timeout=500) + + if response.status_code == 200: + return response.json() + else: + raise Exception( + f"API request failed with status code {response.status_code}: {response.text}" + ) From c4eab5eaba7c9b44dc40b15f49b01632bd97417b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 00:41:44 +0200 Subject: [PATCH 02/17] feat: add jira connector module implementation --- .../app/connectors/jira_connector.py | 374 +++++++++++++++++- 1 file changed, 372 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/connectors/jira_connector.py b/surfsense_backend/app/connectors/jira_connector.py index 7f2345311..14b2147e0 100644 --- a/surfsense_backend/app/connectors/jira_connector.py +++ b/surfsense_backend/app/connectors/jira_connector.py @@ -5,7 +5,7 @@ A module for retrieving data from Jira. Allows fetching issue lists and their comments, projects and more. """ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import requests @@ -25,10 +25,21 @@ class JiraConnector: base_url: Jira instance base URL (e.g., 'https://yourcompany.atlassian.net') (optional) personal_access_token: Jira personal access token (optional) """ - self.base_url = base_url + self.base_url = base_url.rstrip("/") if base_url else None self.personal_access_token = personal_access_token self.api_version = "3" # Jira Cloud API version + def set_credentials(self, base_url: str, personal_access_token: str) -> None: + """ + Set the Jira credentials. + + Args: + base_url: Jira instance base URL + personal_access_token: Jira personal access token + """ + self.base_url = base_url.rstrip("/") + self.personal_access_token = personal_access_token + def set_personal_access_token(self, personal_access_token: str) -> None: """ Set the Jira personal access token. @@ -88,3 +99,362 @@ class JiraConnector: raise Exception( f"API request failed with status code {response.status_code}: {response.text}" ) + + def get_all_projects(self) -> dict[str, Any]: + """ + Fetch all projects from Jira. + + Returns: + List of project objects + + Raises: + ValueError: If credentials have not been set + Exception: If the API request fails + """ + return self.make_api_request("project/search") + + def get_all_issues(self, project_key: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Fetch all issues from Jira. + + Args: + project_key: Optional project key to filter issues (e.g., 'PROJ') + + Returns: + List of issue objects + + Raises: + ValueError: If credentials have not been set + Exception: If the API request fails + """ + jql = "ORDER BY created DESC" + if project_key: + jql = f'project = "{project_key}" ' + jql + + fields = [ + "summary", + "description", + "status", + "assignee", + "reporter", + "created", + "updated", + "priority", + "issuetype", + "project", + ] + + params = { + "jql": jql, + "fields": ",".join(fields), + "maxResults": 100, + "startAt": 0, + } + + all_issues = [] + start_at = 0 + + while True: + params["startAt"] = start_at + result = self.make_api_request("search", params) + + if not isinstance(result, dict) or "issues" not in result: + raise Exception("Invalid response from Jira API") + + issues = result["issues"] + all_issues.extend(issues) + + print(f"Fetched {len(issues)} issues (startAt={start_at})") + + total = result.get("total", 0) + if start_at + len(issues) >= total: + break + + start_at += len(issues) + + return all_issues + + def get_issues_by_date_range( + self, + start_date: str, + end_date: str, + include_comments: bool = True, + project_key: Optional[str] = None, + ) -> tuple[List[Dict[str, Any]], Optional[str]]: + """ + Fetch issues within a date range. + + Args: + start_date: Start date in YYYY-MM-DD format + end_date: End date in YYYY-MM-DD format (inclusive) + include_comments: Whether to include comments in the response + project_key: Optional project key to filter issues + + Returns: + Tuple containing (issues list, error message or None) + """ + try: + # Build JQL query for date range + # Query issues that were either created OR updated within the date range + date_filter = f"(created >= '{start_date}' AND created <= '{end_date}') OR (updated >= '{start_date}' AND updated <= '{end_date}')" + + jql = f"{date_filter} ORDER BY created DESC" + if project_key: + jql = ( + f'project = "{project_key}" AND {date_filter} ORDER BY created DESC' + ) + + # Define fields to retrieve + fields = [ + "summary", + "description", + "status", + "assignee", + "reporter", + "created", + "updated", + "priority", + "issuetype", + "project", + ] + + if include_comments: + fields.append("comment") + + params = { + "jql": jql, + "fields": ",".join(fields), + "maxResults": 100, + "startAt": 0, + } + + all_issues = [] + start_at = 0 + + while True: + params["startAt"] = start_at + result = self.make_api_request("search", params) + + if not isinstance(result, dict) or "issues" not in result: + return [], "Invalid response from Jira API" + + issues = result["issues"] + all_issues.extend(issues) + + # Check if there are more issues to fetch + total = result.get("total", 0) + if start_at + len(issues) >= total: + break + + start_at += len(issues) + + if not all_issues: + return [], "No issues found in the specified date range." + + return all_issues, None + + except Exception as e: + return [], f"Error fetching issues: {str(e)}" + + def format_issue(self, issue: Dict[str, Any]) -> Dict[str, Any]: + """ + Format an issue for easier consumption. + + Args: + issue: The issue object from Jira API + + Returns: + Formatted issue dictionary + """ + fields = issue.get("fields", {}) + + # Extract basic issue details + formatted = { + "id": issue.get("id", ""), + "key": issue.get("key", ""), + "title": fields.get("summary", ""), + "description": fields.get("description", ""), + "status": ( + fields.get("status", {}).get("name", "Unknown") + if fields.get("status") + else "Unknown" + ), + "status_category": ( + fields.get("status", {}) + .get("statusCategory", {}) + .get("name", "Unknown") + if fields.get("status") + else "Unknown" + ), + "priority": ( + fields.get("priority", {}).get("name", "Unknown") + if fields.get("priority") + else "Unknown" + ), + "issue_type": ( + fields.get("issuetype", {}).get("name", "Unknown") + if fields.get("issuetype") + else "Unknown" + ), + "project": ( + fields.get("project", {}).get("key", "Unknown") + if fields.get("project") + else "Unknown" + ), + "created_at": fields.get("created", ""), + "updated_at": fields.get("updated", ""), + "reporter": ( + { + "account_id": ( + fields.get("reporter", {}).get("accountId", "") + if fields.get("reporter") + else "" + ), + "display_name": ( + fields.get("reporter", {}).get("displayName", "Unknown") + if fields.get("reporter") + else "Unknown" + ), + "email": ( + fields.get("reporter", {}).get("emailAddress", "") + if fields.get("reporter") + else "" + ), + } + if fields.get("reporter") + else {"account_id": "", "display_name": "Unknown", "email": ""} + ), + "assignee": ( + { + "account_id": fields.get("assignee", {}).get("accountId", ""), + "display_name": fields.get("assignee", {}).get( + "displayName", "Unknown" + ), + "email": fields.get("assignee", {}).get("emailAddress", ""), + } + if fields.get("assignee") + else None + ), + "comments": [], + } + + # Extract comments if available + if "comment" in fields and "comments" in fields["comment"]: + for comment in fields["comment"]["comments"]: + formatted_comment = { + "id": comment.get("id", ""), + "body": comment.get("body", ""), + "created_at": comment.get("created", ""), + "updated_at": comment.get("updated", ""), + "author": ( + { + "account_id": ( + comment.get("author", {}).get("accountId", "") + if comment.get("author") + else "" + ), + "display_name": ( + comment.get("author", {}).get("displayName", "Unknown") + if comment.get("author") + else "Unknown" + ), + "email": ( + comment.get("author", {}).get("emailAddress", "") + if comment.get("author") + else "" + ), + } + if comment.get("author") + else {"account_id": "", "display_name": "Unknown", "email": ""} + ), + } + formatted["comments"].append(formatted_comment) + + return formatted + + def format_issue_to_markdown(self, issue: Dict[str, Any]) -> str: + """ + Convert an issue to markdown format. + + Args: + issue: The issue object (either raw or formatted) + + Returns: + Markdown string representation of the issue + """ + # Format the issue if it's not already formatted + if "key" not in issue: + issue = self.format_issue(issue) + + # Build the markdown content + markdown = ( + f"# {issue.get('key', 'No Key')}: {issue.get('title', 'No Title')}\n\n" + ) + + if issue.get("status"): + markdown += f"**Status:** {issue['status']}\n" + + if issue.get("priority"): + markdown += f"**Priority:** {issue['priority']}\n" + + if issue.get("issue_type"): + markdown += f"**Type:** {issue['issue_type']}\n" + + if issue.get("project"): + markdown += f"**Project:** {issue['project']}\n\n" + + if issue.get("assignee") and issue["assignee"].get("display_name"): + markdown += f"**Assignee:** {issue['assignee']['display_name']}\n" + + if issue.get("reporter") and issue["reporter"].get("display_name"): + markdown += f"**Reporter:** {issue['reporter']['display_name']}\n" + + if issue.get("created_at"): + created_date = self.format_date(issue["created_at"]) + markdown += f"**Created:** {created_date}\n" + + if issue.get("updated_at"): + updated_date = self.format_date(issue["updated_at"]) + markdown += f"**Updated:** {updated_date}\n\n" + + if issue.get("description"): + markdown += f"## Description\n\n{issue['description']}\n\n" + + if issue.get("comments"): + markdown += f"## Comments ({len(issue['comments'])})\n\n" + + for comment in issue["comments"]: + author_name = "Unknown" + if comment.get("author") and comment["author"].get("display_name"): + author_name = comment["author"]["display_name"] + + comment_date = "Unknown date" + if comment.get("created_at"): + comment_date = self.format_date(comment["created_at"]) + + markdown += f"### {author_name} ({comment_date})\n\n{comment.get('body', '')}\n\n---\n\n" + + return markdown + + @staticmethod + def format_date(iso_date: str) -> str: + """ + Format an ISO date string to a more readable format. + + Args: + iso_date: ISO format date string + + Returns: + Formatted date string + """ + if not iso_date or not isinstance(iso_date, str): + return "Unknown date" + + try: + from datetime import datetime + + # Jira dates are typically in format: 2023-01-01T12:00:00.000+0000 + dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except ValueError: + return iso_date From 90bfec6e7d513652da98fcab039d2dda052c53c7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 11:33:38 +0200 Subject: [PATCH 03/17] add jira to document type enum and search source connector type enum --- surfsense_backend/app/db.py | 283 ++++++++++++++++++++++++++---------- 1 file changed, 207 insertions(+), 76 deletions(-) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 7caf36533..bd982e4cf 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -2,30 +2,30 @@ from collections.abc import AsyncGenerator from datetime import datetime, timezone from enum import Enum +from app.config import config +from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever from fastapi import Depends - from pgvector.sqlalchemy import Vector from sqlalchemy import ( ARRAY, + JSON, + TIMESTAMP, Boolean, Column, - Enum as SQLAlchemyEnum, +) +from sqlalchemy import Enum as SQLAlchemyEnum +from sqlalchemy import ( ForeignKey, Integer, - JSON, String, Text, text, - TIMESTAMP ) from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from sqlalchemy.orm import DeclarativeBase, Mapped, declared_attr, relationship -from app.config import config -from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever - if config.AUTH_TYPE == "GOOGLE": from fastapi_users.db import ( SQLAlchemyBaseOAuthAccountTableUUID, @@ -51,9 +51,11 @@ class DocumentType(str, Enum): GITHUB_CONNECTOR = "GITHUB_CONNECTOR" LINEAR_CONNECTOR = "LINEAR_CONNECTOR" DISCORD_CONNECTOR = "DISCORD_CONNECTOR" + JIRA_CONNECTOR = "JIRA_CONNECTOR" + class SearchSourceConnectorType(str, Enum): - SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT + SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT TAVILY_API = "TAVILY_API" LINKUP_API = "LINKUP_API" SLACK_CONNECTOR = "SLACK_CONNECTOR" @@ -61,13 +63,16 @@ class SearchSourceConnectorType(str, Enum): GITHUB_CONNECTOR = "GITHUB_CONNECTOR" LINEAR_CONNECTOR = "LINEAR_CONNECTOR" DISCORD_CONNECTOR = "DISCORD_CONNECTOR" - + JIRA_CONNECTOR = "JIRA_CONNECTOR" + + class ChatType(str, Enum): QNA = "QNA" REPORT_GENERAL = "REPORT_GENERAL" REPORT_DEEP = "REPORT_DEEP" REPORT_DEEPER = "REPORT_DEEPER" + class LiteLLMProvider(str, Enum): OPENAI = "OPENAI" ANTHROPIC = "ANTHROPIC" @@ -92,6 +97,7 @@ class LiteLLMProvider(str, Enum): PETALS = "PETALS" CUSTOM = "CUSTOM" + class LogLevel(str, Enum): DEBUG = "DEBUG" INFO = "INFO" @@ -99,18 +105,27 @@ class LogLevel(str, Enum): ERROR = "ERROR" CRITICAL = "CRITICAL" + class LogStatus(str, Enum): IN_PROGRESS = "IN_PROGRESS" SUCCESS = "SUCCESS" FAILED = "FAILED" - + + class Base(DeclarativeBase): pass + class TimestampMixin: @declared_attr def created_at(cls): - return Column(TIMESTAMP(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc), index=True) + return Column( + TIMESTAMP(timezone=True), + nullable=False, + default=lambda: datetime.now(timezone.utc), + index=True, + ) + class BaseModel(Base): __abstract__ = True @@ -118,6 +133,7 @@ class BaseModel(Base): id = Column(Integer, primary_key=True, index=True) + class Chat(BaseModel, TimestampMixin): __tablename__ = "chats" @@ -125,73 +141,115 @@ class Chat(BaseModel, TimestampMixin): title = Column(String, nullable=False, index=True) initial_connectors = Column(ARRAY(String), nullable=True) messages = Column(JSON, nullable=False) - - search_space_id = Column(Integer, ForeignKey('searchspaces.id', ondelete='CASCADE'), nullable=False) - search_space = relationship('SearchSpace', back_populates='chats') + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) + search_space = relationship("SearchSpace", back_populates="chats") + class Document(BaseModel, TimestampMixin): __tablename__ = "documents" - + title = Column(String, nullable=False, index=True) document_type = Column(SQLAlchemyEnum(DocumentType), nullable=False) document_metadata = Column(JSON, nullable=True) - + content = Column(Text, nullable=False) content_hash = Column(String, nullable=False, index=True, unique=True) embedding = Column(Vector(config.embedding_model_instance.dimension)) - - search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) search_space = relationship("SearchSpace", back_populates="documents") - chunks = relationship("Chunk", back_populates="document", cascade="all, delete-orphan") + chunks = relationship( + "Chunk", back_populates="document", cascade="all, delete-orphan" + ) + class Chunk(BaseModel, TimestampMixin): __tablename__ = "chunks" - + content = Column(Text, nullable=False) embedding = Column(Vector(config.embedding_model_instance.dimension)) - - document_id = Column(Integer, ForeignKey("documents.id", ondelete='CASCADE'), nullable=False) + + document_id = Column( + Integer, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False + ) document = relationship("Document", back_populates="chunks") + class Podcast(BaseModel, TimestampMixin): __tablename__ = "podcasts" - + title = Column(String, nullable=False, index=True) podcast_transcript = Column(JSON, nullable=False, default={}) file_location = Column(String(500), nullable=False, default="") - - search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) search_space = relationship("SearchSpace", back_populates="podcasts") - + + class SearchSpace(BaseModel, TimestampMixin): __tablename__ = "searchspaces" - + name = Column(String(100), nullable=False, index=True) description = Column(String(500), nullable=True) - - user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) user = relationship("User", back_populates="search_spaces") - - documents = relationship("Document", back_populates="search_space", order_by="Document.id", cascade="all, delete-orphan") - podcasts = relationship("Podcast", back_populates="search_space", order_by="Podcast.id", cascade="all, delete-orphan") - chats = relationship('Chat', back_populates='search_space', order_by='Chat.id', cascade="all, delete-orphan") - logs = relationship("Log", back_populates="search_space", order_by="Log.id", cascade="all, delete-orphan") - + + documents = relationship( + "Document", + back_populates="search_space", + order_by="Document.id", + cascade="all, delete-orphan", + ) + podcasts = relationship( + "Podcast", + back_populates="search_space", + order_by="Podcast.id", + cascade="all, delete-orphan", + ) + chats = relationship( + "Chat", + back_populates="search_space", + order_by="Chat.id", + cascade="all, delete-orphan", + ) + logs = relationship( + "Log", + back_populates="search_space", + order_by="Log.id", + cascade="all, delete-orphan", + ) + + class SearchSourceConnector(BaseModel, TimestampMixin): __tablename__ = "search_source_connectors" - + name = Column(String(100), nullable=False, index=True) - connector_type = Column(SQLAlchemyEnum(SearchSourceConnectorType), nullable=False, unique=True) + connector_type = Column( + SQLAlchemyEnum(SearchSourceConnectorType), nullable=False, unique=True + ) is_indexable = Column(Boolean, nullable=False, default=False) last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True) config = Column(JSON, nullable=False) - - user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) user = relationship("User", back_populates="search_source_connectors") + class LLMConfig(BaseModel, TimestampMixin): __tablename__ = "llm_configs" - + name = Column(String(100), nullable=False, index=True) # Provider from the enum provider = Column(SQLAlchemyEnum(LiteLLMProvider), nullable=False) @@ -202,78 +260,142 @@ class LLMConfig(BaseModel, TimestampMixin): # API Key should be encrypted before storing api_key = Column(String, nullable=False) api_base = Column(String(500), nullable=True) - + # For any other parameters that litellm supports litellm_params = Column(JSON, nullable=True, default={}) - - user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) user = relationship("User", back_populates="llm_configs", foreign_keys=[user_id]) + class Log(BaseModel, TimestampMixin): __tablename__ = "logs" - + level = Column(SQLAlchemyEnum(LogLevel), nullable=False, index=True) status = Column(SQLAlchemyEnum(LogStatus), nullable=False, index=True) message = Column(Text, nullable=False) - source = Column(String(200), nullable=True, index=True) # Service/component that generated the log + source = Column( + String(200), nullable=True, index=True + ) # Service/component that generated the log log_metadata = Column(JSON, nullable=True, default={}) # Additional context data - - search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) search_space = relationship("SearchSpace", back_populates="logs") + if config.AUTH_TYPE == "GOOGLE": + class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base): pass - class User(SQLAlchemyBaseUserTableUUID, Base): oauth_accounts: Mapped[list[OAuthAccount]] = relationship( "OAuthAccount", lazy="joined" ) search_spaces = relationship("SearchSpace", back_populates="user") - search_source_connectors = relationship("SearchSourceConnector", back_populates="user") - llm_configs = relationship("LLMConfig", back_populates="user", foreign_keys="LLMConfig.user_id", cascade="all, delete-orphan") + search_source_connectors = relationship( + "SearchSourceConnector", back_populates="user" + ) + llm_configs = relationship( + "LLMConfig", + back_populates="user", + foreign_keys="LLMConfig.user_id", + cascade="all, delete-orphan", + ) - long_context_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - fast_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - strategic_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) + long_context_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + fast_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + strategic_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + + long_context_llm = relationship( + "LLMConfig", foreign_keys=[long_context_llm_id], post_update=True + ) + fast_llm = relationship( + "LLMConfig", foreign_keys=[fast_llm_id], post_update=True + ) + strategic_llm = relationship( + "LLMConfig", foreign_keys=[strategic_llm_id], post_update=True + ) - long_context_llm = relationship("LLMConfig", foreign_keys=[long_context_llm_id], post_update=True) - fast_llm = relationship("LLMConfig", foreign_keys=[fast_llm_id], post_update=True) - strategic_llm = relationship("LLMConfig", foreign_keys=[strategic_llm_id], post_update=True) else: + class User(SQLAlchemyBaseUserTableUUID, Base): - search_spaces = relationship("SearchSpace", back_populates="user") - search_source_connectors = relationship("SearchSourceConnector", back_populates="user") - llm_configs = relationship("LLMConfig", back_populates="user", foreign_keys="LLMConfig.user_id", cascade="all, delete-orphan") + search_source_connectors = relationship( + "SearchSourceConnector", back_populates="user" + ) + llm_configs = relationship( + "LLMConfig", + back_populates="user", + foreign_keys="LLMConfig.user_id", + cascade="all, delete-orphan", + ) - long_context_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - fast_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - strategic_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) + long_context_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + fast_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + strategic_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) - long_context_llm = relationship("LLMConfig", foreign_keys=[long_context_llm_id], post_update=True) - fast_llm = relationship("LLMConfig", foreign_keys=[fast_llm_id], post_update=True) - strategic_llm = relationship("LLMConfig", foreign_keys=[strategic_llm_id], post_update=True) + long_context_llm = relationship( + "LLMConfig", foreign_keys=[long_context_llm_id], post_update=True + ) + fast_llm = relationship( + "LLMConfig", foreign_keys=[fast_llm_id], post_update=True + ) + strategic_llm = relationship( + "LLMConfig", foreign_keys=[strategic_llm_id], post_update=True + ) engine = create_async_engine(DATABASE_URL) async_session_maker = async_sessionmaker(engine, expire_on_commit=False) - + async def setup_indexes(): async with engine.begin() as conn: - # Create indexes + # Create indexes # Document Summary Indexes - await conn.execute(text('CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)')) - await conn.execute(text('CREATE INDEX IF NOT EXISTS document_search_index ON documents USING gin (to_tsvector(\'english\', content))')) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)" + ) + ) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS document_search_index ON documents USING gin (to_tsvector('english', content))" + ) + ) # Document Chuck Indexes - await conn.execute(text('CREATE INDEX IF NOT EXISTS chucks_vector_index ON chunks USING hnsw (embedding public.vector_cosine_ops)')) - await conn.execute(text('CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector(\'english\', content))')) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS chucks_vector_index ON chunks USING hnsw (embedding public.vector_cosine_ops)" + ) + ) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))" + ) + ) + async def create_db_and_tables(): async with engine.begin() as conn: - await conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) + await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) await conn.run_sync(Base.metadata.create_all) await setup_indexes() @@ -284,14 +406,23 @@ async def get_async_session() -> AsyncGenerator[AsyncSession, None]: if config.AUTH_TYPE == "GOOGLE": + async def get_user_db(session: AsyncSession = Depends(get_async_session)): yield SQLAlchemyUserDatabase(session, User, OAuthAccount) + else: + async def get_user_db(session: AsyncSession = Depends(get_async_session)): yield SQLAlchemyUserDatabase(session, User) - -async def get_chucks_hybrid_search_retriever(session: AsyncSession = Depends(get_async_session)): + + +async def get_chucks_hybrid_search_retriever( + session: AsyncSession = Depends(get_async_session), +): return ChucksHybridSearchRetriever(session) -async def get_documents_hybrid_search_retriever(session: AsyncSession = Depends(get_async_session)): + +async def get_documents_hybrid_search_retriever( + session: AsyncSession = Depends(get_async_session), +): return DocumentHybridSearchRetriever(session) From b2a19af1f75e68b66aeba405657369b589ab05a7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 11:37:36 +0200 Subject: [PATCH 04/17] generate new migration / fix migration files --- ..._add_llm_config_table_and_relationships.py | 114 +++++++++++++----- .../alembic/versions/12_add_logs_table.py | 78 +++++++----- .../versions/13_add_jira_connector_enums.py | 57 +++++++++ .../versions/1_add_github_connector_enum.py | 17 ++- .../versions/2_add_linear_connector_enum.py | 23 ++-- ...3_add_linear_connector_to_documenttype_.py | 16 ++- .../alembic/versions/4_add_linkup_api_enum.py | 26 ++-- .../6_change_podcast_content_to_transcript.py | 30 +++-- .../versions/7_remove_is_generated_column.py | 13 +- .../8_add_content_hash_to_documents.py | 66 +++++----- ...discord_connector_enum_and_documenttype.py | 34 +++++- ...1_add_github_connector_to_documenttype_.py | 49 ++++---- 12 files changed, 366 insertions(+), 157 deletions(-) create mode 100644 surfsense_backend/alembic/versions/13_add_jira_connector_enums.py diff --git a/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py b/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py index 83fdef1f1..d2f04208d 100644 --- a/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py +++ b/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py @@ -20,47 +20,97 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: """Upgrade schema - add LiteLLMProvider enum, LLMConfig table and user LLM preferences.""" - - # Check if enum type exists and create if it doesn't + + # Create enum only if not exists op.execute(""" DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'litellmprovider') THEN - CREATE TYPE litellmprovider AS ENUM ('OPENAI', 'ANTHROPIC', 'GROQ', 'COHERE', 'HUGGINGFACE', 'AZURE_OPENAI', 'GOOGLE', 'AWS_BEDROCK', 'OLLAMA', 'MISTRAL', 'TOGETHER_AI', 'REPLICATE', 'PALM', 'VERTEX_AI', 'ANYSCALE', 'PERPLEXITY', 'DEEPINFRA', 'AI21', 'NLPCLOUD', 'ALEPH_ALPHA', 'PETALS', 'CUSTOM'); + CREATE TYPE litellmprovider AS ENUM ( + 'OPENAI', 'ANTHROPIC', 'GROQ', 'COHERE', 'HUGGINGFACE', + 'AZURE_OPENAI', 'GOOGLE', 'AWS_BEDROCK', 'OLLAMA', 'MISTRAL', + 'TOGETHER_AI', 'REPLICATE', 'PALM', 'VERTEX_AI', 'ANYSCALE', + 'PERPLEXITY', 'DEEPINFRA', 'AI21', 'NLPCLOUD', 'ALEPH_ALPHA', + 'PETALS', 'CUSTOM' + ); END IF; END$$; """) - - # Create llm_configs table using raw SQL to avoid enum creation conflicts + + # Create llm_configs table only if it doesn't already exist op.execute(""" - CREATE TABLE llm_configs ( - id SERIAL PRIMARY KEY, - created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), - name VARCHAR(100) NOT NULL, - provider litellmprovider NOT NULL, - custom_provider VARCHAR(100), - model_name VARCHAR(100) NOT NULL, - api_key TEXT NOT NULL, - api_base VARCHAR(500), - litellm_params JSONB, - user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE - ) + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'llm_configs' + ) THEN + CREATE TABLE llm_configs ( + id SERIAL PRIMARY KEY, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + name VARCHAR(100) NOT NULL, + provider litellmprovider NOT NULL, + custom_provider VARCHAR(100), + model_name VARCHAR(100) NOT NULL, + api_key TEXT NOT NULL, + api_base VARCHAR(500), + litellm_params JSONB, + user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE + ); + END IF; + END$$; """) - - # Create indexes - op.create_index(op.f('ix_llm_configs_id'), 'llm_configs', ['id'], unique=False) - op.create_index(op.f('ix_llm_configs_created_at'), 'llm_configs', ['created_at'], unique=False) - op.create_index(op.f('ix_llm_configs_name'), 'llm_configs', ['name'], unique=False) - - # Add LLM preference columns to user table - op.add_column('user', sa.Column('long_context_llm_id', sa.Integer(), nullable=True)) - op.add_column('user', sa.Column('fast_llm_id', sa.Integer(), nullable=True)) - op.add_column('user', sa.Column('strategic_llm_id', sa.Integer(), nullable=True)) - - # Create foreign key constraints for LLM preferences - op.create_foreign_key(op.f('fk_user_long_context_llm_id_llm_configs'), 'user', 'llm_configs', ['long_context_llm_id'], ['id'], ondelete='SET NULL') - op.create_foreign_key(op.f('fk_user_fast_llm_id_llm_configs'), 'user', 'llm_configs', ['fast_llm_id'], ['id'], ondelete='SET NULL') - op.create_foreign_key(op.f('fk_user_strategic_llm_id_llm_configs'), 'user', 'llm_configs', ['strategic_llm_id'], ['id'], ondelete='SET NULL') + + # Create indexes if they don't exist + op.execute(""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_id' + ) THEN + CREATE INDEX ix_llm_configs_id ON llm_configs(id); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_created_at' + ) THEN + CREATE INDEX ix_llm_configs_created_at ON llm_configs(created_at); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_name' + ) THEN + CREATE INDEX ix_llm_configs_name ON llm_configs(name); + END IF; + END$$; + """) + + # Safely add columns to user table + bind = op.get_bind() + inspector = sa.inspect(bind) + existing_columns = [col["name"] for col in inspector.get_columns("user")] + + with op.batch_alter_table('user') as batch_op: + if 'long_context_llm_id' not in existing_columns: + batch_op.add_column(sa.Column('long_context_llm_id', sa.Integer(), nullable=True)) + batch_op.create_foreign_key(op.f('fk_user_long_context_llm_id_llm_configs'), + 'llm_configs', ['long_context_llm_id'], ['id'], + ondelete='SET NULL') + + if 'fast_llm_id' not in existing_columns: + batch_op.add_column(sa.Column('fast_llm_id', sa.Integer(), nullable=True)) + batch_op.create_foreign_key(op.f('fk_user_fast_llm_id_llm_configs'), + 'llm_configs', ['fast_llm_id'], ['id'], + ondelete='SET NULL') + + if 'strategic_llm_id' not in existing_columns: + batch_op.add_column(sa.Column('strategic_llm_id', sa.Integer(), nullable=True)) + batch_op.create_foreign_key(op.f('fk_user_strategic_llm_id_llm_configs'), + 'llm_configs', ['strategic_llm_id'], ['id'], + ondelete='SET NULL') def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/12_add_logs_table.py b/surfsense_backend/alembic/versions/12_add_logs_table.py index 0b2cc13c8..c74d0e8f6 100644 --- a/surfsense_backend/alembic/versions/12_add_logs_table.py +++ b/surfsense_backend/alembic/versions/12_add_logs_table.py @@ -8,8 +8,8 @@ from typing import Sequence, Union from alembic import op import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import JSON - +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy import inspect # revision identifiers, used by Alembic. revision: str = "12" @@ -20,52 +20,72 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: """Upgrade schema - add LogLevel and LogStatus enums and logs table.""" - - # Create LogLevel enum + + # Create LogLevel enum if it doesn't exist op.execute(""" - CREATE TYPE loglevel AS ENUM ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL') + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'loglevel') THEN + CREATE TYPE loglevel AS ENUM ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'); + END IF; + END$$; """) - - # Create LogStatus enum + + # Create LogStatus enum if it doesn't exist op.execute(""" - CREATE TYPE logstatus AS ENUM ('IN_PROGRESS', 'SUCCESS', 'FAILED') + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'logstatus') THEN + CREATE TYPE logstatus AS ENUM ('IN_PROGRESS', 'SUCCESS', 'FAILED'); + END IF; + END$$; """) - - # Create logs table + + # Create logs table if it doesn't exist op.execute(""" - CREATE TABLE logs ( + CREATE TABLE IF NOT EXISTS logs ( id SERIAL PRIMARY KEY, - created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), level loglevel NOT NULL, status logstatus NOT NULL, message TEXT NOT NULL, source VARCHAR(200), log_metadata JSONB DEFAULT '{}', search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE - ) + ); """) - - # Create indexes - op.create_index(op.f('ix_logs_id'), 'logs', ['id'], unique=False) - op.create_index(op.f('ix_logs_created_at'), 'logs', ['created_at'], unique=False) - op.create_index(op.f('ix_logs_level'), 'logs', ['level'], unique=False) - op.create_index(op.f('ix_logs_status'), 'logs', ['status'], unique=False) - op.create_index(op.f('ix_logs_source'), 'logs', ['source'], unique=False) + + # Get existing indexes + conn = op.get_bind() + inspector = inspect(conn) + existing_indexes = [idx['name'] for idx in inspector.get_indexes('logs')] + + # Create indexes only if they don't already exist + if 'ix_logs_id' not in existing_indexes: + op.create_index('ix_logs_id', 'logs', ['id']) + if 'ix_logs_created_at' not in existing_indexes: + op.create_index('ix_logs_created_at', 'logs', ['created_at']) + if 'ix_logs_level' not in existing_indexes: + op.create_index('ix_logs_level', 'logs', ['level']) + if 'ix_logs_status' not in existing_indexes: + op.create_index('ix_logs_status', 'logs', ['status']) + if 'ix_logs_source' not in existing_indexes: + op.create_index('ix_logs_source', 'logs', ['source']) def downgrade() -> None: """Downgrade schema - remove logs table and enums.""" - + # Drop indexes - op.drop_index(op.f('ix_logs_source'), table_name='logs') - op.drop_index(op.f('ix_logs_status'), table_name='logs') - op.drop_index(op.f('ix_logs_level'), table_name='logs') - op.drop_index(op.f('ix_logs_created_at'), table_name='logs') - op.drop_index(op.f('ix_logs_id'), table_name='logs') - + op.drop_index('ix_logs_source', table_name='logs') + op.drop_index('ix_logs_status', table_name='logs') + op.drop_index('ix_logs_level', table_name='logs') + op.drop_index('ix_logs_created_at', table_name='logs') + op.drop_index('ix_logs_id', table_name='logs') + # Drop logs table op.drop_table('logs') - + # Drop enums op.execute("DROP TYPE IF EXISTS logstatus") - op.execute("DROP TYPE IF EXISTS loglevel") \ No newline at end of file + op.execute("DROP TYPE IF EXISTS loglevel") diff --git a/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py b/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py new file mode 100644 index 000000000..2623cb957 --- /dev/null +++ b/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py @@ -0,0 +1,57 @@ +"""Add JIRA_CONNECTOR to enums + +Revision ID: 13 +Revises: 12 +""" + +from typing import Sequence, Union +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = '13' +down_revision: Union[str, None] = '12' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Safely add 'JIRA_CONNECTOR' to enum types if missing.""" + + # Add to searchsourceconnectortype enum + op.execute(""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'JIRA_CONNECTOR' + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'JIRA_CONNECTOR'; + END IF; + END + $$; + """) + + # Add to documenttype enum + op.execute(""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'documenttype' AND e.enumlabel = 'JIRA_CONNECTOR' + ) THEN + ALTER TYPE documenttype ADD VALUE 'JIRA_CONNECTOR'; + END IF; + END + $$; + """) + + +def downgrade() -> None: + """ + Downgrade logic not implemented since PostgreSQL + does not support removing enum values. + """ + pass diff --git a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py index 1902777b6..713621fca 100644 --- a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py +++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py @@ -24,7 +24,22 @@ def upgrade() -> None: # Manually add the command to add the enum value # Note: It's generally better to let autogenerate handle this, but we're bypassing it - op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'") + op.execute(""" +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_enum + WHERE enumlabel = 'GITHUB_CONNECTOR' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = 'searchsourceconnectortype' + ) + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'; + END IF; +END$$; +""") + # Pass for the rest, as autogenerate didn't run to add other schema details pass diff --git a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py index 526c7c3ad..aa0ac3cfc 100644 --- a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py +++ b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py @@ -18,14 +18,21 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - - # Manually add the command to add the enum value - op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINEAR_CONNECTOR'") - - # Pass for the rest, as autogenerate didn't run to add other schema details - pass - # ### end Alembic commands ### + op.execute(""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = 'LINEAR_CONNECTOR' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = 'searchsourceconnectortype' + ) + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'LINEAR_CONNECTOR'; + END IF; + END$$; + """) +# def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py index e71ee2ed4..2b326f2ee 100644 --- a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py @@ -22,7 +22,21 @@ NEW_VALUE = 'LINEAR_CONNECTOR' def upgrade() -> None: """Upgrade schema.""" - op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'") + op.execute(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{NEW_VALUE}' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = '{ENUM_NAME}' + ) + ) THEN + ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; + END IF; + END$$; + """) + # Warning: This will delete all rows with the new value diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py index 093bdf067..45d4ecf61 100644 --- a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py +++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py @@ -18,14 +18,24 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - - # Manually add the command to add the enum value - op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINKUP_API'") - - # Pass for the rest, as autogenerate didn't run to add other schema details - pass - # ### end Alembic commands ### + ENUM_NAME = 'searchsourceconnectortype' + NEW_VALUE = 'LINKUP_API' + + op.execute(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{NEW_VALUE}' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = '{ENUM_NAME}' + ) + ) THEN + ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; + END IF; + END$$; + """) + def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py index fa7a0f8f6..edc65b7ce 100644 --- a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py +++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py @@ -9,6 +9,7 @@ from typing import Sequence, Union from alembic import op import sqlalchemy as sa from sqlalchemy.dialects.postgresql import JSON +from sqlalchemy import inspect # revision identifiers, used by Alembic. @@ -17,18 +18,25 @@ down_revision: Union[str, None] = '5' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None - def upgrade() -> None: - # Drop the old column and create a new one with the new name and type - # We need to do this because PostgreSQL doesn't support direct column renames with type changes - op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}')) - - # Copy data from old column to new column - # Convert text to JSON by storing it as a JSON string value - op.execute("UPDATE podcasts SET podcast_transcript = jsonb_build_object('text', podcast_content) WHERE podcast_content != ''") - - # Drop the old column - op.drop_column('podcasts', 'podcast_content') + bind = op.get_bind() + inspector = inspect(bind) + + columns = [col["name"] for col in inspector.get_columns("podcasts")] + if "podcast_transcript" not in columns: + op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}')) + + # Copy data from old column to new column + op.execute(""" + UPDATE podcasts + SET podcast_transcript = jsonb_build_object('text', podcast_content) + WHERE podcast_content != '' + """) + + # Drop the old column only if it exists + if "podcast_content" in columns: + op.drop_column('podcasts', 'podcast_content') + def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py index 03048a146..9fa4546e1 100644 --- a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py +++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py @@ -8,6 +8,7 @@ from typing import Sequence, Union from alembic import op import sqlalchemy as sa +from sqlalchemy import inspect # revision identifiers, used by Alembic. @@ -16,10 +17,16 @@ down_revision: Union[str, None] = '6' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None - def upgrade() -> None: - # Drop the is_generated column - op.drop_column('podcasts', 'is_generated') + # Get the current database connection + bind = op.get_bind() + inspector = inspect(bind) + + # Check if the column exists before attempting to drop it + columns = [col["name"] for col in inspector.get_columns("podcasts")] + if "is_generated" in columns: + op.drop_column('podcasts', 'is_generated') + def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py index 64982fc56..ed7ebc594 100644 --- a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py +++ b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py @@ -4,9 +4,9 @@ Revision ID: 8 Revises: 7 """ from typing import Sequence, Union - from alembic import op import sqlalchemy as sa +from sqlalchemy import inspect # revision identifiers, used by Alembic. @@ -17,40 +17,40 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # Add content_hash column as nullable first to handle existing data - op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True)) - - # Update existing documents to generate content hashes - # Using SHA-256 hash of the content column with proper UTF-8 encoding - op.execute(""" - UPDATE documents - SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex') - WHERE content_hash IS NULL - """) - - # Handle duplicate content hashes by keeping only the oldest document for each hash - # Delete newer documents with duplicate content hashes - op.execute(""" - DELETE FROM documents - WHERE id NOT IN ( - SELECT MIN(id) - FROM documents - GROUP BY content_hash - ) - """) - - # Now alter the column to match the model: nullable=False, index=True, unique=True - op.alter_column('documents', 'content_hash', - existing_type=sa.String(), - nullable=False) - op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False) - op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash']) + bind = op.get_bind() + inspector = inspect(bind) + columns = [col['name'] for col in inspector.get_columns('documents')] + + # Only add the column if it doesn't already exist + if 'content_hash' not in columns: + op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True)) + + # Populate the content_hash column + op.execute(""" + UPDATE documents + SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex') + WHERE content_hash IS NULL + """) + + op.execute(""" + DELETE FROM documents + WHERE id NOT IN ( + SELECT MIN(id) + FROM documents + GROUP BY content_hash + ) + """) + + op.alter_column('documents', 'content_hash', + existing_type=sa.String(), + nullable=False) + op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False) + op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash']) + else: + print("Column 'content_hash' already exists. Skipping column creation.") def downgrade() -> None: - # Remove constraints and index first op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique') op.drop_index(op.f('ix_documents_content_hash'), table_name='documents') - - # Remove content_hash column from documents table - op.drop_column('documents', 'content_hash') \ No newline at end of file + op.drop_column('documents', 'content_hash') diff --git a/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py b/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py index fbf748ae6..ad77dad9d 100644 --- a/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py +++ b/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py @@ -24,11 +24,35 @@ DOCUMENT_NEW_VALUE = "DISCORD_CONNECTOR" def upgrade() -> None: - """Upgrade schema - add DISCORD_CONNECTOR to connector and document enum.""" - # Add DISCORD_CONNECTOR to searchsourceconnectortype - op.execute(f"ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{CONNECTOR_NEW_VALUE}'") - # Add DISCORD_CONNECTOR to documenttype - op.execute(f"ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{DOCUMENT_NEW_VALUE}'") + """Upgrade schema - add DISCORD_CONNECTOR to connector and document enum safely.""" + # Add DISCORD_CONNECTOR to searchsourceconnectortype only if not exists + op.execute(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{CONNECTOR_NEW_VALUE}' + AND enumtypid = (SELECT oid FROM pg_type WHERE typname = '{CONNECTOR_ENUM}') + ) THEN + ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{CONNECTOR_NEW_VALUE}'; + END IF; + END$$; + """) + + # Add DISCORD_CONNECTOR to documenttype only if not exists + op.execute(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{DOCUMENT_NEW_VALUE}' + AND enumtypid = (SELECT oid FROM pg_type WHERE typname = '{DOCUMENT_ENUM}') + ) THEN + ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{DOCUMENT_NEW_VALUE}'; + END IF; + END$$; + """) + def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py index 12d653794..ae0a60a27 100644 --- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py @@ -1,15 +1,8 @@ -"""Add GITHUB_CONNECTOR to DocumentType enum - -Revision ID: e55302644c51 -Revises: 1 - -""" from typing import Sequence, Union from alembic import op import sqlalchemy as sa - # revision identifiers, used by Alembic. revision: str = 'e55302644c51' down_revision: Union[str, None] = '1' @@ -17,22 +10,30 @@ branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None # Define the ENUM type name and the new value -ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name) +ENUM_NAME = 'documenttype' NEW_VALUE = 'GITHUB_CONNECTOR' def upgrade() -> None: """Upgrade schema.""" - op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'") - + op.execute(f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{NEW_VALUE}' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = '{ENUM_NAME}' + ) + ) THEN + ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; + END IF; + END$$; + """) -# Warning: This will delete all rows with the new value def downgrade() -> None: """Downgrade schema - remove GITHUB_CONNECTOR from enum.""" - - # The old type name old_enum_name = f"{ENUM_NAME}_old" - # Enum values *before* GITHUB_CONNECTOR was added old_values = ( 'EXTENSION', 'CRAWLED_URL', @@ -43,27 +44,23 @@ def downgrade() -> None: ) old_values_sql = ", ".join([f"'{v}'" for v in old_values]) - # Table and column names (adjust if different) table_name = 'documents' column_name = 'document_type' - # 1. Rename the current enum type - op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}") + # 1. Create the new enum type with the old values + op.execute(f"CREATE TYPE {old_enum_name} AS ENUM({old_values_sql})") - # 2. Create the new enum type with the old values - op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})") - - # 3. Update the table: + # 2. Delete rows using the new value op.execute( f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" ) - # 4. Alter the column to use the new enum type (casting old values) + # 3. Alter the column to use the old enum type op.execute( f"ALTER TABLE {table_name} ALTER COLUMN {column_name} " - f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}" + f"TYPE {old_enum_name} USING {column_name}::text::{old_enum_name}" ) - # 5. Drop the old enum type - op.execute(f"DROP TYPE {old_enum_name}") - # ### end Alembic commands ### + # 4. Drop the current enum type and rename the old one + op.execute(f"DROP TYPE {ENUM_NAME}") + op.execute(f"ALTER TYPE {old_enum_name} RENAME TO {ENUM_NAME}") From a6fe7e583b9388652b59e711950e1119d4ed129b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 11:38:49 +0200 Subject: [PATCH 05/17] generate new migration / fix migration files --- ..._add_llm_config_table_and_relationships.py | 114 +++++++++++------- .../alembic/versions/12_add_logs_table.py | 54 +++++---- .../versions/13_add_jira_connector_enums.py | 18 +-- .../versions/1_add_github_connector_enum.py | 41 ++++--- .../versions/2_add_linear_connector_enum.py | 37 +++--- ...3_add_linear_connector_to_documenttype_.py | 45 ++++--- .../alembic/versions/4_add_linkup_api_enum.py | 40 +++--- .../6_change_podcast_content_to_transcript.py | 44 ++++--- .../versions/7_remove_is_generated_column.py | 17 +-- .../8_add_content_hash_to_documents.py | 60 +++++---- ...discord_connector_enum_and_documenttype.py | 16 +-- ...1_add_github_connector_to_documenttype_.py | 37 +++--- 12 files changed, 303 insertions(+), 220 deletions(-) diff --git a/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py b/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py index d2f04208d..007cd704e 100644 --- a/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py +++ b/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py @@ -6,10 +6,8 @@ Revises: 10 from typing import Sequence, Union -from alembic import op import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import UUID, JSON - +from alembic import op # revision identifiers, used by Alembic. revision: str = "11" @@ -22,7 +20,8 @@ def upgrade() -> None: """Upgrade schema - add LiteLLMProvider enum, LLMConfig table and user LLM preferences.""" # Create enum only if not exists - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'litellmprovider') THEN @@ -35,14 +34,16 @@ def upgrade() -> None: ); END IF; END$$; - """) + """ + ) # Create llm_configs table only if it doesn't already exist - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS ( - SELECT FROM information_schema.tables + SELECT FROM information_schema.tables WHERE table_name = 'llm_configs' ) THEN CREATE TABLE llm_configs ( @@ -59,78 +60,103 @@ def upgrade() -> None: ); END IF; END$$; - """) + """ + ) # Create indexes if they don't exist - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS ( - SELECT 1 FROM pg_indexes + SELECT 1 FROM pg_indexes WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_id' ) THEN CREATE INDEX ix_llm_configs_id ON llm_configs(id); END IF; IF NOT EXISTS ( - SELECT 1 FROM pg_indexes + SELECT 1 FROM pg_indexes WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_created_at' ) THEN CREATE INDEX ix_llm_configs_created_at ON llm_configs(created_at); END IF; IF NOT EXISTS ( - SELECT 1 FROM pg_indexes + SELECT 1 FROM pg_indexes WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_name' ) THEN CREATE INDEX ix_llm_configs_name ON llm_configs(name); END IF; END$$; - """) + """ + ) # Safely add columns to user table bind = op.get_bind() inspector = sa.inspect(bind) existing_columns = [col["name"] for col in inspector.get_columns("user")] - with op.batch_alter_table('user') as batch_op: - if 'long_context_llm_id' not in existing_columns: - batch_op.add_column(sa.Column('long_context_llm_id', sa.Integer(), nullable=True)) - batch_op.create_foreign_key(op.f('fk_user_long_context_llm_id_llm_configs'), - 'llm_configs', ['long_context_llm_id'], ['id'], - ondelete='SET NULL') + with op.batch_alter_table("user") as batch_op: + if "long_context_llm_id" not in existing_columns: + batch_op.add_column( + sa.Column("long_context_llm_id", sa.Integer(), nullable=True) + ) + batch_op.create_foreign_key( + op.f("fk_user_long_context_llm_id_llm_configs"), + "llm_configs", + ["long_context_llm_id"], + ["id"], + ondelete="SET NULL", + ) - if 'fast_llm_id' not in existing_columns: - batch_op.add_column(sa.Column('fast_llm_id', sa.Integer(), nullable=True)) - batch_op.create_foreign_key(op.f('fk_user_fast_llm_id_llm_configs'), - 'llm_configs', ['fast_llm_id'], ['id'], - ondelete='SET NULL') + if "fast_llm_id" not in existing_columns: + batch_op.add_column(sa.Column("fast_llm_id", sa.Integer(), nullable=True)) + batch_op.create_foreign_key( + op.f("fk_user_fast_llm_id_llm_configs"), + "llm_configs", + ["fast_llm_id"], + ["id"], + ondelete="SET NULL", + ) - if 'strategic_llm_id' not in existing_columns: - batch_op.add_column(sa.Column('strategic_llm_id', sa.Integer(), nullable=True)) - batch_op.create_foreign_key(op.f('fk_user_strategic_llm_id_llm_configs'), - 'llm_configs', ['strategic_llm_id'], ['id'], - ondelete='SET NULL') + if "strategic_llm_id" not in existing_columns: + batch_op.add_column( + sa.Column("strategic_llm_id", sa.Integer(), nullable=True) + ) + batch_op.create_foreign_key( + op.f("fk_user_strategic_llm_id_llm_configs"), + "llm_configs", + ["strategic_llm_id"], + ["id"], + ondelete="SET NULL", + ) def downgrade() -> None: """Downgrade schema - remove LLMConfig table and user LLM preferences.""" - + # Drop foreign key constraints - op.drop_constraint(op.f('fk_user_strategic_llm_id_llm_configs'), 'user', type_='foreignkey') - op.drop_constraint(op.f('fk_user_fast_llm_id_llm_configs'), 'user', type_='foreignkey') - op.drop_constraint(op.f('fk_user_long_context_llm_id_llm_configs'), 'user', type_='foreignkey') - + op.drop_constraint( + op.f("fk_user_strategic_llm_id_llm_configs"), "user", type_="foreignkey" + ) + op.drop_constraint( + op.f("fk_user_fast_llm_id_llm_configs"), "user", type_="foreignkey" + ) + op.drop_constraint( + op.f("fk_user_long_context_llm_id_llm_configs"), "user", type_="foreignkey" + ) + # Drop LLM preference columns from user table - op.drop_column('user', 'strategic_llm_id') - op.drop_column('user', 'fast_llm_id') - op.drop_column('user', 'long_context_llm_id') - + op.drop_column("user", "strategic_llm_id") + op.drop_column("user", "fast_llm_id") + op.drop_column("user", "long_context_llm_id") + # Drop indexes and table - op.drop_index(op.f('ix_llm_configs_name'), table_name='llm_configs') - op.drop_index(op.f('ix_llm_configs_created_at'), table_name='llm_configs') - op.drop_index(op.f('ix_llm_configs_id'), table_name='llm_configs') - op.drop_table('llm_configs') - + op.drop_index(op.f("ix_llm_configs_name"), table_name="llm_configs") + op.drop_index(op.f("ix_llm_configs_created_at"), table_name="llm_configs") + op.drop_index(op.f("ix_llm_configs_id"), table_name="llm_configs") + op.drop_table("llm_configs") + # Drop LiteLLMProvider enum - op.execute("DROP TYPE IF EXISTS litellmprovider") \ No newline at end of file + op.execute("DROP TYPE IF EXISTS litellmprovider") diff --git a/surfsense_backend/alembic/versions/12_add_logs_table.py b/surfsense_backend/alembic/versions/12_add_logs_table.py index c74d0e8f6..2fc8b2b02 100644 --- a/surfsense_backend/alembic/versions/12_add_logs_table.py +++ b/surfsense_backend/alembic/versions/12_add_logs_table.py @@ -7,8 +7,6 @@ Revises: 11 from typing import Sequence, Union from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy import inspect # revision identifiers, used by Alembic. @@ -22,27 +20,32 @@ def upgrade() -> None: """Upgrade schema - add LogLevel and LogStatus enums and logs table.""" # Create LogLevel enum if it doesn't exist - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'loglevel') THEN CREATE TYPE loglevel AS ENUM ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'); END IF; END$$; - """) + """ + ) # Create LogStatus enum if it doesn't exist - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'logstatus') THEN CREATE TYPE logstatus AS ENUM ('IN_PROGRESS', 'SUCCESS', 'FAILED'); END IF; END$$; - """) + """ + ) # Create logs table if it doesn't exist - op.execute(""" + op.execute( + """ CREATE TABLE IF NOT EXISTS logs ( id SERIAL PRIMARY KEY, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), @@ -53,38 +56,39 @@ def upgrade() -> None: log_metadata JSONB DEFAULT '{}', search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE ); - """) + """ + ) # Get existing indexes conn = op.get_bind() inspector = inspect(conn) - existing_indexes = [idx['name'] for idx in inspector.get_indexes('logs')] + existing_indexes = [idx["name"] for idx in inspector.get_indexes("logs")] # Create indexes only if they don't already exist - if 'ix_logs_id' not in existing_indexes: - op.create_index('ix_logs_id', 'logs', ['id']) - if 'ix_logs_created_at' not in existing_indexes: - op.create_index('ix_logs_created_at', 'logs', ['created_at']) - if 'ix_logs_level' not in existing_indexes: - op.create_index('ix_logs_level', 'logs', ['level']) - if 'ix_logs_status' not in existing_indexes: - op.create_index('ix_logs_status', 'logs', ['status']) - if 'ix_logs_source' not in existing_indexes: - op.create_index('ix_logs_source', 'logs', ['source']) + if "ix_logs_id" not in existing_indexes: + op.create_index("ix_logs_id", "logs", ["id"]) + if "ix_logs_created_at" not in existing_indexes: + op.create_index("ix_logs_created_at", "logs", ["created_at"]) + if "ix_logs_level" not in existing_indexes: + op.create_index("ix_logs_level", "logs", ["level"]) + if "ix_logs_status" not in existing_indexes: + op.create_index("ix_logs_status", "logs", ["status"]) + if "ix_logs_source" not in existing_indexes: + op.create_index("ix_logs_source", "logs", ["source"]) def downgrade() -> None: """Downgrade schema - remove logs table and enums.""" # Drop indexes - op.drop_index('ix_logs_source', table_name='logs') - op.drop_index('ix_logs_status', table_name='logs') - op.drop_index('ix_logs_level', table_name='logs') - op.drop_index('ix_logs_created_at', table_name='logs') - op.drop_index('ix_logs_id', table_name='logs') + op.drop_index("ix_logs_source", table_name="logs") + op.drop_index("ix_logs_status", table_name="logs") + op.drop_index("ix_logs_level", table_name="logs") + op.drop_index("ix_logs_created_at", table_name="logs") + op.drop_index("ix_logs_id", table_name="logs") # Drop logs table - op.drop_table('logs') + op.drop_table("logs") # Drop enums op.execute("DROP TYPE IF EXISTS logstatus") diff --git a/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py b/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py index 2623cb957..78d95f17b 100644 --- a/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py +++ b/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py @@ -5,12 +5,12 @@ Revises: 12 """ from typing import Sequence, Union + from alembic import op -import sqlalchemy as sa # revision identifiers, used by Alembic. -revision: str = '13' -down_revision: Union[str, None] = '12' +revision: str = "13" +down_revision: Union[str, None] = "12" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -19,7 +19,8 @@ def upgrade() -> None: """Safely add 'JIRA_CONNECTOR' to enum types if missing.""" # Add to searchsourceconnectortype enum - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS ( @@ -31,10 +32,12 @@ def upgrade() -> None: END IF; END $$; - """) + """ + ) # Add to documenttype enum - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS ( @@ -46,7 +49,8 @@ def upgrade() -> None: END IF; END $$; - """) + """ + ) def downgrade() -> None: diff --git a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py index 713621fca..d094c9912 100644 --- a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py +++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py @@ -1,19 +1,20 @@ """Add GITHUB_CONNECTOR to SearchSourceConnectorType enum Revision ID: 1 -Revises: +Revises: """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa + # Import pgvector if needed for other types, though not for this ENUM change # import pgvector # revision identifiers, used by Alembic. -revision: str = '1' +revision: str = "1" down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,10 +22,11 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Manually add the command to add the enum value # Note: It's generally better to let autogenerate handle this, but we're bypassing it - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS ( @@ -38,9 +40,9 @@ BEGIN ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'; END IF; END$$; -""") +""" + ) - # Pass for the rest, as autogenerate didn't run to add other schema details pass # ### end Alembic commands ### @@ -48,20 +50,25 @@ END$$; def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Downgrading removal of an enum value is complex and potentially dangerous # if the value is in use. Often omitted or requires manual SQL based on context. - # For now, we'll just pass. If you needed to reverse this, you'd likely + # For now, we'll just pass. If you needed to reverse this, you'd likely # have to manually check if 'GITHUB_CONNECTOR' is used in the table # and then potentially recreate the type without it. - op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") - op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR')") - op.execute(( - "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " - "connector_type::text::searchsourceconnectortype" - )) + op.execute( + "ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old" + ) + op.execute( + "CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR')" + ) + op.execute( + ( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + ) + ) op.execute("DROP TYPE searchsourceconnectortype_old") - pass - # ### end Alembic commands ### + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py index aa0ac3cfc..31fcee803 100644 --- a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py +++ b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py @@ -4,21 +4,21 @@ Revision ID: 2 Revises: e55302644c51 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = '2' -down_revision: Union[str, None] = 'e55302644c51' +revision: str = "2" +down_revision: Union[str, None] = "e55302644c51" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - op.execute(""" + op.execute( + """ DO $$ BEGIN IF NOT EXISTS ( @@ -31,21 +31,30 @@ def upgrade() -> None: ALTER TYPE searchsourceconnectortype ADD VALUE 'LINEAR_CONNECTOR'; END IF; END$$; - """) + """ + ) + + # def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Downgrading removal of an enum value requires recreating the type - op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") - op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR')") - op.execute(( - "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " - "connector_type::text::searchsourceconnectortype" - )) + op.execute( + "ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old" + ) + op.execute( + "CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR')" + ) + op.execute( + ( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + ) + ) op.execute("DROP TYPE searchsourceconnectortype_old") pass - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py index 2b326f2ee..b108699d4 100644 --- a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py @@ -4,25 +4,26 @@ Revision ID: 3 Revises: 2 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = '3' -down_revision: Union[str, None] = '2' +revision: str = "3" +down_revision: Union[str, None] = "2" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None # Define the ENUM type name and the new value -ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name) -NEW_VALUE = 'LINEAR_CONNECTOR' +ENUM_NAME = "documenttype" # Make sure this matches the name in your DB (usually lowercase class name) +NEW_VALUE = "LINEAR_CONNECTOR" + def upgrade() -> None: """Upgrade schema.""" - op.execute(f""" + op.execute( + f""" DO $$ BEGIN IF NOT EXISTS ( @@ -35,9 +36,9 @@ def upgrade() -> None: ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; END IF; END$$; - """) + """ + ) - # Warning: This will delete all rows with the new value def downgrade() -> None: @@ -48,19 +49,19 @@ def downgrade() -> None: # Enum values *before* LINEAR_CONNECTOR was added old_values = ( - 'EXTENSION', - 'CRAWLED_URL', - 'FILE', - 'SLACK_CONNECTOR', - 'NOTION_CONNECTOR', - 'YOUTUBE_VIDEO', - 'GITHUB_CONNECTOR' + "EXTENSION", + "CRAWLED_URL", + "FILE", + "SLACK_CONNECTOR", + "NOTION_CONNECTOR", + "YOUTUBE_VIDEO", + "GITHUB_CONNECTOR", ) old_values_sql = ", ".join([f"'{v}'" for v in old_values]) # Table and column names (adjust if different) - table_name = 'documents' - column_name = 'document_type' + table_name = "documents" + column_name = "document_type" # 1. Rename the current enum type op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}") @@ -68,10 +69,8 @@ def downgrade() -> None: # 2. Create the new enum type with the old values op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})") - # 3. Update the table: - op.execute( - f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" - ) + # 3. Update the table: + op.execute(f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'") # 4. Alter the column to use the new enum type (casting old values) op.execute( @@ -81,4 +80,4 @@ def downgrade() -> None: # 5. Drop the old enum type op.execute(f"DROP TYPE {old_enum_name}") - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py index 45d4ecf61..6720ae71f 100644 --- a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py +++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py @@ -4,24 +4,24 @@ Revision ID: 4 Revises: 3 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = '4' -down_revision: Union[str, None] = '3' +revision: str = "4" +down_revision: Union[str, None] = "3" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - ENUM_NAME = 'searchsourceconnectortype' - NEW_VALUE = 'LINKUP_API' + ENUM_NAME = "searchsourceconnectortype" + NEW_VALUE = "LINKUP_API" - op.execute(f""" + op.execute( + f""" DO $$ BEGIN IF NOT EXISTS ( @@ -34,21 +34,27 @@ def upgrade() -> None: ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; END IF; END$$; - """) - + """ + ) def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Downgrading removal of an enum value requires recreating the type - op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") - op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')") - op.execute(( - "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " - "connector_type::text::searchsourceconnectortype" - )) + op.execute( + "ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old" + ) + op.execute( + "CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')" + ) + op.execute( + ( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + ) + ) op.execute("DROP TYPE searchsourceconnectortype_old") pass - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py index edc65b7ce..3f0865f84 100644 --- a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py +++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py @@ -4,48 +4,58 @@ Revision ID: 6 Revises: 5 """ + from typing import Sequence, Union -from alembic import op import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import JSON +from alembic import op from sqlalchemy import inspect - +from sqlalchemy.dialects.postgresql import JSON # revision identifiers, used by Alembic. -revision: str = '6' -down_revision: Union[str, None] = '5' +revision: str = "6" +down_revision: Union[str, None] = "5" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None + def upgrade() -> None: bind = op.get_bind() inspector = inspect(bind) columns = [col["name"] for col in inspector.get_columns("podcasts")] if "podcast_transcript" not in columns: - op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}')) + op.add_column( + "podcasts", + sa.Column("podcast_transcript", JSON, nullable=False, server_default="{}"), + ) # Copy data from old column to new column - op.execute(""" - UPDATE podcasts - SET podcast_transcript = jsonb_build_object('text', podcast_content) + op.execute( + """ + UPDATE podcasts + SET podcast_transcript = jsonb_build_object('text', podcast_content) WHERE podcast_content != '' - """) + """ + ) # Drop the old column only if it exists if "podcast_content" in columns: - op.drop_column('podcasts', 'podcast_content') - + op.drop_column("podcasts", "podcast_content") def downgrade() -> None: # Add back the original column - op.add_column('podcasts', sa.Column('podcast_content', sa.Text(), nullable=False, server_default='')) - + op.add_column( + "podcasts", + sa.Column("podcast_content", sa.Text(), nullable=False, server_default=""), + ) + # Copy data from JSON column back to text column # Extract the 'text' field if it exists, otherwise use empty string - op.execute("UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')") - + op.execute( + "UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')" + ) + # Drop the new column - op.drop_column('podcasts', 'podcast_transcript') \ No newline at end of file + op.drop_column("podcasts", "podcast_transcript") diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py index 9fa4546e1..62b273b62 100644 --- a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py +++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py @@ -4,19 +4,20 @@ Revision ID: 7 Revises: 6 """ + from typing import Sequence, Union -from alembic import op import sqlalchemy as sa +from alembic import op from sqlalchemy import inspect - # revision identifiers, used by Alembic. -revision: str = '7' -down_revision: Union[str, None] = '6' +revision: str = "7" +down_revision: Union[str, None] = "6" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None + def upgrade() -> None: # Get the current database connection bind = op.get_bind() @@ -25,10 +26,12 @@ def upgrade() -> None: # Check if the column exists before attempting to drop it columns = [col["name"] for col in inspector.get_columns("podcasts")] if "is_generated" in columns: - op.drop_column('podcasts', 'is_generated') - + op.drop_column("podcasts", "is_generated") def downgrade() -> None: # Add back the is_generated column with its original constraints - op.add_column('podcasts', sa.Column('is_generated', sa.Boolean(), nullable=False, server_default='false')) \ No newline at end of file + op.add_column( + "podcasts", + sa.Column("is_generated", sa.Boolean(), nullable=False, server_default="false"), + ) diff --git a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py index ed7ebc594..976c6d316 100644 --- a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py +++ b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py @@ -3,15 +3,16 @@ Revision ID: 8 Revises: 7 """ + from typing import Sequence, Union -from alembic import op + import sqlalchemy as sa +from alembic import op from sqlalchemy import inspect - # revision identifiers, used by Alembic. -revision: str = '8' -down_revision: Union[str, None] = '7' +revision: str = "8" +down_revision: Union[str, None] = "7" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -19,38 +20,51 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: bind = op.get_bind() inspector = inspect(bind) - columns = [col['name'] for col in inspector.get_columns('documents')] + columns = [col["name"] for col in inspector.get_columns("documents")] # Only add the column if it doesn't already exist - if 'content_hash' not in columns: - op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True)) + if "content_hash" not in columns: + op.add_column( + "documents", sa.Column("content_hash", sa.String(), nullable=True) + ) # Populate the content_hash column - op.execute(""" - UPDATE documents + op.execute( + """ + UPDATE documents SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex') WHERE content_hash IS NULL - """) + """ + ) - op.execute(""" - DELETE FROM documents + op.execute( + """ + DELETE FROM documents WHERE id NOT IN ( - SELECT MIN(id) - FROM documents + SELECT MIN(id) + FROM documents GROUP BY content_hash ) - """) + """ + ) - op.alter_column('documents', 'content_hash', - existing_type=sa.String(), - nullable=False) - op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False) - op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash']) + op.alter_column( + "documents", "content_hash", existing_type=sa.String(), nullable=False + ) + op.create_index( + op.f("ix_documents_content_hash"), + "documents", + ["content_hash"], + unique=False, + ) + op.create_unique_constraint( + op.f("uq_documents_content_hash"), "documents", ["content_hash"] + ) else: print("Column 'content_hash' already exists. Skipping column creation.") def downgrade() -> None: - op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique') - op.drop_index(op.f('ix_documents_content_hash'), table_name='documents') - op.drop_column('documents', 'content_hash') + op.drop_constraint(op.f("uq_documents_content_hash"), "documents", type_="unique") + op.drop_index(op.f("ix_documents_content_hash"), table_name="documents") + op.drop_column("documents", "content_hash") diff --git a/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py b/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py index ad77dad9d..4dec11230 100644 --- a/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py +++ b/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py @@ -7,8 +7,6 @@ Revises: 8 from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. revision: str = "9" @@ -26,7 +24,8 @@ DOCUMENT_NEW_VALUE = "DISCORD_CONNECTOR" def upgrade() -> None: """Upgrade schema - add DISCORD_CONNECTOR to connector and document enum safely.""" # Add DISCORD_CONNECTOR to searchsourceconnectortype only if not exists - op.execute(f""" + op.execute( + f""" DO $$ BEGIN IF NOT EXISTS ( @@ -37,10 +36,12 @@ def upgrade() -> None: ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{CONNECTOR_NEW_VALUE}'; END IF; END$$; - """) + """ + ) # Add DISCORD_CONNECTOR to documenttype only if not exists - op.execute(f""" + op.execute( + f""" DO $$ BEGIN IF NOT EXISTS ( @@ -51,8 +52,8 @@ def upgrade() -> None: ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{DOCUMENT_NEW_VALUE}'; END IF; END$$; - """) - + """ + ) def downgrade() -> None: @@ -109,7 +110,6 @@ def downgrade() -> None: # 4. Drop the old connector enum type op.execute(f"DROP TYPE {old_connector_enum_name}") - # Document Enum Downgrade Steps # 1. Rename the current document enum type op.execute(f"ALTER TYPE {DOCUMENT_ENUM} RENAME TO {old_document_enum_name}") diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py index ae0a60a27..9c93eb7c2 100644 --- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py @@ -1,21 +1,22 @@ from typing import Sequence, Union from alembic import op -import sqlalchemy as sa # revision identifiers, used by Alembic. -revision: str = 'e55302644c51' -down_revision: Union[str, None] = '1' +revision: str = "e55302644c51" +down_revision: Union[str, None] = "1" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None # Define the ENUM type name and the new value -ENUM_NAME = 'documenttype' -NEW_VALUE = 'GITHUB_CONNECTOR' +ENUM_NAME = "documenttype" +NEW_VALUE = "GITHUB_CONNECTOR" + def upgrade() -> None: """Upgrade schema.""" - op.execute(f""" + op.execute( + f""" DO $$ BEGIN IF NOT EXISTS ( @@ -28,32 +29,32 @@ def upgrade() -> None: ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; END IF; END$$; - """) + """ + ) + def downgrade() -> None: """Downgrade schema - remove GITHUB_CONNECTOR from enum.""" old_enum_name = f"{ENUM_NAME}_old" old_values = ( - 'EXTENSION', - 'CRAWLED_URL', - 'FILE', - 'SLACK_CONNECTOR', - 'NOTION_CONNECTOR', - 'YOUTUBE_VIDEO' + "EXTENSION", + "CRAWLED_URL", + "FILE", + "SLACK_CONNECTOR", + "NOTION_CONNECTOR", + "YOUTUBE_VIDEO", ) old_values_sql = ", ".join([f"'{v}'" for v in old_values]) - table_name = 'documents' - column_name = 'document_type' + table_name = "documents" + column_name = "document_type" # 1. Create the new enum type with the old values op.execute(f"CREATE TYPE {old_enum_name} AS ENUM({old_values_sql})") # 2. Delete rows using the new value - op.execute( - f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" - ) + op.execute(f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'") # 3. Alter the column to use the old enum type op.execute( From ca986930058250d9f98e1c1e9738b233bb53b894 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 11:52:06 +0200 Subject: [PATCH 06/17] update connector indexing / update connector service --- .../app/services/connector_service.py | 114 ++++++++ .../app/tasks/connectors_indexing_tasks.py | 256 ++++++++++++++++++ 2 files changed, 370 insertions(+) diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index f53fd4dfc..8c6f99c5f 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -857,6 +857,120 @@ class ConnectorService: return result_object, linear_chunks + async def search_jira(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + """ + Search for Jira issues and comments and return both the source information and langchain documents + + Args: + user_query: The user's query + user_id: The user's ID + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + + Returns: + tuple: (sources_info, langchain_documents) + """ + if search_mode == SearchMode.CHUNKS: + jira_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="JIRA_CONNECTOR" + ) + elif search_mode == SearchMode.DOCUMENTS: + jira_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="JIRA_CONNECTOR" + ) + # Transform document retriever results to match expected format + jira_chunks = self._transform_document_results(jira_chunks) + + # Early return if no results + if not jira_chunks: + return { + "id": 10, + "name": "Jira Issues", + "type": "JIRA_CONNECTOR", + "sources": [], + }, [] + + # Process each chunk and create sources directly without deduplication + sources_list = [] + async with self.counter_lock: + for _i, chunk in enumerate(jira_chunks): + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Extract Jira-specific metadata + issue_key = metadata.get('issue_key', '') + issue_title = metadata.get('issue_title', 'Untitled Issue') + status = metadata.get('status', '') + priority = metadata.get('priority', '') + issue_type = metadata.get('issue_type', '') + comment_count = metadata.get('comment_count', 0) + + # Create a more descriptive title for Jira issues + title = f"Jira: {issue_key} - {issue_title}" + if status: + title += f" ({status})" + + # Create a more descriptive description for Jira issues + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # Add priority and type info to description + info_parts = [] + if priority: + info_parts.append(f"Priority: {priority}") + if issue_type: + info_parts.append(f"Type: {issue_type}") + if comment_count: + info_parts.append(f"Comments: {comment_count}") + + if info_parts: + if description: + description += f" | {' | '.join(info_parts)}" + else: + description = ' | '.join(info_parts) + + # For URL, we could construct a URL to the Jira issue if we have the base URL + # For now, use a generic placeholder + url = "" + if issue_key and metadata.get('base_url'): + url = f"{metadata.get('base_url')}/browse/{issue_key}" + + source = { + "id": document.get('id', self.source_id_counter), + "title": title, + "description": description, + "url": url, + "issue_key": issue_key, + "status": status, + "priority": priority, + "issue_type": issue_type, + "comment_count": comment_count + } + + self.source_id_counter += 1 + sources_list.append(source) + + # Create result object + result_object = { + "id": 10, # Assign a unique ID for the Jira connector + "name": "Jira Issues", + "type": "JIRA_CONNECTOR", + "sources": sources_list, + } + + return result_object, jira_chunks + async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: """ Search using Linkup API and return both the source information and documents diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index e0b3cd1e0..ab3bc858c 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -13,6 +13,7 @@ from app.connectors.notion_history import NotionHistoryConnector from app.connectors.github_connector import GitHubConnector from app.connectors.linear_connector import LinearConnector from app.connectors.discord_connector import DiscordConnector +from app.connectors.jira_connector import JiraConnector from slack_sdk.errors import SlackApiError import logging import asyncio @@ -1651,3 +1652,258 @@ async def index_discord_messages( ) logger.error(f"Failed to index Discord messages: {str(e)}", exc_info=True) return 0, f"Failed to index Discord messages: {str(e)}" + + +async def index_jira_issues( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str = None, + end_date: str = None, + update_last_indexed: bool = True +) -> Tuple[int, Optional[str]]: + """ + Index Jira issues and comments. + + Args: + session: Database session + connector_id: ID of the Jira connector + search_space_id: ID of the search space to store documents in + user_id: User ID + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="jira_issues_indexing", + source="connector_indexing_task", + message=f"Starting Jira issues indexing for connector {connector_id}", + metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + ) + + try: + # Get the connector from the database + result = await session.execute( + select(SearchSourceConnector).where(SearchSourceConnector.id == connector_id) + ) + connector = result.scalar_one_or_none() + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found", + "Connector not found", + {"error_type": "ConnectorNotFound"} + ) + return 0, f"Connector with ID {connector_id} not found" + + # Get the Jira credentials from the connector config + jira_token = connector.config.get("JIRA_PERSONAL_ACCESS_TOKEN") + jira_base_url = connector.config.get("JIRA_BASE_URL") + + if not jira_token or not jira_base_url: + await task_logger.log_task_failure( + log_entry, + f"Jira credentials not found in connector config for connector {connector_id}", + "Missing Jira credentials", + {"error_type": "MissingCredentials"} + ) + return 0, "Jira credentials not found in connector config" + + # Initialize Jira client + await task_logger.log_task_progress( + log_entry, + f"Initializing Jira client for connector {connector_id}", + {"stage": "client_initialization"} + ) + + jira_client = JiraConnector(base_url=jira_base_url, personal_access_token=jira_token) + + # Calculate date range + if start_date is None or end_date is None: + # Fall back to calculating dates based on last_indexed_at + calculated_end_date = datetime.now() + + if connector.last_indexed_at: + calculated_start_date = connector.last_indexed_at + else: + # If never indexed, go back 30 days + calculated_start_date = calculated_end_date - timedelta(days=30) + + start_date_str = calculated_start_date.strftime('%Y-%m-%d') + end_date_str = calculated_end_date.strftime('%Y-%m-%d') + else: + start_date_str = start_date + end_date_str = end_date + + await task_logger.log_task_progress( + log_entry, + f"Fetching Jira issues from {start_date_str} to {end_date_str}", + {"stage": "fetching_issues", "start_date": start_date_str, "end_date": end_date_str} + ) + + # Get issues within date range + try: + issues, error = jira_client.get_issues_by_date_range( + start_date=start_date_str, + end_date=end_date_str, + include_comments=True + ) + + if error: + logger.error(f"Failed to get Jira issues: {error}") + + # Don't treat "No issues found" as an error that should stop indexing + if "No issues found" in error: + logger.info("No issues found is not a critical error, continuing with update") + if update_last_indexed: + connector.last_indexed_at = datetime.now() + await session.commit() + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found") + + await task_logger.log_task_completion( + log_entry, + f"No Jira issues found in date range {start_date_str} to {end_date_str}", + {"indexed_count": 0} + ) + return 0, None + else: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Jira issues: {error}", + "API Error", + {"error_type": "APIError"} + ) + return 0, f"Failed to get Jira issues: {error}" + + logger.info(f"Retrieved {len(issues)} issues from Jira API") + + await task_logger.log_task_progress( + log_entry, + f"Retrieved {len(issues)} issues from Jira API", + {"stage": "processing_issues", "issue_count": len(issues)} + ) + + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Error fetching Jira issues: {str(e)}", + "Fetch Error", + {"error_type": type(e).__name__} + ) + logger.error(f"Error fetching Jira issues: {str(e)}", exc_info=True) + return 0, f"Error fetching Jira issues: {str(e)}" + + # Process and index each issue + indexed_count = 0 + + for issue in issues: + try: + # Format the issue for better readability + formatted_issue = jira_client.format_issue(issue) + + # Convert to markdown + issue_markdown = jira_client.format_issue_to_markdown(formatted_issue) + + # Create document metadata + metadata = { + "issue_key": formatted_issue.get("key", ""), + "issue_title": formatted_issue.get("title", ""), + "status": formatted_issue.get("status", ""), + "priority": formatted_issue.get("priority", ""), + "issue_type": formatted_issue.get("issue_type", ""), + "project": formatted_issue.get("project", ""), + "assignee": formatted_issue.get("assignee", {}).get("display_name", "") if formatted_issue.get("assignee") else "", + "reporter": formatted_issue.get("reporter", {}).get("display_name", ""), + "created_at": formatted_issue.get("created_at", ""), + "updated_at": formatted_issue.get("updated_at", ""), + "comment_count": len(formatted_issue.get("comments", [])), + "connector_id": connector_id, + "source": "jira", + "base_url": jira_base_url + } + + # Generate content hash + content_hash = generate_content_hash(issue_markdown) + + # Check if document already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_doc = existing_doc_result.scalar_one_or_none() + + if existing_doc: + logger.debug(f"Document with hash {content_hash} already exists, skipping") + continue + + # Create new document + document = Document( + title=f"Jira: {formatted_issue.get('key', 'Unknown')} - {formatted_issue.get('title', 'Untitled')}", + document_type=DocumentType.JIRA_CONNECTOR, + document_metadata=metadata, + content=issue_markdown, + content_hash=content_hash, + search_space_id=search_space_id + ) + + # Generate embedding + embedding = await config.embedding_model_instance.get_embedding(issue_markdown) + document.embedding = embedding + + session.add(document) + await session.flush() # Flush to get the document ID + + # Create chunks for the document + chunks = await config.chunking_model_instance.chunk_document(issue_markdown) + + for chunk_content in chunks: + chunk_embedding = await config.embedding_model_instance.get_embedding(chunk_content) + + chunk = Chunk( + content=chunk_content, + embedding=chunk_embedding, + document_id=document.id + ) + session.add(chunk) + + indexed_count += 1 + logger.debug(f"Indexed Jira issue: {formatted_issue.get('key', 'Unknown')}") + + except Exception as e: + logger.error(f"Error processing Jira issue {issue.get('key', 'Unknown')}: {str(e)}", exc_info=True) + continue + + # Commit all changes + await session.commit() + + # Update last_indexed_at timestamp + if update_last_indexed: + connector.last_indexed_at = datetime.now() + await session.commit() + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + await task_logger.log_task_completion( + log_entry, + f"Successfully indexed {indexed_count} Jira issues", + {"indexed_count": indexed_count} + ) + + logger.info(f"Successfully indexed {indexed_count} Jira issues") + return indexed_count, None + + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to index Jira issues: {str(e)}", + str(e), + {"error_type": type(e).__name__} + ) + logger.error(f"Failed to index Jira issues: {str(e)}", exc_info=True) + return 0, f"Failed to index Jira issues: {str(e)}" From cd05a06a9152f790d1ccbe8f3fe24442e7c9c396 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 11:52:21 +0200 Subject: [PATCH 07/17] update connector indexing / update connector service --- .../app/agents/researcher/nodes.py | 54 +- .../agents/researcher/qna_agent/prompts.py | 2 + .../app/agents/researcher/utils.py | 4 + .../app/connectors/test_jira_connector.py | 218 +++ .../routes/search_source_connectors_routes.py | 52 +- .../app/schemas/search_source_connector.py | 13 + .../app/services/connector_service.py | 636 ++++---- .../app/tasks/connectors_indexing_tasks.py | 1297 +++++++++++------ 8 files changed, 1544 insertions(+), 732 deletions(-) create mode 100644 surfsense_backend/app/connectors/test_jira_connector.py diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 30d572a60..0fb9dc3ee 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -172,20 +172,41 @@ async def fetch_documents_by_ids( channel_id = metadata.get('channel_id', '') guild_id = metadata.get('guild_id', '') message_date = metadata.get('start_date', '') - + title = f"Discord: {channel_name}" if message_date: title += f" ({message_date})" - + description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content - + if guild_id and channel_id: url = f"https://discord.com/channels/{guild_id}/{channel_id}" elif channel_id: url = f"https://discord.com/channels/@me/{channel_id}" else: url = "" - + + elif doc_type == "JIRA_CONNECTOR": + # Extract Jira-specific metadata + issue_key = metadata.get('issue_key', 'Unknown Issue') + issue_title = metadata.get('issue_title', 'Untitled Issue') + status = metadata.get('status', '') + priority = metadata.get('priority', '') + issue_type = metadata.get('issue_type', '') + + title = f"Jira: {issue_key} - {issue_title}" + if status: + title += f" ({status})" + + description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content + + # Construct Jira URL if we have the base URL + base_url = metadata.get('base_url', '') + if base_url and issue_key: + url = f"{base_url}/browse/{issue_key}" + else: + url = "" + elif doc_type == "EXTENSION": # Extract Extension-specific metadata webpage_title = metadata.get('VisitedWebPageTitle', doc.title) @@ -227,6 +248,7 @@ async def fetch_documents_by_ids( "GITHUB_CONNECTOR": "GitHub (Selected)", "YOUTUBE_VIDEO": "YouTube Videos (Selected)", "DISCORD_CONNECTOR": "Discord (Selected)", + "JIRA_CONNECTOR": "Jira Issues (Selected)", "EXTENSION": "Browser Extension (Selected)", "CRAWLED_URL": "Web Pages (Selected)", "FILE": "Files (Selected)" @@ -741,6 +763,30 @@ async def fetch_relevant_documents( } ) + elif connector == "JIRA_CONNECTOR": + source_object, jira_chunks = await connector_service.search_jira( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(jira_chunks) + + # Stream found document count + if streaming_service and writer: + writer( + { + "yield_value": streaming_service.format_terminal_info_delta( + f"🎫 Found {len(jira_chunks)} Jira issues related to your query" + ) + } + ) + except Exception as e: error_message = f"Error searching connector {connector}: {str(e)}" print(error_message) diff --git a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py index eed07224b..d726dfd1d 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py @@ -15,6 +15,8 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) +- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) +- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) - DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions) - TAVILY_API: "Tavily search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results) diff --git a/surfsense_backend/app/agents/researcher/utils.py b/surfsense_backend/app/agents/researcher/utils.py index c4991cc9f..647e00003 100644 --- a/surfsense_backend/app/agents/researcher/utils.py +++ b/surfsense_backend/app/agents/researcher/utils.py @@ -33,6 +33,8 @@ def get_connector_emoji(connector_name: str) -> str: "NOTION_CONNECTOR": "📘", "GITHUB_CONNECTOR": "🐙", "LINEAR_CONNECTOR": "📊", + "JIRA_CONNECTOR": "🎫", + "DISCORD_CONNECTOR": "🗨️", "TAVILY_API": "🔍", "LINKUP_API": "🔗" } @@ -50,6 +52,8 @@ def get_connector_friendly_name(connector_name: str) -> str: "NOTION_CONNECTOR": "Notion", "GITHUB_CONNECTOR": "GitHub", "LINEAR_CONNECTOR": "Linear", + "JIRA_CONNECTOR": "Jira", + "DISCORD_CONNECTOR": "Discord", "TAVILY_API": "Tavily Search", "LINKUP_API": "Linkup Search" } diff --git a/surfsense_backend/app/connectors/test_jira_connector.py b/surfsense_backend/app/connectors/test_jira_connector.py new file mode 100644 index 000000000..c9b755152 --- /dev/null +++ b/surfsense_backend/app/connectors/test_jira_connector.py @@ -0,0 +1,218 @@ +import unittest +from unittest.mock import patch, Mock +from datetime import datetime + +# Import the JiraConnector +from .jira_connector import JiraConnector + + +class TestJiraConnector(unittest.TestCase): + + def setUp(self): + """Set up test fixtures.""" + self.base_url = "https://test.atlassian.net" + self.token = "test_token" + self.connector = JiraConnector(base_url=self.base_url, personal_access_token=self.token) + + def test_init(self): + """Test JiraConnector initialization.""" + self.assertEqual(self.connector.base_url, self.base_url) + self.assertEqual(self.connector.personal_access_token, self.token) + self.assertEqual(self.connector.api_version, "3") + + def test_init_with_trailing_slash(self): + """Test JiraConnector initialization with trailing slash in URL.""" + connector = JiraConnector(base_url="https://test.atlassian.net/", personal_access_token=self.token) + self.assertEqual(connector.base_url, "https://test.atlassian.net") + + def test_set_credentials(self): + """Test setting credentials.""" + new_url = "https://newtest.atlassian.net/" + new_token = "new_token" + + self.connector.set_credentials(new_url, new_token) + + self.assertEqual(self.connector.base_url, "https://newtest.atlassian.net") + self.assertEqual(self.connector.personal_access_token, new_token) + + def test_get_headers(self): + """Test header generation.""" + headers = self.connector.get_headers() + + self.assertIn('Content-Type', headers) + self.assertIn('Authorization', headers) + self.assertIn('Accept', headers) + self.assertEqual(headers['Content-Type'], 'application/json') + self.assertEqual(headers['Accept'], 'application/json') + self.assertTrue(headers['Authorization'].startswith('Bearer ')) + + def test_get_headers_no_credentials(self): + """Test header generation without credentials.""" + connector = JiraConnector() + + with self.assertRaises(ValueError) as context: + connector.get_headers() + + self.assertIn("Jira credentials not initialized", str(context.exception)) + + @patch('requests.get') + def test_make_api_request_success(self, mock_get): + """Test successful API request.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"test": "data"} + mock_get.return_value = mock_response + + result = self.connector.make_api_request("test/endpoint") + + self.assertEqual(result, {"test": "data"}) + mock_get.assert_called_once() + + @patch('requests.get') + def test_make_api_request_failure(self, mock_get): + """Test failed API request.""" + mock_response = Mock() + mock_response.status_code = 401 + mock_response.text = "Unauthorized" + mock_get.return_value = mock_response + + with self.assertRaises(Exception) as context: + self.connector.make_api_request("test/endpoint") + + self.assertIn("API request failed with status code 401", str(context.exception)) + + @patch.object(JiraConnector, 'make_api_request') + def test_get_all_projects(self, mock_api_request): + """Test getting all projects.""" + mock_api_request.return_value = { + "values": [ + {"id": "1", "key": "TEST", "name": "Test Project"}, + {"id": "2", "key": "DEMO", "name": "Demo Project"} + ] + } + + projects = self.connector.get_all_projects() + + self.assertEqual(len(projects), 2) + self.assertEqual(projects[0]["key"], "TEST") + self.assertEqual(projects[1]["key"], "DEMO") + mock_api_request.assert_called_once_with("project") + + @patch.object(JiraConnector, 'make_api_request') + def test_get_all_issues(self, mock_api_request): + """Test getting all issues.""" + mock_api_request.return_value = { + "issues": [ + { + "id": "1", + "key": "TEST-1", + "fields": { + "summary": "Test Issue", + "description": "Test Description", + "status": {"name": "Open"}, + "priority": {"name": "High"}, + "issuetype": {"name": "Bug"}, + "project": {"key": "TEST"}, + "created": "2023-01-01T10:00:00.000+0000", + "updated": "2023-01-01T12:00:00.000+0000" + } + } + ], + "total": 1 + } + + issues = self.connector.get_all_issues() + + self.assertEqual(len(issues), 1) + self.assertEqual(issues[0]["key"], "TEST-1") + self.assertEqual(issues[0]["fields"]["summary"], "Test Issue") + + def test_format_issue(self): + """Test issue formatting.""" + raw_issue = { + "id": "1", + "key": "TEST-1", + "fields": { + "summary": "Test Issue", + "description": "Test Description", + "status": {"name": "Open", "statusCategory": {"name": "To Do"}}, + "priority": {"name": "High"}, + "issuetype": {"name": "Bug"}, + "project": {"key": "TEST"}, + "created": "2023-01-01T10:00:00.000+0000", + "updated": "2023-01-01T12:00:00.000+0000", + "reporter": { + "accountId": "123", + "displayName": "John Doe", + "emailAddress": "john@example.com" + }, + "assignee": { + "accountId": "456", + "displayName": "Jane Smith", + "emailAddress": "jane@example.com" + } + } + } + + formatted = self.connector.format_issue(raw_issue) + + self.assertEqual(formatted["id"], "1") + self.assertEqual(formatted["key"], "TEST-1") + self.assertEqual(formatted["title"], "Test Issue") + self.assertEqual(formatted["status"], "Open") + self.assertEqual(formatted["priority"], "High") + self.assertEqual(formatted["issue_type"], "Bug") + self.assertEqual(formatted["project"], "TEST") + self.assertEqual(formatted["reporter"]["display_name"], "John Doe") + self.assertEqual(formatted["assignee"]["display_name"], "Jane Smith") + + def test_format_date(self): + """Test date formatting.""" + iso_date = "2023-01-01T10:30:00.000+0000" + formatted_date = JiraConnector.format_date(iso_date) + + self.assertEqual(formatted_date, "2023-01-01 10:30:00") + + def test_format_date_invalid(self): + """Test date formatting with invalid input.""" + formatted_date = JiraConnector.format_date("invalid-date") + self.assertEqual(formatted_date, "invalid-date") + + formatted_date = JiraConnector.format_date("") + self.assertEqual(formatted_date, "Unknown date") + + formatted_date = JiraConnector.format_date(None) + self.assertEqual(formatted_date, "Unknown date") + + def test_format_issue_to_markdown(self): + """Test issue to markdown conversion.""" + formatted_issue = { + "key": "TEST-1", + "title": "Test Issue", + "status": "Open", + "priority": "High", + "issue_type": "Bug", + "project": "TEST", + "assignee": {"display_name": "Jane Smith"}, + "reporter": {"display_name": "John Doe"}, + "created_at": "2023-01-01T10:00:00.000+0000", + "updated_at": "2023-01-01T12:00:00.000+0000", + "description": "Test Description", + "comments": [] + } + + markdown = self.connector.format_issue_to_markdown(formatted_issue) + + self.assertIn("# TEST-1: Test Issue", markdown) + self.assertIn("**Status:** Open", markdown) + self.assertIn("**Priority:** High", markdown) + self.assertIn("**Type:** Bug", markdown) + self.assertIn("**Project:** TEST", markdown) + self.assertIn("**Assignee:** Jane Smith", markdown) + self.assertIn("**Reporter:** John Doe", markdown) + self.assertIn("## Description", markdown) + self.assertIn("Test Description", markdown) + + +if __name__ == '__main__': + unittest.main() diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 54f97d6ac..33366ff1e 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -19,7 +19,7 @@ from app.schemas import SearchSourceConnectorCreate, SearchSourceConnectorUpdate from app.users import current_active_user from app.utils.check_ownership import check_ownership from pydantic import BaseModel, Field, ValidationError -from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues, index_discord_messages +from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues, index_discord_messages, index_jira_issues from app.connectors.github_connector import GitHubConnector from datetime import datetime, timedelta import logging @@ -284,6 +284,7 @@ async def index_connector_content( - NOTION_CONNECTOR: Indexes pages from all accessible Notion pages - GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories - LINEAR_CONNECTOR: Indexes issues and comments from Linear + - JIRA_CONNECTOR: Indexes issues and comments from Jira - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels Args: @@ -349,6 +350,12 @@ async def index_connector_content( background_tasks.add_task(run_linear_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) response_message = "Linear indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.JIRA_CONNECTOR: + # Run indexing in background + logger.info(f"Triggering Jira indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}") + background_tasks.add_task(run_jira_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) + response_message = "Jira indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: # Run indexing in background logger.info( @@ -647,4 +654,45 @@ async def run_discord_indexing( else: logger.error(f"Discord indexing failed or no documents processed: {error_or_warning}") except Exception as e: - logger.error(f"Error in background Discord indexing task: {str(e)}") \ No newline at end of file + logger.error(f"Error in background Discord indexing task: {str(e)}") + + +# Add new helper functions for Jira indexing +async def run_jira_indexing_with_new_session( + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str +): + """Wrapper to run Jira indexing with its own database session.""" + logger.info(f"Background task started: Indexing Jira connector {connector_id} into space {search_space_id} from {start_date} to {end_date}") + async with async_session_maker() as session: + await run_jira_indexing(session, connector_id, search_space_id, user_id, start_date, end_date) + logger.info(f"Background task finished: Indexing Jira connector {connector_id}") + +async def run_jira_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str +): + """Runs the Jira indexing task and updates the timestamp.""" + try: + indexed_count, error_message = await index_jira_issues( + session, connector_id, search_space_id, user_id, start_date, end_date, update_last_indexed=False + ) + if error_message: + logger.error(f"Jira indexing failed for connector {connector_id}: {error_message}") + # Optionally update status in DB to indicate failure + else: + logger.info(f"Jira indexing successful for connector {connector_id}. Indexed {indexed_count} documents.") + # Update the last indexed timestamp only on success + await update_connector_last_indexed(session, connector_id) + await session.commit() # Commit timestamp update + except Exception as e: + await session.rollback() + logger.error(f"Critical error in run_jira_indexing for connector {connector_id}: {e}", exc_info=True) + # Optionally update status in DB to indicate failure \ No newline at end of file diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 1225d54fc..17f1867b1 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -101,6 +101,19 @@ class SearchSourceConnectorBase(BaseModel): # Ensure the bot token is not empty if not config.get("DISCORD_BOT_TOKEN"): raise ValueError("DISCORD_BOT_TOKEN cannot be empty") + elif connector_type == SearchSourceConnectorType.JIRA_CONNECTOR: + # For JIRA_CONNECTOR, allow JIRA_PERSONAL_ACCESS_TOKEN and JIRA_BASE_URL + allowed_keys = ["JIRA_PERSONAL_ACCESS_TOKEN", "JIRA_BASE_URL"] + if set(config.keys()) != set(allowed_keys): + raise ValueError(f"For JIRA_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + + # Ensure the token is not empty + if not config.get("JIRA_PERSONAL_ACCESS_TOKEN"): + raise ValueError("JIRA_PERSONAL_ACCESS_TOKEN cannot be empty") + + # Ensure the base URL is not empty + if not config.get("JIRA_BASE_URL"): + raise ValueError("JIRA_BASE_URL cannot be empty") return config diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 8c6f99c5f..45293661e 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -1,15 +1,21 @@ -from typing import List, Dict, Optional import asyncio -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.future import select -from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever -from app.db import SearchSourceConnector, SearchSourceConnectorType, Chunk, Document, SearchSpace -from tavily import TavilyClient -from linkup import LinkupClient -from sqlalchemy import func +from typing import Dict, List, Optional from app.agents.researcher.configuration import SearchMode +from app.db import ( + Chunk, + Document, + SearchSourceConnector, + SearchSourceConnectorType, + SearchSpace, +) +from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever +from linkup import LinkupClient +from sqlalchemy import func +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from tavily import TavilyClient class ConnectorService: @@ -18,9 +24,13 @@ class ConnectorService: self.chunk_retriever = ChucksHybridSearchRetriever(session) self.document_retriever = DocumentHybridSearchRetriever(session) self.user_id = user_id - self.source_id_counter = 100000 # High starting value to avoid collisions with existing IDs - self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments - + self.source_id_counter = ( + 100000 # High starting value to avoid collisions with existing IDs + ) + self.counter_lock = ( + asyncio.Lock() + ) # Lock to protect counter in multithreaded environments + async def initialize_counter(self): """ Initialize the source_id_counter based on the total number of chunks for the user. @@ -38,16 +48,25 @@ class ConnectorService: ) chunk_count = result.scalar() or 0 self.source_id_counter = chunk_count + 1 - print(f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}") + print( + f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}" + ) except Exception as e: print(f"Error initializing source_id_counter: {str(e)}") # Fallback to default value self.source_id_counter = 1 - - async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_crawled_urls( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -57,7 +76,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="CRAWLED_URL" + document_type="CRAWLED_URL", ) elif search_mode == SearchMode.DOCUMENTS: crawled_urls_chunks = await self.document_retriever.hybrid_search( @@ -65,7 +84,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="CRAWLED_URL" + document_type="CRAWLED_URL", ) # Transform document retriever results to match expected format crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) @@ -84,20 +103,23 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(crawled_urls_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a source entry source = { - "id": document.get('id', self.source_id_counter), - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') + "id": document.get("id", self.source_id_counter), + "title": document.get("title", "Untitled Document"), + "description": metadata.get( + "og:description", + metadata.get("ogDescription", chunk.get("content", "")[:100]), + ), + "url": metadata.get("url", ""), } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 1, @@ -105,13 +127,20 @@ class ConnectorService: "type": "CRAWLED_URL", "sources": sources_list, } - + return result_object, crawled_urls_chunks - - async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_files( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for files and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -121,7 +150,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="FILE" + document_type="FILE", ) elif search_mode == SearchMode.DOCUMENTS: files_chunks = await self.document_retriever.hybrid_search( @@ -129,11 +158,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="FILE" + document_type="FILE", ) # Transform document retriever results to match expected format files_chunks = self._transform_document_results(files_chunks) - + # Early return if no results if not files_chunks: return { @@ -148,20 +177,23 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(files_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a source entry source = { - "id": document.get('id', self.source_id_counter), - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') + "id": document.get("id", self.source_id_counter), + "title": document.get("title", "Untitled Document"), + "description": metadata.get( + "og:description", + metadata.get("ogDescription", chunk.get("content", "")[:100]), + ), + "url": metadata.get("url", ""), } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 2, @@ -169,69 +201,76 @@ class ConnectorService: "type": "FILE", "sources": sources_list, } - + return result_object, files_chunks - + def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]: """ Transform results from document_retriever.hybrid_search() to match the format expected by the processing code. - + Args: document_results: Results from document_retriever.hybrid_search() - + Returns: List of transformed results in the format expected by the processing code """ transformed_results = [] for doc in document_results: - transformed_results.append({ - 'document': { - 'id': doc.get('document_id'), - 'title': doc.get('title', 'Untitled Document'), - 'document_type': doc.get('document_type'), - 'metadata': doc.get('metadata', {}), - }, - 'content': doc.get('chunks_content', doc.get('content', '')), - 'score': doc.get('score', 0.0) - }) + transformed_results.append( + { + "document": { + "id": doc.get("document_id"), + "title": doc.get("title", "Untitled Document"), + "document_type": doc.get("document_type"), + "metadata": doc.get("metadata", {}), + }, + "content": doc.get("chunks_content", doc.get("content", "")), + "score": doc.get("score", 0.0), + } + ) return transformed_results - - async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]: + + async def get_connector_by_type( + self, user_id: str, connector_type: SearchSourceConnectorType + ) -> Optional[SearchSourceConnector]: """ Get a connector by type for a specific user - + Args: user_id: The user's ID connector_type: The connector type to retrieve - + Returns: Optional[SearchSourceConnector]: The connector if found, None otherwise """ result = await self.session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.user_id == user_id, - SearchSourceConnector.connector_type == connector_type + SearchSourceConnector.connector_type == connector_type, ) ) return result.scalars().first() - - async def search_tavily(self, user_query: str, user_id: str, top_k: int = 20) -> tuple: + + async def search_tavily( + self, user_query: str, user_id: str, top_k: int = 20 + ) -> tuple: """ Search using Tavily API and return both the source information and documents - + Args: user_query: The user's query user_id: The user's ID top_k: Maximum number of results to return - + Returns: tuple: (sources_info, documents) """ # Get Tavily connector configuration - tavily_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.TAVILY_API) - + tavily_connector = await self.get_connector_by_type( + user_id, SearchSourceConnectorType.TAVILY_API + ) + if not tavily_connector: # Return empty results if no Tavily connector is configured return { @@ -240,22 +279,22 @@ class ConnectorService: "type": "TAVILY_API", "sources": [], }, [] - + # Initialize Tavily client with API key from connector config tavily_api_key = tavily_connector.config.get("TAVILY_API_KEY") tavily_client = TavilyClient(api_key=tavily_api_key) - + # Perform search with Tavily try: response = tavily_client.search( query=user_query, max_results=top_k, - search_depth="advanced" # Use advanced search for better results + search_depth="advanced", # Use advanced search for better results ) - + # Extract results from Tavily response tavily_results = response.get("results", []) - + # Early return if no results if not tavily_results: return { @@ -264,23 +303,22 @@ class ConnectorService: "type": "TAVILY_API", "sources": [], }, [] - + # Process each result and create sources directly without deduplication sources_list = [] documents = [] - + async with self.counter_lock: for i, result in enumerate(tavily_results): - # Create a source entry source = { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), "description": result.get("content", "")[:100], - "url": result.get("url", "") + "url": result.get("url", ""), } sources_list.append(source) - + # Create a document entry document = { "chunk_id": f"tavily_chunk_{i}", @@ -293,9 +331,9 @@ class ConnectorService: "metadata": { "url": result.get("url", ""), "published_date": result.get("published_date", ""), - "source": "TAVILY_API" - } - } + "source": "TAVILY_API", + }, + }, } documents.append(document) self.source_id_counter += 1 @@ -307,9 +345,9 @@ class ConnectorService: "type": "TAVILY_API", "sources": sources_list, } - + return result_object, documents - + except Exception as e: # Log the error and return empty results print(f"Error searching with Tavily: {str(e)}") @@ -319,11 +357,18 @@ class ConnectorService: "type": "TAVILY_API", "sources": [], }, [] - - async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_slack( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for slack and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -333,7 +378,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="SLACK_CONNECTOR" + document_type="SLACK_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: slack_chunks = await self.document_retriever.hybrid_search( @@ -341,11 +386,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="SLACK_CONNECTOR" + document_type="SLACK_CONNECTOR", ) # Transform document retriever results to match expected format slack_chunks = self._transform_document_results(slack_chunks) - + # Early return if no results if not slack_chunks: return { @@ -360,31 +405,31 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(slack_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a mapped source entry with Slack-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - + channel_name = metadata.get("channel_name", "Unknown Channel") + channel_id = metadata.get("channel_id", "") + message_date = metadata.get("start_date", "") + # Create a more descriptive title for Slack messages title = f"Slack: {channel_name}" if message_date: title += f" ({message_date})" - + # Create a more descriptive description for Slack messages - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # For URL, we can use a placeholder or construct a URL to the Slack channel if available url = "" if channel_id: url = f"https://slack.com/app_redirect?channel={channel_id}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -392,7 +437,7 @@ class ConnectorService: self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 4, @@ -400,19 +445,26 @@ class ConnectorService: "type": "SLACK_CONNECTOR", "sources": sources_list, } - + return result_object, slack_chunks - - async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_notion( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Notion pages and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -422,7 +474,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="NOTION_CONNECTOR" + document_type="NOTION_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: notion_chunks = await self.document_retriever.hybrid_search( @@ -430,11 +482,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="NOTION_CONNECTOR" + document_type="NOTION_CONNECTOR", ) # Transform document retriever results to match expected format notion_chunks = self._transform_document_results(notion_chunks) - + # Early return if no results if not notion_chunks: return { @@ -449,24 +501,24 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(notion_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a mapped source entry with Notion-specific metadata - page_title = metadata.get('page_title', 'Untitled Page') - page_id = metadata.get('page_id', '') - indexed_at = metadata.get('indexed_at', '') - + page_title = metadata.get("page_title", "Untitled Page") + page_id = metadata.get("page_id", "") + indexed_at = metadata.get("indexed_at", "") + # Create a more descriptive title for Notion pages title = f"Notion: {page_title}" if indexed_at: title += f" (indexed: {indexed_at})" - + # Create a more descriptive description for Notion pages - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # For URL, we can use a placeholder or construct a URL to the Notion page if available url = "" if page_id: @@ -474,7 +526,7 @@ class ConnectorService: url = f"https://notion.so/{page_id.replace('-', '')}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -482,7 +534,7 @@ class ConnectorService: self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 5, @@ -490,19 +542,26 @@ class ConnectorService: "type": "NOTION_CONNECTOR", "sources": sources_list, } - + return result_object, notion_chunks - - async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_extension( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for extension data and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -512,7 +571,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="EXTENSION" + document_type="EXTENSION", ) elif search_mode == SearchMode.DOCUMENTS: extension_chunks = await self.document_retriever.hybrid_search( @@ -520,7 +579,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="EXTENSION" + document_type="EXTENSION", ) # Transform document retriever results to match expected format extension_chunks = self._transform_document_results(extension_chunks) @@ -539,33 +598,39 @@ class ConnectorService: async with self.counter_lock: for i, chunk in enumerate(extension_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract extension-specific metadata - webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') - webpage_url = metadata.get('VisitedWebPageURL', '') - visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') - visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') - browsing_session_id = metadata.get('BrowsingSessionId', '') - + webpage_title = metadata.get("VisitedWebPageTitle", "Untitled Page") + webpage_url = metadata.get("VisitedWebPageURL", "") + visit_date = metadata.get("VisitedWebPageDateWithTimeInISOString", "") + visit_duration = metadata.get( + "VisitedWebPageVisitDurationInMilliseconds", "" + ) + browsing_session_id = metadata.get("BrowsingSessionId", "") + # Create a more descriptive title for extension data title = webpage_title if visit_date: # Format the date for display (simplified) try: # Just extract the date part for display - formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date + formatted_date = ( + visit_date.split("T")[0] + if "T" in visit_date + else visit_date + ) title += f" (visited: {formatted_date})" except: # Fallback if date parsing fails title += f" (visited: {visit_date})" - + # Create a more descriptive description for extension data - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # Add visit duration if available if visit_duration: try: @@ -573,8 +638,8 @@ class ConnectorService: if duration_seconds < 60: duration_text = f"{duration_seconds:.1f} seconds" else: - duration_text = f"{duration_seconds/60:.1f} minutes" - + duration_text = f"{duration_seconds / 60:.1f} minutes" + if description: description += f" | Duration: {duration_text}" except: @@ -582,15 +647,15 @@ class ConnectorService: pass source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, - "url": webpage_url + "url": webpage_url, } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 6, @@ -598,19 +663,26 @@ class ConnectorService: "type": "EXTENSION", "sources": sources_list, } - + return result_object, extension_chunks - - async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_youtube( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for YouTube videos and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -620,7 +692,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="YOUTUBE_VIDEO" + document_type="YOUTUBE_VIDEO", ) elif search_mode == SearchMode.DOCUMENTS: youtube_chunks = await self.document_retriever.hybrid_search( @@ -628,11 +700,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="YOUTUBE_VIDEO" + document_type="YOUTUBE_VIDEO", ) # Transform document retriever results to match expected format youtube_chunks = self._transform_document_results(youtube_chunks) - + # Early return if no results if not youtube_chunks: return { @@ -647,40 +719,42 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(youtube_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract YouTube-specific metadata - video_title = metadata.get('video_title', 'Untitled Video') - video_id = metadata.get('video_id', '') - channel_name = metadata.get('channel_name', '') + video_title = metadata.get("video_title", "Untitled Video") + video_id = metadata.get("video_id", "") + channel_name = metadata.get("channel_name", "") # published_date = metadata.get('published_date', '') - + # Create a more descriptive title for YouTube videos title = video_title if channel_name: title += f" - {channel_name}" - + # Create a more descriptive description for YouTube videos - description = metadata.get('description', chunk.get('content', '')[:100]) + description = metadata.get( + "description", chunk.get("content", "")[:100] + ) if len(description) == 100: description += "..." - + # For URL, construct a URL to the YouTube video url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "video_id": video_id, # Additional field for YouTube videos - "channel_name": channel_name # Additional field for YouTube videos + "channel_name": channel_name, # Additional field for YouTube videos } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 7, # Assign a unique ID for the YouTube connector @@ -688,13 +762,20 @@ class ConnectorService: "type": "YOUTUBE_VIDEO", "sources": sources_list, } - + return result_object, youtube_chunks - async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + async def search_github( + self, + user_query: str, + user_id: int, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for GitHub documents and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -704,7 +785,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="GITHUB_CONNECTOR" + document_type="GITHUB_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: github_chunks = await self.document_retriever.hybrid_search( @@ -712,11 +793,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="GITHUB_CONNECTOR" + document_type="GITHUB_CONNECTOR", ) # Transform document retriever results to match expected format github_chunks = self._transform_document_results(github_chunks) - + # Early return if no results if not github_chunks: return { @@ -731,20 +812,24 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(github_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a source entry source = { - "id": document.get('id', self.source_id_counter), - "title": document.get('title', 'GitHub Document'), # Use specific title if available - "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview - "url": metadata.get('url', '') # Use URL if available in metadata + "id": document.get("id", self.source_id_counter), + "title": document.get( + "title", "GitHub Document" + ), # Use specific title if available + "description": metadata.get( + "description", chunk.get("content", "")[:100] + ), # Use description or content preview + "url": metadata.get("url", ""), # Use URL if available in metadata } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 8, @@ -752,19 +837,26 @@ class ConnectorService: "type": "GITHUB_CONNECTOR", "sources": sources_list, } - + return result_object, github_chunks - async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + async def search_linear( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Linear issues and comments and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -774,7 +866,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="LINEAR_CONNECTOR" + document_type="LINEAR_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: linear_chunks = await self.document_retriever.hybrid_search( @@ -782,7 +874,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="LINEAR_CONNECTOR" + document_type="LINEAR_CONNECTOR", ) # Transform document retriever results to match expected format linear_chunks = self._transform_document_results(linear_chunks) @@ -801,32 +893,32 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(linear_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract Linear-specific metadata - issue_identifier = metadata.get('issue_identifier', '') - issue_title = metadata.get('issue_title', 'Untitled Issue') - issue_state = metadata.get('state', '') - comment_count = metadata.get('comment_count', 0) - + issue_identifier = metadata.get("issue_identifier", "") + issue_title = metadata.get("issue_title", "Untitled Issue") + issue_state = metadata.get("state", "") + comment_count = metadata.get("comment_count", 0) + # Create a more descriptive title for Linear issues title = f"Linear: {issue_identifier} - {issue_title}" if issue_state: title += f" ({issue_state})" - + # Create a more descriptive description for Linear issues - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # Add comment count info to description if comment_count: if description: description += f" | Comments: {comment_count}" else: description = f"Comments: {comment_count}" - + # For URL, we could construct a URL to the Linear issue if we have the workspace info # For now, use a generic placeholder url = "" @@ -835,18 +927,18 @@ class ConnectorService: url = f"https://linear.app/issue/{issue_identifier}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "issue_identifier": issue_identifier, "state": issue_state, - "comment_count": comment_count + "comment_count": comment_count, } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 9, # Assign a unique ID for the Linear connector @@ -854,10 +946,17 @@ class ConnectorService: "type": "LINEAR_CONNECTOR", "sources": sources_list, } - + return result_object, linear_chunks - async def search_jira(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + async def search_jira( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Jira issues and comments and return both the source information and langchain documents @@ -877,7 +976,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="JIRA_CONNECTOR" + document_type="JIRA_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: jira_chunks = await self.document_retriever.hybrid_search( @@ -885,7 +984,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="JIRA_CONNECTOR" + document_type="JIRA_CONNECTOR", ) # Transform document retriever results to match expected format jira_chunks = self._transform_document_results(jira_chunks) @@ -904,16 +1003,16 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(jira_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract Jira-specific metadata - issue_key = metadata.get('issue_key', '') - issue_title = metadata.get('issue_title', 'Untitled Issue') - status = metadata.get('status', '') - priority = metadata.get('priority', '') - issue_type = metadata.get('issue_type', '') - comment_count = metadata.get('comment_count', 0) + issue_key = metadata.get("issue_key", "") + issue_title = metadata.get("issue_title", "Untitled Issue") + status = metadata.get("status", "") + priority = metadata.get("priority", "") + issue_type = metadata.get("issue_type", "") + comment_count = metadata.get("comment_count", 0) # Create a more descriptive title for Jira issues title = f"Jira: {issue_key} - {issue_title}" @@ -921,7 +1020,7 @@ class ConnectorService: title += f" ({status})" # Create a more descriptive description for Jira issues - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." @@ -938,16 +1037,16 @@ class ConnectorService: if description: description += f" | {' | '.join(info_parts)}" else: - description = ' | '.join(info_parts) + description = " | ".join(info_parts) # For URL, we could construct a URL to the Jira issue if we have the base URL # For now, use a generic placeholder url = "" - if issue_key and metadata.get('base_url'): + if issue_key and metadata.get("base_url"): url = f"{metadata.get('base_url')}/browse/{issue_key}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -955,7 +1054,7 @@ class ConnectorService: "status": status, "priority": priority, "issue_type": issue_type, - "comment_count": comment_count + "comment_count": comment_count, } self.source_id_counter += 1 @@ -971,21 +1070,25 @@ class ConnectorService: return result_object, jira_chunks - async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: + async def search_linkup( + self, user_query: str, user_id: str, mode: str = "standard" + ) -> tuple: """ Search using Linkup API and return both the source information and documents - + Args: user_query: The user's query user_id: The user's ID mode: Search depth mode, can be "standard" or "deep" - + Returns: tuple: (sources_info, documents) """ # Get Linkup connector configuration - linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API) - + linkup_connector = await self.get_connector_by_type( + user_id, SearchSourceConnectorType.LINKUP_API + ) + if not linkup_connector: # Return empty results if no Linkup connector is configured return { @@ -994,11 +1097,11 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] - + # Initialize Linkup client with API key from connector config linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY") linkup_client = LinkupClient(api_key=linkup_api_key) - + # Perform search with Linkup try: response = linkup_client.search( @@ -1006,10 +1109,10 @@ class ConnectorService: depth=mode, # Use the provided mode ("standard" or "deep") output_type="searchResults", # Default to search results ) - + # Extract results from Linkup response - access as attribute instead of using .get() - linkup_results = response.results if hasattr(response, 'results') else [] - + linkup_results = response.results if hasattr(response, "results") else [] + # Only proceed if we have results if not linkup_results: return { @@ -1018,41 +1121,49 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] - + # Process each result and create sources directly without deduplication sources_list = [] documents = [] - + async with self.counter_lock: for i, result in enumerate(linkup_results): # Only process results that have content - if not hasattr(result, 'content') or not result.content: + if not hasattr(result, "content") or not result.content: continue - + # Create a source entry source = { "id": self.source_id_counter, - "title": result.name if hasattr(result, 'name') else "Linkup Result", - "description": result.content[:100] if hasattr(result, 'content') else "", - "url": result.url if hasattr(result, 'url') else "" + "title": ( + result.name if hasattr(result, "name") else "Linkup Result" + ), + "description": ( + result.content[:100] if hasattr(result, "content") else "" + ), + "url": result.url if hasattr(result, "url") else "", } sources_list.append(source) - + # Create a document entry document = { "chunk_id": f"linkup_chunk_{i}", - "content": result.content if hasattr(result, 'content') else "", + "content": result.content if hasattr(result, "content") else "", "score": 1.0, # Default score since not provided by Linkup "document": { "id": self.source_id_counter, - "title": result.name if hasattr(result, 'name') else "Linkup Result", + "title": ( + result.name + if hasattr(result, "name") + else "Linkup Result" + ), "document_type": "LINKUP_API", "metadata": { - "url": result.url if hasattr(result, 'url') else "", - "type": result.type if hasattr(result, 'type') else "", - "source": "LINKUP_API" - } - } + "url": result.url if hasattr(result, "url") else "", + "type": result.type if hasattr(result, "type") else "", + "source": "LINKUP_API", + }, + }, } documents.append(document) self.source_id_counter += 1 @@ -1064,9 +1175,9 @@ class ConnectorService: "type": "LINKUP_API", "sources": sources_list, } - + return result_object, documents - + except Exception as e: # Log the error and return empty results print(f"Error searching with Linkup: {str(e)}") @@ -1076,17 +1187,24 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] - - async def search_discord(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_discord( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Discord messages and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -1096,7 +1214,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="DISCORD_CONNECTOR" + document_type="DISCORD_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: discord_chunks = await self.document_retriever.hybrid_search( @@ -1104,11 +1222,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="DISCORD_CONNECTOR" + document_type="DISCORD_CONNECTOR", ) # Transform document retriever results to match expected format discord_chunks = self._transform_document_results(discord_chunks) - + # Early return if no results if not discord_chunks: return { @@ -1123,26 +1241,26 @@ class ConnectorService: async with self.counter_lock: for i, chunk in enumerate(discord_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a mapped source entry with Discord-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - + channel_name = metadata.get("channel_name", "Unknown Channel") + channel_id = metadata.get("channel_id", "") + message_date = metadata.get("start_date", "") + # Create a more descriptive title for Discord messages title = f"Discord: {channel_name}" if message_date: title += f" ({message_date})" - + # Create a more descriptive description for Discord messages - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + url = "" - guild_id = metadata.get('guild_id', '') + guild_id = metadata.get("guild_id", "") if guild_id and channel_id: url = f"https://discord.com/channels/{guild_id}/{channel_id}" elif channel_id: @@ -1150,7 +1268,7 @@ class ConnectorService: url = f"https://discord.com/channels/@me/{channel_id}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -1158,7 +1276,7 @@ class ConnectorService: self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 11, @@ -1166,7 +1284,5 @@ class ConnectorService: "type": "DISCORD_CONNECTOR", "sources": sources_list, } - + return result_object, discord_chunks - - diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index ab3bc858c..f4ae13971 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -1,28 +1,35 @@ -from typing import Optional, Tuple -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.exc import SQLAlchemyError -from sqlalchemy.future import select +import asyncio +import logging from datetime import datetime, timedelta, timezone -from app.db import Document, DocumentType, Chunk, SearchSourceConnector, SearchSourceConnectorType, SearchSpace +from typing import Optional, Tuple + from app.config import config +from app.connectors.discord_connector import DiscordConnector +from app.connectors.github_connector import GitHubConnector +from app.connectors.jira_connector import JiraConnector +from app.connectors.linear_connector import LinearConnector +from app.connectors.notion_history import NotionHistoryConnector +from app.connectors.slack_history import SlackHistory +from app.db import ( + Chunk, + Document, + DocumentType, + SearchSourceConnector, + SearchSourceConnectorType, +) from app.prompts import SUMMARY_PROMPT_TEMPLATE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.connectors.slack_history import SlackHistory -from app.connectors.notion_history import NotionHistoryConnector -from app.connectors.github_connector import GitHubConnector -from app.connectors.linear_connector import LinearConnector -from app.connectors.discord_connector import DiscordConnector -from app.connectors.jira_connector import JiraConnector -from slack_sdk.errors import SlackApiError -import logging -import asyncio - from app.utils.document_converters import generate_content_hash +from slack_sdk.errors import SlackApiError +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select # Set up logging logger = logging.getLogger(__name__) + async def index_slack_messages( session: AsyncSession, connector_id: int, @@ -30,56 +37,64 @@ async def index_slack_messages( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Slack messages from all accessible channels. - + Args: session: Database session connector_id: ID of the Slack connector search_space_id: ID of the search space to store documents in update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - + Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="slack_messages_indexing", source="connector_indexing_task", message=f"Starting Slack messages indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Slack connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.SLACK_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.SLACK_CONNECTOR, ) ) connector = result.scalars().first() - + if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a Slack connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) - return 0, f"Connector with ID {connector_id} not found or is not a Slack connector" - + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Slack connector", + ) + # Get the Slack token from the connector config slack_token = connector.config.get("SLACK_BOT_TOKEN") if not slack_token: @@ -87,62 +102,86 @@ async def index_slack_messages( log_entry, f"Slack token not found in connector config for connector {connector_id}", "Missing Slack token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Slack token not found in connector config" - + # Initialize Slack client await task_logger.log_task_progress( log_entry, f"Initializing Slack client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + slack_client = SlackHistory(token=slack_token) - + # Calculate date range await task_logger.log_task_progress( log_entry, - f"Calculating date range for Slack indexing", - {"stage": "date_calculation", "provided_start_date": start_date, "provided_end_date": end_date} + "Calculating date range for Slack indexing", + { + "stage": "date_calculation", + "provided_start_date": start_date, + "provided_end_date": end_date, + }, ) - + if start_date is None or end_date is None: # Fall back to calculating dates based on last_indexed_at calculated_end_date = datetime.now() - + # Use last_indexed_at as start date if available, otherwise use 365 days ago if connector.last_indexed_at: # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = connector.last_indexed_at.replace(tzinfo=None) if connector.last_indexed_at.tzinfo else connector.last_indexed_at - + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + # Check if last_indexed_at is in the future or after end_date if last_indexed_naive > calculated_end_date: - logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.") + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." + ) calculated_start_date = calculated_end_date - timedelta(days=365) else: calculated_start_date = last_indexed_naive - logger.info(f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date") + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) else: - calculated_start_date = calculated_end_date - timedelta(days=365) # Use 365 days as default - logger.info(f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") - + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Use 365 days as default + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) + # Use calculated dates if not provided - start_date_str = start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = ( + end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + ) else: # Use provided dates start_date_str = start_date end_date_str = end_date - + logger.info(f"Indexing Slack messages from {start_date_str} to {end_date_str}") - + await task_logger.log_task_progress( log_entry, f"Fetching Slack channels from {start_date_str} to {end_date_str}", - {"stage": "fetch_channels", "start_date": start_date_str, "end_date": end_date_str} + { + "stage": "fetch_channels", + "start_date": start_date_str, + "end_date": end_date_str, + }, ) - + # Get all channels try: channels = slack_client.get_all_channels() @@ -151,133 +190,162 @@ async def index_slack_messages( log_entry, f"Failed to get Slack channels for connector {connector_id}", str(e), - {"error_type": "ChannelFetchError"} + {"error_type": "ChannelFetchError"}, ) return 0, f"Failed to get Slack channels: {str(e)}" - + if not channels: await task_logger.log_task_success( log_entry, f"No Slack channels found for connector {connector_id}", - {"channels_found": 0} + {"channels_found": 0}, ) return 0, "No Slack channels found" - + # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 skipped_channels = [] - + await task_logger.log_task_progress( log_entry, f"Starting to process {len(channels)} Slack channels", - {"stage": "process_channels", "total_channels": len(channels)} + {"stage": "process_channels", "total_channels": len(channels)}, ) - + # Process each channel - for channel_obj in channels: # Modified loop to iterate over list of channel objects + for ( + channel_obj + ) in channels: # Modified loop to iterate over list of channel objects channel_id = channel_obj["id"] channel_name = channel_obj["name"] is_private = channel_obj["is_private"] - is_member = channel_obj["is_member"] # This might be False for public channels too + is_member = channel_obj[ + "is_member" + ] # This might be False for public channels too try: # If it's a private channel and the bot is not a member, skip. # For public channels, if they are listed by conversations.list, the bot can typically read history. # The `not_in_channel` error in get_conversation_history will be the ultimate gatekeeper if history is inaccessible. if is_private and not is_member: - logger.warning(f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping.") - skipped_channels.append(f"{channel_name} (private, bot not a member)") + logger.warning( + f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping." + ) + skipped_channels.append( + f"{channel_name} (private, bot not a member)" + ) documents_skipped += 1 continue - + # Get messages for this channel - # The get_history_by_date_range now uses get_conversation_history, + # The get_history_by_date_range now uses get_conversation_history, # which handles 'not_in_channel' by returning [] and logging. messages, error = slack_client.get_history_by_date_range( channel_id=channel_id, start_date=start_date_str, end_date=end_date_str, - limit=1000 # Limit to 1000 messages per channel + limit=1000, # Limit to 1000 messages per channel ) - + if error: - logger.warning(f"Error getting messages from channel {channel_name}: {error}") + logger.warning( + f"Error getting messages from channel {channel_name}: {error}" + ) skipped_channels.append(f"{channel_name} (error: {error})") documents_skipped += 1 continue # Skip this channel if there's an error - + if not messages: - logger.info(f"No messages found in channel {channel_name} for the specified date range.") + logger.info( + f"No messages found in channel {channel_name} for the specified date range." + ) documents_skipped += 1 continue # Skip if no messages - + # Format messages with user info formatted_messages = [] for msg in messages: # Skip bot messages and system messages - if msg.get("subtype") in ["bot_message", "channel_join", "channel_leave"]: + if msg.get("subtype") in [ + "bot_message", + "channel_join", + "channel_leave", + ]: continue - - formatted_msg = slack_client.format_message(msg, include_user_info=True) + + formatted_msg = slack_client.format_message( + msg, include_user_info=True + ) formatted_messages.append(formatted_msg) - + if not formatted_messages: - logger.info(f"No valid messages found in channel {channel_name} after filtering.") + logger.info( + f"No valid messages found in channel {channel_name} after filtering." + ) documents_skipped += 1 continue # Skip if no valid messages after filtering - + # Convert messages to markdown format channel_content = f"# Slack Channel: {channel_name}\n\n" - + for msg in formatted_messages: user_name = msg.get("user_name", "Unknown User") timestamp = msg.get("datetime", "Unknown Time") text = msg.get("text", "") - - channel_content += f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" - + + channel_content += ( + f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + ) + # Format document metadata metadata_sections = [ - ("METADATA", [ - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - # f"START_DATE: {start_date_str}", - # f"END_DATE: {end_date_str}", - f"MESSAGE_COUNT: {len(formatted_messages)}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - channel_content, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + # f"START_DATE: {start_date_str}", + # f"END_DATE: {end_date_str}", + f"MESSAGE_COUNT: {len(formatted_messages)}", + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"], + ), ] - + # Build the document string document_parts = [] document_parts.append("") - + for section_title, section_content in metadata_sections: document_parts.append(f"<{section_title}>") document_parts.extend(section_content) document_parts.append(f"") - + document_parts.append("") - combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing." + ) documents_skipped += 1 continue - + # Get user's long context LLM user_llm = await get_user_long_context_llm(session, user_id) if not user_llm: @@ -285,19 +353,26 @@ async def index_slack_messages( skipped_channels.append(f"{channel_name} (no LLM configured)") documents_skipped += 1 continue - + # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(channel_content) ] - + # Create and store new document document = Document( search_space_id=search_space_id, @@ -309,20 +384,24 @@ async def index_slack_messages( "start_date": start_date_str, "end_date": end_date_str, "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, content=summary_content, embedding=summary_embedding, chunks=chunks, content_hash=content_hash, ) - + session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages") - + logger.info( + f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" + ) + except SlackApiError as slack_error: - logger.error(f"Slack API error for channel {channel_name}: {str(slack_error)}") + logger.error( + f"Slack API error for channel {channel_name}: {str(slack_error)}" + ) skipped_channels.append(f"{channel_name} (Slack API error)") documents_skipped += 1 continue # Skip this channel and continue with others @@ -331,23 +410,23 @@ async def index_slack_messages( skipped_channels.append(f"{channel_name} (processing error)") documents_skipped += 1 continue # Skip this channel and continue with others - + # Update the last_indexed_at timestamp for the connector only if requested # and if we successfully indexed at least one channel total_processed = documents_indexed if update_last_indexed and total_processed > 0: connector.last_indexed_at = datetime.now() - + # Commit all changes await session.commit() - + # Prepare result message result_message = None if skipped_channels: result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" else: result_message = f"Processed {total_processed} channels." - + # Log success await task_logger.log_task_success( log_entry, @@ -357,20 +436,22 @@ async def index_slack_messages( "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, "skipped_channels_count": len(skipped_channels), - "result_message": result_message - } + "result_message": result_message, + }, + ) + + logger.info( + f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" ) - - logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped") return total_processed, result_message - + except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during Slack indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {str(db_error)}") return 0, f"Database error: {str(db_error)}" @@ -380,11 +461,12 @@ async def index_slack_messages( log_entry, f"Failed to index Slack messages for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Slack messages: {str(e)}") return 0, f"Failed to index Slack messages: {str(e)}" + async def index_notion_pages( session: AsyncSession, connector_id: int, @@ -392,56 +474,64 @@ async def index_notion_pages( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Notion pages from all accessible pages. - + Args: session: Database session connector_id: ID of the Notion connector search_space_id: ID of the search space to store documents in update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - + Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="notion_pages_indexing", source="connector_indexing_task", message=f"Starting Notion pages indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Notion connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.NOTION_CONNECTOR, ) ) connector = result.scalars().first() - + if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a Notion connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) - return 0, f"Connector with ID {connector_id} not found or is not a Notion connector" - + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Notion connector", + ) + # Get the Notion token from the connector config notion_token = connector.config.get("NOTION_INTEGRATION_TOKEN") if not notion_token: @@ -449,103 +539,119 @@ async def index_notion_pages( log_entry, f"Notion integration token not found in connector config for connector {connector_id}", "Missing Notion token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Notion integration token not found in connector config" - + # Initialize Notion client await task_logger.log_task_progress( log_entry, f"Initializing Notion client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + logger.info(f"Initializing Notion client for connector {connector_id}") notion_client = NotionHistoryConnector(token=notion_token) - + # Calculate date range if start_date is None or end_date is None: # Fall back to calculating dates calculated_end_date = datetime.now() - calculated_start_date = calculated_end_date - timedelta(days=365) # Check for last 1 year of pages - + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Check for last 1 year of pages + # Use calculated dates if not provided if start_date is None: start_date_iso = calculated_start_date.strftime("%Y-%m-%dT%H:%M:%SZ") else: # Convert YYYY-MM-DD to ISO format - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") - + start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + if end_date is None: end_date_iso = calculated_end_date.strftime("%Y-%m-%dT%H:%M:%SZ") else: # Convert YYYY-MM-DD to ISO format - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") + end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) else: # Convert provided dates to ISO format for Notion API - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") - + start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + logger.info(f"Fetching Notion pages from {start_date_iso} to {end_date_iso}") - + await task_logger.log_task_progress( log_entry, f"Fetching Notion pages from {start_date_iso} to {end_date_iso}", - {"stage": "fetch_pages", "start_date": start_date_iso, "end_date": end_date_iso} + { + "stage": "fetch_pages", + "start_date": start_date_iso, + "end_date": end_date_iso, + }, ) - + # Get all pages try: - pages = notion_client.get_all_pages(start_date=start_date_iso, end_date=end_date_iso) + pages = notion_client.get_all_pages( + start_date=start_date_iso, end_date=end_date_iso + ) logger.info(f"Found {len(pages)} Notion pages") except Exception as e: await task_logger.log_task_failure( log_entry, f"Failed to get Notion pages for connector {connector_id}", str(e), - {"error_type": "PageFetchError"} + {"error_type": "PageFetchError"}, ) logger.error(f"Error fetching Notion pages: {str(e)}", exc_info=True) return 0, f"Failed to get Notion pages: {str(e)}" - + if not pages: await task_logger.log_task_success( log_entry, f"No Notion pages found for connector {connector_id}", - {"pages_found": 0} + {"pages_found": 0}, ) logger.info("No Notion pages found to index") return 0, "No Notion pages found" - + # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 skipped_pages = [] - + await task_logger.log_task_progress( log_entry, f"Starting to process {len(pages)} Notion pages", - {"stage": "process_pages", "total_pages": len(pages)} + {"stage": "process_pages", "total_pages": len(pages)}, ) - + # Process each page for page in pages: try: page_id = page.get("page_id") page_title = page.get("title", f"Untitled page ({page_id})") page_content = page.get("content", []) - + logger.info(f"Processing Notion page: {page_title} ({page_id})") - + if not page_content: logger.info(f"No content found in page {page_title}. Skipping.") skipped_pages.append(f"{page_title} (no content)") documents_skipped += 1 continue - + # Convert page content to markdown format markdown_content = f"# Notion Page: {page_title}\n\n" - + # Process blocks recursively def process_blocks(blocks, level=0): result = "" @@ -553,10 +659,10 @@ async def index_notion_pages( block_type = block.get("type") block_content = block.get("content", "") children = block.get("children", []) - + # Add indentation based on level indent = " " * level - + # Format based on block type if block_type in ["paragraph", "text"]: result += f"{indent}{block_content}\n\n" @@ -586,54 +692,62 @@ async def index_notion_pages( # Default for other block types if block_content: result += f"{indent}{block_content}\n\n" - + # Process children recursively if children: result += process_blocks(children, level + 1) - + return result - - logger.debug(f"Converting {len(page_content)} blocks to markdown for page {page_title}") + + logger.debug( + f"Converting {len(page_content)} blocks to markdown for page {page_title}" + ) markdown_content += process_blocks(page_content) - + # Format document metadata metadata_sections = [ - ("METADATA", [ - f"PAGE_TITLE: {page_title}", - f"PAGE_ID: {page_id}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - markdown_content, - "TEXT_END" - ]) + ("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + markdown_content, + "TEXT_END", + ], + ), ] - + # Build the document string document_parts = [] document_parts.append("") - + for section_title, section_content in metadata_sections: document_parts.append(f"<{section_title}>") document_parts.extend(section_content) document_parts.append(f"") - + document_parts.append("") - combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing." + ) documents_skipped += 1 continue - + # Get user's long context LLM user_llm = await get_user_long_context_llm(session, user_id) if not user_llm: @@ -641,21 +755,28 @@ async def index_notion_pages( skipped_pages.append(f"{page_title} (no LLM configured)") documents_skipped += 1 continue - + # Generate summary logger.debug(f"Generating summary for page {page_title}") summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + # Process chunks logger.debug(f"Chunking content for page {page_title}") chunks = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(markdown_content) ] - + # Create and store new document document = Document( search_space_id=search_space_id, @@ -664,41 +785,46 @@ async def index_notion_pages( document_metadata={ "page_title": page_title, "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, content=summary_content, content_hash=content_hash, embedding=summary_embedding, - chunks=chunks + chunks=chunks, ) - + session.add(document) documents_indexed += 1 logger.info(f"Successfully indexed new Notion page: {page_title}") - + except Exception as e: - logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True) - skipped_pages.append(f"{page.get('title', 'Unknown')} (processing error)") + logger.error( + f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", + exc_info=True, + ) + skipped_pages.append( + f"{page.get('title', 'Unknown')} (processing error)" + ) documents_skipped += 1 continue # Skip this page and continue with others - + # Update the last_indexed_at timestamp for the connector only if requested # and if we successfully indexed at least one page total_processed = documents_indexed if update_last_indexed and total_processed > 0: connector.last_indexed_at = datetime.now() logger.info(f"Updated last_indexed_at for connector {connector_id}") - + # Commit all changes await session.commit() - + # Prepare result message result_message = None if skipped_pages: result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" else: result_message = f"Processed {total_processed} pages." - + # Log success await task_logger.log_task_success( log_entry, @@ -708,22 +834,26 @@ async def index_notion_pages( "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, "skipped_pages_count": len(skipped_pages), - "result_message": result_message - } + "result_message": result_message, + }, + ) + + logger.info( + f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" ) - - logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped") return total_processed, result_message - + except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during Notion indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during Notion indexing: {str(db_error)}", exc_info=True ) - logger.error(f"Database error during Notion indexing: {str(db_error)}", exc_info=True) return 0, f"Database error: {str(db_error)}" except Exception as e: await session.rollback() @@ -731,11 +861,12 @@ async def index_notion_pages( log_entry, f"Failed to index Notion pages for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Notion pages: {str(e)}", exc_info=True) return 0, f"Failed to index Notion pages: {str(e)}" + async def index_github_repos( session: AsyncSession, connector_id: int, @@ -743,7 +874,7 @@ async def index_github_repos( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index code and documentation files from accessible GitHub repositories. @@ -758,15 +889,20 @@ async def index_github_repos( Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="github_repos_indexing", source="connector_indexing_task", message=f"Starting GitHub repositories indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + documents_processed = 0 errors = [] @@ -775,14 +911,14 @@ async def index_github_repos( await task_logger.log_task_progress( log_entry, f"Retrieving GitHub connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.GITHUB_CONNECTOR, ) ) connector = result.scalars().first() @@ -792,9 +928,12 @@ async def index_github_repos( log_entry, f"Connector with ID {connector_id} not found or is not a GitHub connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a GitHub connector", ) - return 0, f"Connector with ID {connector_id} not found or is not a GitHub connector" # 2. Get the GitHub PAT and selected repositories from the connector config github_pat = connector.config.get("GITHUB_PAT") @@ -805,16 +944,18 @@ async def index_github_repos( log_entry, f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", "Missing GitHub PAT", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "GitHub Personal Access Token (PAT) not found in connector config" - - if not repo_full_names_to_index or not isinstance(repo_full_names_to_index, list): + + if not repo_full_names_to_index or not isinstance( + repo_full_names_to_index, list + ): await task_logger.log_task_failure( log_entry, f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}", "Invalid repo configuration", - {"error_type": "InvalidConfiguration"} + {"error_type": "InvalidConfiguration"}, ) return 0, "'repo_full_names' not found or is not a list in connector config" @@ -822,9 +963,12 @@ async def index_github_repos( await task_logger.log_task_progress( log_entry, f"Initializing GitHub client for connector {connector_id}", - {"stage": "client_initialization", "repo_count": len(repo_full_names_to_index)} + { + "stage": "client_initialization", + "repo_count": len(repo_full_names_to_index), + }, ) - + try: github_client = GitHubConnector(token=github_pat) except ValueError as e: @@ -832,7 +976,7 @@ async def index_github_repos( log_entry, f"Failed to initialize GitHub client for connector {connector_id}", str(e), - {"error_type": "ClientInitializationError"} + {"error_type": "ClientInitializationError"}, ) return 0, f"Failed to initialize GitHub client: {str(e)}" @@ -842,12 +986,21 @@ async def index_github_repos( await task_logger.log_task_progress( log_entry, f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", - {"stage": "repo_processing", "repo_count": len(repo_full_names_to_index), "start_date": start_date, "end_date": end_date} + { + "stage": "repo_processing", + "repo_count": len(repo_full_names_to_index), + "start_date": start_date, + "end_date": end_date, + }, + ) + + logger.info( + f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." ) - - logger.info(f"Starting indexing for {len(repo_full_names_to_index)} selected repositories.") if start_date and end_date: - logger.info(f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)") + logger.info( + f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" + ) # 6. Iterate through selected repositories and index files for repo_full_name in repo_full_names_to_index: @@ -859,65 +1012,92 @@ async def index_github_repos( try: files_to_index = github_client.get_repository_files(repo_full_name) if not files_to_index: - logger.info(f"No indexable files found in repository: {repo_full_name}") + logger.info( + f"No indexable files found in repository: {repo_full_name}" + ) continue - logger.info(f"Found {len(files_to_index)} files to process in {repo_full_name}") + logger.info( + f"Found {len(files_to_index)} files to process in {repo_full_name}" + ) for file_info in files_to_index: file_path = file_info.get("path") file_url = file_info.get("url") file_sha = file_info.get("sha") - file_type = file_info.get("type") # 'code' or 'doc' + file_type = file_info.get("type") # 'code' or 'doc' full_path_key = f"{repo_full_name}/{file_path}" if not file_path or not file_url or not file_sha: - logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}") + logger.warning( + f"Skipping file with missing info in {repo_full_name}: {file_info}" + ) continue # Get file content - file_content = github_client.get_file_content(repo_full_name, file_path) + file_content = github_client.get_file_content( + repo_full_name, file_path + ) if file_content is None: - logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.") - continue # Skip if content fetch failed - + logger.warning( + f"Could not retrieve content for {full_path_key}. Skipping." + ) + continue # Skip if content fetch failed + content_hash = generate_content_hash(file_content, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing." + ) continue - + # Use file_content directly for chunking, maybe summary for main content? # For now, let's use the full content for both, might need refinement - summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) # Chunk the content try: chunks_data = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) - for chunk in config.code_chunker_instance.chunk(file_content) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed( + chunk.text + ), + ) + for chunk in config.code_chunker_instance.chunk( + file_content + ) ] except Exception as chunk_err: - logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}") - errors.append(f"Chunking failed for {full_path_key}: {chunk_err}") - continue # Skip this file if chunking fails + logger.error( + f"Failed to chunk file {full_path_key}: {chunk_err}" + ) + errors.append( + f"Chunking failed for {full_path_key}: {chunk_err}" + ) + continue # Skip this file if chunking fails doc_metadata = { "repository_full_name": repo_full_name, "file_path": file_path, - "full_path": full_path_key, # For easier lookup + "full_path": full_path_key, # For easier lookup "url": file_url, "sha": file_sha, "type": file_type, - "indexed_at": datetime.now(timezone.utc).isoformat() + "indexed_at": datetime.now(timezone.utc).isoformat(), } # Create new document @@ -926,22 +1106,26 @@ async def index_github_repos( title=f"GitHub - {file_path}", document_type=DocumentType.GITHUB_CONNECTOR, document_metadata=doc_metadata, - content=summary_content, # Store summary + content=summary_content, # Store summary content_hash=content_hash, embedding=summary_embedding, search_space_id=search_space_id, - chunks=chunks_data # Associate chunks directly + chunks=chunks_data, # Associate chunks directly ) session.add(document) documents_processed += 1 except Exception as repo_err: - logger.error(f"Failed to process repository {repo_full_name}: {repo_err}") + logger.error( + f"Failed to process repository {repo_full_name}: {repo_err}" + ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") - + # Commit all changes at the end await session.commit() - logger.info(f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files.") + logger.info( + f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." + ) # Log success await task_logger.log_task_success( @@ -950,8 +1134,8 @@ async def index_github_repos( { "documents_processed": documents_processed, "errors_count": len(errors), - "repo_count": len(repo_full_names_to_index) - } + "repo_count": len(repo_full_names_to_index), + }, ) except SQLAlchemyError as db_err: @@ -960,9 +1144,11 @@ async def index_github_repos( log_entry, f"Database error during GitHub indexing for connector {connector_id}", str(db_err), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during GitHub indexing for connector {connector_id}: {db_err}" ) - logger.error(f"Database error during GitHub indexing for connector {connector_id}: {db_err}") errors.append(f"Database error: {db_err}") return documents_processed, "; ".join(errors) if errors else str(db_err) except Exception as e: @@ -971,15 +1157,19 @@ async def index_github_repos( log_entry, f"Unexpected error during GitHub indexing for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, + ) + logger.error( + f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", + exc_info=True, ) - logger.error(f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", exc_info=True) errors.append(f"Unexpected error: {e}") return documents_processed, "; ".join(errors) if errors else str(e) error_message = "; ".join(errors) if errors else None return documents_processed, error_message + async def index_linear_issues( session: AsyncSession, connector_id: int, @@ -987,56 +1177,64 @@ async def index_linear_issues( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Linear issues and comments. - + Args: session: Database session connector_id: ID of the Linear connector search_space_id: ID of the search space to store documents in update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - + Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="linear_issues_indexing", source="connector_indexing_task", message=f"Starting Linear issues indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Linear connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.LINEAR_CONNECTOR, ) ) connector = result.scalars().first() - + if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a Linear connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) - return 0, f"Connector with ID {connector_id} not found or is not a Linear connector" - + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Linear connector", + ) + # Get the Linear token from the connector config linear_token = connector.config.get("LINEAR_API_KEY") if not linear_token: @@ -1044,135 +1242,167 @@ async def index_linear_issues( log_entry, f"Linear API token not found in connector config for connector {connector_id}", "Missing Linear token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Linear API token not found in connector config" - + # Initialize Linear client await task_logger.log_task_progress( log_entry, f"Initializing Linear client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + linear_client = LinearConnector(token=linear_token) - + # Calculate date range if start_date is None or end_date is None: # Fall back to calculating dates based on last_indexed_at calculated_end_date = datetime.now() - + # Use last_indexed_at as start date if available, otherwise use 365 days ago if connector.last_indexed_at: # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = connector.last_indexed_at.replace(tzinfo=None) if connector.last_indexed_at.tzinfo else connector.last_indexed_at - + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + # Check if last_indexed_at is in the future or after end_date if last_indexed_naive > calculated_end_date: - logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.") + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." + ) calculated_start_date = calculated_end_date - timedelta(days=365) else: calculated_start_date = last_indexed_naive - logger.info(f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date") + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) else: - calculated_start_date = calculated_end_date - timedelta(days=365) # Use 365 days as default - logger.info(f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") - + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Use 365 days as default + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) + # Use calculated dates if not provided - start_date_str = start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = ( + end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + ) else: # Use provided dates start_date_str = start_date end_date_str = end_date - + logger.info(f"Fetching Linear issues from {start_date_str} to {end_date_str}") - + await task_logger.log_task_progress( log_entry, f"Fetching Linear issues from {start_date_str} to {end_date_str}", - {"stage": "fetch_issues", "start_date": start_date_str, "end_date": end_date_str} + { + "stage": "fetch_issues", + "start_date": start_date_str, + "end_date": end_date_str, + }, ) - + # Get issues within date range try: issues, error = linear_client.get_issues_by_date_range( - start_date=start_date_str, - end_date=end_date_str, - include_comments=True + start_date=start_date_str, end_date=end_date_str, include_comments=True ) - + if error: logger.error(f"Failed to get Linear issues: {error}") - + # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: - logger.info("No issues found is not a critical error, continuing with update") + logger.info( + "No issues found is not a critical error, continuing with update" + ) if update_last_indexed: connector.last_indexed_at = datetime.now() await session.commit() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found") + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) return 0, None else: return 0, f"Failed to get Linear issues: {error}" - + logger.info(f"Retrieved {len(issues)} issues from Linear API") - + except Exception as e: logger.error(f"Exception when calling Linear API: {str(e)}", exc_info=True) return 0, f"Failed to get Linear issues: {str(e)}" - + if not issues: logger.info("No Linear issues found for the specified date range") if update_last_indexed: connector.last_indexed_at = datetime.now() await session.commit() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found") + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) return 0, None # Return None instead of error message when no issues found - + # Log issue IDs and titles for debugging logger.info("Issues retrieved from Linear API:") for idx, issue in enumerate(issues[:10]): # Log first 10 issues - logger.info(f" {idx+1}. {issue.get('identifier', 'Unknown')} - {issue.get('title', 'Unknown')} - Created: {issue.get('createdAt', 'Unknown')} - Updated: {issue.get('updatedAt', 'Unknown')}") + logger.info( + f" {idx + 1}. {issue.get('identifier', 'Unknown')} - {issue.get('title', 'Unknown')} - Created: {issue.get('createdAt', 'Unknown')} - Updated: {issue.get('updatedAt', 'Unknown')}" + ) if len(issues) > 10: logger.info(f" ...and {len(issues) - 10} more issues") - + # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 skipped_issues = [] - + await task_logger.log_task_progress( log_entry, f"Starting to process {len(issues)} Linear issues", - {"stage": "process_issues", "total_issues": len(issues)} + {"stage": "process_issues", "total_issues": len(issues)}, ) - + # Process each issue for issue in issues: try: issue_id = issue.get("id") issue_identifier = issue.get("identifier", "") issue_title = issue.get("title", "") - + if not issue_id or not issue_title: - logger.warning(f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}") - skipped_issues.append(f"{issue_identifier or 'Unknown'} (missing data)") + logger.warning( + f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" + ) + skipped_issues.append( + f"{issue_identifier or 'Unknown'} (missing data)" + ) documents_skipped += 1 continue - + # Format the issue first to get well-structured data formatted_issue = linear_client.format_issue(issue) - + # Convert issue to markdown format issue_content = linear_client.format_issue_to_markdown(formatted_issue) - + if not issue_content: - logger.warning(f"Skipping issue with no content: {issue_identifier} - {issue_title}") + logger.warning( + f"Skipping issue with no content: {issue_identifier} - {issue_title}" + ) skipped_issues.append(f"{issue_identifier} (no content)") documents_skipped += 1 continue - + # Create a short summary for the embedding # This avoids using the LLM and just uses the issue data directly state = formatted_issue.get("state", "Unknown") @@ -1180,40 +1410,51 @@ async def index_linear_issues( # Truncate description if it's too long for the summary if description and len(description) > 500: description = description[:497] + "..." - + # Create a simple summary from the issue data summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" if description: summary_content += f"Description: {description}\n\n" - + # Add comment count comment_count = len(formatted_issue.get("comments", [])) summary_content += f"Comments: {comment_count}" - + content_hash = generate_content_hash(issue_content, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." + ) documents_skipped += 1 continue - + # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed(summary_content) - + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + # Process chunks - using the full issue content with comments chunks = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(issue_content) ] - + # Create and store new document - logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}") + logger.info( + f"Creating new document for issue {issue_identifier} - {issue_title}" + ) document = Document( search_space_id=search_space_id, title=f"Linear - {issue_identifier}: {issue_title}", @@ -1224,34 +1465,41 @@ async def index_linear_issues( "issue_title": issue_title, "state": state, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, content=summary_content, content_hash=content_hash, embedding=summary_embedding, - chunks=chunks + chunks=chunks, ) - + session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}") - + logger.info( + f"Successfully indexed new issue {issue_identifier} - {issue_title}" + ) + except Exception as e: - logger.error(f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", exc_info=True) - skipped_issues.append(f"{issue.get('identifier', 'Unknown')} (processing error)") + logger.error( + f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", + exc_info=True, + ) + skipped_issues.append( + f"{issue.get('identifier', 'Unknown')} (processing error)" + ) documents_skipped += 1 continue # Skip this issue and continue with others - + # Update the last_indexed_at timestamp for the connector only if requested total_processed = documents_indexed if update_last_indexed: connector.last_indexed_at = datetime.now() logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - + # Commit all changes await session.commit() - logger.info(f"Successfully committed all Linear document changes to database") - + logger.info("Successfully committed all Linear document changes to database") + # Log success await task_logger.log_task_success( log_entry, @@ -1260,20 +1508,25 @@ async def index_linear_issues( "issues_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_issues_count": len(skipped_issues) - } + "skipped_issues_count": len(skipped_issues), + }, ) - - logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped") - return total_processed, None # Return None as the error message to indicate success - + + logger.info( + f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + ) + return ( + total_processed, + None, + ) # Return None as the error message to indicate success + except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during Linear indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {str(db_error)}", exc_info=True) return 0, f"Database error: {str(db_error)}" @@ -1283,11 +1536,12 @@ async def index_linear_issues( log_entry, f"Failed to index Linear issues for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Linear issues: {str(e)}", exc_info=True) return 0, f"Failed to index Linear issues: {str(e)}" + async def index_discord_messages( session: AsyncSession, connector_id: int, @@ -1295,7 +1549,7 @@ async def index_discord_messages( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Discord messages from all accessible channels. @@ -1310,28 +1564,33 @@ async def index_discord_messages( Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="discord_messages_indexing", source="connector_indexing_task", message=f"Starting Discord messages indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Discord connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.DISCORD_CONNECTOR, ) ) connector = result.scalars().first() @@ -1341,9 +1600,12 @@ async def index_discord_messages( log_entry, f"Connector with ID {connector_id} not found or is not a Discord connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Discord connector", ) - return 0, f"Connector with ID {connector_id} not found or is not a Discord connector" # Get the Discord token from the connector config discord_token = connector.config.get("DISCORD_BOT_TOKEN") @@ -1352,7 +1614,7 @@ async def index_discord_messages( log_entry, f"Discord token not found in connector config for connector {connector_id}", "Missing Discord token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Discord token not found in connector config" @@ -1362,9 +1624,9 @@ async def index_discord_messages( await task_logger.log_task_progress( log_entry, f"Initializing Discord client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + discord_client = DiscordConnector(token=discord_token) # Calculate date range @@ -1374,30 +1636,54 @@ async def index_discord_messages( # Use last_indexed_at as start date if available, otherwise use 365 days ago if connector.last_indexed_at: - calculated_start_date = connector.last_indexed_at.replace(tzinfo=timezone.utc) - logger.info(f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date") + calculated_start_date = connector.last_indexed_at.replace( + tzinfo=timezone.utc + ) + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) else: calculated_start_date = calculated_end_date - timedelta(days=365) - logger.info(f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) # Use calculated dates if not provided, convert to ISO format for Discord API if start_date is None: start_date_iso = calculated_start_date.isoformat() else: # Convert YYYY-MM-DD to ISO format - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() - + start_date_iso = ( + datetime.strptime(start_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) + if end_date is None: end_date_iso = calculated_end_date.isoformat() else: - # Convert YYYY-MM-DD to ISO format - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() + # Convert YYYY-MM-DD to ISO format + end_date_iso = ( + datetime.strptime(end_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) else: # Convert provided dates to ISO format for Discord API - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() - - logger.info(f"Indexing Discord messages from {start_date_iso} to {end_date_iso}") + start_date_iso = ( + datetime.strptime(start_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) + end_date_iso = ( + datetime.strptime(end_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) + + logger.info( + f"Indexing Discord messages from {start_date_iso} to {end_date_iso}" + ) documents_indexed = 0 documents_skipped = 0 @@ -1407,9 +1693,9 @@ async def index_discord_messages( await task_logger.log_task_progress( log_entry, f"Starting Discord bot and fetching guilds for connector {connector_id}", - {"stage": "fetch_guilds"} + {"stage": "fetch_guilds"}, ) - + logger.info("Starting Discord bot to fetch guilds") discord_client._bot_task = asyncio.create_task(discord_client.start_bot()) await discord_client._wait_until_ready() @@ -1422,7 +1708,7 @@ async def index_discord_messages( log_entry, f"Failed to get Discord guilds for connector {connector_id}", str(e), - {"error_type": "GuildFetchError"} + {"error_type": "GuildFetchError"}, ) logger.error(f"Failed to get Discord guilds: {str(e)}", exc_info=True) await discord_client.close_bot() @@ -1431,7 +1717,7 @@ async def index_discord_messages( await task_logger.log_task_success( log_entry, f"No Discord guilds found for connector {connector_id}", - {"guilds_found": 0} + {"guilds_found": 0}, ) logger.info("No Discord guilds found to index") await discord_client.close_bot() @@ -1441,9 +1727,9 @@ async def index_discord_messages( await task_logger.log_task_progress( log_entry, f"Starting to process {len(guilds)} Discord guilds", - {"stage": "process_guilds", "total_guilds": len(guilds)} + {"stage": "process_guilds", "total_guilds": len(guilds)}, ) - + for guild in guilds: guild_id = guild["id"] guild_name = guild["name"] @@ -1467,13 +1753,19 @@ async def index_discord_messages( end_date=end_date_iso, ) except Exception as e: - logger.error(f"Failed to get messages for channel {channel_name}: {str(e)}") - skipped_channels.append(f"{guild_name}#{channel_name} (fetch error)") + logger.error( + f"Failed to get messages for channel {channel_name}: {str(e)}" + ) + skipped_channels.append( + f"{guild_name}#{channel_name} (fetch error)" + ) documents_skipped += 1 continue if not messages: - logger.info(f"No messages found in channel {channel_name} for the specified date range.") + logger.info( + f"No messages found in channel {channel_name} for the specified date range." + ) documents_skipped += 1 continue @@ -1486,33 +1778,45 @@ async def index_discord_messages( formatted_messages.append(msg) if not formatted_messages: - logger.info(f"No valid messages found in channel {channel_name} after filtering.") + logger.info( + f"No valid messages found in channel {channel_name} after filtering." + ) documents_skipped += 1 continue # Convert messages to markdown format - channel_content = f"# Discord Channel: {guild_name} / {channel_name}\n\n" + channel_content = ( + f"# Discord Channel: {guild_name} / {channel_name}\n\n" + ) for msg in formatted_messages: user_name = msg.get("author_name", "Unknown User") timestamp = msg.get("created_at", "Unknown Time") text = msg.get("content", "") - channel_content += f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + channel_content += ( + f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + ) # Format document metadata metadata_sections = [ - ("METADATA", [ - f"GUILD_NAME: {guild_name}", - f"GUILD_ID: {guild_id}", - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - f"MESSAGE_COUNT: {len(formatted_messages)}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - channel_content, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"GUILD_NAME: {guild_name}", + f"GUILD_ID: {guild_id}", + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + f"MESSAGE_COUNT: {len(formatted_messages)}", + ], + ), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + channel_content, + "TEXT_END", + ], + ), ] # Build the document string @@ -1523,31 +1827,43 @@ async def index_discord_messages( document_parts.extend(section_content) document_parts.append(f"") document_parts.append("") - combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing." + ) documents_skipped += 1 continue # Get user's long context LLM user_llm = await get_user_long_context_llm(session, user_id) if not user_llm: - logger.error(f"No long context LLM configured for user {user_id}") - skipped_channels.append(f"{guild_name}#{channel_name} (no LLM configured)") + logger.error( + f"No long context LLM configured for user {user_id}" + ) + skipped_channels.append( + f"{guild_name}#{channel_name} (no LLM configured)" + ) documents_skipped += 1 continue # Generate summary using summary_chain summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content summary_embedding = await asyncio.to_thread( config.embedding_model_instance.embed, summary_content @@ -1555,14 +1871,17 @@ async def index_discord_messages( # Process chunks raw_chunks = await asyncio.to_thread( - config.chunker_instance.chunk, - channel_content + config.chunker_instance.chunk, channel_content ) - chunk_texts = [chunk.text for chunk in raw_chunks if chunk.text.strip()] + chunk_texts = [ + chunk.text for chunk in raw_chunks if chunk.text.strip() + ] chunk_embeddings = await asyncio.to_thread( - lambda texts: [config.embedding_model_instance.embed(t) for t in texts], - chunk_texts + lambda texts: [ + config.embedding_model_instance.embed(t) for t in texts + ], + chunk_texts, ) chunks = [ @@ -1583,20 +1902,26 @@ async def index_discord_messages( "message_count": len(formatted_messages), "start_date": start_date_iso, "end_date": end_date_iso, - "indexed_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now(timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S" + ), }, content=summary_content, content_hash=content_hash, embedding=summary_embedding, - chunks=chunks + chunks=chunks, ) session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages") + logger.info( + f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" + ) except Exception as e: - logger.error(f"Error processing guild {guild_name}: {str(e)}", exc_info=True) + logger.error( + f"Error processing guild {guild_name}: {str(e)}", exc_info=True + ) skipped_channels.append(f"{guild_name} (processing error)") documents_skipped += 1 continue @@ -1625,11 +1950,13 @@ async def index_discord_messages( "documents_skipped": documents_skipped, "skipped_channels_count": len(skipped_channels), "guilds_processed": len(guilds), - "result_message": result_message - } + "result_message": result_message, + }, ) - logger.info(f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped") + logger.info( + f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + ) return documents_indexed, result_message except SQLAlchemyError as db_error: @@ -1638,9 +1965,11 @@ async def index_discord_messages( log_entry, f"Database error during Discord indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during Discord indexing: {str(db_error)}", exc_info=True ) - logger.error(f"Database error during Discord indexing: {str(db_error)}", exc_info=True) return 0, f"Database error: {str(db_error)}" except Exception as e: await session.rollback() @@ -1648,7 +1977,7 @@ async def index_discord_messages( log_entry, f"Failed to index Discord messages for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Discord messages: {str(e)}", exc_info=True) return 0, f"Failed to index Discord messages: {str(e)}" @@ -1661,7 +1990,7 @@ async def index_jira_issues( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Jira issues and comments. @@ -1685,13 +2014,20 @@ async def index_jira_issues( task_name="jira_issues_indexing", source="connector_indexing_task", message=f"Starting Jira issues indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) try: # Get the connector from the database result = await session.execute( - select(SearchSourceConnector).where(SearchSourceConnector.id == connector_id) + select(SearchSourceConnector).where( + SearchSourceConnector.id == connector_id + ) ) connector = result.scalar_one_or_none() @@ -1700,7 +2036,7 @@ async def index_jira_issues( log_entry, f"Connector with ID {connector_id} not found", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) return 0, f"Connector with ID {connector_id} not found" @@ -1713,7 +2049,7 @@ async def index_jira_issues( log_entry, f"Jira credentials not found in connector config for connector {connector_id}", "Missing Jira credentials", - {"error_type": "MissingCredentials"} + {"error_type": "MissingCredentials"}, ) return 0, "Jira credentials not found in connector config" @@ -1721,10 +2057,12 @@ async def index_jira_issues( await task_logger.log_task_progress( log_entry, f"Initializing Jira client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - jira_client = JiraConnector(base_url=jira_base_url, personal_access_token=jira_token) + jira_client = JiraConnector( + base_url=jira_base_url, personal_access_token=jira_token + ) # Calculate date range if start_date is None or end_date is None: @@ -1737,8 +2075,8 @@ async def index_jira_issues( # If never indexed, go back 30 days calculated_start_date = calculated_end_date - timedelta(days=30) - start_date_str = calculated_start_date.strftime('%Y-%m-%d') - end_date_str = calculated_end_date.strftime('%Y-%m-%d') + start_date_str = calculated_start_date.strftime("%Y-%m-%d") + end_date_str = calculated_end_date.strftime("%Y-%m-%d") else: start_date_str = start_date end_date_str = end_date @@ -1746,15 +2084,17 @@ async def index_jira_issues( await task_logger.log_task_progress( log_entry, f"Fetching Jira issues from {start_date_str} to {end_date_str}", - {"stage": "fetching_issues", "start_date": start_date_str, "end_date": end_date_str} + { + "stage": "fetching_issues", + "start_date": start_date_str, + "end_date": end_date_str, + }, ) # Get issues within date range try: issues, error = jira_client.get_issues_by_date_range( - start_date=start_date_str, - end_date=end_date_str, - include_comments=True + start_date=start_date_str, end_date=end_date_str, include_comments=True ) if error: @@ -1762,16 +2102,20 @@ async def index_jira_issues( # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: - logger.info("No issues found is not a critical error, continuing with update") + logger.info( + "No issues found is not a critical error, continuing with update" + ) if update_last_indexed: connector.last_indexed_at = datetime.now() await session.commit() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found") + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) await task_logger.log_task_completion( log_entry, f"No Jira issues found in date range {start_date_str} to {end_date_str}", - {"indexed_count": 0} + {"indexed_count": 0}, ) return 0, None else: @@ -1779,7 +2123,7 @@ async def index_jira_issues( log_entry, f"Failed to get Jira issues: {error}", "API Error", - {"error_type": "APIError"} + {"error_type": "APIError"}, ) return 0, f"Failed to get Jira issues: {error}" @@ -1788,7 +2132,7 @@ async def index_jira_issues( await task_logger.log_task_progress( log_entry, f"Retrieved {len(issues)} issues from Jira API", - {"stage": "processing_issues", "issue_count": len(issues)} + {"stage": "processing_issues", "issue_count": len(issues)}, ) except Exception as e: @@ -1796,7 +2140,7 @@ async def index_jira_issues( log_entry, f"Error fetching Jira issues: {str(e)}", "Fetch Error", - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Error fetching Jira issues: {str(e)}", exc_info=True) return 0, f"Error fetching Jira issues: {str(e)}" @@ -1820,14 +2164,20 @@ async def index_jira_issues( "priority": formatted_issue.get("priority", ""), "issue_type": formatted_issue.get("issue_type", ""), "project": formatted_issue.get("project", ""), - "assignee": formatted_issue.get("assignee", {}).get("display_name", "") if formatted_issue.get("assignee") else "", - "reporter": formatted_issue.get("reporter", {}).get("display_name", ""), + "assignee": ( + formatted_issue.get("assignee", {}).get("display_name", "") + if formatted_issue.get("assignee") + else "" + ), + "reporter": formatted_issue.get("reporter", {}).get( + "display_name", "" + ), "created_at": formatted_issue.get("created_at", ""), "updated_at": formatted_issue.get("updated_at", ""), "comment_count": len(formatted_issue.get("comments", [])), "connector_id": connector_id, "source": "jira", - "base_url": jira_base_url + "base_url": jira_base_url, } # Generate content hash @@ -1840,7 +2190,9 @@ async def index_jira_issues( existing_doc = existing_doc_result.scalar_one_or_none() if existing_doc: - logger.debug(f"Document with hash {content_hash} already exists, skipping") + logger.debug( + f"Document with hash {content_hash} already exists, skipping" + ) continue # Create new document @@ -1850,34 +2202,47 @@ async def index_jira_issues( document_metadata=metadata, content=issue_markdown, content_hash=content_hash, - search_space_id=search_space_id + search_space_id=search_space_id, ) # Generate embedding - embedding = await config.embedding_model_instance.get_embedding(issue_markdown) + embedding = await config.embedding_model_instance.get_embedding( + issue_markdown + ) document.embedding = embedding session.add(document) await session.flush() # Flush to get the document ID # Create chunks for the document - chunks = await config.chunking_model_instance.chunk_document(issue_markdown) + chunks = await config.chunking_model_instance.chunk_document( + issue_markdown + ) for chunk_content in chunks: - chunk_embedding = await config.embedding_model_instance.get_embedding(chunk_content) + chunk_embedding = ( + await config.embedding_model_instance.get_embedding( + chunk_content + ) + ) chunk = Chunk( content=chunk_content, embedding=chunk_embedding, - document_id=document.id + document_id=document.id, ) session.add(chunk) indexed_count += 1 - logger.debug(f"Indexed Jira issue: {formatted_issue.get('key', 'Unknown')}") + logger.debug( + f"Indexed Jira issue: {formatted_issue.get('key', 'Unknown')}" + ) except Exception as e: - logger.error(f"Error processing Jira issue {issue.get('key', 'Unknown')}: {str(e)}", exc_info=True) + logger.error( + f"Error processing Jira issue {issue.get('key', 'Unknown')}: {str(e)}", + exc_info=True, + ) continue # Commit all changes @@ -1892,7 +2257,7 @@ async def index_jira_issues( await task_logger.log_task_completion( log_entry, f"Successfully indexed {indexed_count} Jira issues", - {"indexed_count": indexed_count} + {"indexed_count": indexed_count}, ) logger.info(f"Successfully indexed {indexed_count} Jira issues") @@ -1903,7 +2268,7 @@ async def index_jira_issues( log_entry, f"Failed to index Jira issues: {str(e)}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Jira issues: {str(e)}", exc_info=True) return 0, f"Failed to index Jira issues: {str(e)}" From 6bced733b27f1417ef9265d51e8d719c4a017f47 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 11:58:41 +0200 Subject: [PATCH 08/17] add jira connector implementation in the web --- ...cb2962bf19c1099cfe708e42daa0097f94976.json | 1 + .../agents/researcher/qna_agent/prompts.py | 1 - .../connectors/[connector_id]/edit/page.tsx | 397 +- .../connectors/[connector_id]/page.tsx | 122 +- .../connectors/add/jira-connector/page.tsx | 448 +++ .../[search_space_id]/connectors/add/page.tsx | 165 +- .../researcher/[chat_id]/page.tsx | 3512 +++++++++-------- .../components/chat/ConnectorComponents.tsx | 202 +- surfsense_web/lib/connectors/utils.ts | 25 +- 9 files changed, 2740 insertions(+), 2133 deletions(-) create mode 100644 node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json create mode 100644 surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx diff --git a/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json b/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json new file mode 100644 index 000000000..502adfcc4 --- /dev/null +++ b/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json @@ -0,0 +1 @@ +{"2d0ec64d93969318101ee479b664221b32241665":{"files":{"surfsense_web/lib/connectors/utils.ts":["RXwmTdu3JAyxa1ApFuYJiSRHfZo=",true],"surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx":["jZynb8hLm5uq1viyFK9UMcRClD8=",true],"surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx":["LEFIcQIvBUtbTE9PuuJI0WqzdVw=",true]},"modified":1753351069225}} \ No newline at end of file diff --git a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py index d726dfd1d..0c5ebc158 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py @@ -17,7 +17,6 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) - JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) - DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) -- DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions) - TAVILY_API: "Tavily search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results) diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx index 34db58f67..4292b7efa 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx @@ -9,12 +9,12 @@ import { ArrowLeft, Check, Loader2, Github } from "lucide-react"; import { Form } from "@/components/ui/form"; import { Button } from "@/components/ui/button"; import { - Card, - CardContent, - CardDescription, - CardFooter, - CardHeader, - CardTitle, + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, } from "@/components/ui/card"; // Import Utils, Types, Hook, and Components @@ -27,201 +27,220 @@ import { EditSimpleTokenForm } from "@/components/editConnector/EditSimpleTokenF import { getConnectorIcon } from "@/components/chat"; export default function EditConnectorPage() { - const router = useRouter(); - const params = useParams(); - const searchSpaceId = params.search_space_id as string; - // Ensure connectorId is parsed safely - const connectorIdParam = params.connector_id as string; - const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN; + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + // Ensure connectorId is parsed safely + const connectorIdParam = params.connector_id as string; + const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN; - // Use the custom hook to manage state and logic - const { - connectorsLoading, - connector, - isSaving, - editForm, - patForm, // Needed for GitHub child component - handleSaveChanges, - // GitHub specific props for the child component - editMode, - setEditMode, // Pass down if needed by GitHub component - originalPat, - currentSelectedRepos, - fetchedRepos, - setFetchedRepos, - newSelectedRepos, - setNewSelectedRepos, - isFetchingRepos, - handleFetchRepositories, - handleRepoSelectionChange, - } = useConnectorEditPage(connectorId, searchSpaceId); + // Use the custom hook to manage state and logic + const { + connectorsLoading, + connector, + isSaving, + editForm, + patForm, // Needed for GitHub child component + handleSaveChanges, + // GitHub specific props for the child component + editMode, + setEditMode, // Pass down if needed by GitHub component + originalPat, + currentSelectedRepos, + fetchedRepos, + setFetchedRepos, + newSelectedRepos, + setNewSelectedRepos, + isFetchingRepos, + handleFetchRepositories, + handleRepoSelectionChange, + } = useConnectorEditPage(connectorId, searchSpaceId); - // Redirect if connectorId is not a valid number after parsing - useEffect(() => { - if (isNaN(connectorId)) { - toast.error("Invalid Connector ID."); - router.push(`/dashboard/${searchSpaceId}/connectors`); - } - }, [connectorId, router, searchSpaceId]); + // Redirect if connectorId is not a valid number after parsing + useEffect(() => { + if (isNaN(connectorId)) { + toast.error("Invalid Connector ID."); + router.push(`/dashboard/${searchSpaceId}/connectors`); + } + }, [connectorId, router, searchSpaceId]); - // Loading State - if (connectorsLoading || !connector) { - // Handle NaN case before showing skeleton - if (isNaN(connectorId)) return null; - return ; - } + // Loading State + if (connectorsLoading || !connector) { + // Handle NaN case before showing skeleton + if (isNaN(connectorId)) return null; + return ; + } - // Main Render using data/handlers from the hook - return ( -
- + // Main Render using data/handlers from the hook + return ( +
+ - - - - - {getConnectorIcon(connector.connector_type)} - Edit {getConnectorTypeDisplay(connector.connector_type)} Connector - - - Modify connector name and configuration. - - + + + + + {getConnectorIcon(connector.connector_type)} + Edit {getConnectorTypeDisplay(connector.connector_type)} Connector + + + Modify connector name and configuration. + + -
- {/* Pass hook's handleSaveChanges */} - - - {/* Pass form control from hook */} - + + {/* Pass hook's handleSaveChanges */} + + + {/* Pass form control from hook */} + -
+
-

Configuration

+

Configuration

- {/* == GitHub == */} - {connector.connector_type === "GITHUB_CONNECTOR" && ( - - )} + {/* == GitHub == */} + {connector.connector_type === "GITHUB_CONNECTOR" && ( + + )} - {/* == Slack == */} - {connector.connector_type === "SLACK_CONNECTOR" && ( - - )} - {/* == Notion == */} - {connector.connector_type === "NOTION_CONNECTOR" && ( - - )} - {/* == Serper == */} - {connector.connector_type === "SERPER_API" && ( - - )} - {/* == Tavily == */} - {connector.connector_type === "TAVILY_API" && ( - - )} + {/* == Slack == */} + {connector.connector_type === "SLACK_CONNECTOR" && ( + + )} + {/* == Notion == */} + {connector.connector_type === "NOTION_CONNECTOR" && ( + + )} + {/* == Serper == */} + {connector.connector_type === "SERPER_API" && ( + + )} + {/* == Tavily == */} + {connector.connector_type === "TAVILY_API" && ( + + )} - {/* == Linear == */} - {connector.connector_type === "LINEAR_CONNECTOR" && ( - - )} + {/* == Linear == */} + {connector.connector_type === "LINEAR_CONNECTOR" && ( + + )} - {/* == Linkup == */} - {connector.connector_type === "LINKUP_API" && ( - - )} + {/* == Jira == */} + {connector.connector_type === "JIRA_CONNECTOR" && ( +
+ + +
+ )} - {/* == Discord == */} - {connector.connector_type === "DISCORD_CONNECTOR" && ( - - )} + {/* == Linkup == */} + {connector.connector_type === "LINKUP_API" && ( + + )} -
- - - - - -
-
-
- ); + {/* == Discord == */} + {connector.connector_type === "DISCORD_CONNECTOR" && ( + + )} + + + + + + + + +
+ ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx index 898644429..9ed3f94b9 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx @@ -9,7 +9,10 @@ import * as z from "zod"; import { toast } from "sonner"; import { ArrowLeft, Check, Info, Loader2 } from "lucide-react"; -import { useSearchSourceConnectors, SearchSourceConnector } from "@/hooks/useSearchSourceConnectors"; +import { + useSearchSourceConnectors, + SearchSourceConnector, +} from "@/hooks/useSearchSourceConnectors"; import { Form, FormControl, @@ -28,11 +31,7 @@ import { CardHeader, CardTitle, } from "@/components/ui/card"; -import { - Alert, - AlertDescription, - AlertTitle, -} from "@/components/ui/alert"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; // Define the form schema with Zod const apiConnectorFormSchema = z.object({ @@ -47,13 +46,15 @@ const apiConnectorFormSchema = z.object({ // Helper function to get connector type display name const getConnectorTypeDisplay = (type: string): string => { const typeMap: Record = { - "SERPER_API": "Serper API", - "TAVILY_API": "Tavily API", - "SLACK_CONNECTOR": "Slack Connector", - "NOTION_CONNECTOR": "Notion Connector", - "GITHUB_CONNECTOR": "GitHub Connector", - "DISCORD_CONNECTOR": "Discord Connector", - "LINKUP_API": "Linkup", + SERPER_API: "Serper API", + TAVILY_API: "Tavily API", + SLACK_CONNECTOR: "Slack Connector", + NOTION_CONNECTOR: "Notion Connector", + GITHUB_CONNECTOR: "GitHub Connector", + LINEAR_CONNECTOR: "Linear Connector", + JIRA_CONNECTOR: "Jira Connector", + DISCORD_CONNECTOR: "Discord Connector", + LINKUP_API: "Linkup", // Add other connector types here as needed }; return typeMap[type] || type; @@ -67,9 +68,11 @@ export default function EditConnectorPage() { const params = useParams(); const searchSpaceId = params.search_space_id as string; const connectorId = parseInt(params.connector_id as string, 10); - + const { connectors, updateConnector } = useSearchSourceConnectors(); - const [connector, setConnector] = useState(null); + const [connector, setConnector] = useState( + null, + ); const [isLoading, setIsLoading] = useState(true); const [isSubmitting, setIsSubmitting] = useState(false); // console.log("connector", connector); @@ -85,24 +88,24 @@ export default function EditConnectorPage() { // Get API key field name based on connector type const getApiKeyFieldName = (connectorType: string): string => { const fieldMap: Record = { - "SERPER_API": "SERPER_API_KEY", - "TAVILY_API": "TAVILY_API_KEY", - "SLACK_CONNECTOR": "SLACK_BOT_TOKEN", - "NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN", - "GITHUB_CONNECTOR": "GITHUB_PAT", - "DISCORD_CONNECTOR": "DISCORD_BOT_TOKEN", - "LINKUP_API": "LINKUP_API_KEY" + SERPER_API: "SERPER_API_KEY", + TAVILY_API: "TAVILY_API_KEY", + SLACK_CONNECTOR: "SLACK_BOT_TOKEN", + NOTION_CONNECTOR: "NOTION_INTEGRATION_TOKEN", + GITHUB_CONNECTOR: "GITHUB_PAT", + DISCORD_CONNECTOR: "DISCORD_BOT_TOKEN", + LINKUP_API: "LINKUP_API_KEY", }; return fieldMap[connectorType] || ""; }; // Find connector in the list useEffect(() => { - const currentConnector = connectors.find(c => c.id === connectorId); - + const currentConnector = connectors.find((c) => c.id === connectorId); + if (currentConnector) { setConnector(currentConnector); - + // Check if connector type is supported const apiKeyField = getApiKeyFieldName(currentConnector.connector_type); if (apiKeyField) { @@ -115,7 +118,7 @@ export default function EditConnectorPage() { toast.error("This connector type is not supported for editing"); router.push(`/dashboard/${searchSpaceId}/connectors`); } - + setIsLoading(false); } else if (!isLoading && connectors.length > 0) { // If connectors are loaded but this one isn't found @@ -127,11 +130,11 @@ export default function EditConnectorPage() { // Handle form submission const onSubmit = async (values: ApiConnectorFormValues) => { if (!connector) return; - + setIsSubmitting(true); try { const apiKeyField = getApiKeyFieldName(connector.connector_type); - + // Only update the API key if a new one was provided const updatedConfig = { ...connector.config }; if (values.api_key) { @@ -150,7 +153,9 @@ export default function EditConnectorPage() { router.push(`/dashboard/${searchSpaceId}/connectors`); } catch (error) { console.error("Error updating connector:", error); - toast.error(error instanceof Error ? error.message : "Failed to update connector"); + toast.error( + error instanceof Error ? error.message : "Failed to update connector", + ); } finally { setIsSubmitting(false); } @@ -186,24 +191,30 @@ export default function EditConnectorPage() { - Edit {connector ? getConnectorTypeDisplay(connector.connector_type) : ""} Connector + Edit{" "} + {connector + ? getConnectorTypeDisplay(connector.connector_type) + : ""}{" "} + Connector - - Update your connector settings. - + Update your connector settings. API Key Security - Your API key is stored securely. For security reasons, we don't display your existing API key. - If you don't update the API key field, your existing key will be preserved. + Your API key is stored securely. For security reasons, we don't + display your existing API key. If you don't update the API key + field, your existing key will be preserved.
- + ( - {connector?.connector_type === "SLACK_CONNECTOR" - ? "Slack Bot Token" - : connector?.connector_type === "NOTION_CONNECTOR" - ? "Notion Integration Token" + {connector?.connector_type === "SLACK_CONNECTOR" + ? "Slack Bot Token" + : connector?.connector_type === "NOTION_CONNECTOR" + ? "Notion Integration Token" : connector?.connector_type === "GITHUB_CONNECTOR" ? "GitHub Personal Access Token (PAT)" : connector?.connector_type === "LINKUP_API" @@ -238,27 +249,28 @@ export default function EditConnectorPage() { : "API Key"} - - {connector?.connector_type === "SLACK_CONNECTOR" - ? "Enter a new Slack Bot Token or leave blank to keep your existing token." - : connector?.connector_type === "NOTION_CONNECTOR" - ? "Enter a new Notion Integration Token or leave blank to keep your existing token." + {connector?.connector_type === "SLACK_CONNECTOR" + ? "Enter a new Slack Bot Token or leave blank to keep your existing token." + : connector?.connector_type === "NOTION_CONNECTOR" + ? "Enter a new Notion Integration Token or leave blank to keep your existing token." : connector?.connector_type === "GITHUB_CONNECTOR" ? "Enter a new GitHub PAT or leave blank to keep your existing token." : connector?.connector_type === "LINKUP_API" @@ -271,8 +283,8 @@ export default function EditConnectorPage() { />
-
); -} +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx new file mode 100644 index 000000000..625adfa0d --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx @@ -0,0 +1,448 @@ +"use client"; + +import { useState } from "react"; +import { useRouter, useParams } from "next/navigation"; +import { motion } from "framer-motion"; +import { zodResolver } from "@hookform/resolvers/zod"; +import { useForm } from "react-hook-form"; +import * as z from "zod"; +import { toast } from "sonner"; +import { ArrowLeft, Check, Info, Loader2 } from "lucide-react"; + +import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors"; +import { + Form, + FormControl, + FormDescription, + FormField, + FormItem, + FormLabel, + FormMessage, +} from "@/components/ui/form"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; +import { + Accordion, + AccordionContent, + AccordionItem, + AccordionTrigger, +} from "@/components/ui/accordion"; +import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; + +// Define the form schema with Zod +const jiraConnectorFormSchema = z.object({ + name: z.string().min(3, { + message: "Connector name must be at least 3 characters.", + }), + base_url: z + .string() + .url({ + message: + "Please enter a valid Jira URL (e.g., https://yourcompany.atlassian.net)", + }) + .refine( + (url) => { + return url.includes("atlassian.net") || url.includes("jira"); + }, + { + message: "Please enter a valid Jira instance URL", + }, + ), + personal_access_token: z.string().min(10, { + message: "Jira Personal Access Token is required and must be valid.", + }), +}); + +// Define the type for the form values +type JiraConnectorFormValues = z.infer; + +export default function JiraConnectorPage() { + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + const [isSubmitting, setIsSubmitting] = useState(false); + const { createConnector } = useSearchSourceConnectors(); + + // Initialize the form + const form = useForm({ + resolver: zodResolver(jiraConnectorFormSchema), + defaultValues: { + name: "Jira Connector", + base_url: "", + personal_access_token: "", + }, + }); + + // Handle form submission + const onSubmit = async (values: JiraConnectorFormValues) => { + setIsSubmitting(true); + try { + await createConnector({ + name: values.name, + connector_type: "JIRA_CONNECTOR", + config: { + JIRA_BASE_URL: values.base_url, + JIRA_PERSONAL_ACCESS_TOKEN: values.personal_access_token, + }, + is_indexable: true, + last_indexed_at: null, + }); + + toast.success("Jira connector created successfully!"); + + // Navigate back to connectors page + router.push(`/dashboard/${searchSpaceId}/connectors`); + } catch (error) { + console.error("Error creating connector:", error); + toast.error( + error instanceof Error ? error.message : "Failed to create connector", + ); + } finally { + setIsSubmitting(false); + } + }; + + return ( +
+ + + + + + Connect + Documentation + + + + + + + Connect Jira Instance + + + Integrate with Jira to search and retrieve information from + your issues, tickets, and comments. This connector can index + your Jira content for search. + + + + + + Jira Personal Access Token Required + + You'll need a Jira Personal Access Token to use this + connector. You can create one from{" "} + + Atlassian Account Settings + + + + + + + ( + + Connector Name + + + + + A friendly name to identify this connector. + + + + )} + /> + + ( + + Jira Instance URL + + + + + Your Jira instance URL. For Atlassian Cloud, this is + typically https://yourcompany.atlassian.net + + + + )} + /> + + ( + + Personal Access Token + + + + + Your Jira Personal Access Token will be encrypted + and stored securely. + + + + )} + /> + +
+ +
+ + +
+ +

+ What you get with Jira integration: +

+
    +
  • Search through all your Jira issues and tickets
  • +
  • + Access issue descriptions, comments, and full discussion + threads +
  • +
  • + Connect your team's project management directly to your + search space +
  • +
  • + Keep your search results up-to-date with latest Jira content +
  • +
  • + Index your Jira issues for enhanced search capabilities +
  • +
  • + Search by issue keys, status, priority, and assignee + information +
  • +
+
+
+
+ + + + + + Jira Connector Documentation + + + Learn how to set up and use the Jira connector to index your + project management data. + + + +
+

How it works

+

+ The Jira connector uses the Jira REST API to fetch all + issues and comments that the Personal Access Token has + access to within your Jira instance. +

+
    +
  • + For follow up indexing runs, the connector retrieves + issues and comments that have been updated since the last + indexing attempt. +
  • +
  • + Indexing is configured to run periodically, so updates + should appear in your search results within minutes. +
  • +
+
+ + + + + Authorization + + + + + Read-Only Access is Sufficient + + You only need read access for this connector to work. + The Personal Access Token will only be used to read + your Jira data. + + + +
+
+

+ Step 1: Create a Personal Access Token +

+
    +
  1. Log in to your Atlassian account
  2. +
  3. + Navigate to{" "} + + https://id.atlassian.com/manage-profile/security/api-tokens + +
  4. +
  5. + Click Create API token +
  6. +
  7. + Enter a label for your token (like "SurfSense + Connector") +
  8. +
  9. + Click Create +
  10. +
  11. + Copy the generated token as it will only be shown + once +
  12. +
+
+ +
+

+ Step 2: Grant necessary access +

+

+ The Personal Access Token will have access to all + projects and issues that your user account can see. + Make sure your account has appropriate permissions + for the projects you want to index. +

+ + + Data Privacy + + Only issues, comments, and basic metadata will be + indexed. Jira attachments and linked files are not + indexed by this connector. + + +
+
+
+
+ + + + Indexing + + +
    +
  1. + Navigate to the Connector Dashboard and select the{" "} + Jira Connector. +
  2. +
  3. + Enter your Jira Instance URL (e.g., + https://yourcompany.atlassian.net) +
  4. +
  5. + Place your Personal Access Token in + the form field. +
  6. +
  7. + Click Connect to establish the + connection. +
  8. +
  9. + Once connected, your Jira issues will be indexed + automatically. +
  10. +
+ + + + What Gets Indexed + +

+ The Jira connector indexes the following data: +

+
    +
  • Issue keys and summaries (e.g., PROJ-123)
  • +
  • Issue descriptions
  • +
  • Issue comments and discussion threads
  • +
  • + Issue status, priority, and type information +
  • +
  • Assignee and reporter information
  • +
  • Project information
  • +
+
+
+
+
+
+
+
+
+
+
+
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx index afcc0af00..3d0e59d9b 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx @@ -1,8 +1,17 @@ "use client"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardFooter, CardHeader } from "@/components/ui/card"; -import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; +import { + Card, + CardContent, + CardFooter, + CardHeader, +} from "@/components/ui/card"; +import { + Collapsible, + CollapsibleContent, + CollapsibleTrigger, +} from "@/components/ui/collapsible"; import { IconBrandDiscord, IconBrandGithub, @@ -67,23 +76,26 @@ const connectorCategories: ConnectorCategory[] = [ { id: "slack-connector", title: "Slack", - description: "Connect to your Slack workspace to access messages and channels.", + description: + "Connect to your Slack workspace to access messages and channels.", icon: , status: "available", }, { id: "ms-teams", title: "Microsoft Teams", - description: "Connect to Microsoft Teams to access your team's conversations.", + description: + "Connect to Microsoft Teams to access your team's conversations.", icon: , status: "coming-soon", }, { id: "discord-connector", title: "Discord", - description: "Connect to Discord servers to access messages and channels.", + description: + "Connect to Discord servers to access messages and channels.", icon: , - status: "available" + status: "available", }, ], }, @@ -94,16 +106,18 @@ const connectorCategories: ConnectorCategory[] = [ { id: "linear-connector", title: "Linear", - description: "Connect to Linear to search issues, comments and project data.", + description: + "Connect to Linear to search issues, comments and project data.", icon: , status: "available", }, { id: "jira-connector", title: "Jira", - description: "Connect to Jira to search issues, tickets and project data.", + description: + "Connect to Jira to search issues, tickets and project data.", icon: , - status: "coming-soon", + status: "available", }, ], }, @@ -114,14 +128,16 @@ const connectorCategories: ConnectorCategory[] = [ { id: "notion-connector", title: "Notion", - description: "Connect to your Notion workspace to access pages and databases.", + description: + "Connect to your Notion workspace to access pages and databases.", icon: , status: "available", }, { id: "github-connector", title: "GitHub", - description: "Connect a GitHub PAT to index code and docs from accessible repositories.", + description: + "Connect a GitHub PAT to index code and docs from accessible repositories.", icon: , status: "available", }, @@ -141,7 +157,8 @@ const connectorCategories: ConnectorCategory[] = [ { id: "zoom", title: "Zoom", - description: "Connect to Zoom to access meeting recordings and transcripts.", + description: + "Connect to Zoom to access meeting recordings and transcripts.", icon: , status: "coming-soon", }, @@ -152,7 +169,7 @@ const connectorCategories: ConnectorCategory[] = [ // Animation variants const fadeIn = { hidden: { opacity: 0 }, - visible: { opacity: 1, transition: { duration: 0.4 } } + visible: { opacity: 1, transition: { duration: 0.4 } }, }; const staggerContainer = { @@ -160,43 +177,49 @@ const staggerContainer = { visible: { opacity: 1, transition: { - staggerChildren: 0.1 - } - } + staggerChildren: 0.1, + }, + }, }; const cardVariants = { hidden: { opacity: 0, y: 20 }, - visible: { - opacity: 1, + visible: { + opacity: 1, y: 0, - transition: { + transition: { type: "spring", stiffness: 260, - damping: 20 - } + damping: 20, + }, }, - hover: { + hover: { scale: 1.02, - boxShadow: "0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05)", - transition: { + boxShadow: + "0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05)", + transition: { type: "spring", stiffness: 400, - damping: 10 - } - } + damping: 10, + }, + }, }; export default function ConnectorsPage() { const params = useParams(); const searchSpaceId = params.search_space_id as string; - const [expandedCategories, setExpandedCategories] = useState(["search-engines", "knowledge-bases", "project-management", "team-chats"]); + const [expandedCategories, setExpandedCategories] = useState([ + "search-engines", + "knowledge-bases", + "project-management", + "team-chats", + ]); const toggleCategory = (categoryId: string) => { - setExpandedCategories(prev => - prev.includes(categoryId) - ? prev.filter(id => id !== categoryId) - : [...prev, categoryId] + setExpandedCategories((prev) => + prev.includes(categoryId) + ? prev.filter((id) => id !== categoryId) + : [...prev, categoryId], ); }; @@ -205,9 +228,9 @@ export default function ConnectorsPage() { @@ -215,18 +238,19 @@ export default function ConnectorsPage() { Connect Your Tools

- Integrate with your favorite services to enhance your research capabilities. + Integrate with your favorite services to enhance your research + capabilities.

- {connectorCategories.map((category) => ( -

{category.title}

- - + -
-

{connector.title}

+

+ {connector.title} +

{connector.status === "coming-soon" && ( - + Coming soon )} {connector.status === "connected" && ( - + Connected )}
- +

{connector.description}

- + - {connector.status === 'available' && ( - - )} - {connector.status === 'coming-soon' && ( - )} - {connector.status === 'connected' && ( - )} diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx index 8a0bde74f..e92db282e 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx @@ -1,77 +1,77 @@ "use client"; import React, { - useRef, - useEffect, - useState, - useMemo, - useCallback, + useRef, + useEffect, + useState, + useMemo, + useCallback, } from "react"; import { useChat } from "@ai-sdk/react"; import { useParams } from "next/navigation"; import { - Loader2, - X, - Search, - ExternalLink, - ChevronLeft, - ChevronRight, - Check, - ArrowDown, - CircleUser, - Database, - SendHorizontal, - FileText, - Grid3x3, - FolderOpen, - Upload, - ChevronDown, - Filter, - Brain, - Zap, + Loader2, + X, + Search, + ExternalLink, + ChevronLeft, + ChevronRight, + Check, + ArrowDown, + CircleUser, + Database, + SendHorizontal, + FileText, + Grid3x3, + FolderOpen, + Upload, + ChevronDown, + Filter, + Brain, + Zap, } from "lucide-react"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; import { - Dialog, - DialogContent, - DialogDescription, - DialogHeader, - DialogTitle, - DialogTrigger, - DialogFooter, + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, + DialogTrigger, + DialogFooter, } from "@/components/ui/dialog"; import { - DropdownMenu, - DropdownMenuContent, - DropdownMenuItem, - DropdownMenuLabel, - DropdownMenuSeparator, - DropdownMenuTrigger, + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, } from "@/components/ui/select"; import { Badge } from "@/components/ui/badge"; import { Skeleton } from "@/components/ui/skeleton"; import { - ConnectorButton as ConnectorButtonComponent, - getConnectorIcon, - getFilteredSources as getFilteredSourcesUtil, - getPaginatedDialogSources as getPaginatedDialogSourcesUtil, - useScrollToBottom, - updateScrollIndicators as updateScrollIndicatorsUtil, - useScrollIndicators, - scrollTabsLeft as scrollTabsLeftUtil, - scrollTabsRight as scrollTabsRightUtil, - Source, - ResearchMode, - ResearchModeControl, + ConnectorButton as ConnectorButtonComponent, + getConnectorIcon, + getFilteredSources as getFilteredSourcesUtil, + getPaginatedDialogSources as getPaginatedDialogSourcesUtil, + useScrollToBottom, + updateScrollIndicators as updateScrollIndicatorsUtil, + useScrollIndicators, + scrollTabsLeft as scrollTabsLeftUtil, + scrollTabsRight as scrollTabsRightUtil, + Source, + ResearchMode, + ResearchModeControl, } from "@/components/chat"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Logo } from "@/components/Logo"; @@ -80,446 +80,447 @@ import { useDocuments } from "@/hooks/use-documents"; import { useLLMConfigs, useLLMPreferences } from "@/hooks/use-llm-configs"; interface SourceItem { - id: number; - title: string; - description: string; - url: string; - connectorType?: string; + id: number; + title: string; + description: string; + url: string; + connectorType?: string; } interface ConnectorSource { - id: number; - name: string; - type: string; - sources: SourceItem[]; + id: number; + name: string; + type: string; + sources: SourceItem[]; } type DocumentType = - | "EXTENSION" - | "CRAWLED_URL" - | "SLACK_CONNECTOR" - | "NOTION_CONNECTOR" - | "FILE" - | "YOUTUBE_VIDEO" - | "GITHUB_CONNECTOR" - | "LINEAR_CONNECTOR" - | "DISCORD_CONNECTOR"; + | "EXTENSION" + | "CRAWLED_URL" + | "SLACK_CONNECTOR" + | "NOTION_CONNECTOR" + | "FILE" + | "YOUTUBE_VIDEO" + | "GITHUB_CONNECTOR" + | "LINEAR_CONNECTOR" + | "JIRA_CONNECTOR" + | "DISCORD_CONNECTOR"; /** * Skeleton loader for document items */ const DocumentSkeleton = () => ( -
- -
- - - -
- -
+
+ +
+ + + +
+ +
); /** * Enhanced document type filter dropdown */ const DocumentTypeFilter = ({ - value, - onChange, - counts, + value, + onChange, + counts, }: { - value: DocumentType | "ALL"; - onChange: (value: DocumentType | "ALL") => void; - counts: Record; + value: DocumentType | "ALL"; + onChange: (value: DocumentType | "ALL") => void; + counts: Record; }) => { - const getTypeLabel = (type: DocumentType | "ALL") => { - if (type === "ALL") return "All Types"; - return type - .replace(/_/g, " ") - .toLowerCase() - .replace(/\b\w/g, (l) => l.toUpperCase()); - }; + const getTypeLabel = (type: DocumentType | "ALL") => { + if (type === "ALL") return "All Types"; + return type + .replace(/_/g, " ") + .toLowerCase() + .replace(/\b\w/g, (l) => l.toUpperCase()); + }; - const getTypeIcon = (type: DocumentType | "ALL") => { - if (type === "ALL") return ; - return getConnectorIcon(type); - }; + const getTypeIcon = (type: DocumentType | "ALL") => { + if (type === "ALL") return ; + return getConnectorIcon(type); + }; - return ( - - - - - - Document Types - - {Object.entries(counts).map(([type, count]) => ( - onChange(type as DocumentType | "ALL")} - className="flex items-center justify-between" - > -
- {getTypeIcon(type as DocumentType | "ALL")} - {getTypeLabel(type as DocumentType | "ALL")} -
- - {count} - -
- ))} -
-
- ); + return ( + + + + + + Document Types + + {Object.entries(counts).map(([type, count]) => ( + onChange(type as DocumentType | "ALL")} + className="flex items-center justify-between" + > +
+ {getTypeIcon(type as DocumentType | "ALL")} + {getTypeLabel(type as DocumentType | "ALL")} +
+ + {count} + +
+ ))} +
+
+ ); }; /** * Button that displays selected connectors and opens connector selection dialog */ const ConnectorButton = ({ - selectedConnectors, - onClick, + selectedConnectors, + onClick, }: { - selectedConnectors: string[]; - onClick: () => void; + selectedConnectors: string[]; + onClick: () => void; }) => { - const { connectorSourceItems } = useSearchSourceConnectors(); + const { connectorSourceItems } = useSearchSourceConnectors(); - return ( - - ); + return ( + + ); }; /** * Button that displays selected documents count and opens document selection dialog */ const DocumentSelectorButton = ({ - selectedDocuments, - onClick, - documentsCount, + selectedDocuments, + onClick, + documentsCount, }: { - selectedDocuments: number[]; - onClick: () => void; - documentsCount: number; + selectedDocuments: number[]; + onClick: () => void; + documentsCount: number; }) => { - return ( -
- - {selectedDocuments.length > 0 && ( - - {selectedDocuments.length > 99 ? "99+" : selectedDocuments.length} - - )} - {selectedDocuments.length === 0 && ( - - 0 - - )} -
- ); + return ( +
+ + {selectedDocuments.length > 0 && ( + + {selectedDocuments.length > 99 ? "99+" : selectedDocuments.length} + + )} + {selectedDocuments.length === 0 && ( + + 0 + + )} +
+ ); }; // Create a wrapper component for the sources dialog content const SourcesDialogContent = ({ - connector, - sourceFilter, - expandedSources, - sourcesPage, - setSourcesPage, - setSourceFilter, - setExpandedSources, - isLoadingMore, + connector, + sourceFilter, + expandedSources, + sourcesPage, + setSourcesPage, + setSourceFilter, + setExpandedSources, + isLoadingMore, }: { - connector: any; - sourceFilter: string; - expandedSources: boolean; - sourcesPage: number; - setSourcesPage: React.Dispatch>; - setSourceFilter: React.Dispatch>; - setExpandedSources: React.Dispatch>; - isLoadingMore: boolean; + connector: any; + sourceFilter: string; + expandedSources: boolean; + sourcesPage: number; + setSourcesPage: React.Dispatch>; + setSourceFilter: React.Dispatch>; + setExpandedSources: React.Dispatch>; + isLoadingMore: boolean; }) => { - // Safely access sources with fallbacks - const sources = connector?.sources || []; + // Safely access sources with fallbacks + const sources = connector?.sources || []; - // Safe versions of utility functions - const getFilteredSourcesSafe = () => { - if (!sources.length) return []; - return getFilteredSourcesUtil(connector, sourceFilter); - }; + // Safe versions of utility functions + const getFilteredSourcesSafe = () => { + if (!sources.length) return []; + return getFilteredSourcesUtil(connector, sourceFilter); + }; - const getPaginatedSourcesSafe = () => { - if (!sources.length) return []; - return getPaginatedDialogSourcesUtil( - connector, - sourceFilter, - expandedSources, - sourcesPage, - 5, // SOURCES_PER_PAGE - ); - }; + const getPaginatedSourcesSafe = () => { + if (!sources.length) return []; + return getPaginatedDialogSourcesUtil( + connector, + sourceFilter, + expandedSources, + sourcesPage, + 5, // SOURCES_PER_PAGE + ); + }; - const filteredSources = getFilteredSourcesSafe() || []; - const paginatedSources = getPaginatedSourcesSafe() || []; + const filteredSources = getFilteredSourcesSafe() || []; + const paginatedSources = getPaginatedSourcesSafe() || []; - // Description text - const descriptionText = sourceFilter - ? `Found ${filteredSources.length} sources matching "${sourceFilter}"` - : `Viewing ${paginatedSources.length} of ${sources.length} sources`; + // Description text + const descriptionText = sourceFilter + ? `Found ${filteredSources.length} sources matching "${sourceFilter}"` + : `Viewing ${paginatedSources.length} of ${sources.length} sources`; - if (paginatedSources.length === 0) { - return ( -
- -

No sources found matching "{sourceFilter}"

- -
- ); - } + if (paginatedSources.length === 0) { + return ( +
+ +

No sources found matching "{sourceFilter}"

+ +
+ ); + } - return ( - <> - - - {getConnectorIcon(connector.type)} - {connector.name} Sources - - - {descriptionText} - - + return ( + <> + + + {getConnectorIcon(connector.type)} + {connector.name} Sources + + + {descriptionText} + + -
- - { - setSourceFilter(e.target.value); - setSourcesPage(1); - setExpandedSources(false); - }} - /> - {sourceFilter && ( - - )} -
+
+ + { + setSourceFilter(e.target.value); + setSourcesPage(1); + setExpandedSources(false); + }} + /> + {sourceFilter && ( + + )} +
-
- {paginatedSources.map((source: any, index: number) => ( - -
-
- {getConnectorIcon(connector.type)} -
-
-

{source.title}

-

- {source.description} -

-
- -
-
- ))} +
+ {paginatedSources.map((source: any, index: number) => ( + +
+
+ {getConnectorIcon(connector.type)} +
+
+

{source.title}

+

+ {source.description} +

+
+ +
+
+ ))} - {!expandedSources && - paginatedSources.length < filteredSources.length && ( - - )} + {!expandedSources && + paginatedSources.length < filteredSources.length && ( + + )} - {expandedSources && filteredSources.length > 10 && ( -
- Showing all {filteredSources.length} sources -
- )} -
- - ); + {expandedSources && filteredSources.length > 10 && ( +
+ Showing all {filteredSources.length} sources +
+ )} +
+ + ); }; const ChatPage = () => { - const [token, setToken] = React.useState(null); - const [dialogOpenId, setDialogOpenId] = useState(null); - const [sourcesPage, setSourcesPage] = useState(1); - const [expandedSources, setExpandedSources] = useState(false); - const [canScrollLeft, setCanScrollLeft] = useState(false); - const [canScrollRight, setCanScrollRight] = useState(true); - const [sourceFilter, setSourceFilter] = useState(""); - const tabsListRef = useRef(null); - const [terminalExpanded, setTerminalExpanded] = useState(false); - const [selectedConnectors, setSelectedConnectors] = useState([]); - const [searchMode, setSearchMode] = useState<"DOCUMENTS" | "CHUNKS">( - "DOCUMENTS", - ); - const [researchMode, setResearchMode] = useState("QNA"); - const [currentTime, setCurrentTime] = useState(""); - const [currentDate, setCurrentDate] = useState(""); - const terminalMessagesRef = useRef(null); - const { connectorSourceItems, isLoading: isLoadingConnectors } = - useSearchSourceConnectors(); - const { llmConfigs } = useLLMConfigs(); - const { preferences, updatePreferences } = useLLMPreferences(); + const [token, setToken] = React.useState(null); + const [dialogOpenId, setDialogOpenId] = useState(null); + const [sourcesPage, setSourcesPage] = useState(1); + const [expandedSources, setExpandedSources] = useState(false); + const [canScrollLeft, setCanScrollLeft] = useState(false); + const [canScrollRight, setCanScrollRight] = useState(true); + const [sourceFilter, setSourceFilter] = useState(""); + const tabsListRef = useRef(null); + const [terminalExpanded, setTerminalExpanded] = useState(false); + const [selectedConnectors, setSelectedConnectors] = useState([]); + const [searchMode, setSearchMode] = useState<"DOCUMENTS" | "CHUNKS">( + "DOCUMENTS", + ); + const [researchMode, setResearchMode] = useState("QNA"); + const [currentTime, setCurrentTime] = useState(""); + const [currentDate, setCurrentDate] = useState(""); + const terminalMessagesRef = useRef(null); + const { connectorSourceItems, isLoading: isLoadingConnectors } = + useSearchSourceConnectors(); + const { llmConfigs } = useLLMConfigs(); + const { preferences, updatePreferences } = useLLMPreferences(); - const INITIAL_SOURCES_DISPLAY = 3; + const INITIAL_SOURCES_DISPLAY = 3; - const { search_space_id, chat_id } = useParams(); + const { search_space_id, chat_id } = useParams(); - // Document selection state - const [selectedDocuments, setSelectedDocuments] = useState([]); - const [documentFilter, setDocumentFilter] = useState(""); - const [debouncedDocumentFilter, setDebouncedDocumentFilter] = useState(""); - const [documentTypeFilter, setDocumentTypeFilter] = useState< - DocumentType | "ALL" - >("ALL"); - const [documentsPage, setDocumentsPage] = useState(1); - const [documentsPerPage] = useState(10); - const { - documents, - loading: isLoadingDocuments, - error: documentsError, - } = useDocuments(Number(search_space_id)); + // Document selection state + const [selectedDocuments, setSelectedDocuments] = useState([]); + const [documentFilter, setDocumentFilter] = useState(""); + const [debouncedDocumentFilter, setDebouncedDocumentFilter] = useState(""); + const [documentTypeFilter, setDocumentTypeFilter] = useState< + DocumentType | "ALL" + >("ALL"); + const [documentsPage, setDocumentsPage] = useState(1); + const [documentsPerPage] = useState(10); + const { + documents, + loading: isLoadingDocuments, + error: documentsError, + } = useDocuments(Number(search_space_id)); - // Debounced search effect (proper implementation) - useEffect(() => { - const handler = setTimeout(() => { - setDebouncedDocumentFilter(documentFilter); - setDocumentsPage(1); // Reset page when search changes - }, 300); + // Debounced search effect (proper implementation) + useEffect(() => { + const handler = setTimeout(() => { + setDebouncedDocumentFilter(documentFilter); + setDocumentsPage(1); // Reset page when search changes + }, 300); - return () => { - clearTimeout(handler); - }; - }, [documentFilter]); + return () => { + clearTimeout(handler); + }; + }, [documentFilter]); - // Memoized filtered and paginated documents - const filteredDocuments = useMemo(() => { - if (!documents) return []; + // Memoized filtered and paginated documents + const filteredDocuments = useMemo(() => { + if (!documents) return []; - return documents.filter((doc) => { - const matchesSearch = - doc.title - .toLowerCase() - .includes(debouncedDocumentFilter.toLowerCase()) || - doc.content - .toLowerCase() - .includes(debouncedDocumentFilter.toLowerCase()); - const matchesType = - documentTypeFilter === "ALL" || - doc.document_type === documentTypeFilter; - return matchesSearch && matchesType; - }); - }, [documents, debouncedDocumentFilter, documentTypeFilter]); + return documents.filter((doc) => { + const matchesSearch = + doc.title + .toLowerCase() + .includes(debouncedDocumentFilter.toLowerCase()) || + doc.content + .toLowerCase() + .includes(debouncedDocumentFilter.toLowerCase()); + const matchesType = + documentTypeFilter === "ALL" || + doc.document_type === documentTypeFilter; + return matchesSearch && matchesType; + }); + }, [documents, debouncedDocumentFilter, documentTypeFilter]); - const paginatedDocuments = useMemo(() => { - const startIndex = (documentsPage - 1) * documentsPerPage; - return filteredDocuments.slice(startIndex, startIndex + documentsPerPage); - }, [filteredDocuments, documentsPage, documentsPerPage]); + const paginatedDocuments = useMemo(() => { + const startIndex = (documentsPage - 1) * documentsPerPage; + return filteredDocuments.slice(startIndex, startIndex + documentsPerPage); + }, [filteredDocuments, documentsPage, documentsPerPage]); - const totalPages = Math.ceil(filteredDocuments.length / documentsPerPage); + const totalPages = Math.ceil(filteredDocuments.length / documentsPerPage); - // Document type counts for filter dropdown - const documentTypeCounts = useMemo(() => { - if (!documents) return {}; + // Document type counts for filter dropdown + const documentTypeCounts = useMemo(() => { + if (!documents) return {}; - const counts: Record = { ALL: documents.length }; - documents.forEach((doc) => { - counts[doc.document_type] = (counts[doc.document_type] || 0) + 1; - }); - return counts; - }, [documents]); + const counts: Record = { ALL: documents.length }; + documents.forEach((doc) => { + counts[doc.document_type] = (counts[doc.document_type] || 0) + 1; + }); + return counts; + }, [documents]); - // Callback to handle document selection - const handleDocumentToggle = useCallback((documentId: number) => { - setSelectedDocuments((prev) => - prev.includes(documentId) - ? prev.filter((id) => id !== documentId) - : [...prev, documentId], - ); - }, []); + // Callback to handle document selection + const handleDocumentToggle = useCallback((documentId: number) => { + setSelectedDocuments((prev) => + prev.includes(documentId) + ? prev.filter((id) => id !== documentId) + : [...prev, documentId], + ); + }, []); - // Function to scroll terminal to bottom - const scrollTerminalToBottom = () => { - if (terminalMessagesRef.current) { - terminalMessagesRef.current.scrollTop = - terminalMessagesRef.current.scrollHeight; - } - }; + // Function to scroll terminal to bottom + const scrollTerminalToBottom = () => { + if (terminalMessagesRef.current) { + terminalMessagesRef.current.scrollTop = + terminalMessagesRef.current.scrollHeight; + } + }; - // Get token from localStorage on client side only - React.useEffect(() => { - setToken(localStorage.getItem("surfsense_bearer_token")); - }, []); + // Get token from localStorage on client side only + React.useEffect(() => { + setToken(localStorage.getItem("surfsense_bearer_token")); + }, []); - // Set the current time only on the client side after initial render - useEffect(() => { - setCurrentDate(new Date().toISOString().split("T")[0]); - setCurrentTime(new Date().toTimeString().split(" ")[0]); - }, []); + // Set the current time only on the client side after initial render + useEffect(() => { + setCurrentDate(new Date().toISOString().split("T")[0]); + setCurrentTime(new Date().toTimeString().split(" ")[0]); + }, []); - // Add this CSS to remove input shadow and improve the UI - useEffect(() => { - if (typeof document !== "undefined") { - const style = document.createElement("style"); - style.innerHTML = ` + // Add this CSS to remove input shadow and improve the UI + useEffect(() => { + if (typeof document !== "undefined") { + const style = document.createElement("style"); + style.innerHTML = ` .no-shadow-input { box-shadow: none !important; } @@ -617,825 +618,860 @@ const ChatPage = () => { background: hsl(var(--muted-foreground) / 0.5); } `; - document.head.appendChild(style); + document.head.appendChild(style); - return () => { - document.head.removeChild(style); - }; - } - }, []); + return () => { + document.head.removeChild(style); + }; + } + }, []); - const { - messages, - input, - handleInputChange, - handleSubmit: handleChatSubmit, - status, - setMessages, - } = useChat({ - api: `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chat`, - streamProtocol: "data", - headers: { - ...(token && { Authorization: `Bearer ${token}` }), - }, - body: { - data: { - search_space_id: search_space_id, - selected_connectors: selectedConnectors, - research_mode: researchMode, - search_mode: searchMode, - document_ids_to_add_in_context: selectedDocuments, - }, - }, - onError: (error) => { - console.error("Chat error:", error); - // You can add additional error handling here if needed - }, - }); + const { + messages, + input, + handleInputChange, + handleSubmit: handleChatSubmit, + status, + setMessages, + } = useChat({ + api: `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chat`, + streamProtocol: "data", + headers: { + ...(token && { Authorization: `Bearer ${token}` }), + }, + body: { + data: { + search_space_id: search_space_id, + selected_connectors: selectedConnectors, + research_mode: researchMode, + search_mode: searchMode, + document_ids_to_add_in_context: selectedDocuments, + }, + }, + onError: (error) => { + console.error("Chat error:", error); + // You can add additional error handling here if needed + }, + }); - // Fetch chat details when component mounts - useEffect(() => { - const fetchChatDetails = async () => { - try { - if (!token) return; // Wait for token to be set + // Fetch chat details when component mounts + useEffect(() => { + const fetchChatDetails = async () => { + try { + if (!token) return; // Wait for token to be set - // console.log('Fetching chat details for chat ID:', chat_id); + // console.log('Fetching chat details for chat ID:', chat_id); - const response = await fetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, - { - method: "GET", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${token}`, - }, - }, - ); + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, + { + method: "GET", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${token}`, + }, + }, + ); - if (!response.ok) { - throw new Error( - `Failed to fetch chat details: ${response.statusText}`, - ); - } + if (!response.ok) { + throw new Error( + `Failed to fetch chat details: ${response.statusText}`, + ); + } - const chatData = await response.json(); - // console.log('Chat details fetched:', chatData); + const chatData = await response.json(); + // console.log('Chat details fetched:', chatData); - // Set research mode from chat data - if (chatData.type) { - setResearchMode(chatData.type as ResearchMode); - } + // Set research mode from chat data + if (chatData.type) { + setResearchMode(chatData.type as ResearchMode); + } - // Set connectors from chat data - if ( - chatData.initial_connectors && - Array.isArray(chatData.initial_connectors) - ) { - setSelectedConnectors(chatData.initial_connectors); - } + // Set connectors from chat data + if ( + chatData.initial_connectors && + Array.isArray(chatData.initial_connectors) + ) { + setSelectedConnectors(chatData.initial_connectors); + } - // Set messages from chat data - if (chatData.messages && Array.isArray(chatData.messages)) { - setMessages(chatData.messages); - } - } catch (err) { - console.error("Error fetching chat details:", err); - } - }; + // Set messages from chat data + if (chatData.messages && Array.isArray(chatData.messages)) { + setMessages(chatData.messages); + } + } catch (err) { + console.error("Error fetching chat details:", err); + } + }; - if (token) { - fetchChatDetails(); - } - }, [token, chat_id, setMessages]); + if (token) { + fetchChatDetails(); + } + }, [token, chat_id, setMessages]); - // Update chat when a conversation exchange is complete - useEffect(() => { - const updateChat = async () => { - try { - // Only update when: - // 1. Status is ready (not loading) - // 2. We have messages - // 3. Last message is from assistant (completed response) - if ( - status === "ready" && - messages.length > 0 && - messages[messages.length - 1]?.role === "assistant" - ) { - const token = localStorage.getItem("surfsense_bearer_token"); - if (!token) return; + // Update chat when a conversation exchange is complete + useEffect(() => { + const updateChat = async () => { + try { + // Only update when: + // 1. Status is ready (not loading) + // 2. We have messages + // 3. Last message is from assistant (completed response) + if ( + status === "ready" && + messages.length > 0 && + messages[messages.length - 1]?.role === "assistant" + ) { + const token = localStorage.getItem("surfsense_bearer_token"); + if (!token) return; - // Find the first user message to use as title - const userMessages = messages.filter((msg) => msg.role === "user"); - if (userMessages.length === 0) return; + // Find the first user message to use as title + const userMessages = messages.filter((msg) => msg.role === "user"); + if (userMessages.length === 0) return; - // Use the first user message as the title - const title = userMessages[0].content; + // Use the first user message as the title + const title = userMessages[0].content; - // console.log('Updating chat with title:', title); + // console.log('Updating chat with title:', title); - // Update the chat - const response = await fetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, - { - method: "PUT", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${token}`, - }, - body: JSON.stringify({ - type: researchMode, - title: title, - initial_connectors: selectedConnectors, - messages: messages, - search_space_id: Number(search_space_id), - }), - }, - ); + // Update the chat + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, + { + method: "PUT", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${token}`, + }, + body: JSON.stringify({ + type: researchMode, + title: title, + initial_connectors: selectedConnectors, + messages: messages, + search_space_id: Number(search_space_id), + }), + }, + ); - if (!response.ok) { - throw new Error(`Failed to update chat: ${response.statusText}`); - } + if (!response.ok) { + throw new Error(`Failed to update chat: ${response.statusText}`); + } - // console.log('Chat updated successfully'); - } - } catch (err) { - console.error("Error updating chat:", err); - } - }; + // console.log('Chat updated successfully'); + } + } catch (err) { + console.error("Error updating chat:", err); + } + }; - updateChat(); - }, [ - messages, - status, - chat_id, - researchMode, - selectedConnectors, - search_space_id, - ]); + updateChat(); + }, [ + messages, + status, + chat_id, + researchMode, + selectedConnectors, + search_space_id, + ]); - // Check and scroll terminal when terminal info is available - useEffect(() => { - // Modified to trigger during streaming as well (removed status check) - if (messages.length === 0) return; + // Check and scroll terminal when terminal info is available + useEffect(() => { + // Modified to trigger during streaming as well (removed status check) + if (messages.length === 0) return; - // Find the latest assistant message - const assistantMessages = messages.filter( - (msg) => msg.role === "assistant", - ); - if (assistantMessages.length === 0) return; + // Find the latest assistant message + const assistantMessages = messages.filter( + (msg) => msg.role === "assistant", + ); + if (assistantMessages.length === 0) return; - const latestAssistantMessage = - assistantMessages[assistantMessages.length - 1]; - if (!latestAssistantMessage?.annotations) return; + const latestAssistantMessage = + assistantMessages[assistantMessages.length - 1]; + if (!latestAssistantMessage?.annotations) return; - // Check for terminal info annotations - const annotations = latestAssistantMessage.annotations as any[]; - const terminalInfoAnnotations = annotations.filter( - (a) => a.type === "TERMINAL_INFO", - ); + // Check for terminal info annotations + const annotations = latestAssistantMessage.annotations as any[]; + const terminalInfoAnnotations = annotations.filter( + (a) => a.type === "TERMINAL_INFO", + ); - if (terminalInfoAnnotations.length > 0) { - // Always scroll to bottom when terminal info is updated, even during streaming - scrollTerminalToBottom(); - } - }, [messages]); // Removed status from dependencies to ensure it triggers during streaming + if (terminalInfoAnnotations.length > 0) { + // Always scroll to bottom when terminal info is updated, even during streaming + scrollTerminalToBottom(); + } + }, [messages]); // Removed status from dependencies to ensure it triggers during streaming - // Pure function to get connector sources for a specific message - const getMessageConnectorSources = (message: any): any[] => { - if (!message || message.role !== "assistant" || !message.annotations) - return []; + // Pure function to get connector sources for a specific message + const getMessageConnectorSources = (message: any): any[] => { + if (!message || message.role !== "assistant" || !message.annotations) + return []; - // Find all SOURCES annotations - const annotations = message.annotations as any[]; - const sourcesAnnotations = annotations.filter((a) => a.type === "SOURCES"); + // Find all SOURCES annotations + const annotations = message.annotations as any[]; + const sourcesAnnotations = annotations.filter((a) => a.type === "SOURCES"); - // Get the latest SOURCES annotation - if (sourcesAnnotations.length === 0) return []; - const latestSourcesAnnotation = - sourcesAnnotations[sourcesAnnotations.length - 1]; + // Get the latest SOURCES annotation + if (sourcesAnnotations.length === 0) return []; + const latestSourcesAnnotation = + sourcesAnnotations[sourcesAnnotations.length - 1]; - if (!latestSourcesAnnotation.content) return []; + if (!latestSourcesAnnotation.content) return []; - return latestSourcesAnnotation.content; - }; + return latestSourcesAnnotation.content; + }; - // Custom handleSubmit function to include selected connectors and answer type - const handleSubmit = (e: React.FormEvent) => { - e.preventDefault(); + // Custom handleSubmit function to include selected connectors and answer type + const handleSubmit = (e: React.FormEvent) => { + e.preventDefault(); - if (!input.trim() || status !== "ready") return; + if (!input.trim() || status !== "ready") return; - // Validation: require at least one connector OR at least one document - // Note: Fast LLM selection updates user preferences automatically - // if (selectedConnectors.length === 0 && selectedDocuments.length === 0) { - // alert("Please select at least one connector or document"); - // return; - // } + // Validation: require at least one connector OR at least one document + // Note: Fast LLM selection updates user preferences automatically + // if (selectedConnectors.length === 0 && selectedDocuments.length === 0) { + // alert("Please select at least one connector or document"); + // return; + // } - // Call the original handleSubmit from useChat - handleChatSubmit(e); - }; + // Call the original handleSubmit from useChat + handleChatSubmit(e); + }; - // Reference to the messages container for auto-scrolling - const messagesEndRef = useRef(null); + // Reference to the messages container for auto-scrolling + const messagesEndRef = useRef(null); - // Function to scroll to bottom - const scrollToBottom = () => { - messagesEndRef.current?.scrollIntoView({ behavior: "smooth" }); - }; + // Function to scroll to bottom + const scrollToBottom = () => { + messagesEndRef.current?.scrollIntoView({ behavior: "smooth" }); + }; - // Scroll to bottom when messages change - useEffect(() => { - scrollToBottom(); - }, [messages]); + // Scroll to bottom when messages change + useEffect(() => { + scrollToBottom(); + }, [messages]); - // Reset sources page when new messages arrive - useEffect(() => { - // Reset pagination when we get new messages - setSourcesPage(1); - setExpandedSources(false); - }, [messages]); + // Reset sources page when new messages arrive + useEffect(() => { + // Reset pagination when we get new messages + setSourcesPage(1); + setExpandedSources(false); + }, [messages]); - // Scroll terminal to bottom when expanded - useEffect(() => { - if (terminalExpanded) { - setTimeout(scrollTerminalToBottom, 300); // Wait for transition to complete - } - }, [terminalExpanded]); + // Scroll terminal to bottom when expanded + useEffect(() => { + if (terminalExpanded) { + setTimeout(scrollTerminalToBottom, 300); // Wait for transition to complete + } + }, [terminalExpanded]); - // Function to check scroll position and update indicators - const updateScrollIndicators = () => { - updateScrollIndicatorsUtil( - tabsListRef as React.RefObject, - setCanScrollLeft, - setCanScrollRight, - ); - }; + // Function to check scroll position and update indicators + const updateScrollIndicators = () => { + updateScrollIndicatorsUtil( + tabsListRef as React.RefObject, + setCanScrollLeft, + setCanScrollRight, + ); + }; - // Initialize scroll indicators - const updateIndicators = useScrollIndicators( - tabsListRef as React.RefObject, - setCanScrollLeft, - setCanScrollRight, - ); + // Initialize scroll indicators + const updateIndicators = useScrollIndicators( + tabsListRef as React.RefObject, + setCanScrollLeft, + setCanScrollRight, + ); - // Function to scroll tabs list left - const scrollTabsLeft = () => { - scrollTabsLeftUtil( - tabsListRef as React.RefObject, - updateIndicators, - ); - }; + // Function to scroll tabs list left + const scrollTabsLeft = () => { + scrollTabsLeftUtil( + tabsListRef as React.RefObject, + updateIndicators, + ); + }; - // Function to scroll tabs list right - const scrollTabsRight = () => { - scrollTabsRightUtil( - tabsListRef as React.RefObject, - updateIndicators, - ); - }; + // Function to scroll tabs list right + const scrollTabsRight = () => { + scrollTabsRightUtil( + tabsListRef as React.RefObject, + updateIndicators, + ); + }; - // Use the scroll to bottom hook - useScrollToBottom(messagesEndRef as React.RefObject, [ - messages, - ]); + // Use the scroll to bottom hook + useScrollToBottom(messagesEndRef as React.RefObject, [ + messages, + ]); - // Function to get a citation source by ID - const getCitationSource = React.useCallback( - (citationId: number, messageIndex?: number): Source | null => { - if (!messages || messages.length === 0) return null; + // Function to get a citation source by ID + const getCitationSource = React.useCallback( + (citationId: number, messageIndex?: number): Source | null => { + if (!messages || messages.length === 0) return null; - // If no specific message index is provided, use the latest assistant message - if (messageIndex === undefined) { - // Find the latest assistant message - const assistantMessages = messages.filter( - (msg) => msg.role === "assistant", - ); - if (assistantMessages.length === 0) return null; + // If no specific message index is provided, use the latest assistant message + if (messageIndex === undefined) { + // Find the latest assistant message + const assistantMessages = messages.filter( + (msg) => msg.role === "assistant", + ); + if (assistantMessages.length === 0) return null; - const latestAssistantMessage = - assistantMessages[assistantMessages.length - 1]; + const latestAssistantMessage = + assistantMessages[assistantMessages.length - 1]; - // Use our helper function to get sources - const sources = getMessageConnectorSources(latestAssistantMessage); - if (sources.length === 0) return null; + // Use our helper function to get sources + const sources = getMessageConnectorSources(latestAssistantMessage); + if (sources.length === 0) return null; - // Flatten all sources from all connectors - const allSources: Source[] = []; - sources.forEach((connector: ConnectorSource) => { - if (connector.sources && Array.isArray(connector.sources)) { - connector.sources.forEach((source: SourceItem) => { - allSources.push({ - id: source.id, - title: source.title, - description: source.description, - url: source.url, - connectorType: connector.type, - }); - }); - } - }); + // Flatten all sources from all connectors + const allSources: Source[] = []; + sources.forEach((connector: ConnectorSource) => { + if (connector.sources && Array.isArray(connector.sources)) { + connector.sources.forEach((source: SourceItem) => { + allSources.push({ + id: source.id, + title: source.title, + description: source.description, + url: source.url, + connectorType: connector.type, + }); + }); + } + }); - // Find the source with the matching ID - const foundSource = allSources.find( - (source) => source.id === citationId, - ); + // Find the source with the matching ID + const foundSource = allSources.find( + (source) => source.id === citationId, + ); - return foundSource || null; - } else { - // Use the specific message by index - const message = messages[messageIndex]; + return foundSource || null; + } else { + // Use the specific message by index + const message = messages[messageIndex]; - // Use our helper function to get sources - const sources = getMessageConnectorSources(message); - if (sources.length === 0) return null; + // Use our helper function to get sources + const sources = getMessageConnectorSources(message); + if (sources.length === 0) return null; - // Flatten all sources from all connectors - const allSources: Source[] = []; - sources.forEach((connector: ConnectorSource) => { - if (connector.sources && Array.isArray(connector.sources)) { - connector.sources.forEach((source: SourceItem) => { - allSources.push({ - id: source.id, - title: source.title, - description: source.description, - url: source.url, - connectorType: connector.type, - }); - }); - } - }); + // Flatten all sources from all connectors + const allSources: Source[] = []; + sources.forEach((connector: ConnectorSource) => { + if (connector.sources && Array.isArray(connector.sources)) { + connector.sources.forEach((source: SourceItem) => { + allSources.push({ + id: source.id, + title: source.title, + description: source.description, + url: source.url, + connectorType: connector.type, + }); + }); + } + }); - // Find the source with the matching ID - const foundSource = allSources.find( - (source) => source.id === citationId, - ); + // Find the source with the matching ID + const foundSource = allSources.find( + (source) => source.id === citationId, + ); - return foundSource || null; - } - }, - [messages], - ); + return foundSource || null; + } + }, + [messages], + ); - // Pure function for rendering terminal content - no hooks allowed here - const renderTerminalContent = (message: any) => { - if (!message.annotations) return null; + // Pure function for rendering terminal content - no hooks allowed here + const renderTerminalContent = (message: any) => { + if (!message.annotations) return null; - // Get all TERMINAL_INFO annotations content - const terminalInfoAnnotations = (message.annotations as any[]).map(item => { - if(item.type === "TERMINAL_INFO") { - return item.content.map((a: any) => a.text) - - } - }).flat().filter(Boolean) + // Get all TERMINAL_INFO annotations content + const terminalInfoAnnotations = (message.annotations as any[]) + .map((item) => { + if (item.type === "TERMINAL_INFO") { + return item.content.map((a: any) => a.text); + } + }) + .flat() + .filter(Boolean); - // Render the content of the latest TERMINAL_INFO annotation - return terminalInfoAnnotations.map((item: any, idx: number) => ( -
- - [{String(idx).padStart(2, "0")}: - {String(Math.floor(idx * 2)).padStart(2, "0")}] - - {">"} - ( +
+ + [{String(idx).padStart(2, "0")}: + {String(Math.floor(idx * 2)).padStart(2, "0")}] + + {">"} + - {item} - -
- )); - }; + > + {item} +
+
+ )); + }; - return ( - <> -
- {messages.length === 0 && ( -

- -
- Surf{""} -
-
- Sense -
-
-
-

- )} - {messages?.map((message, index) => { - if (message.role === "user") { - return ( -
- -
- - - getCitationSource(id, index)} - className="text-sm" - /> - - -
-
- ); - } + return ( + <> +
+ {messages.length === 0 && ( +

+ +
+ Surf{""} +
+
+ Sense +
+
+
+

+ )} + {messages?.map((message, index) => { + if (message.role === "user") { + return ( +
+ +
+ + + getCitationSource(id, index)} + className="text-sm" + /> + + +
+
+ ); + } - if (message.role === "assistant") { - return ( -
- - - - Answer - - - - {/* Status Messages Section */} - -
-
-
-
setTerminalExpanded(false)} - >
-
-
setTerminalExpanded(true)} - >
-
- - surfsense-research-terminal - -
-
+ if (message.role === "assistant") { + return ( +
+ + + + Answer + + + + {/* Status Messages Section */} + +
+
+
+
setTerminalExpanded(false)} + >
+
+
setTerminalExpanded(true)} + >
+
+ + surfsense-research-terminal + +
+
-
-
- Last login: {currentDate} {currentTime} -
-
- - researcher@surfsense - - : - ~/research - $ - surfsense-researcher -
+
+
+ Last login: {currentDate} {currentTime} +
+
+ + researcher@surfsense + + : + ~/research + $ + surfsense-researcher +
- {renderTerminalContent(message)} + {renderTerminalContent(message)} -
- - [00:13] - - - researcher@surfsense - - : - ~/research - $ -
-
+
+ + [00:13] + + + researcher@surfsense + + : + ~/research + $ +
+
- {/* Terminal scroll button */} -
- -
-
- + {/* Terminal scroll button */} +
+ +
+
+
- {/* Sources Section with Connector Tabs */} -
-
- - Sources -
+ {/* Sources Section with Connector Tabs */} +
+
+ + Sources +
- {(() => { - // Get sources for this specific message - const messageConnectorSources = - getMessageConnectorSources(message); + {(() => { + // Get sources for this specific message + const messageConnectorSources = + getMessageConnectorSources(message); - if (messageConnectorSources.length === 0) { - return ( -
- -
- ); - } + if (messageConnectorSources.length === 0) { + return ( +
+ +
+ ); + } - // Use these message-specific sources for the Tabs component - return ( - 0 - ? messageConnectorSources[0].type - : undefined - } - className="w-full" - > -
-
- + // Use these message-specific sources for the Tabs component + return ( + 0 + ? messageConnectorSources[0].type + : undefined + } + className="w-full" + > +
+
+ -
-
- - {messageConnectorSources.map( - (connector) => ( - - {getConnectorIcon(connector.type)} - - {connector.name.split(" ")[0]} - - - {connector.sources?.length || 0} - - - ), - )} - -
-
+
+
+ + {messageConnectorSources.map( + (connector) => ( + + {getConnectorIcon(connector.type)} + + {connector.name.split(" ")[0]} + + + {connector.sources?.length || 0} + + + ), + )} + +
+
- -
-
+ +
+
- {messageConnectorSources.map((connector) => ( - -
- {connector.sources - ?.slice(0, INITIAL_SOURCES_DISPLAY) - ?.map((source: any, index: number) => ( - -
-
- {getConnectorIcon(connector.type)} -
-
-

- {source.title} -

-

- {source.description} -

-
- -
-
- ))} + {messageConnectorSources.map((connector) => ( + +
+ {connector.sources + ?.slice(0, INITIAL_SOURCES_DISPLAY) + ?.map((source: any, index: number) => ( + +
+
+ {getConnectorIcon(connector.type)} +
+
+

+ {source.title} +

+

+ {source.description} +

+
+ +
+
+ ))} - {connector.sources?.length > - INITIAL_SOURCES_DISPLAY && ( - - setDialogOpenId( - open ? connector.id : null, - ) - } - > - - - - - - - - )} -
-
- ))} - - ); - })()} -
+ {connector.sources?.length > + INITIAL_SOURCES_DISPLAY && ( + + setDialogOpenId( + open ? connector.id : null, + ) + } + > + + + + + + + + )} +
+ + ))} + + ); + })()} +
- {/* Answer Section */} -
- { -
- {message.annotations && - (() => { - // Get all ANSWER annotations - const answerAnnotations = ( - message.annotations as any[] - ).filter((a) => a.type === "ANSWER"); + {/* Answer Section */} +
+ { +
+ {message.annotations && + (() => { + // Get all ANSWER annotations + const answerAnnotations = ( + message.annotations as any[] + ).filter((a) => a.type === "ANSWER"); - // Get the latest ANSWER annotation - const latestAnswer = - answerAnnotations.length > 0 - ? answerAnnotations[ - answerAnnotations.length - 1 - ] - : null; + // Get the latest ANSWER annotation + const latestAnswer = + answerAnnotations.length > 0 + ? answerAnnotations[ + answerAnnotations.length - 1 + ] + : null; - // If we have a latest ANSWER annotation with content, render it - if ( - latestAnswer?.content && - latestAnswer.content.length > 0 - ) { - return ( - - getCitationSource(id, index) - } - type="ai" - /> - ); - } + // If we have a latest ANSWER annotation with content, render it + if ( + latestAnswer?.content && + latestAnswer.content.length > 0 + ) { + return ( + + getCitationSource(id, index) + } + type="ai" + /> + ); + } - // Fallback to the message content if no ANSWER annotation is available - return getCitationSource(id, index)} - type="ai" - />; - })()} + // Fallback to the message content if no ANSWER annotation is available + return ( + + getCitationSource(id, index) + } + type="ai" + /> + ); + })()}
}
{/* Further Questions Section */} - {message.annotations && (() => { - // Get all FURTHER_QUESTIONS annotations - const furtherQuestionsAnnotations = (message.annotations as any[]) - .filter(a => a.type === 'FURTHER_QUESTIONS'); + {message.annotations && + (() => { + // Get all FURTHER_QUESTIONS annotations + const furtherQuestionsAnnotations = ( + message.annotations as any[] + ).filter((a) => a.type === "FURTHER_QUESTIONS"); - // Get the latest FURTHER_QUESTIONS annotation - const latestFurtherQuestions = furtherQuestionsAnnotations.length > 0 - ? furtherQuestionsAnnotations[furtherQuestionsAnnotations.length - 1] - : null; + // Get the latest FURTHER_QUESTIONS annotation + const latestFurtherQuestions = + furtherQuestionsAnnotations.length > 0 + ? furtherQuestionsAnnotations[ + furtherQuestionsAnnotations.length - 1 + ] + : null; - // Only render if we have questions - if (!latestFurtherQuestions?.content || latestFurtherQuestions.content.length === 0) { - return null; - } + // Only render if we have questions + if ( + !latestFurtherQuestions?.content || + latestFurtherQuestions.content.length === 0 + ) { + return null; + } - const furtherQuestions = latestFurtherQuestions.content; + const furtherQuestions = latestFurtherQuestions.content; - return ( -
- {/* Main container with improved styling */} -
- {/* Header with better visual separation */} -
-
-

- - - - Follow-up Questions -

- - {furtherQuestions.length} suggestion{furtherQuestions.length !== 1 ? 's' : ''} - + return ( +
+ {/* Main container with improved styling */} +
+ {/* Header with better visual separation */} +
+
+

+ + + + Follow-up Questions +

+ + {furtherQuestions.length} suggestion + {furtherQuestions.length !== 1 ? "s" : ""} + +
-
- {/* Questions container with enhanced scrolling */} -
-
- {/* Left fade gradient */} -
- - {/* Right fade gradient */} -
- - {/* Scrollable container */} -
-
- {furtherQuestions.map((question: any, qIndex: number) => ( - - ))} + {/* Questions container with enhanced scrolling */} +
+
+ {/* Left fade gradient */} +
+ + {/* Right fade gradient */} +
+ + {/* Scrollable container */} +
+
+ {furtherQuestions.map( + (question: any, qIndex: number) => ( + + ), + )} +
-
- ); - })()} + ); + })()} {/* Scroll to bottom button */}
- -
-
- {/* Enhanced Document Selection Dialog */} - - - {}} - documentsCount={documents?.length || 0} - /> - - - - -
- - Select Documents - - {selectedDocuments.length} selected - -
- -
- - Choose documents to include in your research context. Use - filters and search to find specific documents. - -
+ {/* New Chat Input Form */} +
+
+ + {/* Send button */} + +
+
+
+ {/* Enhanced Document Selection Dialog */} + + + {}} + documentsCount={documents?.length || 0} + /> + + + + +
+ + Select Documents + + {selectedDocuments.length} selected + +
+ +
+ + Choose documents to include in your research context. Use + filters and search to find specific documents. + +
- {/* Enhanced Search and Filter Controls */} -
-
- {/* Search Input */} -
- - setDocumentFilter(e.target.value)} - /> - {documentFilter && ( - - )} -
+ {/* Enhanced Search and Filter Controls */} +
+
+ {/* Search Input */} +
+ + setDocumentFilter(e.target.value)} + /> + {documentFilter && ( + + )} +
- {/* Document Type Filter */} - { - setDocumentTypeFilter(newType); - setDocumentsPage(1); // Reset to page 1 when filter changes - }} - counts={documentTypeCounts} - /> -
+ {/* Document Type Filter */} + { + setDocumentTypeFilter(newType); + setDocumentsPage(1); // Reset to page 1 when filter changes + }} + counts={documentTypeCounts} + /> +
- {/* Results Summary */} -
- - {isLoadingDocuments - ? "Loading documents..." - : `Showing ${paginatedDocuments.length} of ${filteredDocuments.length} documents`} - - {filteredDocuments.length > 0 && ( - - Page {documentsPage} of {totalPages} - - )} -
-
+ {/* Results Summary */} +
+ + {isLoadingDocuments + ? "Loading documents..." + : `Showing ${paginatedDocuments.length} of ${filteredDocuments.length} documents`} + + {filteredDocuments.length > 0 && ( + + Page {documentsPage} of {totalPages} + + )} +
+
- {/* Document List with Proper Scrolling */} -
-
- {isLoadingDocuments ? ( - // Enhanced skeleton loading - Array.from({ length: 6 }, (_, i) => ( - - )) - ) : documentsError ? ( -
-
- -
-

- Error loading documents -

-

- Please try refreshing the page -

-
- ) : filteredDocuments.length === 0 ? ( -
-
- -
-

- No documents found -

-

- {documentFilter || documentTypeFilter !== "ALL" - ? "Try adjusting your search or filters" - : "Upload documents to get started"} -

- {!documentFilter && documentTypeFilter === "ALL" && ( - - )} -
- ) : ( - // Enhanced document list - paginatedDocuments.map((document) => { - const isSelected = selectedDocuments.includes( - document.id, - ); - const typeLabel = document.document_type - .replace(/_/g, " ") - .toLowerCase(); + {/* Document List with Proper Scrolling */} +
+
+ {isLoadingDocuments ? ( + // Enhanced skeleton loading + Array.from({ length: 6 }, (_, i) => ( + + )) + ) : documentsError ? ( +
+
+ +
+

+ Error loading documents +

+

+ Please try refreshing the page +

+
+ ) : filteredDocuments.length === 0 ? ( +
+
+ +
+

+ No documents found +

+

+ {documentFilter || documentTypeFilter !== "ALL" + ? "Try adjusting your search or filters" + : "Upload documents to get started"} +

+ {!documentFilter && documentTypeFilter === "ALL" && ( + + )} +
+ ) : ( + // Enhanced document list + paginatedDocuments.map((document) => { + const isSelected = selectedDocuments.includes( + document.id, + ); + const typeLabel = document.document_type + .replace(/_/g, " ") + .toLowerCase(); - return ( -
handleDocumentToggle(document.id)} - > -
-
- {getConnectorIcon(document.document_type)} -
-
-
-
-

- {document.title} -

- {isSelected && ( -
-
- -
-
- )} -
-
- - {typeLabel} - - - {new Date( - document.created_at, - ).toLocaleDateString()} - -
-

- {document.content.substring(0, 200)}... -

-
-
- ); - }) - )} -
-
+ return ( +
handleDocumentToggle(document.id)} + > +
+
+ {getConnectorIcon(document.document_type)} +
+
+
+
+

+ {document.title} +

+ {isSelected && ( +
+
+ +
+
+ )} +
+
+ + {typeLabel} + + + {new Date( + document.created_at, + ).toLocaleDateString()} + +
+

+ {document.content.substring(0, 200)}... +

+
+
+ ); + }) + )} +
+
- {/* Enhanced Pagination Controls */} - {totalPages > 1 && ( -
-
- -
- {Array.from( - { length: Math.min(5, totalPages) }, - (_, i) => { - const page = - documentsPage <= 3 - ? i + 1 - : documentsPage - 2 + i; - if (page > totalPages) return null; - return ( - - ); - }, - )} - {totalPages > 5 && documentsPage < totalPages - 2 && ( - <> - - ... - - - - )} -
- -
-
- )} + {/* Enhanced Pagination Controls */} + {totalPages > 1 && ( +
+
+ +
+ {Array.from( + { length: Math.min(5, totalPages) }, + (_, i) => { + const page = + documentsPage <= 3 + ? i + 1 + : documentsPage - 2 + i; + if (page > totalPages) return null; + return ( + + ); + }, + )} + {totalPages > 5 && documentsPage < totalPages - 2 && ( + <> + + ... + + + + )} +
+ +
+
+ )} - {/* Enhanced Footer */} - -
- - {selectedDocuments.length} of {filteredDocuments.length}{" "} - document{selectedDocuments.length !== 1 ? "s" : ""}{" "} - selected - -
-
- - + - + -
-
-
-
+ if (allSelected) { + setSelectedDocuments((prev) => + prev.filter((id) => !allFilteredIds.includes(id)), + ); + } else { + setSelectedDocuments((prev) => [ + ...new Set([...prev, ...allFilteredIds]), + ]); + } + }} + disabled={filteredDocuments.length === 0} + > + {filteredDocuments.every((doc) => + selectedDocuments.includes(doc.id), + ) + ? "Deselect" + : "Select"}{" "} + All Filtered + +
+ + +
- {/* Connector Selection Dialog */} - - - {}} - /> - - - - Select Connectors - - Choose which data sources to include in your research - - + {/* Connector Selection Dialog */} + + + {}} + /> + + + + Select Connectors + + Choose which data sources to include in your research + + - {/* Connector selection grid */} -
- {isLoadingConnectors ? ( -
- -
- ) : ( - connectorSourceItems.map((connector) => { - const isSelected = selectedConnectors.includes( - connector.type, - ); + {/* Connector selection grid */} +
+ {isLoadingConnectors ? ( +
+ +
+ ) : ( + connectorSourceItems.map((connector) => { + const isSelected = selectedConnectors.includes( + connector.type, + ); - return ( -
{ - setSelectedConnectors( - isSelected - ? selectedConnectors.filter( - (type) => type !== connector.type, - ) - : [...selectedConnectors, connector.type], - ); - }} - role="checkbox" - aria-checked={isSelected} - tabIndex={0} - > -
- {getConnectorIcon(connector.type)} -
- - {connector.name} - - {isSelected && ( - - )} -
- ); - }) - )} -
+ return ( +
{ + setSelectedConnectors( + isSelected + ? selectedConnectors.filter( + (type) => type !== connector.type, + ) + : [...selectedConnectors, connector.type], + ); + }} + role="checkbox" + aria-checked={isSelected} + tabIndex={0} + > +
+ {getConnectorIcon(connector.type)} +
+ + {connector.name} + + {isSelected && ( + + )} +
+ ); + }) + )} +
- -
- - -
-
-
-
+ +
+ + +
+
+
+
- {/* Search Mode Control */} -
- - -
+ {/* Search Mode Control */} +
+ + +
- {/* Research Mode Control */} -
- -
+ {/* Research Mode Control */} +
+ +
- {/* Fast LLM Selector */} -
- -
-
-
-
+ {/* Fast LLM Selector */} +
+ +
+
+
+
- {/* Reference for auto-scrolling */} -
-
- - ); + {/* Reference for auto-scrolling */} +
+
+ + ); }; export default ChatPage; diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx index 4d0aa11ef..d7c977b98 100644 --- a/surfsense_web/components/chat/ConnectorComponents.tsx +++ b/surfsense_web/components/chat/ConnectorComponents.tsx @@ -1,6 +1,6 @@ -import React from 'react'; -import { - ChevronDown, +import React from "react"; +import { + ChevronDown, Plus, Search, Globe, @@ -12,78 +12,99 @@ import { Webhook, MessageCircle, FileText, -} from 'lucide-react'; -import { IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconBrandGithub, IconLayoutKanban, IconLinkPlus, IconBrandDiscord } from "@tabler/icons-react"; -import { Button } from '@/components/ui/button'; -import { Connector, ResearchMode } from './types'; +} from "lucide-react"; +import { + IconBrandNotion, + IconBrandSlack, + IconBrandYoutube, + IconBrandGithub, + IconLayoutKanban, + IconLinkPlus, + IconBrandDiscord, + IconTicket, +} from "@tabler/icons-react"; +import { Button } from "@/components/ui/button"; +import { Connector, ResearchMode } from "./types"; // Helper function to get connector icon export const getConnectorIcon = (connectorType: string) => { const iconProps = { className: "h-4 w-4" }; - - switch(connectorType) { - case 'LINKUP_API': + + switch (connectorType) { + case "LINKUP_API": return ; - case 'LINEAR_CONNECTOR': + case "LINEAR_CONNECTOR": return ; - case 'GITHUB_CONNECTOR': + case "GITHUB_CONNECTOR": return ; - case 'YOUTUBE_VIDEO': + case "YOUTUBE_VIDEO": return ; - case 'CRAWLED_URL': + case "CRAWLED_URL": return ; - case 'FILE': - return ; - case 'EXTENSION': - return ; - case 'SERPER_API': - case 'TAVILY_API': + case "FILE": + return ; + case "EXTENSION": + return ; + case "SERPER_API": + case "TAVILY_API": return ; - case 'SLACK_CONNECTOR': + case "SLACK_CONNECTOR": return ; - case 'NOTION_CONNECTOR': + case "NOTION_CONNECTOR": return ; - case 'DISCORD_CONNECTOR': + case "DISCORD_CONNECTOR": return ; - case 'DEEP': + case "JIRA_CONNECTOR": + return ; + case "DEEP": return ; - case 'DEEPER': + case "DEEPER": return ; - case 'DEEPEST': + case "DEEPEST": return ; default: return ; } }; -export const researcherOptions: { value: ResearchMode; label: string; icon: React.ReactNode }[] = [ +export const researcherOptions: { + value: ResearchMode; + label: string; + icon: React.ReactNode; +}[] = [ { - value: 'QNA', - label: 'Q/A', - icon: getConnectorIcon('GENERAL') + value: "QNA", + label: "Q/A", + icon: getConnectorIcon("GENERAL"), }, { - value: 'REPORT_GENERAL', - label: 'General', - icon: getConnectorIcon('GENERAL') + value: "REPORT_GENERAL", + label: "General", + icon: getConnectorIcon("GENERAL"), }, { - value: 'REPORT_DEEP', - label: 'Deep', - icon: getConnectorIcon('DEEP') + value: "REPORT_DEEP", + label: "Deep", + icon: getConnectorIcon("DEEP"), }, { - value: 'REPORT_DEEPER', - label: 'Deeper', - icon: getConnectorIcon('DEEPER') + value: "REPORT_DEEPER", + label: "Deeper", + icon: getConnectorIcon("DEEPER"), }, -] +]; /** * Displays a small icon for a connector type */ -export const ConnectorIcon = ({ type, index = 0 }: { type: string; index?: number }) => ( -
( +
@@ -109,24 +130,30 @@ type ConnectorButtonProps = { /** * Button that displays selected connectors and opens connector selection dialog */ -export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources }: ConnectorButtonProps) => { +export const ConnectorButton = ({ + selectedConnectors, + onClick, + connectorSources, +}: ConnectorButtonProps) => { const totalConnectors = connectorSources.length; const selectedCount = selectedConnectors.length; const progressPercentage = (selectedCount / totalConnectors) * 100; - + // Get the name of a single selected connector const getSingleConnectorName = () => { - const connector = connectorSources.find(c => c.type === selectedConnectors[0]); - return connector?.name || ''; + const connector = connectorSources.find( + (c) => c.type === selectedConnectors[0], + ); + return connector?.name || ""; }; - + // Get display text based on selection count const getDisplayText = () => { if (selectedCount === totalConnectors) return "All Connectors"; if (selectedCount === 1) return getSingleConnectorName(); return `${selectedCount} Connectors`; }; - + // Render the empty state (no connectors selected) const renderEmptyState = () => ( <> @@ -134,7 +161,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources Select Connectors ); - + // Render the selected connectors preview const renderSelectedConnectors = () => ( <> @@ -143,32 +170,36 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources {selectedConnectors.slice(0, 3).map((type, index) => ( ))} - + {/* Show count indicator if more than 3 connectors are selected */} {selectedCount > 3 && }
- + {/* Display text */} {getDisplayText()} ); - + return (
); -}; \ No newline at end of file +}; diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 022459be8..b53ffee64 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -1,14 +1,15 @@ // Helper function to get connector type display name export const getConnectorTypeDisplay = (type: string): string => { - const typeMap: Record = { - "SERPER_API": "Serper API", - "TAVILY_API": "Tavily API", - "SLACK_CONNECTOR": "Slack", - "NOTION_CONNECTOR": "Notion", - "GITHUB_CONNECTOR": "GitHub", - "LINEAR_CONNECTOR": "Linear", - "DISCORD_CONNECTOR": "Discord", - "LINKUP_API": "Linkup", - }; - return typeMap[type] || type; -}; + const typeMap: Record = { + SERPER_API: "Serper API", + TAVILY_API: "Tavily API", + SLACK_CONNECTOR: "Slack", + NOTION_CONNECTOR: "Notion", + GITHUB_CONNECTOR: "GitHub", + LINEAR_CONNECTOR: "Linear", + JIRA_CONNECTOR: "Jira", + DISCORD_CONNECTOR: "Discord", + LINKUP_API: "Linkup", + }; + return typeMap[type] || type; +}; From 2bb013ef9d07b0302b099bf2dd74a5afecc5ee4d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 12:02:29 +0200 Subject: [PATCH 09/17] update the promt --- .../a2ecb2962bf19c1099cfe708e42daa0097f94976.json | 1 - surfsense_backend/app/agents/researcher/qna_agent/prompts.py | 2 +- surfsense_backend/app/connectors/jira_connector.py | 3 ++- 3 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json diff --git a/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json b/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json deleted file mode 100644 index 502adfcc4..000000000 --- a/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json +++ /dev/null @@ -1 +0,0 @@ -{"2d0ec64d93969318101ee479b664221b32241665":{"files":{"surfsense_web/lib/connectors/utils.ts":["RXwmTdu3JAyxa1ApFuYJiSRHfZo=",true],"surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx":["jZynb8hLm5uq1viyFK9UMcRClD8=",true],"surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx":["LEFIcQIvBUtbTE9PuuJI0WqzdVw=",true]},"modified":1753351069225}} \ No newline at end of file diff --git a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py index 0c5ebc158..3f4d97558 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py @@ -72,7 +72,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel Python's asyncio library provides tools for writing concurrent code using the async/await syntax. It's particularly useful for I/O-bound and high-level structured network code. - + 12 diff --git a/surfsense_backend/app/connectors/jira_connector.py b/surfsense_backend/app/connectors/jira_connector.py index 14b2147e0..5b54bb745 100644 --- a/surfsense_backend/app/connectors/jira_connector.py +++ b/surfsense_backend/app/connectors/jira_connector.py @@ -5,7 +5,8 @@ A module for retrieving data from Jira. Allows fetching issue lists and their comments, projects and more. """ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple +from datetime import datetime import requests From 7af65a5a6630ada0df306e0bedd6bb8b3cd8377e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 12:03:30 +0200 Subject: [PATCH 10/17] update the promt --- surfsense_backend/app/connectors/jira_connector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/surfsense_backend/app/connectors/jira_connector.py b/surfsense_backend/app/connectors/jira_connector.py index 5b54bb745..65cb6575f 100644 --- a/surfsense_backend/app/connectors/jira_connector.py +++ b/surfsense_backend/app/connectors/jira_connector.py @@ -452,8 +452,6 @@ class JiraConnector: return "Unknown date" try: - from datetime import datetime - # Jira dates are typically in format: 2023-01-01T12:00:00.000+0000 dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00")) return dt.strftime("%Y-%m-%d %H:%M:%S") From 4984aab3f16801c0b9c1f2c7affe4f1bb74893db Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 24 Jul 2025 22:45:47 +0200 Subject: [PATCH 11/17] update auth type --- .../app/connectors/jira_connector.py | 74 ++++++++---- .../app/connectors/test_jira_connector.py | 108 ++++++++++-------- .../app/schemas/search_source_connector.py | 106 +++++++++++------ .../app/tasks/connectors_indexing_tasks.py | 20 ++-- .../connectors/[connector_id]/edit/page.tsx | 15 ++- .../connectors/add/jira-connector/page.tsx | 62 +++++++--- 6 files changed, 245 insertions(+), 140 deletions(-) diff --git a/surfsense_backend/app/connectors/jira_connector.py b/surfsense_backend/app/connectors/jira_connector.py index 65cb6575f..2325a66fc 100644 --- a/surfsense_backend/app/connectors/jira_connector.py +++ b/surfsense_backend/app/connectors/jira_connector.py @@ -5,8 +5,10 @@ A module for retrieving data from Jira. Allows fetching issue lists and their comments, projects and more. """ -from typing import Any, Dict, List, Optional, Tuple +import base64 +import json from datetime import datetime +from typing import Any, Dict, List, Optional import requests @@ -17,55 +19,76 @@ class JiraConnector: def __init__( self, base_url: Optional[str] = None, - personal_access_token: Optional[str] = None, + email: Optional[str] = None, + api_token: Optional[str] = None, ): """ Initialize the JiraConnector class. Args: base_url: Jira instance base URL (e.g., 'https://yourcompany.atlassian.net') (optional) - personal_access_token: Jira personal access token (optional) + email: Jira account email address (optional) + api_token: Jira API token (optional) """ self.base_url = base_url.rstrip("/") if base_url else None - self.personal_access_token = personal_access_token + self.email = email + self.api_token = api_token self.api_version = "3" # Jira Cloud API version - def set_credentials(self, base_url: str, personal_access_token: str) -> None: + def set_credentials(self, base_url: str, email: str, api_token: str) -> None: """ Set the Jira credentials. Args: base_url: Jira instance base URL - personal_access_token: Jira personal access token + email: Jira account email address + api_token: Jira API token """ self.base_url = base_url.rstrip("/") - self.personal_access_token = personal_access_token + self.email = email + self.api_token = api_token - def set_personal_access_token(self, personal_access_token: str) -> None: + def set_email(self, email: str) -> None: """ - Set the Jira personal access token. + Set the Jira account email. Args: - personal_access_token: Jira personal access token + email: Jira account email address """ - self.personal_access_token = personal_access_token + self.email = email + + def set_api_token(self, api_token: str) -> None: + """ + Set the Jira API token. + + Args: + api_token: Jira API token + """ + self.api_token = api_token def get_headers(self) -> Dict[str, str]: """ - Get headers for Jira API requests. + Get headers for Jira API requests using Basic Authentication. Returns: Dictionary of headers Raises: - ValueError: If personal_access_token or base_url have not been set + ValueError: If email, api_token, or base_url have not been set """ - if not all([self.base_url, self.personal_access_token]): - raise ValueError("Jira personal access token or base URL not initialized.") + if not all([self.base_url, self.email, self.api_token]): + raise ValueError( + "Jira credentials not initialized. Call set_credentials() first." + ) + + # Create Basic Auth header using email:api_token + auth_str = f"{self.email}:{self.api_token}" + auth_bytes = auth_str.encode("utf-8") + auth_header = "Basic " + base64.b64encode(auth_bytes).decode("ascii") return { "Content-Type": "application/json", - "Authorization": f"Bearer {self.personal_access_token}", + "Authorization": auth_header, "Accept": "application/json", } @@ -83,17 +106,21 @@ class JiraConnector: Response data from the API Raises: - ValueError: If personal_access_token or base_url have not been set + ValueError: If email, api_token, or base_url have not been set Exception: If the API request fails """ - if not all([self.base_url, self.personal_access_token]): - raise ValueError("Jira personal access token or base URL not initialized.") + if not all([self.base_url, self.email, self.api_token]): + raise ValueError( + "Jira credentials not initialized. Call set_credentials() first." + ) url = f"{self.base_url}/rest/api/{self.api_version}/{endpoint}" headers = self.get_headers() response = requests.get(url, headers=headers, params=params, timeout=500) + print(json.dumps(response.json(), indent=2)) + if response.status_code == 200: return response.json() else: @@ -197,9 +224,11 @@ class JiraConnector: try: # Build JQL query for date range # Query issues that were either created OR updated within the date range - date_filter = f"(created >= '{start_date}' AND created <= '{end_date}') OR (updated >= '{start_date}' AND updated <= '{end_date}')" + date_filter = ( + f"(createdDate >= '{start_date}' AND createdDate <= '{end_date}')" + ) - jql = f"{date_filter} ORDER BY created DESC" + jql = f"{date_filter}" if project_key: jql = ( f'project = "{project_key}" AND {date_filter} ORDER BY created DESC' @@ -234,8 +263,11 @@ class JiraConnector: while True: params["startAt"] = start_at + print(json.dumps(params, indent=2)) result = self.make_api_request("search", params) + print(json.dumps(result, indent=2)) + if not isinstance(result, dict) or "issues" not in result: return [], "Invalid response from Jira API" diff --git a/surfsense_backend/app/connectors/test_jira_connector.py b/surfsense_backend/app/connectors/test_jira_connector.py index c9b755152..a4b33b01a 100644 --- a/surfsense_backend/app/connectors/test_jira_connector.py +++ b/surfsense_backend/app/connectors/test_jira_connector.py @@ -1,104 +1,112 @@ import unittest -from unittest.mock import patch, Mock -from datetime import datetime +from unittest.mock import Mock, patch # Import the JiraConnector from .jira_connector import JiraConnector class TestJiraConnector(unittest.TestCase): - def setUp(self): """Set up test fixtures.""" self.base_url = "https://test.atlassian.net" - self.token = "test_token" - self.connector = JiraConnector(base_url=self.base_url, personal_access_token=self.token) + self.email = "test@example.com" + self.api_token = "test_api_token" + self.connector = JiraConnector( + base_url=self.base_url, email=self.email, api_token=self.api_token + ) def test_init(self): """Test JiraConnector initialization.""" self.assertEqual(self.connector.base_url, self.base_url) - self.assertEqual(self.connector.personal_access_token, self.token) + self.assertEqual(self.connector.email, self.email) + self.assertEqual(self.connector.api_token, self.api_token) self.assertEqual(self.connector.api_version, "3") def test_init_with_trailing_slash(self): """Test JiraConnector initialization with trailing slash in URL.""" - connector = JiraConnector(base_url="https://test.atlassian.net/", personal_access_token=self.token) + connector = JiraConnector( + base_url="https://test.atlassian.net/", + email=self.email, + api_token=self.api_token, + ) self.assertEqual(connector.base_url, "https://test.atlassian.net") def test_set_credentials(self): """Test setting credentials.""" new_url = "https://newtest.atlassian.net/" - new_token = "new_token" - - self.connector.set_credentials(new_url, new_token) - + new_email = "new@example.com" + new_token = "new_api_token" + + self.connector.set_credentials(new_url, new_email, new_token) + self.assertEqual(self.connector.base_url, "https://newtest.atlassian.net") - self.assertEqual(self.connector.personal_access_token, new_token) + self.assertEqual(self.connector.email, new_email) + self.assertEqual(self.connector.api_token, new_token) def test_get_headers(self): """Test header generation.""" headers = self.connector.get_headers() - - self.assertIn('Content-Type', headers) - self.assertIn('Authorization', headers) - self.assertIn('Accept', headers) - self.assertEqual(headers['Content-Type'], 'application/json') - self.assertEqual(headers['Accept'], 'application/json') - self.assertTrue(headers['Authorization'].startswith('Bearer ')) + + self.assertIn("Content-Type", headers) + self.assertIn("Authorization", headers) + self.assertIn("Accept", headers) + self.assertEqual(headers["Content-Type"], "application/json") + self.assertEqual(headers["Accept"], "application/json") + self.assertTrue(headers["Authorization"].startswith("Basic ")) def test_get_headers_no_credentials(self): """Test header generation without credentials.""" connector = JiraConnector() - + with self.assertRaises(ValueError) as context: connector.get_headers() - + self.assertIn("Jira credentials not initialized", str(context.exception)) - @patch('requests.get') + @patch("requests.get") def test_make_api_request_success(self, mock_get): """Test successful API request.""" mock_response = Mock() mock_response.status_code = 200 mock_response.json.return_value = {"test": "data"} mock_get.return_value = mock_response - + result = self.connector.make_api_request("test/endpoint") - + self.assertEqual(result, {"test": "data"}) mock_get.assert_called_once() - @patch('requests.get') + @patch("requests.get") def test_make_api_request_failure(self, mock_get): """Test failed API request.""" mock_response = Mock() mock_response.status_code = 401 mock_response.text = "Unauthorized" mock_get.return_value = mock_response - + with self.assertRaises(Exception) as context: self.connector.make_api_request("test/endpoint") - + self.assertIn("API request failed with status code 401", str(context.exception)) - @patch.object(JiraConnector, 'make_api_request') + @patch.object(JiraConnector, "make_api_request") def test_get_all_projects(self, mock_api_request): """Test getting all projects.""" mock_api_request.return_value = { "values": [ {"id": "1", "key": "TEST", "name": "Test Project"}, - {"id": "2", "key": "DEMO", "name": "Demo Project"} + {"id": "2", "key": "DEMO", "name": "Demo Project"}, ] } - + projects = self.connector.get_all_projects() - + self.assertEqual(len(projects), 2) self.assertEqual(projects[0]["key"], "TEST") self.assertEqual(projects[1]["key"], "DEMO") mock_api_request.assert_called_once_with("project") - @patch.object(JiraConnector, 'make_api_request') + @patch.object(JiraConnector, "make_api_request") def test_get_all_issues(self, mock_api_request): """Test getting all issues.""" mock_api_request.return_value = { @@ -114,15 +122,15 @@ class TestJiraConnector(unittest.TestCase): "issuetype": {"name": "Bug"}, "project": {"key": "TEST"}, "created": "2023-01-01T10:00:00.000+0000", - "updated": "2023-01-01T12:00:00.000+0000" - } + "updated": "2023-01-01T12:00:00.000+0000", + }, } ], - "total": 1 + "total": 1, } - + issues = self.connector.get_all_issues() - + self.assertEqual(len(issues), 1) self.assertEqual(issues[0]["key"], "TEST-1") self.assertEqual(issues[0]["fields"]["summary"], "Test Issue") @@ -144,18 +152,18 @@ class TestJiraConnector(unittest.TestCase): "reporter": { "accountId": "123", "displayName": "John Doe", - "emailAddress": "john@example.com" + "emailAddress": "john@example.com", }, "assignee": { "accountId": "456", "displayName": "Jane Smith", - "emailAddress": "jane@example.com" - } - } + "emailAddress": "jane@example.com", + }, + }, } - + formatted = self.connector.format_issue(raw_issue) - + self.assertEqual(formatted["id"], "1") self.assertEqual(formatted["key"], "TEST-1") self.assertEqual(formatted["title"], "Test Issue") @@ -170,17 +178,17 @@ class TestJiraConnector(unittest.TestCase): """Test date formatting.""" iso_date = "2023-01-01T10:30:00.000+0000" formatted_date = JiraConnector.format_date(iso_date) - + self.assertEqual(formatted_date, "2023-01-01 10:30:00") def test_format_date_invalid(self): """Test date formatting with invalid input.""" formatted_date = JiraConnector.format_date("invalid-date") self.assertEqual(formatted_date, "invalid-date") - + formatted_date = JiraConnector.format_date("") self.assertEqual(formatted_date, "Unknown date") - + formatted_date = JiraConnector.format_date(None) self.assertEqual(formatted_date, "Unknown date") @@ -198,11 +206,11 @@ class TestJiraConnector(unittest.TestCase): "created_at": "2023-01-01T10:00:00.000+0000", "updated_at": "2023-01-01T12:00:00.000+0000", "description": "Test Description", - "comments": [] + "comments": [], } - + markdown = self.connector.format_issue_to_markdown(formatted_issue) - + self.assertIn("# TEST-1: Test Issue", markdown) self.assertIn("**Status:** Open", markdown) self.assertIn("**Priority:** High", markdown) @@ -214,5 +222,5 @@ class TestJiraConnector(unittest.TestCase): self.assertIn("Test Description", markdown) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 17f1867b1..8c444a8fc 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -1,9 +1,12 @@ -from datetime import datetime import uuid -from typing import Dict, Any, Optional -from pydantic import BaseModel, field_validator, ConfigDict -from .base import IDModel, TimestampModel +from datetime import datetime +from typing import Any, Dict, Optional + from app.db import SearchSourceConnectorType +from pydantic import BaseModel, ConfigDict, field_validator + +from .base import IDModel, TimestampModel + class SearchSourceConnectorBase(BaseModel): name: str @@ -11,105 +14,129 @@ class SearchSourceConnectorBase(BaseModel): is_indexable: bool last_indexed_at: Optional[datetime] = None config: Dict[str, Any] - - @field_validator('config') + + @field_validator("config") @classmethod - def validate_config_for_connector_type(cls, config: Dict[str, Any], values: Dict[str, Any]) -> Dict[str, Any]: - connector_type = values.data.get('connector_type') - + def validate_config_for_connector_type( + cls, config: Dict[str, Any], values: Dict[str, Any] + ) -> Dict[str, Any]: + connector_type = values.data.get("connector_type") + if connector_type == SearchSourceConnectorType.SERPER_API: # For SERPER_API, only allow SERPER_API_KEY allowed_keys = ["SERPER_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For SERPER_API connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For SERPER_API connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the API key is not empty if not config.get("SERPER_API_KEY"): raise ValueError("SERPER_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.TAVILY_API: # For TAVILY_API, only allow TAVILY_API_KEY allowed_keys = ["TAVILY_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For TAVILY_API connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For TAVILY_API connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the API key is not empty if not config.get("TAVILY_API_KEY"): raise ValueError("TAVILY_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.LINKUP_API: # For LINKUP_API, only allow LINKUP_API_KEY allowed_keys = ["LINKUP_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the API key is not empty if not config.get("LINKUP_API_KEY"): raise ValueError("LINKUP_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.SLACK_CONNECTOR: # For SLACK_CONNECTOR, only allow SLACK_BOT_TOKEN allowed_keys = ["SLACK_BOT_TOKEN"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For SLACK_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + raise ValueError( + f"For SLACK_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) # Ensure the bot token is not empty if not config.get("SLACK_BOT_TOKEN"): raise ValueError("SLACK_BOT_TOKEN cannot be empty") - + elif connector_type == SearchSourceConnectorType.NOTION_CONNECTOR: # For NOTION_CONNECTOR, only allow NOTION_INTEGRATION_TOKEN allowed_keys = ["NOTION_INTEGRATION_TOKEN"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For NOTION_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For NOTION_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the integration token is not empty if not config.get("NOTION_INTEGRATION_TOKEN"): raise ValueError("NOTION_INTEGRATION_TOKEN cannot be empty") - + elif connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR: # For GITHUB_CONNECTOR, only allow GITHUB_PAT and repo_full_names allowed_keys = ["GITHUB_PAT", "repo_full_names"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the token is not empty if not config.get("GITHUB_PAT"): raise ValueError("GITHUB_PAT cannot be empty") - + # Ensure the repo_full_names is present and is a non-empty list repo_full_names = config.get("repo_full_names") if not isinstance(repo_full_names, list) or not repo_full_names: raise ValueError("repo_full_names must be a non-empty list of strings") - + elif connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR: # For LINEAR_CONNECTOR, only allow LINEAR_API_KEY allowed_keys = ["LINEAR_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For LINEAR_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For LINEAR_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the token is not empty if not config.get("LINEAR_API_KEY"): raise ValueError("LINEAR_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: # For DISCORD_CONNECTOR, only allow DISCORD_BOT_TOKEN allowed_keys = ["DISCORD_BOT_TOKEN"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For DISCORD_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + raise ValueError( + f"For DISCORD_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) # Ensure the bot token is not empty if not config.get("DISCORD_BOT_TOKEN"): raise ValueError("DISCORD_BOT_TOKEN cannot be empty") elif connector_type == SearchSourceConnectorType.JIRA_CONNECTOR: - # For JIRA_CONNECTOR, allow JIRA_PERSONAL_ACCESS_TOKEN and JIRA_BASE_URL - allowed_keys = ["JIRA_PERSONAL_ACCESS_TOKEN", "JIRA_BASE_URL"] + # For JIRA_CONNECTOR, require JIRA_EMAIL, JIRA_API_TOKEN and JIRA_BASE_URL + allowed_keys = ["JIRA_EMAIL", "JIRA_API_TOKEN", "JIRA_BASE_URL"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For JIRA_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + raise ValueError( + f"For JIRA_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) - # Ensure the token is not empty - if not config.get("JIRA_PERSONAL_ACCESS_TOKEN"): - raise ValueError("JIRA_PERSONAL_ACCESS_TOKEN cannot be empty") + # Ensure the email is not empty + if not config.get("JIRA_EMAIL"): + raise ValueError("JIRA_EMAIL cannot be empty") + + # Ensure the API token is not empty + if not config.get("JIRA_API_TOKEN"): + raise ValueError("JIRA_API_TOKEN cannot be empty") # Ensure the base URL is not empty if not config.get("JIRA_BASE_URL"): @@ -117,9 +144,11 @@ class SearchSourceConnectorBase(BaseModel): return config + class SearchSourceConnectorCreate(SearchSourceConnectorBase): pass + class SearchSourceConnectorUpdate(BaseModel): name: Optional[str] = None connector_type: Optional[SearchSourceConnectorType] = None @@ -127,7 +156,8 @@ class SearchSourceConnectorUpdate(BaseModel): last_indexed_at: Optional[datetime] = None config: Optional[Dict[str, Any]] = None + class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampModel): user_id: uuid.UUID - model_config = ConfigDict(from_attributes=True) + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index f4ae13971..b01a2a118 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -1,4 +1,5 @@ import asyncio +import json import logging from datetime import datetime, timedelta, timezone from typing import Optional, Tuple @@ -2041,10 +2042,11 @@ async def index_jira_issues( return 0, f"Connector with ID {connector_id} not found" # Get the Jira credentials from the connector config - jira_token = connector.config.get("JIRA_PERSONAL_ACCESS_TOKEN") + jira_email = connector.config.get("JIRA_EMAIL") + jira_api_token = connector.config.get("JIRA_API_TOKEN") jira_base_url = connector.config.get("JIRA_BASE_URL") - if not jira_token or not jira_base_url: + if not jira_email or not jira_api_token or not jira_base_url: await task_logger.log_task_failure( log_entry, f"Jira credentials not found in connector config for connector {connector_id}", @@ -2061,7 +2063,7 @@ async def index_jira_issues( ) jira_client = JiraConnector( - base_url=jira_base_url, personal_access_token=jira_token + base_url=jira_base_url, email=jira_email, api_token=jira_api_token ) # Calculate date range @@ -2097,6 +2099,8 @@ async def index_jira_issues( start_date=start_date_str, end_date=end_date_str, include_comments=True ) + print(json.dumps(issues, indent=2)) + if error: logger.error(f"Failed to get Jira issues: {error}") @@ -2112,10 +2116,10 @@ async def index_jira_issues( f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" ) - await task_logger.log_task_completion( + await task_logger.log_task_success( log_entry, f"No Jira issues found in date range {start_date_str} to {end_date_str}", - {"indexed_count": 0}, + {"issues_found": 0}, ) return 0, None else: @@ -2132,7 +2136,7 @@ async def index_jira_issues( await task_logger.log_task_progress( log_entry, f"Retrieved {len(issues)} issues from Jira API", - {"stage": "processing_issues", "issue_count": len(issues)}, + {"stage": "processing_issues", "issues_found": len(issues)}, ) except Exception as e: @@ -2254,10 +2258,10 @@ async def index_jira_issues( await session.commit() logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - await task_logger.log_task_completion( + await task_logger.log_task_success( log_entry, f"Successfully indexed {indexed_count} Jira issues", - {"indexed_count": indexed_count}, + {"issues_indexed": indexed_count}, ) logger.info(f"Successfully indexed {indexed_count} Jira issues") diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx index 4292b7efa..918a625d5 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx @@ -193,10 +193,17 @@ export default function EditConnectorPage() { /> +
)} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx index 625adfa0d..23e128f1f 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx @@ -57,8 +57,11 @@ const jiraConnectorFormSchema = z.object({ message: "Please enter a valid Jira instance URL", }, ), - personal_access_token: z.string().min(10, { - message: "Jira Personal Access Token is required and must be valid.", + email: z.string().email({ + message: "Please enter a valid email address.", + }), + api_token: z.string().min(10, { + message: "Jira API Token is required and must be valid.", }), }); @@ -78,7 +81,8 @@ export default function JiraConnectorPage() { defaultValues: { name: "Jira Connector", base_url: "", - personal_access_token: "", + email: "", + api_token: "", }, }); @@ -91,7 +95,8 @@ export default function JiraConnectorPage() { connector_type: "JIRA_CONNECTOR", config: { JIRA_BASE_URL: values.base_url, - JIRA_PERSONAL_ACCESS_TOKEN: values.personal_access_token, + JIRA_EMAIL: values.email, + JIRA_API_TOKEN: values.api_token, }, is_indexable: true, last_indexed_at: null, @@ -210,20 +215,40 @@ export default function JiraConnectorPage() { ( - Personal Access Token + Email Address - Your Jira Personal Access Token will be encrypted - and stored securely. + Your Atlassian account email address. + + + + )} + /> + + ( + + API Token + + + + + Your Jira API Token will be encrypted and stored securely. @@ -296,8 +321,8 @@ export default function JiraConnectorPage() {

How it works

- The Jira connector uses the Jira REST API to fetch all - issues and comments that the Personal Access Token has + The Jira connector uses the Jira REST API with Basic Authentication + to fetch all issues and comments that your account has access to within your Jira instance.