diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0548e6667..78de72540 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,7 +67,7 @@ repos: hooks: - id: mypy files: ^surfsense_backend/ - additional_dependencies: [] + additional_dependencies: ['types-requests'] args: [--ignore-missing-imports, --disallow-untyped-defs] - repo: https://github.com/PyCQA/bandit @@ -75,7 +75,7 @@ repos: hooks: - id: bandit files: ^surfsense_backend/ - args: ['-r', '.', '-f', 'json'] + args: ['-r', '-f', 'json'] exclude: ^surfsense_backend/(tests/|alembic/) # Frontend/Extension Hooks (TypeScript/JavaScript) diff --git a/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json b/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json new file mode 100644 index 000000000..e744e3ae9 --- /dev/null +++ b/node_modules/.cache/prettier/.prettier-caches/a2ecb2962bf19c1099cfe708e42daa0097f94976.json @@ -0,0 +1 @@ +{"2d0ec64d93969318101ee479b664221b32241665":{"files":{"surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx":["EHKKvlOK0vfy0GgHwlG/J2Bx5rw=",true]},"modified":1753426633288}} \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py b/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py index 83fdef1f1..007cd704e 100644 --- a/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py +++ b/surfsense_backend/alembic/versions/11_add_llm_config_table_and_relationships.py @@ -6,10 +6,8 @@ Revises: 10 from typing import Sequence, Union -from alembic import op import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import UUID, JSON - +from alembic import op # revision identifiers, used by Alembic. revision: str = "11" @@ -20,67 +18,145 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: """Upgrade schema - add LiteLLMProvider enum, LLMConfig table and user LLM preferences.""" - - # Check if enum type exists and create if it doesn't - op.execute(""" + + # Create enum only if not exists + op.execute( + """ DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'litellmprovider') THEN - CREATE TYPE litellmprovider AS ENUM ('OPENAI', 'ANTHROPIC', 'GROQ', 'COHERE', 'HUGGINGFACE', 'AZURE_OPENAI', 'GOOGLE', 'AWS_BEDROCK', 'OLLAMA', 'MISTRAL', 'TOGETHER_AI', 'REPLICATE', 'PALM', 'VERTEX_AI', 'ANYSCALE', 'PERPLEXITY', 'DEEPINFRA', 'AI21', 'NLPCLOUD', 'ALEPH_ALPHA', 'PETALS', 'CUSTOM'); + CREATE TYPE litellmprovider AS ENUM ( + 'OPENAI', 'ANTHROPIC', 'GROQ', 'COHERE', 'HUGGINGFACE', + 'AZURE_OPENAI', 'GOOGLE', 'AWS_BEDROCK', 'OLLAMA', 'MISTRAL', + 'TOGETHER_AI', 'REPLICATE', 'PALM', 'VERTEX_AI', 'ANYSCALE', + 'PERPLEXITY', 'DEEPINFRA', 'AI21', 'NLPCLOUD', 'ALEPH_ALPHA', + 'PETALS', 'CUSTOM' + ); END IF; END$$; - """) - - # Create llm_configs table using raw SQL to avoid enum creation conflicts - op.execute(""" - CREATE TABLE llm_configs ( - id SERIAL PRIMARY KEY, - created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), - name VARCHAR(100) NOT NULL, - provider litellmprovider NOT NULL, - custom_provider VARCHAR(100), - model_name VARCHAR(100) NOT NULL, - api_key TEXT NOT NULL, - api_base VARCHAR(500), - litellm_params JSONB, - user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE - ) - """) - - # Create indexes - op.create_index(op.f('ix_llm_configs_id'), 'llm_configs', ['id'], unique=False) - op.create_index(op.f('ix_llm_configs_created_at'), 'llm_configs', ['created_at'], unique=False) - op.create_index(op.f('ix_llm_configs_name'), 'llm_configs', ['name'], unique=False) - - # Add LLM preference columns to user table - op.add_column('user', sa.Column('long_context_llm_id', sa.Integer(), nullable=True)) - op.add_column('user', sa.Column('fast_llm_id', sa.Integer(), nullable=True)) - op.add_column('user', sa.Column('strategic_llm_id', sa.Integer(), nullable=True)) - - # Create foreign key constraints for LLM preferences - op.create_foreign_key(op.f('fk_user_long_context_llm_id_llm_configs'), 'user', 'llm_configs', ['long_context_llm_id'], ['id'], ondelete='SET NULL') - op.create_foreign_key(op.f('fk_user_fast_llm_id_llm_configs'), 'user', 'llm_configs', ['fast_llm_id'], ['id'], ondelete='SET NULL') - op.create_foreign_key(op.f('fk_user_strategic_llm_id_llm_configs'), 'user', 'llm_configs', ['strategic_llm_id'], ['id'], ondelete='SET NULL') + """ + ) + + # Create llm_configs table only if it doesn't already exist + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'llm_configs' + ) THEN + CREATE TABLE llm_configs ( + id SERIAL PRIMARY KEY, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + name VARCHAR(100) NOT NULL, + provider litellmprovider NOT NULL, + custom_provider VARCHAR(100), + model_name VARCHAR(100) NOT NULL, + api_key TEXT NOT NULL, + api_base VARCHAR(500), + litellm_params JSONB, + user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE + ); + END IF; + END$$; + """ + ) + + # Create indexes if they don't exist + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_id' + ) THEN + CREATE INDEX ix_llm_configs_id ON llm_configs(id); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_created_at' + ) THEN + CREATE INDEX ix_llm_configs_created_at ON llm_configs(created_at); + END IF; + + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'llm_configs' AND indexname = 'ix_llm_configs_name' + ) THEN + CREATE INDEX ix_llm_configs_name ON llm_configs(name); + END IF; + END$$; + """ + ) + + # Safely add columns to user table + bind = op.get_bind() + inspector = sa.inspect(bind) + existing_columns = [col["name"] for col in inspector.get_columns("user")] + + with op.batch_alter_table("user") as batch_op: + if "long_context_llm_id" not in existing_columns: + batch_op.add_column( + sa.Column("long_context_llm_id", sa.Integer(), nullable=True) + ) + batch_op.create_foreign_key( + op.f("fk_user_long_context_llm_id_llm_configs"), + "llm_configs", + ["long_context_llm_id"], + ["id"], + ondelete="SET NULL", + ) + + if "fast_llm_id" not in existing_columns: + batch_op.add_column(sa.Column("fast_llm_id", sa.Integer(), nullable=True)) + batch_op.create_foreign_key( + op.f("fk_user_fast_llm_id_llm_configs"), + "llm_configs", + ["fast_llm_id"], + ["id"], + ondelete="SET NULL", + ) + + if "strategic_llm_id" not in existing_columns: + batch_op.add_column( + sa.Column("strategic_llm_id", sa.Integer(), nullable=True) + ) + batch_op.create_foreign_key( + op.f("fk_user_strategic_llm_id_llm_configs"), + "llm_configs", + ["strategic_llm_id"], + ["id"], + ondelete="SET NULL", + ) def downgrade() -> None: """Downgrade schema - remove LLMConfig table and user LLM preferences.""" - + # Drop foreign key constraints - op.drop_constraint(op.f('fk_user_strategic_llm_id_llm_configs'), 'user', type_='foreignkey') - op.drop_constraint(op.f('fk_user_fast_llm_id_llm_configs'), 'user', type_='foreignkey') - op.drop_constraint(op.f('fk_user_long_context_llm_id_llm_configs'), 'user', type_='foreignkey') - + op.drop_constraint( + op.f("fk_user_strategic_llm_id_llm_configs"), "user", type_="foreignkey" + ) + op.drop_constraint( + op.f("fk_user_fast_llm_id_llm_configs"), "user", type_="foreignkey" + ) + op.drop_constraint( + op.f("fk_user_long_context_llm_id_llm_configs"), "user", type_="foreignkey" + ) + # Drop LLM preference columns from user table - op.drop_column('user', 'strategic_llm_id') - op.drop_column('user', 'fast_llm_id') - op.drop_column('user', 'long_context_llm_id') - + op.drop_column("user", "strategic_llm_id") + op.drop_column("user", "fast_llm_id") + op.drop_column("user", "long_context_llm_id") + # Drop indexes and table - op.drop_index(op.f('ix_llm_configs_name'), table_name='llm_configs') - op.drop_index(op.f('ix_llm_configs_created_at'), table_name='llm_configs') - op.drop_index(op.f('ix_llm_configs_id'), table_name='llm_configs') - op.drop_table('llm_configs') - + op.drop_index(op.f("ix_llm_configs_name"), table_name="llm_configs") + op.drop_index(op.f("ix_llm_configs_created_at"), table_name="llm_configs") + op.drop_index(op.f("ix_llm_configs_id"), table_name="llm_configs") + op.drop_table("llm_configs") + # Drop LiteLLMProvider enum - op.execute("DROP TYPE IF EXISTS litellmprovider") \ No newline at end of file + op.execute("DROP TYPE IF EXISTS litellmprovider") diff --git a/surfsense_backend/alembic/versions/12_add_logs_table.py b/surfsense_backend/alembic/versions/12_add_logs_table.py index 0b2cc13c8..2fc8b2b02 100644 --- a/surfsense_backend/alembic/versions/12_add_logs_table.py +++ b/surfsense_backend/alembic/versions/12_add_logs_table.py @@ -7,9 +7,7 @@ Revises: 11 from typing import Sequence, Union from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects.postgresql import JSON - +from sqlalchemy import inspect # revision identifiers, used by Alembic. revision: str = "12" @@ -20,52 +18,78 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: """Upgrade schema - add LogLevel and LogStatus enums and logs table.""" - - # Create LogLevel enum - op.execute(""" - CREATE TYPE loglevel AS ENUM ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL') - """) - - # Create LogStatus enum - op.execute(""" - CREATE TYPE logstatus AS ENUM ('IN_PROGRESS', 'SUCCESS', 'FAILED') - """) - - # Create logs table - op.execute(""" - CREATE TABLE logs ( + + # Create LogLevel enum if it doesn't exist + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'loglevel') THEN + CREATE TYPE loglevel AS ENUM ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'); + END IF; + END$$; + """ + ) + + # Create LogStatus enum if it doesn't exist + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'logstatus') THEN + CREATE TYPE logstatus AS ENUM ('IN_PROGRESS', 'SUCCESS', 'FAILED'); + END IF; + END$$; + """ + ) + + # Create logs table if it doesn't exist + op.execute( + """ + CREATE TABLE IF NOT EXISTS logs ( id SERIAL PRIMARY KEY, - created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), level loglevel NOT NULL, status logstatus NOT NULL, message TEXT NOT NULL, source VARCHAR(200), log_metadata JSONB DEFAULT '{}', search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE - ) - """) - - # Create indexes - op.create_index(op.f('ix_logs_id'), 'logs', ['id'], unique=False) - op.create_index(op.f('ix_logs_created_at'), 'logs', ['created_at'], unique=False) - op.create_index(op.f('ix_logs_level'), 'logs', ['level'], unique=False) - op.create_index(op.f('ix_logs_status'), 'logs', ['status'], unique=False) - op.create_index(op.f('ix_logs_source'), 'logs', ['source'], unique=False) + ); + """ + ) + + # Get existing indexes + conn = op.get_bind() + inspector = inspect(conn) + existing_indexes = [idx["name"] for idx in inspector.get_indexes("logs")] + + # Create indexes only if they don't already exist + if "ix_logs_id" not in existing_indexes: + op.create_index("ix_logs_id", "logs", ["id"]) + if "ix_logs_created_at" not in existing_indexes: + op.create_index("ix_logs_created_at", "logs", ["created_at"]) + if "ix_logs_level" not in existing_indexes: + op.create_index("ix_logs_level", "logs", ["level"]) + if "ix_logs_status" not in existing_indexes: + op.create_index("ix_logs_status", "logs", ["status"]) + if "ix_logs_source" not in existing_indexes: + op.create_index("ix_logs_source", "logs", ["source"]) def downgrade() -> None: """Downgrade schema - remove logs table and enums.""" - + # Drop indexes - op.drop_index(op.f('ix_logs_source'), table_name='logs') - op.drop_index(op.f('ix_logs_status'), table_name='logs') - op.drop_index(op.f('ix_logs_level'), table_name='logs') - op.drop_index(op.f('ix_logs_created_at'), table_name='logs') - op.drop_index(op.f('ix_logs_id'), table_name='logs') - + op.drop_index("ix_logs_source", table_name="logs") + op.drop_index("ix_logs_status", table_name="logs") + op.drop_index("ix_logs_level", table_name="logs") + op.drop_index("ix_logs_created_at", table_name="logs") + op.drop_index("ix_logs_id", table_name="logs") + # Drop logs table - op.drop_table('logs') - + op.drop_table("logs") + # Drop enums op.execute("DROP TYPE IF EXISTS logstatus") - op.execute("DROP TYPE IF EXISTS loglevel") \ No newline at end of file + op.execute("DROP TYPE IF EXISTS loglevel") diff --git a/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py b/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py new file mode 100644 index 000000000..78d95f17b --- /dev/null +++ b/surfsense_backend/alembic/versions/13_add_jira_connector_enums.py @@ -0,0 +1,61 @@ +"""Add JIRA_CONNECTOR to enums + +Revision ID: 13 +Revises: 12 +""" + +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "13" +down_revision: Union[str, None] = "12" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Safely add 'JIRA_CONNECTOR' to enum types if missing.""" + + # Add to searchsourceconnectortype enum + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'JIRA_CONNECTOR' + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'JIRA_CONNECTOR'; + END IF; + END + $$; + """ + ) + + # Add to documenttype enum + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'documenttype' AND e.enumlabel = 'JIRA_CONNECTOR' + ) THEN + ALTER TYPE documenttype ADD VALUE 'JIRA_CONNECTOR'; + END IF; + END + $$; + """ + ) + + +def downgrade() -> None: + """ + Downgrade logic not implemented since PostgreSQL + does not support removing enum values. + """ + pass diff --git a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py index 1902777b6..d094c9912 100644 --- a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py +++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py @@ -1,19 +1,20 @@ """Add GITHUB_CONNECTOR to SearchSourceConnectorType enum Revision ID: 1 -Revises: +Revises: """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa + # Import pgvector if needed for other types, though not for this ENUM change # import pgvector # revision identifiers, used by Alembic. -revision: str = '1' +revision: str = "1" down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,11 +22,27 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Manually add the command to add the enum value # Note: It's generally better to let autogenerate handle this, but we're bypassing it - op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'") - + op.execute( + """ +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_enum + WHERE enumlabel = 'GITHUB_CONNECTOR' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = 'searchsourceconnectortype' + ) + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'GITHUB_CONNECTOR'; + END IF; +END$$; +""" + ) + # Pass for the rest, as autogenerate didn't run to add other schema details pass # ### end Alembic commands ### @@ -33,20 +50,25 @@ def upgrade() -> None: def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Downgrading removal of an enum value is complex and potentially dangerous # if the value is in use. Often omitted or requires manual SQL based on context. - # For now, we'll just pass. If you needed to reverse this, you'd likely + # For now, we'll just pass. If you needed to reverse this, you'd likely # have to manually check if 'GITHUB_CONNECTOR' is used in the table # and then potentially recreate the type without it. - op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") - op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR')") - op.execute(( - "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " - "connector_type::text::searchsourceconnectortype" - )) + op.execute( + "ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old" + ) + op.execute( + "CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR')" + ) + op.execute( + ( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + ) + ) op.execute("DROP TYPE searchsourceconnectortype_old") - pass - # ### end Alembic commands ### + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py index 526c7c3ad..31fcee803 100644 --- a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py +++ b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py @@ -4,41 +4,57 @@ Revision ID: 2 Revises: e55302644c51 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = '2' -down_revision: Union[str, None] = 'e55302644c51' +revision: str = "2" +down_revision: Union[str, None] = "e55302644c51" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - - # Manually add the command to add the enum value - op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINEAR_CONNECTOR'") - - # Pass for the rest, as autogenerate didn't run to add other schema details - pass - # ### end Alembic commands ### + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = 'LINEAR_CONNECTOR' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = 'searchsourceconnectortype' + ) + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'LINEAR_CONNECTOR'; + END IF; + END$$; + """ + ) + + +# def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Downgrading removal of an enum value requires recreating the type - op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") - op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR')") - op.execute(( - "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " - "connector_type::text::searchsourceconnectortype" - )) + op.execute( + "ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old" + ) + op.execute( + "CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR')" + ) + op.execute( + ( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + ) + ) op.execute("DROP TYPE searchsourceconnectortype_old") pass - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py index e71ee2ed4..b108699d4 100644 --- a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py @@ -4,26 +4,41 @@ Revision ID: 3 Revises: 2 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = '3' -down_revision: Union[str, None] = '2' +revision: str = "3" +down_revision: Union[str, None] = "2" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None # Define the ENUM type name and the new value -ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name) -NEW_VALUE = 'LINEAR_CONNECTOR' +ENUM_NAME = "documenttype" # Make sure this matches the name in your DB (usually lowercase class name) +NEW_VALUE = "LINEAR_CONNECTOR" + def upgrade() -> None: """Upgrade schema.""" - op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'") - + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{NEW_VALUE}' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = '{ENUM_NAME}' + ) + ) THEN + ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; + END IF; + END$$; + """ + ) + # Warning: This will delete all rows with the new value def downgrade() -> None: @@ -34,19 +49,19 @@ def downgrade() -> None: # Enum values *before* LINEAR_CONNECTOR was added old_values = ( - 'EXTENSION', - 'CRAWLED_URL', - 'FILE', - 'SLACK_CONNECTOR', - 'NOTION_CONNECTOR', - 'YOUTUBE_VIDEO', - 'GITHUB_CONNECTOR' + "EXTENSION", + "CRAWLED_URL", + "FILE", + "SLACK_CONNECTOR", + "NOTION_CONNECTOR", + "YOUTUBE_VIDEO", + "GITHUB_CONNECTOR", ) old_values_sql = ", ".join([f"'{v}'" for v in old_values]) # Table and column names (adjust if different) - table_name = 'documents' - column_name = 'document_type' + table_name = "documents" + column_name = "document_type" # 1. Rename the current enum type op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}") @@ -54,10 +69,8 @@ def downgrade() -> None: # 2. Create the new enum type with the old values op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})") - # 3. Update the table: - op.execute( - f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" - ) + # 3. Update the table: + op.execute(f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'") # 4. Alter the column to use the new enum type (casting old values) op.execute( @@ -67,4 +80,4 @@ def downgrade() -> None: # 5. Drop the old enum type op.execute(f"DROP TYPE {old_enum_name}") - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py index 093bdf067..6720ae71f 100644 --- a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py +++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py @@ -4,41 +4,57 @@ Revision ID: 4 Revises: 3 """ + from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = '4' -down_revision: Union[str, None] = '3' +revision: str = "4" +down_revision: Union[str, None] = "3" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - - # Manually add the command to add the enum value - op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINKUP_API'") - - # Pass for the rest, as autogenerate didn't run to add other schema details - pass - # ### end Alembic commands ### + ENUM_NAME = "searchsourceconnectortype" + NEW_VALUE = "LINKUP_API" + + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{NEW_VALUE}' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = '{ENUM_NAME}' + ) + ) THEN + ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; + END IF; + END$$; + """ + ) def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + # Downgrading removal of an enum value requires recreating the type - op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") - op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')") - op.execute(( - "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " - "connector_type::text::searchsourceconnectortype" - )) + op.execute( + "ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old" + ) + op.execute( + "CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')" + ) + op.execute( + ( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + ) + ) op.execute("DROP TYPE searchsourceconnectortype_old") pass - # ### end Alembic commands ### \ No newline at end of file + # ### end Alembic commands ### diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py index fa7a0f8f6..3f0865f84 100644 --- a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py +++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py @@ -4,40 +4,58 @@ Revision ID: 6 Revises: 5 """ + from typing import Sequence, Union -from alembic import op import sqlalchemy as sa +from alembic import op +from sqlalchemy import inspect from sqlalchemy.dialects.postgresql import JSON - # revision identifiers, used by Alembic. -revision: str = '6' -down_revision: Union[str, None] = '5' +revision: str = "6" +down_revision: Union[str, None] = "5" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # Drop the old column and create a new one with the new name and type - # We need to do this because PostgreSQL doesn't support direct column renames with type changes - op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}')) - - # Copy data from old column to new column - # Convert text to JSON by storing it as a JSON string value - op.execute("UPDATE podcasts SET podcast_transcript = jsonb_build_object('text', podcast_content) WHERE podcast_content != ''") - - # Drop the old column - op.drop_column('podcasts', 'podcast_content') + bind = op.get_bind() + inspector = inspect(bind) + + columns = [col["name"] for col in inspector.get_columns("podcasts")] + if "podcast_transcript" not in columns: + op.add_column( + "podcasts", + sa.Column("podcast_transcript", JSON, nullable=False, server_default="{}"), + ) + + # Copy data from old column to new column + op.execute( + """ + UPDATE podcasts + SET podcast_transcript = jsonb_build_object('text', podcast_content) + WHERE podcast_content != '' + """ + ) + + # Drop the old column only if it exists + if "podcast_content" in columns: + op.drop_column("podcasts", "podcast_content") def downgrade() -> None: # Add back the original column - op.add_column('podcasts', sa.Column('podcast_content', sa.Text(), nullable=False, server_default='')) - + op.add_column( + "podcasts", + sa.Column("podcast_content", sa.Text(), nullable=False, server_default=""), + ) + # Copy data from JSON column back to text column # Extract the 'text' field if it exists, otherwise use empty string - op.execute("UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')") - + op.execute( + "UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')" + ) + # Drop the new column - op.drop_column('podcasts', 'podcast_transcript') \ No newline at end of file + op.drop_column("podcasts", "podcast_transcript") diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py index 03048a146..62b273b62 100644 --- a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py +++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py @@ -4,24 +4,34 @@ Revision ID: 7 Revises: 6 """ + from typing import Sequence, Union -from alembic import op import sqlalchemy as sa - +from alembic import op +from sqlalchemy import inspect # revision identifiers, used by Alembic. -revision: str = '7' -down_revision: Union[str, None] = '6' +revision: str = "7" +down_revision: Union[str, None] = "6" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # Drop the is_generated column - op.drop_column('podcasts', 'is_generated') + # Get the current database connection + bind = op.get_bind() + inspector = inspect(bind) + + # Check if the column exists before attempting to drop it + columns = [col["name"] for col in inspector.get_columns("podcasts")] + if "is_generated" in columns: + op.drop_column("podcasts", "is_generated") def downgrade() -> None: # Add back the is_generated column with its original constraints - op.add_column('podcasts', sa.Column('is_generated', sa.Boolean(), nullable=False, server_default='false')) \ No newline at end of file + op.add_column( + "podcasts", + sa.Column("is_generated", sa.Boolean(), nullable=False, server_default="false"), + ) diff --git a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py index 64982fc56..976c6d316 100644 --- a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py +++ b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py @@ -3,54 +3,68 @@ Revision ID: 8 Revises: 7 """ + from typing import Sequence, Union -from alembic import op import sqlalchemy as sa - +from alembic import op +from sqlalchemy import inspect # revision identifiers, used by Alembic. -revision: str = '8' -down_revision: Union[str, None] = '7' +revision: str = "8" +down_revision: Union[str, None] = "7" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - # Add content_hash column as nullable first to handle existing data - op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True)) - - # Update existing documents to generate content hashes - # Using SHA-256 hash of the content column with proper UTF-8 encoding - op.execute(""" - UPDATE documents - SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex') - WHERE content_hash IS NULL - """) - - # Handle duplicate content hashes by keeping only the oldest document for each hash - # Delete newer documents with duplicate content hashes - op.execute(""" - DELETE FROM documents - WHERE id NOT IN ( - SELECT MIN(id) - FROM documents - GROUP BY content_hash + bind = op.get_bind() + inspector = inspect(bind) + columns = [col["name"] for col in inspector.get_columns("documents")] + + # Only add the column if it doesn't already exist + if "content_hash" not in columns: + op.add_column( + "documents", sa.Column("content_hash", sa.String(), nullable=True) ) - """) - - # Now alter the column to match the model: nullable=False, index=True, unique=True - op.alter_column('documents', 'content_hash', - existing_type=sa.String(), - nullable=False) - op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False) - op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash']) + + # Populate the content_hash column + op.execute( + """ + UPDATE documents + SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex') + WHERE content_hash IS NULL + """ + ) + + op.execute( + """ + DELETE FROM documents + WHERE id NOT IN ( + SELECT MIN(id) + FROM documents + GROUP BY content_hash + ) + """ + ) + + op.alter_column( + "documents", "content_hash", existing_type=sa.String(), nullable=False + ) + op.create_index( + op.f("ix_documents_content_hash"), + "documents", + ["content_hash"], + unique=False, + ) + op.create_unique_constraint( + op.f("uq_documents_content_hash"), "documents", ["content_hash"] + ) + else: + print("Column 'content_hash' already exists. Skipping column creation.") def downgrade() -> None: - # Remove constraints and index first - op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique') - op.drop_index(op.f('ix_documents_content_hash'), table_name='documents') - - # Remove content_hash column from documents table - op.drop_column('documents', 'content_hash') \ No newline at end of file + op.drop_constraint(op.f("uq_documents_content_hash"), "documents", type_="unique") + op.drop_index(op.f("ix_documents_content_hash"), table_name="documents") + op.drop_column("documents", "content_hash") diff --git a/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py b/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py index fbf748ae6..4dec11230 100644 --- a/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py +++ b/surfsense_backend/alembic/versions/9_add_discord_connector_enum_and_documenttype.py @@ -7,8 +7,6 @@ Revises: 8 from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. revision: str = "9" @@ -24,11 +22,38 @@ DOCUMENT_NEW_VALUE = "DISCORD_CONNECTOR" def upgrade() -> None: - """Upgrade schema - add DISCORD_CONNECTOR to connector and document enum.""" - # Add DISCORD_CONNECTOR to searchsourceconnectortype - op.execute(f"ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{CONNECTOR_NEW_VALUE}'") - # Add DISCORD_CONNECTOR to documenttype - op.execute(f"ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{DOCUMENT_NEW_VALUE}'") + """Upgrade schema - add DISCORD_CONNECTOR to connector and document enum safely.""" + # Add DISCORD_CONNECTOR to searchsourceconnectortype only if not exists + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{CONNECTOR_NEW_VALUE}' + AND enumtypid = (SELECT oid FROM pg_type WHERE typname = '{CONNECTOR_ENUM}') + ) THEN + ALTER TYPE {CONNECTOR_ENUM} ADD VALUE '{CONNECTOR_NEW_VALUE}'; + END IF; + END$$; + """ + ) + + # Add DISCORD_CONNECTOR to documenttype only if not exists + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{DOCUMENT_NEW_VALUE}' + AND enumtypid = (SELECT oid FROM pg_type WHERE typname = '{DOCUMENT_ENUM}') + ) THEN + ALTER TYPE {DOCUMENT_ENUM} ADD VALUE '{DOCUMENT_NEW_VALUE}'; + END IF; + END$$; + """ + ) def downgrade() -> None: @@ -85,7 +110,6 @@ def downgrade() -> None: # 4. Drop the old connector enum type op.execute(f"DROP TYPE {old_connector_enum_name}") - # Document Enum Downgrade Steps # 1. Rename the current document enum type op.execute(f"ALTER TYPE {DOCUMENT_ENUM} RENAME TO {old_document_enum_name}") diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py index 12d653794..9c93eb7c2 100644 --- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py @@ -1,69 +1,67 @@ -"""Add GITHUB_CONNECTOR to DocumentType enum - -Revision ID: e55302644c51 -Revises: 1 - -""" from typing import Sequence, Union from alembic import op -import sqlalchemy as sa - # revision identifiers, used by Alembic. -revision: str = 'e55302644c51' -down_revision: Union[str, None] = '1' +revision: str = "e55302644c51" +down_revision: Union[str, None] = "1" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None # Define the ENUM type name and the new value -ENUM_NAME = 'documenttype' # Make sure this matches the name in your DB (usually lowercase class name) -NEW_VALUE = 'GITHUB_CONNECTOR' +ENUM_NAME = "documenttype" +NEW_VALUE = "GITHUB_CONNECTOR" + def upgrade() -> None: """Upgrade schema.""" - op.execute(f"ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'") - + op.execute( + f""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = '{NEW_VALUE}' + AND enumtypid = ( + SELECT oid FROM pg_type WHERE typname = '{ENUM_NAME}' + ) + ) THEN + ALTER TYPE {ENUM_NAME} ADD VALUE '{NEW_VALUE}'; + END IF; + END$$; + """ + ) + -# Warning: This will delete all rows with the new value def downgrade() -> None: """Downgrade schema - remove GITHUB_CONNECTOR from enum.""" - - # The old type name old_enum_name = f"{ENUM_NAME}_old" - # Enum values *before* GITHUB_CONNECTOR was added old_values = ( - 'EXTENSION', - 'CRAWLED_URL', - 'FILE', - 'SLACK_CONNECTOR', - 'NOTION_CONNECTOR', - 'YOUTUBE_VIDEO' + "EXTENSION", + "CRAWLED_URL", + "FILE", + "SLACK_CONNECTOR", + "NOTION_CONNECTOR", + "YOUTUBE_VIDEO", ) old_values_sql = ", ".join([f"'{v}'" for v in old_values]) - # Table and column names (adjust if different) - table_name = 'documents' - column_name = 'document_type' + table_name = "documents" + column_name = "document_type" - # 1. Rename the current enum type - op.execute(f"ALTER TYPE {ENUM_NAME} RENAME TO {old_enum_name}") + # 1. Create the new enum type with the old values + op.execute(f"CREATE TYPE {old_enum_name} AS ENUM({old_values_sql})") - # 2. Create the new enum type with the old values - op.execute(f"CREATE TYPE {ENUM_NAME} AS ENUM({old_values_sql})") + # 2. Delete rows using the new value + op.execute(f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'") - # 3. Update the table: - op.execute( - f"DELETE FROM {table_name} WHERE {column_name}::text = '{NEW_VALUE}'" - ) - - # 4. Alter the column to use the new enum type (casting old values) + # 3. Alter the column to use the old enum type op.execute( f"ALTER TABLE {table_name} ALTER COLUMN {column_name} " - f"TYPE {ENUM_NAME} USING {column_name}::text::{ENUM_NAME}" + f"TYPE {old_enum_name} USING {column_name}::text::{old_enum_name}" ) - # 5. Drop the old enum type - op.execute(f"DROP TYPE {old_enum_name}") - # ### end Alembic commands ### + # 4. Drop the current enum type and rename the old one + op.execute(f"DROP TYPE {ENUM_NAME}") + op.execute(f"ALTER TYPE {old_enum_name} RENAME TO {ENUM_NAME}") diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 30d572a60..7919465c3 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -2,78 +2,79 @@ import asyncio import json from typing import Any, Dict, List +from app.db import Document, SearchSpace from app.services.connector_service import ConnectorService +from app.services.query_service import QueryService from langchain_core.messages import HumanMessage, SystemMessage from langchain_core.runnables import RunnableConfig - -from sqlalchemy.ext.asyncio import AsyncSession - -from .configuration import Configuration, SearchMode -from .prompts import get_answer_outline_system_prompt, get_further_questions_system_prompt -from .state import State -from .sub_section_writer.graph import graph as sub_section_writer_graph -from .sub_section_writer.configuration import SubSectionType -from .qna_agent.graph import graph as qna_agent_graph -from .utils import AnswerOutline, get_connector_emoji, get_connector_friendly_name - -from app.services.query_service import QueryService - from langgraph.types import StreamWriter +from sqlalchemy.ext.asyncio import AsyncSession # Additional imports for document fetching from sqlalchemy.future import select -from app.db import Document, SearchSpace + +from .configuration import Configuration, SearchMode +from .prompts import ( + get_answer_outline_system_prompt, + get_further_questions_system_prompt, +) +from .qna_agent.graph import graph as qna_agent_graph +from .state import State +from .sub_section_writer.configuration import SubSectionType +from .sub_section_writer.graph import graph as sub_section_writer_graph +from .utils import AnswerOutline, get_connector_emoji, get_connector_friendly_name + async def fetch_documents_by_ids( - document_ids: List[int], - user_id: str, - db_session: AsyncSession + document_ids: List[int], user_id: str, db_session: AsyncSession ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """ Fetch documents by their IDs with ownership check using DOCUMENTS mode approach. - + This function ensures that only documents belonging to the user are fetched, providing security by checking ownership through SearchSpace association. Similar to SearchMode.DOCUMENTS, it fetches full documents and concatenates their chunks. Also creates source objects for UI display, grouped by document type. - + Args: document_ids: List of document IDs to fetch user_id: The user ID to check ownership db_session: The database session - + Returns: Tuple of (source_objects, document_chunks) - similar to ConnectorService pattern """ if not document_ids: return [], [] - + try: # Query documents with ownership check result = await db_session.execute( select(Document) .join(SearchSpace) - .filter( - Document.id.in_(document_ids), - SearchSpace.user_id == user_id - ) + .filter(Document.id.in_(document_ids), SearchSpace.user_id == user_id) ) documents = result.scalars().all() - + # Group documents by type for source object creation documents_by_type = {} formatted_documents = [] - + for doc in documents: # Fetch associated chunks for this document (similar to DocumentHybridSearchRetriever) from app.db import Chunk - chunks_query = select(Chunk).where(Chunk.document_id == doc.id).order_by(Chunk.id) + + chunks_query = ( + select(Chunk).where(Chunk.document_id == doc.id).order_by(Chunk.id) + ) chunks_result = await db_session.execute(chunks_query) chunks = chunks_result.scalars().all() - + # Concatenate chunks content (similar to SearchMode.DOCUMENTS approach) - concatenated_chunks_content = " ".join([chunk.content for chunk in chunks]) if chunks else doc.content - + concatenated_chunks_content = ( + " ".join([chunk.content for chunk in chunks]) if chunks else doc.content + ) + # Format to match connector service return format formatted_doc = { "chunk_id": f"user_doc_{doc.id}", @@ -82,143 +83,250 @@ async def fetch_documents_by_ids( "document": { "id": doc.id, "title": doc.title, - "document_type": doc.document_type.value if doc.document_type else "UNKNOWN", + "document_type": ( + doc.document_type.value if doc.document_type else "UNKNOWN" + ), "metadata": doc.document_metadata or {}, }, - "source": doc.document_type.value if doc.document_type else "UNKNOWN" + "source": doc.document_type.value if doc.document_type else "UNKNOWN", } formatted_documents.append(formatted_doc) - + # Group by document type for source objects doc_type = doc.document_type.value if doc.document_type else "UNKNOWN" if doc_type not in documents_by_type: documents_by_type[doc_type] = [] documents_by_type[doc_type].append(doc) - + # Create source objects for each document type (similar to ConnectorService) source_objects = [] - connector_id_counter = 100 # Start from 100 to avoid conflicts with regular connectors - + connector_id_counter = ( + 100 # Start from 100 to avoid conflicts with regular connectors + ) + for doc_type, docs in documents_by_type.items(): sources_list = [] - + for doc in docs: metadata = doc.document_metadata or {} - + # Create type-specific source formatting (similar to ConnectorService) if doc_type == "LINEAR_CONNECTOR": # Extract Linear-specific metadata - issue_identifier = metadata.get('issue_identifier', '') - issue_title = metadata.get('issue_title', doc.title) - issue_state = metadata.get('state', '') - comment_count = metadata.get('comment_count', 0) - + issue_identifier = metadata.get("issue_identifier", "") + issue_title = metadata.get("issue_title", doc.title) + issue_state = metadata.get("state", "") + comment_count = metadata.get("comment_count", 0) + # Create a more descriptive title for Linear issues - title = f"Linear: {issue_identifier} - {issue_title}" if issue_identifier else f"Linear: {issue_title}" + title = ( + f"Linear: {issue_identifier} - {issue_title}" + if issue_identifier + else f"Linear: {issue_title}" + ) if issue_state: title += f" ({issue_state})" - + # Create description - description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) if comment_count: description += f" | Comments: {comment_count}" - + # Create URL - url = f"https://linear.app/issue/{issue_identifier}" if issue_identifier else "" - + url = ( + f"https://linear.app/issue/{issue_identifier}" + if issue_identifier + else "" + ) + elif doc_type == "SLACK_CONNECTOR": # Extract Slack-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - + channel_name = metadata.get("channel_name", "Unknown Channel") + channel_id = metadata.get("channel_id", "") + message_date = metadata.get("start_date", "") + title = f"Slack: {channel_name}" if message_date: title += f" ({message_date})" - - description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content - url = f"https://slack.com/app_redirect?channel={channel_id}" if channel_id else "" - + + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) + url = ( + f"https://slack.com/app_redirect?channel={channel_id}" + if channel_id + else "" + ) + elif doc_type == "NOTION_CONNECTOR": # Extract Notion-specific metadata - page_title = metadata.get('page_title', doc.title) - page_id = metadata.get('page_id', '') - + page_title = metadata.get("page_title", doc.title) + page_id = metadata.get("page_id", "") + title = f"Notion: {page_title}" - description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content - url = f"https://notion.so/{page_id.replace('-', '')}" if page_id else "" - + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) + url = ( + f"https://notion.so/{page_id.replace('-', '')}" + if page_id + else "" + ) + elif doc_type == "GITHUB_CONNECTOR": title = f"GitHub: {doc.title}" - description = metadata.get('description', doc.content[:100] + "..." if len(doc.content) > 100 else doc.content) - url = metadata.get('url', '') - + description = metadata.get( + "description", + ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ), + ) + url = metadata.get("url", "") + elif doc_type == "YOUTUBE_VIDEO": # Extract YouTube-specific metadata - video_title = metadata.get('video_title', doc.title) - video_id = metadata.get('video_id', '') - channel_name = metadata.get('channel_name', '') - + video_title = metadata.get("video_title", doc.title) + video_id = metadata.get("video_id", "") + channel_name = metadata.get("channel_name", "") + title = video_title if channel_name: title += f" - {channel_name}" - - description = metadata.get('description', doc.content[:100] + "..." if len(doc.content) > 100 else doc.content) - url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" - + + description = metadata.get( + "description", + ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ), + ) + url = ( + f"https://www.youtube.com/watch?v={video_id}" + if video_id + else "" + ) + elif doc_type == "DISCORD_CONNECTOR": # Extract Discord-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - guild_id = metadata.get('guild_id', '') - message_date = metadata.get('start_date', '') - + channel_name = metadata.get("channel_name", "Unknown Channel") + channel_id = metadata.get("channel_id", "") + guild_id = metadata.get("guild_id", "") + message_date = metadata.get("start_date", "") + title = f"Discord: {channel_name}" if message_date: title += f" ({message_date})" - - description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content - + + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) + if guild_id and channel_id: url = f"https://discord.com/channels/{guild_id}/{channel_id}" elif channel_id: url = f"https://discord.com/channels/@me/{channel_id}" else: url = "" - + + elif doc_type == "JIRA_CONNECTOR": + # Extract Jira-specific metadata + issue_key = metadata.get("issue_key", "Unknown Issue") + issue_title = metadata.get("issue_title", "Untitled Issue") + status = metadata.get("status", "") + priority = metadata.get("priority", "") + issue_type = metadata.get("issue_type", "") + + title = f"Jira: {issue_key} - {issue_title}" + if status: + title += f" ({status})" + + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) + if priority: + description += f" | Priority: {priority}" + if issue_type: + description += f" | Type: {issue_type}" + + # Construct Jira URL if we have the base URL + base_url = metadata.get("base_url", "") + if base_url and issue_key: + url = f"{base_url}/browse/{issue_key}" + else: + url = "" + elif doc_type == "EXTENSION": # Extract Extension-specific metadata - webpage_title = metadata.get('VisitedWebPageTitle', doc.title) - webpage_url = metadata.get('VisitedWebPageURL', '') - visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') - + webpage_title = metadata.get("VisitedWebPageTitle", doc.title) + webpage_url = metadata.get("VisitedWebPageURL", "") + visit_date = metadata.get( + "VisitedWebPageDateWithTimeInISOString", "" + ) + title = webpage_title if visit_date: - formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date + formatted_date = ( + visit_date.split("T")[0] + if "T" in visit_date + else visit_date + ) title += f" (visited: {formatted_date})" - - description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content + + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) url = webpage_url - + elif doc_type == "CRAWLED_URL": title = doc.title - description = metadata.get('og:description', metadata.get('ogDescription', doc.content[:100] + "..." if len(doc.content) > 100 else doc.content)) - url = metadata.get('url', '') - + description = metadata.get( + "og:description", + metadata.get( + "ogDescription", + ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ), + ), + ) + url = metadata.get("url", "") + else: # FILE and other types title = doc.title - description = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content - url = metadata.get('url', '') - + description = ( + doc.content[:100] + "..." + if len(doc.content) > 100 + else doc.content + ) + url = metadata.get("url", "") + # Create source entry source = { "id": doc.id, "title": title, "description": description, - "url": url + "url": url, } sources_list.append(source) - + # Create source object for this document type friendly_type_names = { "LINEAR_CONNECTOR": "Linear Issues (Selected)", @@ -227,11 +335,12 @@ async def fetch_documents_by_ids( "GITHUB_CONNECTOR": "GitHub (Selected)", "YOUTUBE_VIDEO": "YouTube Videos (Selected)", "DISCORD_CONNECTOR": "Discord (Selected)", + "JIRA_CONNECTOR": "Jira Issues (Selected)", "EXTENSION": "Browser Extension (Selected)", "CRAWLED_URL": "Web Pages (Selected)", - "FILE": "Files (Selected)" + "FILE": "Files (Selected)", } - + source_object = { "id": connector_id_counter, "name": friendly_type_names.get(doc_type, f"{doc_type} (Selected)"), @@ -240,31 +349,34 @@ async def fetch_documents_by_ids( } source_objects.append(source_object) connector_id_counter += 1 - - print(f"Fetched {len(formatted_documents)} user-selected documents (with concatenated chunks) from {len(document_ids)} requested IDs") + + print( + f"Fetched {len(formatted_documents)} user-selected documents (with concatenated chunks) from {len(document_ids)} requested IDs" + ) print(f"Created {len(source_objects)} source objects for UI display") - + return source_objects, formatted_documents - + except Exception as e: print(f"Error fetching documents by IDs: {str(e)}") return [], [] -async def write_answer_outline(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: +async def write_answer_outline( + state: State, config: RunnableConfig, writer: StreamWriter +) -> Dict[str, Any]: """ Create a structured answer outline based on the user query. - + This node takes the user query and number of sections from the configuration and uses an LLM to generate a comprehensive outline with logical sections and research questions for each section. - + Returns: Dict containing the answer outline in the "answer_outline" key for state update. """ from app.services.llm_service import get_user_strategic_llm - from app.db import get_async_session - + streaming_service = state.streaming_service writer( @@ -299,10 +411,10 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str # Create the human message content human_message_content = f""" Now Please create an answer outline for the following query: - + User Query: {reformulated_query} Number of Sections: {num_sections} - + Remember to format your response as valid JSON exactly matching this structure: {{ "answer_outline": [ @@ -316,7 +428,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str }} ] }} - + Your output MUST be valid JSON in exactly this format. Do not include any other text or explanation. """ @@ -331,9 +443,9 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str # Create messages for the LLM messages = [ SystemMessage(content=get_answer_outline_system_prompt()), - HumanMessage(content=human_message_content) + HumanMessage(content=human_message_content), ] - + # Call the LLM directly without using structured output writer( { @@ -344,26 +456,28 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str ) response = await llm.ainvoke(messages) - + # Parse the JSON response manually try: # Extract JSON content from the response content = response.content - + # Find the JSON in the content (handle case where LLM might add additional text) - json_start = content.find('{') - json_end = content.rfind('}') + 1 + json_start = content.find("{") + json_end = content.rfind("}") + 1 if json_start >= 0 and json_end > json_start: json_str = content[json_start:json_end] - + # Parse the JSON string parsed_data = json.loads(json_str) - + # Convert to Pydantic model answer_outline = AnswerOutline(**parsed_data) - - total_questions = sum(len(section.questions) for section in answer_outline.answer_outline) - + + total_questions = sum( + len(section.questions) for section in answer_outline.answer_outline + ) + writer( { "yield_value": streaming_service.format_terminal_info_delta( @@ -407,16 +521,16 @@ async def fetch_relevant_documents( top_k: int = 10, connector_service: ConnectorService = None, search_mode: SearchMode = SearchMode.CHUNKS, - user_selected_sources: List[Dict[str, Any]] = None + user_selected_sources: List[Dict[str, Any]] = None, ) -> List[Dict[str, Any]]: """ Fetch relevant documents for research questions using the provided connectors. - + This function searches across multiple data sources for information related to the research questions. It provides user-friendly feedback during the search process by displaying connector names (like "Web Search" instead of "TAVILY_API") and adding relevant emojis to indicate the type of source being searched. - + Args: research_questions: List of research questions to find documents for user_id: The user ID @@ -427,19 +541,21 @@ async def fetch_relevant_documents( state: The current state containing the streaming service top_k: Number of top results to retrieve per connector per question connector_service: An initialized connector service to use for searching - + Returns: List of relevant documents """ # Initialize services # connector_service = ConnectorService(db_session) - + # Only use streaming if both writer and state are provided streaming_service = state.streaming_service if state is not None else None # Stream initial status update if streaming_service and writer: - connector_names = [get_connector_friendly_name(connector) for connector in connectors_to_search] + connector_names = [ + get_connector_friendly_name(connector) for connector in connectors_to_search + ] connector_names_str = ", ".join(connector_names) writer( { @@ -451,7 +567,7 @@ async def fetch_relevant_documents( all_raw_documents = [] # Store all raw documents all_sources = [] # Store all sources - + for i, user_query in enumerate(research_questions): # Stream question being researched if streaming_service and writer: @@ -465,7 +581,7 @@ async def fetch_relevant_documents( # Use original research question as the query reformulated_query = user_query - + # Process each selected connector for connector in connectors_to_search: # Stream connector being searched @@ -482,19 +598,22 @@ async def fetch_relevant_documents( try: if connector == "YOUTUBE_VIDEO": - source_object, youtube_chunks = await connector_service.search_youtube( + ( + source_object, + youtube_chunks, + ) = await connector_service.search_youtube( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(youtube_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -506,19 +625,22 @@ async def fetch_relevant_documents( ) elif connector == "EXTENSION": - source_object, extension_chunks = await connector_service.search_extension( + ( + source_object, + extension_chunks, + ) = await connector_service.search_extension( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(extension_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -530,19 +652,22 @@ async def fetch_relevant_documents( ) elif connector == "CRAWLED_URL": - source_object, crawled_urls_chunks = await connector_service.search_crawled_urls( + ( + source_object, + crawled_urls_chunks, + ) = await connector_service.search_crawled_urls( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(crawled_urls_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -559,14 +684,14 @@ async def fetch_relevant_documents( user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(files_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -583,14 +708,14 @@ async def fetch_relevant_documents( user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(slack_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -602,19 +727,22 @@ async def fetch_relevant_documents( ) elif connector == "NOTION_CONNECTOR": - source_object, notion_chunks = await connector_service.search_notion( + ( + source_object, + notion_chunks, + ) = await connector_service.search_notion( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(notion_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -626,19 +754,22 @@ async def fetch_relevant_documents( ) elif connector == "GITHUB_CONNECTOR": - source_object, github_chunks = await connector_service.search_github( + ( + source_object, + github_chunks, + ) = await connector_service.search_github( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(github_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -650,19 +781,22 @@ async def fetch_relevant_documents( ) elif connector == "LINEAR_CONNECTOR": - source_object, linear_chunks = await connector_service.search_linear( + ( + source_object, + linear_chunks, + ) = await connector_service.search_linear( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(linear_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -674,17 +808,18 @@ async def fetch_relevant_documents( ) elif connector == "TAVILY_API": - source_object, tavily_chunks = await connector_service.search_tavily( - user_query=reformulated_query, - user_id=user_id, - top_k=top_k + ( + source_object, + tavily_chunks, + ) = await connector_service.search_tavily( + user_query=reformulated_query, user_id=user_id, top_k=top_k ) - + # Add to sources and raw documents if source_object: all_sources.append(source_object) all_raw_documents.extend(tavily_chunks) - + # Stream found document count if streaming_service and writer: writer( @@ -701,14 +836,14 @@ async def fetch_relevant_documents( source_object, linkup_chunks = await connector_service.search_linkup( user_query=reformulated_query, user_id=user_id, - mode=linkup_mode - ) - + mode=linkup_mode, + ) + # Add to sources and raw documents if source_object: all_sources.append(source_object) - all_raw_documents.extend(linkup_chunks) - + all_raw_documents.extend(linkup_chunks) + # Stream found document count if streaming_service and writer: writer( @@ -720,12 +855,15 @@ async def fetch_relevant_documents( ) elif connector == "DISCORD_CONNECTOR": - source_object, discord_chunks = await connector_service.search_discord( + ( + source_object, + discord_chunks, + ) = await connector_service.search_discord( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, top_k=top_k, - search_mode=search_mode + search_mode=search_mode, ) # Add to sources and raw documents if source_object: @@ -741,10 +879,34 @@ async def fetch_relevant_documents( } ) + elif connector == "JIRA_CONNECTOR": + source_object, jira_chunks = await connector_service.search_jira( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode, + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(jira_chunks) + + # Stream found document count + if streaming_service and writer: + writer( + { + "yield_value": streaming_service.format_terminal_info_delta( + f"🎫 Found {len(jira_chunks)} Jira issues related to your query" + ) + } + ) + except Exception as e: error_message = f"Error searching connector {connector}: {str(e)}" print(error_message) - + # Stream error message if streaming_service and writer: friendly_name = get_connector_friendly_name(connector) @@ -758,17 +920,17 @@ async def fetch_relevant_documents( # Continue with other connectors on error continue - + # Deduplicate source objects by ID before streaming deduplicated_sources = [] seen_source_keys = set() - + # First add user-selected sources (if any) if user_selected_sources: for source_obj in user_selected_sources: - source_id = source_obj.get('id') - source_type = source_obj.get('type') - + source_id = source_obj.get("id") + source_type = source_obj.get("type") + if source_id and source_type: source_key = f"{source_type}_{source_id}" if source_key not in seen_source_keys: @@ -776,14 +938,14 @@ async def fetch_relevant_documents( deduplicated_sources.append(source_obj) else: deduplicated_sources.append(source_obj) - + # Then add connector sources for source_obj in all_sources: # Use combination of source ID and type as a unique identifier # This ensures we don't accidentally deduplicate sources from different connectors - source_id = source_obj.get('id') - source_type = source_obj.get('type') - + source_id = source_obj.get("id") + source_type = source_obj.get("type") + if source_id and source_type: source_key = f"{source_type}_{source_id}" current_sources_count = len(source_obj.get('sources', [])) @@ -831,28 +993,36 @@ async def fetch_relevant_documents( # After all sources are collected and deduplicated, stream them if streaming_service and writer: - writer({"yield_value": streaming_service.format_sources_delta(deduplicated_sources)}) + writer( + { + "yield_value": streaming_service.format_sources_delta( + deduplicated_sources + ) + } + ) # Deduplicate raw documents based on chunk_id or content seen_chunk_ids = set() seen_content_hashes = set() deduplicated_docs = [] - + for doc in all_raw_documents: chunk_id = doc.get("chunk_id") content = doc.get("content", "") content_hash = hash(content) - + # Skip if we've seen this chunk_id or content before - if (chunk_id and chunk_id in seen_chunk_ids) or content_hash in seen_content_hashes: + if ( + chunk_id and chunk_id in seen_chunk_ids + ) or content_hash in seen_content_hashes: continue - + # Add to our tracking sets and keep this document if chunk_id: seen_chunk_ids.add(chunk_id) seen_content_hashes.add(content_hash) deduplicated_docs.append(doc) - + # Stream info about deduplicated documents if streaming_service and writer: writer( @@ -867,14 +1037,16 @@ async def fetch_relevant_documents( return deduplicated_docs -async def process_sections(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: +async def process_sections( + state: State, config: RunnableConfig, writer: StreamWriter +) -> Dict[str, Any]: """ Process all sections in parallel and combine the results. - - This node takes the answer outline from the previous step, fetches relevant documents - for all questions across all sections once, and then processes each section in parallel + + This node takes the answer outline from the previous step, fetches relevant documents + for all questions across all sections once, and then processes each section in parallel using the sub_section_writer graph with the shared document pool. - + Returns: Dict containing the final written report in the "final_written_report" key. """ @@ -882,7 +1054,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW configuration = Configuration.from_runnable_config(config) answer_outline = state.answer_outline streaming_service = state.streaming_service - + # Initialize a dictionary to track content for all sections # This is used to maintain section content while streaming multiple sections section_contents = {} @@ -896,19 +1068,19 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW ) print(f"Processing sections from outline: {answer_outline is not None}") - + if not answer_outline: error_message = "No answer outline was provided. Cannot generate report." writer({"yield_value": streaming_service.format_error(error_message)}) return { "final_written_report": "No answer outline was provided. Cannot generate final report." } - + # Collect all questions from all sections all_questions = [] for section in answer_outline.answer_outline: all_questions.extend(section.questions) - + print(f"Collected {len(all_questions)} questions from all sections") writer( { @@ -935,11 +1107,11 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW TOP_K = 30 else: TOP_K = 10 - + relevant_documents = [] user_selected_documents = [] user_selected_sources = [] - + try: # First, fetch user-selected documents if any if configuration.document_ids_to_add_in_context: @@ -951,12 +1123,15 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW } ) - user_selected_sources, user_selected_documents = await fetch_documents_by_ids( + ( + user_selected_sources, + user_selected_documents, + ) = await fetch_documents_by_ids( document_ids=configuration.document_ids_to_add_in_context, user_id=configuration.user_id, - db_session=state.db_session + db_session=state.db_session, ) - + if user_selected_documents: writer( { @@ -967,9 +1142,11 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW ) # Create connector service using state db_session - connector_service = ConnectorService(state.db_session, user_id=configuration.user_id) + connector_service = ConnectorService( + state.db_session, user_id=configuration.user_id + ) await connector_service.initialize_counter() - + relevant_documents = await fetch_relevant_documents( research_questions=all_questions, user_id=configuration.user_id, @@ -981,7 +1158,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW top_k=TOP_K, connector_service=connector_service, search_mode=configuration.search_mode, - user_selected_sources=user_selected_sources + user_selected_sources=user_selected_sources, ) except Exception as e: error_message = f"Error fetching relevant documents: {str(e)}" @@ -990,12 +1167,14 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW # Log the error and continue with an empty list of documents # This allows the process to continue, but the report might lack information relevant_documents = [] - + # Combine user-selected documents with connector-fetched documents all_documents = user_selected_documents + relevant_documents - + print(f"Fetched {len(relevant_documents)} relevant documents for all sections") - print(f"Added {len(user_selected_documents)} user-selected documents for all sections") + print( + f"Added {len(user_selected_documents)} user-selected documents for all sections" + ) print(f"Total documents for sections: {len(all_documents)}") writer( @@ -1023,14 +1202,14 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW sub_section_type = SubSectionType.END else: sub_section_type = SubSectionType.MIDDLE - + # Initialize the section_contents entry for this section section_contents[i] = { "title": section.section_title, "content": "", - "index": i + "index": i, } - + section_tasks.append( process_section_with_documents( section_id=i, @@ -1043,10 +1222,10 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW state=state, writer=writer, sub_section_type=sub_section_type, - section_contents=section_contents + section_contents=section_contents, ) ) - + # Run all section processing tasks in parallel print(f"Running {len(section_tasks)} section processing tasks in parallel") writer( @@ -1058,7 +1237,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW ) section_results = await asyncio.gather(*section_tasks, return_exceptions=True) - + # Handle any exceptions in the results writer( { @@ -1078,22 +1257,25 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW processed_results.append(error_message) else: processed_results.append(result) - + # Combine the results into a final report with section titles final_report = [] - for i, (section, content) in enumerate(zip(answer_outline.answer_outline, processed_results)): + for i, (section, content) in enumerate( + zip(answer_outline.answer_outline, processed_results) + ): # Skip adding the section header since the content already contains the title final_report.append(content) - final_report.append("\n") - + final_report.append("\n") + # Stream each section with its title writer( { - "yield_value": state.streaming_service.format_text_chunk(f"# {section.section_title}\n\n{content}") + "yield_value": state.streaming_service.format_text_chunk( + f"# {section.section_title}\n\n{content}" + ) } ) - # Join all sections with newlines final_written_report = "\n".join(final_report) print(f"Generated final report with {len(final_report)} parts") @@ -1110,26 +1292,26 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW # Since all sections used the same document pool, we can use it directly return { "final_written_report": final_written_report, - "reranked_documents": all_documents + "reranked_documents": all_documents, } async def process_section_with_documents( section_id: int, - section_title: str, + section_title: str, section_questions: List[str], - user_id: str, - search_space_id: int, + user_id: str, + search_space_id: int, relevant_documents: List[Dict[str, Any]], user_query: str, state: State = None, writer: StreamWriter = None, sub_section_type: SubSectionType = SubSectionType.MIDDLE, - section_contents: Dict[int, Dict[str, Any]] = None + section_contents: Dict[int, Dict[str, Any]] = None, ) -> str: """ Process a single section using pre-fetched documents. - + Args: section_id: The ID of the section section_title: The title of the section @@ -1141,14 +1323,14 @@ async def process_section_with_documents( writer: StreamWriter for sending progress updates sub_section_type: The type of section (start, middle, end) section_contents: Dictionary to track content across multiple sections - + Returns: The written section content """ try: # Use the provided documents documents_to_use = relevant_documents - + # Send status update via streaming if available if state and state.streaming_service and writer: writer( @@ -1175,7 +1357,7 @@ async def process_section_with_documents( {"content": f"No specific information was found for: {question}"} for question in section_questions ] - + # Call the sub_section_writer graph with the appropriate config config = { "configurable": { @@ -1188,13 +1370,10 @@ async def process_section_with_documents( "search_space_id": search_space_id, } } - + # Create the initial state with db_session and chat_history - sub_state = { - "db_session": state.db_session, - "chat_history": state.chat_history - } - + sub_state = {"db_session": state.db_session, "chat_history": state.chat_history} + # Invoke the sub-section writer graph with streaming print(f"Invoking sub_section_writer for: {section_title}") if state and state.streaming_service and writer: @@ -1208,17 +1387,19 @@ async def process_section_with_documents( # Variables to track streaming state complete_content = "" # Tracks the complete content received so far - - async for chunk_type, chunk in sub_section_writer_graph.astream(sub_state, config, stream_mode=["values"]): + + async for chunk_type, chunk in sub_section_writer_graph.astream( + sub_state, config, stream_mode=["values"] + ): if "final_answer" in chunk: new_content = chunk["final_answer"] if new_content and new_content != complete_content: # Extract only the new content (delta) - delta = new_content[len(complete_content):] - + delta = new_content[len(complete_content) :] + # Update what we've processed so far complete_content = new_content - + # Only stream if there's actual new content if delta and state and state.streaming_service and writer: # Update terminal with real-time progress indicator @@ -1232,26 +1413,29 @@ async def process_section_with_documents( # Update section_contents with just the new delta section_contents[section_id]["content"] += delta - + # Build UI-friendly content for all sections complete_answer = [] for i in range(len(section_contents)): if i in section_contents and section_contents[i]["content"]: # Add section header - complete_answer.append(f"# {section_contents[i]['title']}") + complete_answer.append( + f"# {section_contents[i]['title']}" + ) complete_answer.append("") # Empty line after title - + # Add section content - content_lines = section_contents[i]["content"].split("\n") + content_lines = section_contents[i]["content"].split( + "\n" + ) complete_answer.extend(content_lines) complete_answer.append("") # Empty line after content - # Set default if no content was received if not complete_content: complete_content = "No content was generated for this section." section_contents[section_id]["content"] = complete_content - + # Final terminal update if state and state.streaming_service and writer: writer( @@ -1265,7 +1449,7 @@ async def process_section_with_documents( return complete_content except Exception as e: print(f"Error processing section '{section_title}': {str(e)}") - + # Send error update via streaming if available if state and state.streaming_service and writer: writer( @@ -1279,37 +1463,46 @@ async def process_section_with_documents( return f"Error processing section: {section_title}. Details: {str(e)}" -async def reformulate_user_query(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: +async def reformulate_user_query( + state: State, config: RunnableConfig, writer: StreamWriter +) -> Dict[str, Any]: """ Reforms the user query based on the chat history. """ - + configuration = Configuration.from_runnable_config(config) user_query = configuration.user_query - chat_history_str = await QueryService.langchain_chat_history_to_str(state.chat_history) - if len(state.chat_history) == 0: + chat_history_str = await QueryService.langchain_chat_history_to_str( + state.chat_history + ) + if len(state.chat_history) == 0: reformulated_query = user_query else: - reformulated_query = await QueryService.reformulate_query_with_chat_history(user_query=user_query, session=state.db_session, user_id=configuration.user_id, chat_history_str=chat_history_str) - - return { - "reformulated_query": reformulated_query - } + reformulated_query = await QueryService.reformulate_query_with_chat_history( + user_query=user_query, + session=state.db_session, + user_id=configuration.user_id, + chat_history_str=chat_history_str, + ) + + return {"reformulated_query": reformulated_query} -async def handle_qna_workflow(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: +async def handle_qna_workflow( + state: State, config: RunnableConfig, writer: StreamWriter +) -> Dict[str, Any]: """ Handle the QNA research workflow. - + This node fetches relevant documents for the user query and then uses the QNA agent to generate a comprehensive answer with proper citations. - + Returns: Dict containing the final answer in the "final_written_report" key for consistency. """ streaming_service = state.streaming_service configuration = Configuration.from_runnable_config(config) - + reformulated_query = state.reformulated_query user_query = configuration.user_query @@ -1340,11 +1533,11 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre # Use a reasonable top_k for QNA - not too many documents to avoid overwhelming the LLM TOP_K = 15 - + relevant_documents = [] user_selected_documents = [] user_selected_sources = [] - + try: # First, fetch user-selected documents if any if configuration.document_ids_to_add_in_context: @@ -1356,12 +1549,15 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre } ) - user_selected_sources, user_selected_documents = await fetch_documents_by_ids( + ( + user_selected_sources, + user_selected_documents, + ) = await fetch_documents_by_ids( document_ids=configuration.document_ids_to_add_in_context, user_id=configuration.user_id, - db_session=state.db_session + db_session=state.db_session, ) - + if user_selected_documents: writer( { @@ -1372,12 +1568,14 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre ) # Create connector service using state db_session - connector_service = ConnectorService(state.db_session, user_id=configuration.user_id) + connector_service = ConnectorService( + state.db_session, user_id=configuration.user_id + ) await connector_service.initialize_counter() - + # Use the reformulated query as a single research question research_questions = [reformulated_query, user_query] - + relevant_documents = await fetch_relevant_documents( research_questions=research_questions, user_id=configuration.user_id, @@ -1389,7 +1587,7 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre top_k=TOP_K, connector_service=connector_service, search_mode=configuration.search_mode, - user_selected_sources=user_selected_sources + user_selected_sources=user_selected_sources, ) except Exception as e: error_message = f"Error fetching relevant documents for QNA: {str(e)}" @@ -1397,10 +1595,10 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre writer({"yield_value": streaming_service.format_error(error_message)}) # Continue with empty documents - the QNA agent will handle this gracefully relevant_documents = [] - + # Combine user-selected documents with connector-fetched documents all_documents = user_selected_documents + relevant_documents - + print(f"Fetched {len(relevant_documents)} relevant documents for QNA") print(f"Added {len(user_selected_documents)} user-selected documents for QNA") print(f"Total documents for QNA: {len(all_documents)}") @@ -1420,16 +1618,13 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre "reformulated_query": reformulated_query, "relevant_documents": all_documents, # Use combined documents "user_id": configuration.user_id, - "search_space_id": configuration.search_space_id + "search_space_id": configuration.search_space_id, } } - + # Create the state for the QNA agent (it has a different state structure) - qna_state = { - "db_session": state.db_session, - "chat_history": state.chat_history - } - + qna_state = {"db_session": state.db_session, "chat_history": state.chat_history} + try: writer( { @@ -1442,16 +1637,18 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre # Track streaming content for real-time updates complete_content = "" captured_reranked_documents = [] - + # Call the QNA agent with streaming - async for _chunk_type, chunk in qna_agent_graph.astream(qna_state, qna_config, stream_mode=["values"]): + async for _chunk_type, chunk in qna_agent_graph.astream( + qna_state, qna_config, stream_mode=["values"] + ): if "final_answer" in chunk: new_content = chunk["final_answer"] if new_content and new_content != complete_content: # Extract only the new content (delta) - delta = new_content[len(complete_content):] + delta = new_content[len(complete_content) :] complete_content = new_content - + # Stream the real-time answer if there's new content if delta: # Update terminal with progress @@ -1471,7 +1668,7 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre # Capture reranked documents from QNA agent for further question generation if "reranked_documents" in chunk: captured_reranked_documents = chunk["reranked_documents"] - + # Set default if no content was received if not complete_content: complete_content = "I couldn't find relevant information in your knowledge base to answer this question." @@ -1487,9 +1684,9 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre # Return the final answer and captured reranked documents for further question generation return { "final_written_report": complete_content, - "reranked_documents": captured_reranked_documents + "reranked_documents": captured_reranked_documents, } - + except Exception as e: error_message = f"Error generating QNA answer: {str(e)}" print(error_message) @@ -1498,27 +1695,29 @@ async def handle_qna_workflow(state: State, config: RunnableConfig, writer: Stre return {"final_written_report": f"Error generating answer: {str(e)}"} -async def generate_further_questions(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: +async def generate_further_questions( + state: State, config: RunnableConfig, writer: StreamWriter +) -> Dict[str, Any]: """ Generate contextually relevant follow-up questions based on chat history and available documents. - + This node takes the chat history and reranked documents from sub-agents (qna_agent or sub_section_writer) and uses an LLM to generate follow-up questions that would naturally extend the conversation and provide additional value to the user. - + Returns: Dict containing the further questions in the "further_questions" key for state update. """ from app.services.llm_service import get_user_fast_llm - + # Get configuration and state data configuration = Configuration.from_runnable_config(config) chat_history = state.chat_history user_id = configuration.user_id streaming_service = state.streaming_service - + # Get reranked documents from the state (will be populated by sub-agents) - reranked_documents = getattr(state, 'reranked_documents', None) or [] + reranked_documents = getattr(state, "reranked_documents", None) or [] writer( { @@ -1538,11 +1737,11 @@ async def generate_further_questions(state: State, config: RunnableConfig, write # Stream empty further questions to UI writer({"yield_value": streaming_service.format_further_questions_delta([])}) return {"further_questions": []} - + # Format chat history for the prompt chat_history_xml = "\n" for message in chat_history: - if hasattr(message, 'type'): + if hasattr(message, "type"): if message.type == "human": chat_history_xml += f"{message.content}\n" elif message.type == "ai": @@ -1551,7 +1750,7 @@ async def generate_further_questions(state: State, config: RunnableConfig, write # Handle other message types if needed chat_history_xml += f"{str(message)}\n" chat_history_xml += "" - + # Format available documents for the prompt documents_xml = "\n" for i, doc in enumerate(reranked_documents): @@ -1559,24 +1758,24 @@ async def generate_further_questions(state: State, config: RunnableConfig, write source_id = document_info.get("id", f"doc_{i}") source_type = document_info.get("document_type", "UNKNOWN") content = doc.get("content", "") - - documents_xml += f"\n" - documents_xml += f"\n" + + documents_xml += "\n" + documents_xml += "\n" documents_xml += f"{source_id}\n" documents_xml += f"{source_type}\n" - documents_xml += f"\n" + documents_xml += "\n" documents_xml += f"\n{content}\n" - documents_xml += f"\n" + documents_xml += "\n" documents_xml += "" - + # Create the human message content human_message_content = f""" {chat_history_xml} - + {documents_xml} - + Based on the chat history and available documents above, generate 3-5 contextually relevant follow-up questions that would naturally extend the conversation and provide additional value to the user. Make sure the questions can be reasonably answered using the available documents or knowledge base. - + Your response MUST be valid JSON in exactly this format: {{ "further_questions": [ @@ -1590,7 +1789,7 @@ async def generate_further_questions(state: State, config: RunnableConfig, write }} ] }} - + Do not include any other text or explanation. Only return the JSON. """ @@ -1605,25 +1804,25 @@ async def generate_further_questions(state: State, config: RunnableConfig, write # Create messages for the LLM messages = [ SystemMessage(content=get_further_questions_system_prompt()), - HumanMessage(content=human_message_content) + HumanMessage(content=human_message_content), ] - + try: # Call the LLM response = await llm.ainvoke(messages) - + # Parse the JSON response content = response.content - + # Find the JSON in the content - json_start = content.find('{') - json_end = content.rfind('}') + 1 + json_start = content.find("{") + json_end = content.rfind("}") + 1 if json_start >= 0 and json_end > json_start: json_str = content[json_start:json_end] - + # Parse the JSON string parsed_data = json.loads(json_str) - + # Extract the further_questions array further_questions = parsed_data.get("further_questions", []) @@ -1645,7 +1844,7 @@ async def generate_further_questions(state: State, config: RunnableConfig, write ) print(f"Successfully generated {len(further_questions)} further questions") - + return {"further_questions": further_questions} else: # If JSON structure not found, return empty list @@ -1666,7 +1865,7 @@ async def generate_further_questions(state: State, config: RunnableConfig, write {"yield_value": streaming_service.format_further_questions_delta([])} ) return {"further_questions": []} - + except (json.JSONDecodeError, ValueError) as e: # Log the error and return empty list error_message = f"Error parsing further questions response: {str(e)}" @@ -1678,7 +1877,7 @@ async def generate_further_questions(state: State, config: RunnableConfig, write # Stream empty further questions to UI writer({"yield_value": streaming_service.format_further_questions_delta([])}) return {"further_questions": []} - + except Exception as e: # Handle any other errors error_message = f"Error generating further questions: {str(e)}" diff --git a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py index eed07224b..3f4d97558 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py @@ -15,7 +15,8 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel - YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) - GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) - LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) -- DISCORD_CONNECTOR: "Discord server messages and channels" (personal community interactions) +- JIRA_CONNECTOR: "Jira project issues, tickets, and comments" (personal project tracking) +- DISCORD_CONNECTOR: "Discord server conversations and shared content" (personal community communications) - TAVILY_API: "Tavily search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results) @@ -71,7 +72,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel Python's asyncio library provides tools for writing concurrent code using the async/await syntax. It's particularly useful for I/O-bound and high-level structured network code. - + 12 diff --git a/surfsense_backend/app/agents/researcher/utils.py b/surfsense_backend/app/agents/researcher/utils.py index c4991cc9f..647e00003 100644 --- a/surfsense_backend/app/agents/researcher/utils.py +++ b/surfsense_backend/app/agents/researcher/utils.py @@ -33,6 +33,8 @@ def get_connector_emoji(connector_name: str) -> str: "NOTION_CONNECTOR": "📘", "GITHUB_CONNECTOR": "🐙", "LINEAR_CONNECTOR": "📊", + "JIRA_CONNECTOR": "🎫", + "DISCORD_CONNECTOR": "🗨️", "TAVILY_API": "🔍", "LINKUP_API": "🔗" } @@ -50,6 +52,8 @@ def get_connector_friendly_name(connector_name: str) -> str: "NOTION_CONNECTOR": "Notion", "GITHUB_CONNECTOR": "GitHub", "LINEAR_CONNECTOR": "Linear", + "JIRA_CONNECTOR": "Jira", + "DISCORD_CONNECTOR": "Discord", "TAVILY_API": "Tavily Search", "LINKUP_API": "Linkup Search" } diff --git a/surfsense_backend/app/connectors/jira_connector.py b/surfsense_backend/app/connectors/jira_connector.py new file mode 100644 index 000000000..b30640be5 --- /dev/null +++ b/surfsense_backend/app/connectors/jira_connector.py @@ -0,0 +1,487 @@ +""" +Jira Connector Module + +A module for retrieving data from Jira. +Allows fetching issue lists and their comments, projects and more. +""" + +import base64 +from datetime import datetime +from typing import Any, Dict, List, Optional + +import requests + + +class JiraConnector: + """Class for retrieving data from Jira.""" + + def __init__( + self, + base_url: Optional[str] = None, + email: Optional[str] = None, + api_token: Optional[str] = None, + ): + """ + Initialize the JiraConnector class. + + Args: + base_url: Jira instance base URL (e.g., 'https://yourcompany.atlassian.net') (optional) + email: Jira account email address (optional) + api_token: Jira API token (optional) + """ + self.base_url = base_url.rstrip("/") if base_url else None + self.email = email + self.api_token = api_token + self.api_version = "3" # Jira Cloud API version + + def set_credentials(self, base_url: str, email: str, api_token: str) -> None: + """ + Set the Jira credentials. + + Args: + base_url: Jira instance base URL + email: Jira account email address + api_token: Jira API token + """ + self.base_url = base_url.rstrip("/") + self.email = email + self.api_token = api_token + + def set_email(self, email: str) -> None: + """ + Set the Jira account email. + + Args: + email: Jira account email address + """ + self.email = email + + def set_api_token(self, api_token: str) -> None: + """ + Set the Jira API token. + + Args: + api_token: Jira API token + """ + self.api_token = api_token + + def get_headers(self) -> Dict[str, str]: + """ + Get headers for Jira API requests using Basic Authentication. + + Returns: + Dictionary of headers + + Raises: + ValueError: If email, api_token, or base_url have not been set + """ + if not all([self.base_url, self.email, self.api_token]): + raise ValueError( + "Jira credentials not initialized. Call set_credentials() first." + ) + + # Create Basic Auth header using email:api_token + auth_str = f"{self.email}:{self.api_token}" + auth_bytes = auth_str.encode("utf-8") + auth_header = "Basic " + base64.b64encode(auth_bytes).decode("ascii") + + return { + "Content-Type": "application/json", + "Authorization": auth_header, + "Accept": "application/json", + } + + def make_api_request( + self, endpoint: str, params: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Make a request to the Jira API. + + Args: + endpoint: API endpoint (without base URL) + params: Query parameters for the request (optional) + + Returns: + Response data from the API + + Raises: + ValueError: If email, api_token, or base_url have not been set + Exception: If the API request fails + """ + if not all([self.base_url, self.email, self.api_token]): + raise ValueError( + "Jira credentials not initialized. Call set_credentials() first." + ) + + url = f"{self.base_url}/rest/api/{self.api_version}/{endpoint}" + headers = self.get_headers() + + response = requests.get(url, headers=headers, params=params, timeout=500) + + if response.status_code == 200: + return response.json() + else: + raise Exception( + f"API request failed with status code {response.status_code}: {response.text}" + ) + + def get_all_projects(self) -> dict[str, Any]: + """ + Fetch all projects from Jira. + + Returns: + List of project objects + + Raises: + ValueError: If credentials have not been set + Exception: If the API request fails + """ + return self.make_api_request("project/search") + + def get_all_issues(self, project_key: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Fetch all issues from Jira. + + Args: + project_key: Optional project key to filter issues (e.g., 'PROJ') + + Returns: + List of issue objects + + Raises: + ValueError: If credentials have not been set + Exception: If the API request fails + """ + jql = "ORDER BY created DESC" + if project_key: + jql = f'project = "{project_key}" ' + jql + + fields = [ + "summary", + "description", + "status", + "assignee", + "reporter", + "created", + "updated", + "priority", + "issuetype", + "project", + ] + + params = { + "jql": jql, + "fields": ",".join(fields), + "maxResults": 100, + "startAt": 0, + } + + all_issues = [] + start_at = 0 + + while True: + params["startAt"] = start_at + result = self.make_api_request("search", params) + + if not isinstance(result, dict) or "issues" not in result: + raise Exception("Invalid response from Jira API") + + issues = result["issues"] + all_issues.extend(issues) + + print(f"Fetched {len(issues)} issues (startAt={start_at})") + + total = result.get("total", 0) + if start_at + len(issues) >= total: + break + + start_at += len(issues) + + return all_issues + + def get_issues_by_date_range( + self, + start_date: str, + end_date: str, + include_comments: bool = True, + project_key: Optional[str] = None, + ) -> tuple[List[Dict[str, Any]], Optional[str]]: + """ + Fetch issues within a date range. + + Args: + start_date: Start date in YYYY-MM-DD format + end_date: End date in YYYY-MM-DD format (inclusive) + include_comments: Whether to include comments in the response + project_key: Optional project key to filter issues + + Returns: + Tuple containing (issues list, error message or None) + """ + try: + # Build JQL query for date range + # Query issues that were either created OR updated within the date range + date_filter = ( + f"(createdDate >= '{start_date}' AND createdDate <= '{end_date}')" + ) + # TODO : This JQL needs some improvement to work as expected + + jql = f"{date_filter}" + if project_key: + jql = ( + f'project = "{project_key}" AND {date_filter} ORDER BY created DESC' + ) + + # Define fields to retrieve + fields = [ + "summary", + "description", + "status", + "assignee", + "reporter", + "created", + "updated", + "priority", + "issuetype", + "project", + ] + + if include_comments: + fields.append("comment") + + params = { + # "jql": "", TODO : Add a JQL query to filter from a date range + "fields": ",".join(fields), + "maxResults": 100, + "startAt": 0, + } + + all_issues = [] + start_at = 0 + + while True: + params["startAt"] = start_at + + result = self.make_api_request("search", params) + + if not isinstance(result, dict) or "issues" not in result: + return [], "Invalid response from Jira API" + + issues = result["issues"] + all_issues.extend(issues) + + # Check if there are more issues to fetch + total = result.get("total", 0) + if start_at + len(issues) >= total: + break + + start_at += len(issues) + + if not all_issues: + return [], "No issues found in the specified date range." + + return all_issues, None + + except Exception as e: + return [], f"Error fetching issues: {str(e)}" + + def format_issue(self, issue: Dict[str, Any]) -> Dict[str, Any]: + """ + Format an issue for easier consumption. + + Args: + issue: The issue object from Jira API + + Returns: + Formatted issue dictionary + """ + fields = issue.get("fields", {}) + + # Extract basic issue details + formatted = { + "id": issue.get("id", ""), + "key": issue.get("key", ""), + "title": fields.get("summary", ""), + "description": fields.get("description", ""), + "status": ( + fields.get("status", {}).get("name", "Unknown") + if fields.get("status") + else "Unknown" + ), + "status_category": ( + fields.get("status", {}) + .get("statusCategory", {}) + .get("name", "Unknown") + if fields.get("status") + else "Unknown" + ), + "priority": ( + fields.get("priority", {}).get("name", "Unknown") + if fields.get("priority") + else "Unknown" + ), + "issue_type": ( + fields.get("issuetype", {}).get("name", "Unknown") + if fields.get("issuetype") + else "Unknown" + ), + "project": ( + fields.get("project", {}).get("key", "Unknown") + if fields.get("project") + else "Unknown" + ), + "created_at": fields.get("created", ""), + "updated_at": fields.get("updated", ""), + "reporter": ( + { + "account_id": ( + fields.get("reporter", {}).get("accountId", "") + if fields.get("reporter") + else "" + ), + "display_name": ( + fields.get("reporter", {}).get("displayName", "Unknown") + if fields.get("reporter") + else "Unknown" + ), + "email": ( + fields.get("reporter", {}).get("emailAddress", "") + if fields.get("reporter") + else "" + ), + } + if fields.get("reporter") + else {"account_id": "", "display_name": "Unknown", "email": ""} + ), + "assignee": ( + { + "account_id": fields.get("assignee", {}).get("accountId", ""), + "display_name": fields.get("assignee", {}).get( + "displayName", "Unknown" + ), + "email": fields.get("assignee", {}).get("emailAddress", ""), + } + if fields.get("assignee") + else None + ), + "comments": [], + } + + # Extract comments if available + if "comment" in fields and "comments" in fields["comment"]: + for comment in fields["comment"]["comments"]: + formatted_comment = { + "id": comment.get("id", ""), + "body": comment.get("body", ""), + "created_at": comment.get("created", ""), + "updated_at": comment.get("updated", ""), + "author": ( + { + "account_id": ( + comment.get("author", {}).get("accountId", "") + if comment.get("author") + else "" + ), + "display_name": ( + comment.get("author", {}).get("displayName", "Unknown") + if comment.get("author") + else "Unknown" + ), + "email": ( + comment.get("author", {}).get("emailAddress", "") + if comment.get("author") + else "" + ), + } + if comment.get("author") + else {"account_id": "", "display_name": "Unknown", "email": ""} + ), + } + formatted["comments"].append(formatted_comment) + + return formatted + + def format_issue_to_markdown(self, issue: Dict[str, Any]) -> str: + """ + Convert an issue to markdown format. + + Args: + issue: The issue object (either raw or formatted) + + Returns: + Markdown string representation of the issue + """ + # Format the issue if it's not already formatted + if "key" not in issue: + issue = self.format_issue(issue) + + # Build the markdown content + markdown = ( + f"# {issue.get('key', 'No Key')}: {issue.get('title', 'No Title')}\n\n" + ) + + if issue.get("status"): + markdown += f"**Status:** {issue['status']}\n" + + if issue.get("priority"): + markdown += f"**Priority:** {issue['priority']}\n" + + if issue.get("issue_type"): + markdown += f"**Type:** {issue['issue_type']}\n" + + if issue.get("project"): + markdown += f"**Project:** {issue['project']}\n\n" + + if issue.get("assignee") and issue["assignee"].get("display_name"): + markdown += f"**Assignee:** {issue['assignee']['display_name']}\n" + + if issue.get("reporter") and issue["reporter"].get("display_name"): + markdown += f"**Reporter:** {issue['reporter']['display_name']}\n" + + if issue.get("created_at"): + created_date = self.format_date(issue["created_at"]) + markdown += f"**Created:** {created_date}\n" + + if issue.get("updated_at"): + updated_date = self.format_date(issue["updated_at"]) + markdown += f"**Updated:** {updated_date}\n\n" + + if issue.get("description"): + markdown += f"## Description\n\n{issue['description']}\n\n" + + if issue.get("comments"): + markdown += f"## Comments ({len(issue['comments'])})\n\n" + + for comment in issue["comments"]: + author_name = "Unknown" + if comment.get("author") and comment["author"].get("display_name"): + author_name = comment["author"]["display_name"] + + comment_date = "Unknown date" + if comment.get("created_at"): + comment_date = self.format_date(comment["created_at"]) + + markdown += f"### {author_name} ({comment_date})\n\n{comment.get('body', '')}\n\n---\n\n" + + return markdown + + @staticmethod + def format_date(iso_date: str) -> str: + """ + Format an ISO date string to a more readable format. + + Args: + iso_date: ISO format date string + + Returns: + Formatted date string + """ + if not iso_date or not isinstance(iso_date, str): + return "Unknown date" + + try: + # Jira dates are typically in format: 2023-01-01T12:00:00.000+0000 + dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except ValueError: + return iso_date diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 7caf36533..bd982e4cf 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -2,30 +2,30 @@ from collections.abc import AsyncGenerator from datetime import datetime, timezone from enum import Enum +from app.config import config +from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever from fastapi import Depends - from pgvector.sqlalchemy import Vector from sqlalchemy import ( ARRAY, + JSON, + TIMESTAMP, Boolean, Column, - Enum as SQLAlchemyEnum, +) +from sqlalchemy import Enum as SQLAlchemyEnum +from sqlalchemy import ( ForeignKey, Integer, - JSON, String, Text, text, - TIMESTAMP ) from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from sqlalchemy.orm import DeclarativeBase, Mapped, declared_attr, relationship -from app.config import config -from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever - if config.AUTH_TYPE == "GOOGLE": from fastapi_users.db import ( SQLAlchemyBaseOAuthAccountTableUUID, @@ -51,9 +51,11 @@ class DocumentType(str, Enum): GITHUB_CONNECTOR = "GITHUB_CONNECTOR" LINEAR_CONNECTOR = "LINEAR_CONNECTOR" DISCORD_CONNECTOR = "DISCORD_CONNECTOR" + JIRA_CONNECTOR = "JIRA_CONNECTOR" + class SearchSourceConnectorType(str, Enum): - SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT + SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT TAVILY_API = "TAVILY_API" LINKUP_API = "LINKUP_API" SLACK_CONNECTOR = "SLACK_CONNECTOR" @@ -61,13 +63,16 @@ class SearchSourceConnectorType(str, Enum): GITHUB_CONNECTOR = "GITHUB_CONNECTOR" LINEAR_CONNECTOR = "LINEAR_CONNECTOR" DISCORD_CONNECTOR = "DISCORD_CONNECTOR" - + JIRA_CONNECTOR = "JIRA_CONNECTOR" + + class ChatType(str, Enum): QNA = "QNA" REPORT_GENERAL = "REPORT_GENERAL" REPORT_DEEP = "REPORT_DEEP" REPORT_DEEPER = "REPORT_DEEPER" + class LiteLLMProvider(str, Enum): OPENAI = "OPENAI" ANTHROPIC = "ANTHROPIC" @@ -92,6 +97,7 @@ class LiteLLMProvider(str, Enum): PETALS = "PETALS" CUSTOM = "CUSTOM" + class LogLevel(str, Enum): DEBUG = "DEBUG" INFO = "INFO" @@ -99,18 +105,27 @@ class LogLevel(str, Enum): ERROR = "ERROR" CRITICAL = "CRITICAL" + class LogStatus(str, Enum): IN_PROGRESS = "IN_PROGRESS" SUCCESS = "SUCCESS" FAILED = "FAILED" - + + class Base(DeclarativeBase): pass + class TimestampMixin: @declared_attr def created_at(cls): - return Column(TIMESTAMP(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc), index=True) + return Column( + TIMESTAMP(timezone=True), + nullable=False, + default=lambda: datetime.now(timezone.utc), + index=True, + ) + class BaseModel(Base): __abstract__ = True @@ -118,6 +133,7 @@ class BaseModel(Base): id = Column(Integer, primary_key=True, index=True) + class Chat(BaseModel, TimestampMixin): __tablename__ = "chats" @@ -125,73 +141,115 @@ class Chat(BaseModel, TimestampMixin): title = Column(String, nullable=False, index=True) initial_connectors = Column(ARRAY(String), nullable=True) messages = Column(JSON, nullable=False) - - search_space_id = Column(Integer, ForeignKey('searchspaces.id', ondelete='CASCADE'), nullable=False) - search_space = relationship('SearchSpace', back_populates='chats') + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) + search_space = relationship("SearchSpace", back_populates="chats") + class Document(BaseModel, TimestampMixin): __tablename__ = "documents" - + title = Column(String, nullable=False, index=True) document_type = Column(SQLAlchemyEnum(DocumentType), nullable=False) document_metadata = Column(JSON, nullable=True) - + content = Column(Text, nullable=False) content_hash = Column(String, nullable=False, index=True, unique=True) embedding = Column(Vector(config.embedding_model_instance.dimension)) - - search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) search_space = relationship("SearchSpace", back_populates="documents") - chunks = relationship("Chunk", back_populates="document", cascade="all, delete-orphan") + chunks = relationship( + "Chunk", back_populates="document", cascade="all, delete-orphan" + ) + class Chunk(BaseModel, TimestampMixin): __tablename__ = "chunks" - + content = Column(Text, nullable=False) embedding = Column(Vector(config.embedding_model_instance.dimension)) - - document_id = Column(Integer, ForeignKey("documents.id", ondelete='CASCADE'), nullable=False) + + document_id = Column( + Integer, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False + ) document = relationship("Document", back_populates="chunks") + class Podcast(BaseModel, TimestampMixin): __tablename__ = "podcasts" - + title = Column(String, nullable=False, index=True) podcast_transcript = Column(JSON, nullable=False, default={}) file_location = Column(String(500), nullable=False, default="") - - search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) search_space = relationship("SearchSpace", back_populates="podcasts") - + + class SearchSpace(BaseModel, TimestampMixin): __tablename__ = "searchspaces" - + name = Column(String(100), nullable=False, index=True) description = Column(String(500), nullable=True) - - user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) user = relationship("User", back_populates="search_spaces") - - documents = relationship("Document", back_populates="search_space", order_by="Document.id", cascade="all, delete-orphan") - podcasts = relationship("Podcast", back_populates="search_space", order_by="Podcast.id", cascade="all, delete-orphan") - chats = relationship('Chat', back_populates='search_space', order_by='Chat.id', cascade="all, delete-orphan") - logs = relationship("Log", back_populates="search_space", order_by="Log.id", cascade="all, delete-orphan") - + + documents = relationship( + "Document", + back_populates="search_space", + order_by="Document.id", + cascade="all, delete-orphan", + ) + podcasts = relationship( + "Podcast", + back_populates="search_space", + order_by="Podcast.id", + cascade="all, delete-orphan", + ) + chats = relationship( + "Chat", + back_populates="search_space", + order_by="Chat.id", + cascade="all, delete-orphan", + ) + logs = relationship( + "Log", + back_populates="search_space", + order_by="Log.id", + cascade="all, delete-orphan", + ) + + class SearchSourceConnector(BaseModel, TimestampMixin): __tablename__ = "search_source_connectors" - + name = Column(String(100), nullable=False, index=True) - connector_type = Column(SQLAlchemyEnum(SearchSourceConnectorType), nullable=False, unique=True) + connector_type = Column( + SQLAlchemyEnum(SearchSourceConnectorType), nullable=False, unique=True + ) is_indexable = Column(Boolean, nullable=False, default=False) last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True) config = Column(JSON, nullable=False) - - user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) user = relationship("User", back_populates="search_source_connectors") + class LLMConfig(BaseModel, TimestampMixin): __tablename__ = "llm_configs" - + name = Column(String(100), nullable=False, index=True) # Provider from the enum provider = Column(SQLAlchemyEnum(LiteLLMProvider), nullable=False) @@ -202,78 +260,142 @@ class LLMConfig(BaseModel, TimestampMixin): # API Key should be encrypted before storing api_key = Column(String, nullable=False) api_base = Column(String(500), nullable=True) - + # For any other parameters that litellm supports litellm_params = Column(JSON, nullable=True, default={}) - - user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) user = relationship("User", back_populates="llm_configs", foreign_keys=[user_id]) + class Log(BaseModel, TimestampMixin): __tablename__ = "logs" - + level = Column(SQLAlchemyEnum(LogLevel), nullable=False, index=True) status = Column(SQLAlchemyEnum(LogStatus), nullable=False, index=True) message = Column(Text, nullable=False) - source = Column(String(200), nullable=True, index=True) # Service/component that generated the log + source = Column( + String(200), nullable=True, index=True + ) # Service/component that generated the log log_metadata = Column(JSON, nullable=True, default={}) # Additional context data - - search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) search_space = relationship("SearchSpace", back_populates="logs") + if config.AUTH_TYPE == "GOOGLE": + class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base): pass - class User(SQLAlchemyBaseUserTableUUID, Base): oauth_accounts: Mapped[list[OAuthAccount]] = relationship( "OAuthAccount", lazy="joined" ) search_spaces = relationship("SearchSpace", back_populates="user") - search_source_connectors = relationship("SearchSourceConnector", back_populates="user") - llm_configs = relationship("LLMConfig", back_populates="user", foreign_keys="LLMConfig.user_id", cascade="all, delete-orphan") + search_source_connectors = relationship( + "SearchSourceConnector", back_populates="user" + ) + llm_configs = relationship( + "LLMConfig", + back_populates="user", + foreign_keys="LLMConfig.user_id", + cascade="all, delete-orphan", + ) - long_context_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - fast_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - strategic_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) + long_context_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + fast_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + strategic_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + + long_context_llm = relationship( + "LLMConfig", foreign_keys=[long_context_llm_id], post_update=True + ) + fast_llm = relationship( + "LLMConfig", foreign_keys=[fast_llm_id], post_update=True + ) + strategic_llm = relationship( + "LLMConfig", foreign_keys=[strategic_llm_id], post_update=True + ) - long_context_llm = relationship("LLMConfig", foreign_keys=[long_context_llm_id], post_update=True) - fast_llm = relationship("LLMConfig", foreign_keys=[fast_llm_id], post_update=True) - strategic_llm = relationship("LLMConfig", foreign_keys=[strategic_llm_id], post_update=True) else: + class User(SQLAlchemyBaseUserTableUUID, Base): - search_spaces = relationship("SearchSpace", back_populates="user") - search_source_connectors = relationship("SearchSourceConnector", back_populates="user") - llm_configs = relationship("LLMConfig", back_populates="user", foreign_keys="LLMConfig.user_id", cascade="all, delete-orphan") + search_source_connectors = relationship( + "SearchSourceConnector", back_populates="user" + ) + llm_configs = relationship( + "LLMConfig", + back_populates="user", + foreign_keys="LLMConfig.user_id", + cascade="all, delete-orphan", + ) - long_context_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - fast_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) - strategic_llm_id = Column(Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True) + long_context_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + fast_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) + strategic_llm_id = Column( + Integer, ForeignKey("llm_configs.id", ondelete="SET NULL"), nullable=True + ) - long_context_llm = relationship("LLMConfig", foreign_keys=[long_context_llm_id], post_update=True) - fast_llm = relationship("LLMConfig", foreign_keys=[fast_llm_id], post_update=True) - strategic_llm = relationship("LLMConfig", foreign_keys=[strategic_llm_id], post_update=True) + long_context_llm = relationship( + "LLMConfig", foreign_keys=[long_context_llm_id], post_update=True + ) + fast_llm = relationship( + "LLMConfig", foreign_keys=[fast_llm_id], post_update=True + ) + strategic_llm = relationship( + "LLMConfig", foreign_keys=[strategic_llm_id], post_update=True + ) engine = create_async_engine(DATABASE_URL) async_session_maker = async_sessionmaker(engine, expire_on_commit=False) - + async def setup_indexes(): async with engine.begin() as conn: - # Create indexes + # Create indexes # Document Summary Indexes - await conn.execute(text('CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)')) - await conn.execute(text('CREATE INDEX IF NOT EXISTS document_search_index ON documents USING gin (to_tsvector(\'english\', content))')) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)" + ) + ) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS document_search_index ON documents USING gin (to_tsvector('english', content))" + ) + ) # Document Chuck Indexes - await conn.execute(text('CREATE INDEX IF NOT EXISTS chucks_vector_index ON chunks USING hnsw (embedding public.vector_cosine_ops)')) - await conn.execute(text('CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector(\'english\', content))')) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS chucks_vector_index ON chunks USING hnsw (embedding public.vector_cosine_ops)" + ) + ) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))" + ) + ) + async def create_db_and_tables(): async with engine.begin() as conn: - await conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) + await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) await conn.run_sync(Base.metadata.create_all) await setup_indexes() @@ -284,14 +406,23 @@ async def get_async_session() -> AsyncGenerator[AsyncSession, None]: if config.AUTH_TYPE == "GOOGLE": + async def get_user_db(session: AsyncSession = Depends(get_async_session)): yield SQLAlchemyUserDatabase(session, User, OAuthAccount) + else: + async def get_user_db(session: AsyncSession = Depends(get_async_session)): yield SQLAlchemyUserDatabase(session, User) - -async def get_chucks_hybrid_search_retriever(session: AsyncSession = Depends(get_async_session)): + + +async def get_chucks_hybrid_search_retriever( + session: AsyncSession = Depends(get_async_session), +): return ChucksHybridSearchRetriever(session) -async def get_documents_hybrid_search_retriever(session: AsyncSession = Depends(get_async_session)): + +async def get_documents_hybrid_search_retriever( + session: AsyncSession = Depends(get_async_session), +): return DocumentHybridSearchRetriever(session) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 54f97d6ac..838b81a49 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -9,35 +9,58 @@ POST /search-source-connectors/{connector_id}/index - Index content from a conne Note: Each user can have only one connector of each type (SERPER_API, TAVILY_API, SLACK_CONNECTOR, NOTION_CONNECTOR, GITHUB_CONNECTOR, LINEAR_CONNECTOR, DISCORD_CONNECTOR). """ -from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks, Body -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.future import select -from sqlalchemy.exc import IntegrityError -from typing import List, Dict, Any -from app.db import get_async_session, User, SearchSourceConnector, SearchSourceConnectorType, SearchSpace, async_session_maker -from app.schemas import SearchSourceConnectorCreate, SearchSourceConnectorUpdate, SearchSourceConnectorRead, SearchSourceConnectorBase + +import logging +from datetime import datetime, timedelta +from typing import Any, Dict, List + +from app.connectors.github_connector import GitHubConnector +from app.db import ( + SearchSourceConnector, + SearchSourceConnectorType, + SearchSpace, + User, + async_session_maker, + get_async_session, +) +from app.schemas import ( + SearchSourceConnectorBase, + SearchSourceConnectorCreate, + SearchSourceConnectorRead, + SearchSourceConnectorUpdate, +) +from app.tasks.connectors_indexing_tasks import ( + index_discord_messages, + index_github_repos, + index_jira_issues, + index_linear_issues, + index_notion_pages, + index_slack_messages, +) from app.users import current_active_user from app.utils.check_ownership import check_ownership +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query from pydantic import BaseModel, Field, ValidationError -from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues, index_discord_messages -from app.connectors.github_connector import GitHubConnector -from datetime import datetime, timedelta -import logging +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select # Set up logging logger = logging.getLogger(__name__) router = APIRouter() + # Use Pydantic's BaseModel here class GitHubPATRequest(BaseModel): github_pat: str = Field(..., description="GitHub Personal Access Token") + # --- New Endpoint to list GitHub Repositories --- @router.post("/github/repositories/", response_model=List[Dict[str, Any]]) async def list_github_repositories( pat_request: GitHubPATRequest, - user: User = Depends(current_active_user) # Ensure the user is logged in + user: User = Depends(current_active_user), # Ensure the user is logged in ): """ Fetches a list of repositories accessible by the provided GitHub PAT. @@ -54,35 +77,39 @@ async def list_github_repositories( logger.error(f"GitHub PAT validation failed for user {user.id}: {str(e)}") raise HTTPException(status_code=400, detail=f"Invalid GitHub PAT: {str(e)}") except Exception as e: - logger.error(f"Failed to fetch GitHub repositories for user {user.id}: {str(e)}") - raise HTTPException(status_code=500, detail="Failed to fetch GitHub repositories.") + logger.error( + f"Failed to fetch GitHub repositories for user {user.id}: {str(e)}" + ) + raise HTTPException( + status_code=500, detail="Failed to fetch GitHub repositories." + ) + @router.post("/search-source-connectors/", response_model=SearchSourceConnectorRead) async def create_search_source_connector( connector: SearchSourceConnectorCreate, session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user) + user: User = Depends(current_active_user), ): """ Create a new search source connector. - + Each user can have only one connector of each type (SERPER_API, TAVILY_API, SLACK_CONNECTOR, etc.). The config must contain the appropriate keys for the connector type. """ try: # Check if a connector with the same type already exists for this user result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.user_id == user.id, - SearchSourceConnector.connector_type == connector.connector_type + SearchSourceConnector.connector_type == connector.connector_type, ) ) existing_connector = result.scalars().first() if existing_connector: raise HTTPException( status_code=409, - detail=f"A connector with type {connector.connector_type} already exists. Each user can have only one connector of each type." + detail=f"A connector with type {connector.connector_type} already exists. Each user can have only one connector of each type.", ) db_connector = SearchSourceConnector(**connector.model_dump(), user_id=user.id) session.add(db_connector) @@ -91,15 +118,12 @@ async def create_search_source_connector( return db_connector except ValidationError as e: await session.rollback() - raise HTTPException( - status_code=422, - detail=f"Validation error: {str(e)}" - ) + raise HTTPException(status_code=422, detail=f"Validation error: {str(e)}") except IntegrityError as e: await session.rollback() raise HTTPException( status_code=409, - detail=f"Integrity error: A connector with this type already exists. {str(e)}" + detail=f"Integrity error: A connector with this type already exists. {str(e)}", ) except HTTPException: await session.rollback() @@ -109,38 +133,44 @@ async def create_search_source_connector( await session.rollback() raise HTTPException( status_code=500, - detail=f"Failed to create search source connector: {str(e)}" + detail=f"Failed to create search source connector: {str(e)}", ) -@router.get("/search-source-connectors/", response_model=List[SearchSourceConnectorRead]) + +@router.get( + "/search-source-connectors/", response_model=List[SearchSourceConnectorRead] +) async def read_search_source_connectors( skip: int = 0, limit: int = 100, search_space_id: int = None, session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user) + user: User = Depends(current_active_user), ): """List all search source connectors for the current user.""" try: - query = select(SearchSourceConnector).filter(SearchSourceConnector.user_id == user.id) - - # No need to filter by search_space_id as connectors are user-owned, not search space specific - - result = await session.execute( - query.offset(skip).limit(limit) + query = select(SearchSourceConnector).filter( + SearchSourceConnector.user_id == user.id ) + + # No need to filter by search_space_id as connectors are user-owned, not search space specific + + result = await session.execute(query.offset(skip).limit(limit)) return result.scalars().all() except Exception as e: raise HTTPException( status_code=500, - detail=f"Failed to fetch search source connectors: {str(e)}" + detail=f"Failed to fetch search source connectors: {str(e)}", ) -@router.get("/search-source-connectors/{connector_id}", response_model=SearchSourceConnectorRead) + +@router.get( + "/search-source-connectors/{connector_id}", response_model=SearchSourceConnectorRead +) async def read_search_source_connector( connector_id: int, session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user) + user: User = Depends(current_active_user), ): """Get a specific search source connector by ID.""" try: @@ -149,31 +179,37 @@ async def read_search_source_connector( raise except Exception as e: raise HTTPException( - status_code=500, - detail=f"Failed to fetch search source connector: {str(e)}" + status_code=500, detail=f"Failed to fetch search source connector: {str(e)}" ) -@router.put("/search-source-connectors/{connector_id}", response_model=SearchSourceConnectorRead) + +@router.put( + "/search-source-connectors/{connector_id}", response_model=SearchSourceConnectorRead +) async def update_search_source_connector( connector_id: int, connector_update: SearchSourceConnectorUpdate, session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user) + user: User = Depends(current_active_user), ): """ Update a search source connector. Handles partial updates, including merging changes into the 'config' field. """ - db_connector = await check_ownership(session, SearchSourceConnector, connector_id, user) - + db_connector = await check_ownership( + session, SearchSourceConnector, connector_id, user + ) + # Convert the sparse update data (only fields present in request) to a dict update_data = connector_update.model_dump(exclude_unset=True) # Special handling for 'config' field if "config" in update_data: - incoming_config = update_data["config"] # Config data from the request - existing_config = db_connector.config if db_connector.config else {} # Current config from DB - + incoming_config = update_data["config"] # Config data from the request + existing_config = ( + db_connector.config if db_connector.config else {} + ) # Current config from DB + # Merge incoming config into existing config # This preserves existing keys (like GITHUB_PAT) if they are not in the incoming data merged_config = existing_config.copy() @@ -182,26 +218,29 @@ async def update_search_source_connector( # -- Validation after merging -- # Validate the *merged* config based on the connector type # We need the connector type - use the one from the update if provided, else the existing one - current_connector_type = connector_update.connector_type if connector_update.connector_type is not None else db_connector.connector_type - + current_connector_type = ( + connector_update.connector_type + if connector_update.connector_type is not None + else db_connector.connector_type + ) + try: # We can reuse the base validator by creating a temporary base model instance # Note: This assumes 'name' and 'is_indexable' are not crucial for config validation itself temp_data_for_validation = { - "name": db_connector.name, # Use existing name + "name": db_connector.name, # Use existing name "connector_type": current_connector_type, - "is_indexable": db_connector.is_indexable, # Use existing value - "last_indexed_at": db_connector.last_indexed_at, # Not used by validator - "config": merged_config + "is_indexable": db_connector.is_indexable, # Use existing value + "last_indexed_at": db_connector.last_indexed_at, # Not used by validator + "config": merged_config, } SearchSourceConnectorBase.model_validate(temp_data_for_validation) except ValidationError as e: # Raise specific validation error for the merged config raise HTTPException( - status_code=422, - detail=f"Validation error for merged config: {str(e)}" + status_code=422, detail=f"Validation error for merged config: {str(e)}" ) - + # If validation passes, update the main update_data dict with the merged config update_data["config"] = merged_config @@ -210,20 +249,19 @@ async def update_search_source_connector( # Prevent changing connector_type if it causes a duplicate (check moved here) if key == "connector_type" and value != db_connector.connector_type: result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.user_id == user.id, SearchSourceConnector.connector_type == value, - SearchSourceConnector.id != connector_id + SearchSourceConnector.id != connector_id, ) ) existing_connector = result.scalars().first() if existing_connector: raise HTTPException( status_code=409, - detail=f"A connector with type {value} already exists. Each user can have only one connector of each type." + detail=f"A connector with type {value} already exists. Each user can have only one connector of each type.", ) - + setattr(db_connector, key, value) try: @@ -234,26 +272,31 @@ async def update_search_source_connector( await session.rollback() # This might occur if connector_type constraint is violated somehow after the check raise HTTPException( - status_code=409, - detail=f"Database integrity error during update: {str(e)}" + status_code=409, detail=f"Database integrity error during update: {str(e)}" ) except Exception as e: await session.rollback() - logger.error(f"Failed to update search source connector {connector_id}: {e}", exc_info=True) + logger.error( + f"Failed to update search source connector {connector_id}: {e}", + exc_info=True, + ) raise HTTPException( status_code=500, - detail=f"Failed to update search source connector: {str(e)}" + detail=f"Failed to update search source connector: {str(e)}", ) + @router.delete("/search-source-connectors/{connector_id}", response_model=dict) async def delete_search_source_connector( connector_id: int, session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user) + user: User = Depends(current_active_user), ): """Delete a search source connector.""" try: - db_connector = await check_ownership(session, SearchSourceConnector, connector_id, user) + db_connector = await check_ownership( + session, SearchSourceConnector, connector_id, user + ) await session.delete(db_connector) await session.commit() return {"message": "Search source connector deleted successfully"} @@ -263,48 +306,64 @@ async def delete_search_source_connector( await session.rollback() raise HTTPException( status_code=500, - detail=f"Failed to delete search source connector: {str(e)}" + detail=f"Failed to delete search source connector: {str(e)}", ) -@router.post("/search-source-connectors/{connector_id}/index", response_model=Dict[str, Any]) + +@router.post( + "/search-source-connectors/{connector_id}/index", response_model=Dict[str, Any] +) async def index_connector_content( connector_id: int, - search_space_id: int = Query(..., description="ID of the search space to store indexed content"), - start_date: str = Query(None, description="Start date for indexing (YYYY-MM-DD format). If not provided, uses last_indexed_at or defaults to 365 days ago"), - end_date: str = Query(None, description="End date for indexing (YYYY-MM-DD format). If not provided, uses today's date"), + search_space_id: int = Query( + ..., description="ID of the search space to store indexed content" + ), + start_date: str = Query( + None, + description="Start date for indexing (YYYY-MM-DD format). If not provided, uses last_indexed_at or defaults to 365 days ago", + ), + end_date: str = Query( + None, + description="End date for indexing (YYYY-MM-DD format). If not provided, uses today's date", + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), - background_tasks: BackgroundTasks = None + background_tasks: BackgroundTasks = None, ): """ Index content from a connector to a search space. - + Currently supports: - SLACK_CONNECTOR: Indexes messages from all accessible Slack channels - NOTION_CONNECTOR: Indexes pages from all accessible Notion pages - GITHUB_CONNECTOR: Indexes code and documentation from GitHub repositories - LINEAR_CONNECTOR: Indexes issues and comments from Linear + - JIRA_CONNECTOR: Indexes issues and comments from Jira - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels - + Args: connector_id: ID of the connector to use search_space_id: ID of the search space to store indexed content background_tasks: FastAPI background tasks - + Returns: Dictionary with indexing status """ try: # Check if the connector belongs to the user - connector = await check_ownership(session, SearchSourceConnector, connector_id, user) - + connector = await check_ownership( + session, SearchSourceConnector, connector_id, user + ) + # Check if the search space belongs to the user - search_space = await check_ownership(session, SearchSpace, search_space_id, user) - + search_space = await check_ownership( + session, SearchSpace, search_space_id, user + ) + # Handle different connector types response_message = "" today_str = datetime.now().strftime("%Y-%m-%d") - + # Determine the actual date range to use if start_date is None: # Use last_indexed_at or default to 365 days ago @@ -316,10 +375,12 @@ async def index_connector_content( else: indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d") else: - indexing_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d") + indexing_from = (datetime.now() - timedelta(days=365)).strftime( + "%Y-%m-%d" + ) else: indexing_from = start_date - + if end_date is None: indexing_to = today_str else: @@ -327,99 +388,162 @@ async def index_connector_content( if connector.connector_type == SearchSourceConnectorType.SLACK_CONNECTOR: # Run indexing in background - logger.info(f"Triggering Slack indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}") - background_tasks.add_task(run_slack_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) + logger.info( + f"Triggering Slack indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" + ) + background_tasks.add_task( + run_slack_indexing_with_new_session, + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, + ) response_message = "Slack indexing started in the background." elif connector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR: # Run indexing in background - logger.info(f"Triggering Notion indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}") - background_tasks.add_task(run_notion_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) + logger.info( + f"Triggering Notion indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" + ) + background_tasks.add_task( + run_notion_indexing_with_new_session, + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, + ) response_message = "Notion indexing started in the background." - + elif connector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR: # Run indexing in background - logger.info(f"Triggering GitHub indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}") - background_tasks.add_task(run_github_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) + logger.info( + f"Triggering GitHub indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" + ) + background_tasks.add_task( + run_github_indexing_with_new_session, + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, + ) response_message = "GitHub indexing started in the background." - + elif connector.connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR: # Run indexing in background - logger.info(f"Triggering Linear indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}") - background_tasks.add_task(run_linear_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to) + logger.info( + f"Triggering Linear indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" + ) + background_tasks.add_task( + run_linear_indexing_with_new_session, + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, + ) response_message = "Linear indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.JIRA_CONNECTOR: + # Run indexing in background + logger.info( + f"Triggering Jira indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" + ) + background_tasks.add_task( + run_jira_indexing_with_new_session, + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, + ) + response_message = "Jira indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: # Run indexing in background logger.info( f"Triggering Discord indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" ) background_tasks.add_task( - run_discord_indexing_with_new_session, connector_id, search_space_id, str(user.id), indexing_from, indexing_to + run_discord_indexing_with_new_session, + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, ) response_message = "Discord indexing started in the background." else: raise HTTPException( status_code=400, - detail=f"Indexing not supported for connector type: {connector.connector_type}" + detail=f"Indexing not supported for connector type: {connector.connector_type}", ) return { - "message": response_message, - "connector_id": connector_id, + "message": response_message, + "connector_id": connector_id, "search_space_id": search_space_id, "indexing_from": indexing_from, - "indexing_to": indexing_to + "indexing_to": indexing_to, } except HTTPException: raise except Exception as e: - logger.error(f"Failed to initiate indexing for connector {connector_id}: {e}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"Failed to initiate indexing: {str(e)}" + logger.error( + f"Failed to initiate indexing for connector {connector_id}: {e}", + exc_info=True, ) - -async def update_connector_last_indexed( - session: AsyncSession, - connector_id: int -): + raise HTTPException( + status_code=500, detail=f"Failed to initiate indexing: {str(e)}" + ) + + +async def update_connector_last_indexed(session: AsyncSession, connector_id: int): """ Update the last_indexed_at timestamp for a connector. - + Args: session: Database session connector_id: ID of the connector to update """ try: result = await session.execute( - select(SearchSourceConnector) - .filter(SearchSourceConnector.id == connector_id) + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id + ) ) connector = result.scalars().first() - + if connector: connector.last_indexed_at = datetime.now() await session.commit() logger.info(f"Updated last_indexed_at for connector {connector_id}") except Exception as e: - logger.error(f"Failed to update last_indexed_at for connector {connector_id}: {str(e)}") + logger.error( + f"Failed to update last_indexed_at for connector {connector_id}: {str(e)}" + ) await session.rollback() + async def run_slack_indexing_with_new_session( connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """ Create a new session and run the Slack indexing task. This prevents session leaks by creating a dedicated session for the background task. """ async with async_session_maker() as session: - await run_slack_indexing(session, connector_id, search_space_id, user_id, start_date, end_date) + await run_slack_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + async def run_slack_indexing( session: AsyncSession, @@ -427,11 +551,11 @@ async def run_slack_indexing( search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """ Background task to run Slack indexing. - + Args: session: Database session connector_id: ID of the Slack connector @@ -449,31 +573,39 @@ async def run_slack_indexing( user_id=user_id, start_date=start_date, end_date=end_date, - update_last_indexed=False # Don't update timestamp in the indexing function + update_last_indexed=False, # Don't update timestamp in the indexing function ) - + # Only update last_indexed_at if indexing was successful (either new docs or updated docs) if documents_processed > 0: await update_connector_last_indexed(session, connector_id) - logger.info(f"Slack indexing completed successfully: {documents_processed} documents processed") + logger.info( + f"Slack indexing completed successfully: {documents_processed} documents processed" + ) else: - logger.error(f"Slack indexing failed or no documents processed: {error_or_warning}") + logger.error( + f"Slack indexing failed or no documents processed: {error_or_warning}" + ) except Exception as e: logger.error(f"Error in background Slack indexing task: {str(e)}") + async def run_notion_indexing_with_new_session( connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """ Create a new session and run the Notion indexing task. This prevents session leaks by creating a dedicated session for the background task. """ async with async_session_maker() as session: - await run_notion_indexing(session, connector_id, search_space_id, user_id, start_date, end_date) + await run_notion_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + async def run_notion_indexing( session: AsyncSession, @@ -481,11 +613,11 @@ async def run_notion_indexing( search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """ Background task to run Notion indexing. - + Args: session: Database session connector_id: ID of the Notion connector @@ -503,112 +635,158 @@ async def run_notion_indexing( user_id=user_id, start_date=start_date, end_date=end_date, - update_last_indexed=False # Don't update timestamp in the indexing function + update_last_indexed=False, # Don't update timestamp in the indexing function ) - + # Only update last_indexed_at if indexing was successful (either new docs or updated docs) if documents_processed > 0: await update_connector_last_indexed(session, connector_id) - logger.info(f"Notion indexing completed successfully: {documents_processed} documents processed") + logger.info( + f"Notion indexing completed successfully: {documents_processed} documents processed" + ) else: - logger.error(f"Notion indexing failed or no documents processed: {error_or_warning}") + logger.error( + f"Notion indexing failed or no documents processed: {error_or_warning}" + ) except Exception as e: logger.error(f"Error in background Notion indexing task: {str(e)}") + # Add new helper functions for GitHub indexing async def run_github_indexing_with_new_session( connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """Wrapper to run GitHub indexing with its own database session.""" - logger.info(f"Background task started: Indexing GitHub connector {connector_id} into space {search_space_id} from {start_date} to {end_date}") + logger.info( + f"Background task started: Indexing GitHub connector {connector_id} into space {search_space_id} from {start_date} to {end_date}" + ) async with async_session_maker() as session: - await run_github_indexing(session, connector_id, search_space_id, user_id, start_date, end_date) + await run_github_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) logger.info(f"Background task finished: Indexing GitHub connector {connector_id}") + async def run_github_indexing( session: AsyncSession, connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """Runs the GitHub indexing task and updates the timestamp.""" try: indexed_count, error_message = await index_github_repos( - session, connector_id, search_space_id, user_id, start_date, end_date, update_last_indexed=False + session, + connector_id, + search_space_id, + user_id, + start_date, + end_date, + update_last_indexed=False, ) if error_message: - logger.error(f"GitHub indexing failed for connector {connector_id}: {error_message}") + logger.error( + f"GitHub indexing failed for connector {connector_id}: {error_message}" + ) # Optionally update status in DB to indicate failure else: - logger.info(f"GitHub indexing successful for connector {connector_id}. Indexed {indexed_count} documents.") + logger.info( + f"GitHub indexing successful for connector {connector_id}. Indexed {indexed_count} documents." + ) # Update the last indexed timestamp only on success await update_connector_last_indexed(session, connector_id) - await session.commit() # Commit timestamp update + await session.commit() # Commit timestamp update except Exception as e: await session.rollback() - logger.error(f"Critical error in run_github_indexing for connector {connector_id}: {e}", exc_info=True) + logger.error( + f"Critical error in run_github_indexing for connector {connector_id}: {e}", + exc_info=True, + ) # Optionally update status in DB to indicate failure + # Add new helper functions for Linear indexing async def run_linear_indexing_with_new_session( connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """Wrapper to run Linear indexing with its own database session.""" - logger.info(f"Background task started: Indexing Linear connector {connector_id} into space {search_space_id} from {start_date} to {end_date}") + logger.info( + f"Background task started: Indexing Linear connector {connector_id} into space {search_space_id} from {start_date} to {end_date}" + ) async with async_session_maker() as session: - await run_linear_indexing(session, connector_id, search_space_id, user_id, start_date, end_date) + await run_linear_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) logger.info(f"Background task finished: Indexing Linear connector {connector_id}") + async def run_linear_indexing( session: AsyncSession, connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """Runs the Linear indexing task and updates the timestamp.""" try: indexed_count, error_message = await index_linear_issues( - session, connector_id, search_space_id, user_id, start_date, end_date, update_last_indexed=False + session, + connector_id, + search_space_id, + user_id, + start_date, + end_date, + update_last_indexed=False, ) if error_message: - logger.error(f"Linear indexing failed for connector {connector_id}: {error_message}") + logger.error( + f"Linear indexing failed for connector {connector_id}: {error_message}" + ) # Optionally update status in DB to indicate failure else: - logger.info(f"Linear indexing successful for connector {connector_id}. Indexed {indexed_count} documents.") + logger.info( + f"Linear indexing successful for connector {connector_id}. Indexed {indexed_count} documents." + ) # Update the last indexed timestamp only on success await update_connector_last_indexed(session, connector_id) - await session.commit() # Commit timestamp update + await session.commit() # Commit timestamp update except Exception as e: await session.rollback() - logger.error(f"Critical error in run_linear_indexing for connector {connector_id}: {e}", exc_info=True) + logger.error( + f"Critical error in run_linear_indexing for connector {connector_id}: {e}", + exc_info=True, + ) # Optionally update status in DB to indicate failure + # Add new helper functions for discord indexing async def run_discord_indexing_with_new_session( connector_id: int, search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """ Create a new session and run the Discord indexing task. This prevents session leaks by creating a dedicated session for the background task. """ async with async_session_maker() as session: - await run_discord_indexing(session, connector_id, search_space_id, user_id, start_date, end_date) + await run_discord_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + async def run_discord_indexing( session: AsyncSession, @@ -616,7 +794,7 @@ async def run_discord_indexing( search_space_id: int, user_id: str, start_date: str, - end_date: str + end_date: str, ): """ Background task to run Discord indexing. @@ -637,14 +815,76 @@ async def run_discord_indexing( user_id=user_id, start_date=start_date, end_date=end_date, - update_last_indexed=False # Don't update timestamp in the indexing function + update_last_indexed=False, # Don't update timestamp in the indexing function ) # Only update last_indexed_at if indexing was successful (either new docs or updated docs) if documents_processed > 0: await update_connector_last_indexed(session, connector_id) - logger.info(f"Discord indexing completed successfully: {documents_processed} documents processed") + logger.info( + f"Discord indexing completed successfully: {documents_processed} documents processed" + ) else: - logger.error(f"Discord indexing failed or no documents processed: {error_or_warning}") + logger.error( + f"Discord indexing failed or no documents processed: {error_or_warning}" + ) except Exception as e: - logger.error(f"Error in background Discord indexing task: {str(e)}") \ No newline at end of file + logger.error(f"Error in background Discord indexing task: {str(e)}") + + +# Add new helper functions for Jira indexing +async def run_jira_indexing_with_new_session( + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str, +): + """Wrapper to run Jira indexing with its own database session.""" + logger.info( + f"Background task started: Indexing Jira connector {connector_id} into space {search_space_id} from {start_date} to {end_date}" + ) + async with async_session_maker() as session: + await run_jira_indexing( + session, connector_id, search_space_id, user_id, start_date, end_date + ) + logger.info(f"Background task finished: Indexing Jira connector {connector_id}") + + +async def run_jira_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str, + end_date: str, +): + """Runs the Jira indexing task and updates the timestamp.""" + try: + indexed_count, error_message = await index_jira_issues( + session, + connector_id, + search_space_id, + user_id, + start_date, + end_date, + update_last_indexed=False, + ) + if error_message: + logger.error( + f"Jira indexing failed for connector {connector_id}: {error_message}" + ) + # Optionally update status in DB to indicate failure + else: + logger.info( + f"Jira indexing successful for connector {connector_id}. Indexed {indexed_count} documents." + ) + # Update the last indexed timestamp only on success + await update_connector_last_indexed(session, connector_id) + await session.commit() # Commit timestamp update + except Exception as e: + logger.error( + f"Critical error in run_jira_indexing for connector {connector_id}: {e}", + exc_info=True, + ) + # Optionally update status in DB to indicate failure diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 1225d54fc..8c444a8fc 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -1,9 +1,12 @@ -from datetime import datetime import uuid -from typing import Dict, Any, Optional -from pydantic import BaseModel, field_validator, ConfigDict -from .base import IDModel, TimestampModel +from datetime import datetime +from typing import Any, Dict, Optional + from app.db import SearchSourceConnectorType +from pydantic import BaseModel, ConfigDict, field_validator + +from .base import IDModel, TimestampModel + class SearchSourceConnectorBase(BaseModel): name: str @@ -11,102 +14,141 @@ class SearchSourceConnectorBase(BaseModel): is_indexable: bool last_indexed_at: Optional[datetime] = None config: Dict[str, Any] - - @field_validator('config') + + @field_validator("config") @classmethod - def validate_config_for_connector_type(cls, config: Dict[str, Any], values: Dict[str, Any]) -> Dict[str, Any]: - connector_type = values.data.get('connector_type') - + def validate_config_for_connector_type( + cls, config: Dict[str, Any], values: Dict[str, Any] + ) -> Dict[str, Any]: + connector_type = values.data.get("connector_type") + if connector_type == SearchSourceConnectorType.SERPER_API: # For SERPER_API, only allow SERPER_API_KEY allowed_keys = ["SERPER_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For SERPER_API connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For SERPER_API connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the API key is not empty if not config.get("SERPER_API_KEY"): raise ValueError("SERPER_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.TAVILY_API: # For TAVILY_API, only allow TAVILY_API_KEY allowed_keys = ["TAVILY_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For TAVILY_API connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For TAVILY_API connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the API key is not empty if not config.get("TAVILY_API_KEY"): raise ValueError("TAVILY_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.LINKUP_API: # For LINKUP_API, only allow LINKUP_API_KEY allowed_keys = ["LINKUP_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the API key is not empty if not config.get("LINKUP_API_KEY"): raise ValueError("LINKUP_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.SLACK_CONNECTOR: # For SLACK_CONNECTOR, only allow SLACK_BOT_TOKEN allowed_keys = ["SLACK_BOT_TOKEN"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For SLACK_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + raise ValueError( + f"For SLACK_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) # Ensure the bot token is not empty if not config.get("SLACK_BOT_TOKEN"): raise ValueError("SLACK_BOT_TOKEN cannot be empty") - + elif connector_type == SearchSourceConnectorType.NOTION_CONNECTOR: # For NOTION_CONNECTOR, only allow NOTION_INTEGRATION_TOKEN allowed_keys = ["NOTION_INTEGRATION_TOKEN"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For NOTION_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For NOTION_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the integration token is not empty if not config.get("NOTION_INTEGRATION_TOKEN"): raise ValueError("NOTION_INTEGRATION_TOKEN cannot be empty") - + elif connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR: # For GITHUB_CONNECTOR, only allow GITHUB_PAT and repo_full_names allowed_keys = ["GITHUB_PAT", "repo_full_names"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For GITHUB_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the token is not empty if not config.get("GITHUB_PAT"): raise ValueError("GITHUB_PAT cannot be empty") - + # Ensure the repo_full_names is present and is a non-empty list repo_full_names = config.get("repo_full_names") if not isinstance(repo_full_names, list) or not repo_full_names: raise ValueError("repo_full_names must be a non-empty list of strings") - + elif connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR: # For LINEAR_CONNECTOR, only allow LINEAR_API_KEY allowed_keys = ["LINEAR_API_KEY"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For LINEAR_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") - + raise ValueError( + f"For LINEAR_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + # Ensure the token is not empty if not config.get("LINEAR_API_KEY"): raise ValueError("LINEAR_API_KEY cannot be empty") - + elif connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: # For DISCORD_CONNECTOR, only allow DISCORD_BOT_TOKEN allowed_keys = ["DISCORD_BOT_TOKEN"] if set(config.keys()) != set(allowed_keys): - raise ValueError(f"For DISCORD_CONNECTOR connector type, config must only contain these keys: {allowed_keys}") + raise ValueError( + f"For DISCORD_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) # Ensure the bot token is not empty if not config.get("DISCORD_BOT_TOKEN"): raise ValueError("DISCORD_BOT_TOKEN cannot be empty") + elif connector_type == SearchSourceConnectorType.JIRA_CONNECTOR: + # For JIRA_CONNECTOR, require JIRA_EMAIL, JIRA_API_TOKEN and JIRA_BASE_URL + allowed_keys = ["JIRA_EMAIL", "JIRA_API_TOKEN", "JIRA_BASE_URL"] + if set(config.keys()) != set(allowed_keys): + raise ValueError( + f"For JIRA_CONNECTOR connector type, config must only contain these keys: {allowed_keys}" + ) + + # Ensure the email is not empty + if not config.get("JIRA_EMAIL"): + raise ValueError("JIRA_EMAIL cannot be empty") + + # Ensure the API token is not empty + if not config.get("JIRA_API_TOKEN"): + raise ValueError("JIRA_API_TOKEN cannot be empty") + + # Ensure the base URL is not empty + if not config.get("JIRA_BASE_URL"): + raise ValueError("JIRA_BASE_URL cannot be empty") return config + class SearchSourceConnectorCreate(SearchSourceConnectorBase): pass + class SearchSourceConnectorUpdate(BaseModel): name: Optional[str] = None connector_type: Optional[SearchSourceConnectorType] = None @@ -114,7 +156,8 @@ class SearchSourceConnectorUpdate(BaseModel): last_indexed_at: Optional[datetime] = None config: Optional[Dict[str, Any]] = None + class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampModel): user_id: uuid.UUID - model_config = ConfigDict(from_attributes=True) + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index f53fd4dfc..b0071ba75 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -1,15 +1,21 @@ -from typing import List, Dict, Optional import asyncio -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.future import select -from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever -from app.db import SearchSourceConnector, SearchSourceConnectorType, Chunk, Document, SearchSpace -from tavily import TavilyClient -from linkup import LinkupClient -from sqlalchemy import func +from typing import Dict, List, Optional from app.agents.researcher.configuration import SearchMode +from app.db import ( + Chunk, + Document, + SearchSourceConnector, + SearchSourceConnectorType, + SearchSpace, +) +from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever +from linkup import LinkupClient +from sqlalchemy import func +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from tavily import TavilyClient class ConnectorService: @@ -18,9 +24,13 @@ class ConnectorService: self.chunk_retriever = ChucksHybridSearchRetriever(session) self.document_retriever = DocumentHybridSearchRetriever(session) self.user_id = user_id - self.source_id_counter = 100000 # High starting value to avoid collisions with existing IDs - self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments - + self.source_id_counter = ( + 100000 # High starting value to avoid collisions with existing IDs + ) + self.counter_lock = ( + asyncio.Lock() + ) # Lock to protect counter in multithreaded environments + async def initialize_counter(self): """ Initialize the source_id_counter based on the total number of chunks for the user. @@ -38,16 +48,25 @@ class ConnectorService: ) chunk_count = result.scalar() or 0 self.source_id_counter = chunk_count + 1 - print(f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}") + print( + f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}" + ) except Exception as e: print(f"Error initializing source_id_counter: {str(e)}") # Fallback to default value self.source_id_counter = 1 - - async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_crawled_urls( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -57,7 +76,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="CRAWLED_URL" + document_type="CRAWLED_URL", ) elif search_mode == SearchMode.DOCUMENTS: crawled_urls_chunks = await self.document_retriever.hybrid_search( @@ -65,7 +84,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="CRAWLED_URL" + document_type="CRAWLED_URL", ) # Transform document retriever results to match expected format crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) @@ -84,20 +103,23 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(crawled_urls_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a source entry source = { - "id": document.get('id', self.source_id_counter), - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') + "id": document.get("id", self.source_id_counter), + "title": document.get("title", "Untitled Document"), + "description": metadata.get( + "og:description", + metadata.get("ogDescription", chunk.get("content", "")[:100]), + ), + "url": metadata.get("url", ""), } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 1, @@ -105,13 +127,20 @@ class ConnectorService: "type": "CRAWLED_URL", "sources": sources_list, } - + return result_object, crawled_urls_chunks - - async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_files( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for files and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -121,7 +150,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="FILE" + document_type="FILE", ) elif search_mode == SearchMode.DOCUMENTS: files_chunks = await self.document_retriever.hybrid_search( @@ -129,11 +158,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="FILE" + document_type="FILE", ) # Transform document retriever results to match expected format files_chunks = self._transform_document_results(files_chunks) - + # Early return if no results if not files_chunks: return { @@ -148,20 +177,23 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(files_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a source entry source = { - "id": document.get('id', self.source_id_counter), - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') + "id": document.get("id", self.source_id_counter), + "title": document.get("title", "Untitled Document"), + "description": metadata.get( + "og:description", + metadata.get("ogDescription", chunk.get("content", "")[:100]), + ), + "url": metadata.get("url", ""), } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 2, @@ -169,69 +201,76 @@ class ConnectorService: "type": "FILE", "sources": sources_list, } - + return result_object, files_chunks - + def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]: """ Transform results from document_retriever.hybrid_search() to match the format expected by the processing code. - + Args: document_results: Results from document_retriever.hybrid_search() - + Returns: List of transformed results in the format expected by the processing code """ transformed_results = [] for doc in document_results: - transformed_results.append({ - 'document': { - 'id': doc.get('document_id'), - 'title': doc.get('title', 'Untitled Document'), - 'document_type': doc.get('document_type'), - 'metadata': doc.get('metadata', {}), - }, - 'content': doc.get('chunks_content', doc.get('content', '')), - 'score': doc.get('score', 0.0) - }) + transformed_results.append( + { + "document": { + "id": doc.get("document_id"), + "title": doc.get("title", "Untitled Document"), + "document_type": doc.get("document_type"), + "metadata": doc.get("metadata", {}), + }, + "content": doc.get("chunks_content", doc.get("content", "")), + "score": doc.get("score", 0.0), + } + ) return transformed_results - - async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]: + + async def get_connector_by_type( + self, user_id: str, connector_type: SearchSourceConnectorType + ) -> Optional[SearchSourceConnector]: """ Get a connector by type for a specific user - + Args: user_id: The user's ID connector_type: The connector type to retrieve - + Returns: Optional[SearchSourceConnector]: The connector if found, None otherwise """ result = await self.session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.user_id == user_id, - SearchSourceConnector.connector_type == connector_type + SearchSourceConnector.connector_type == connector_type, ) ) return result.scalars().first() - - async def search_tavily(self, user_query: str, user_id: str, top_k: int = 20) -> tuple: + + async def search_tavily( + self, user_query: str, user_id: str, top_k: int = 20 + ) -> tuple: """ Search using Tavily API and return both the source information and documents - + Args: user_query: The user's query user_id: The user's ID top_k: Maximum number of results to return - + Returns: tuple: (sources_info, documents) """ # Get Tavily connector configuration - tavily_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.TAVILY_API) - + tavily_connector = await self.get_connector_by_type( + user_id, SearchSourceConnectorType.TAVILY_API + ) + if not tavily_connector: # Return empty results if no Tavily connector is configured return { @@ -240,22 +279,22 @@ class ConnectorService: "type": "TAVILY_API", "sources": [], }, [] - + # Initialize Tavily client with API key from connector config tavily_api_key = tavily_connector.config.get("TAVILY_API_KEY") tavily_client = TavilyClient(api_key=tavily_api_key) - + # Perform search with Tavily try: response = tavily_client.search( query=user_query, max_results=top_k, - search_depth="advanced" # Use advanced search for better results + search_depth="advanced", # Use advanced search for better results ) - + # Extract results from Tavily response tavily_results = response.get("results", []) - + # Early return if no results if not tavily_results: return { @@ -264,23 +303,22 @@ class ConnectorService: "type": "TAVILY_API", "sources": [], }, [] - + # Process each result and create sources directly without deduplication sources_list = [] documents = [] - + async with self.counter_lock: for i, result in enumerate(tavily_results): - # Create a source entry source = { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), "description": result.get("content", "")[:100], - "url": result.get("url", "") + "url": result.get("url", ""), } sources_list.append(source) - + # Create a document entry document = { "chunk_id": f"tavily_chunk_{i}", @@ -293,9 +331,9 @@ class ConnectorService: "metadata": { "url": result.get("url", ""), "published_date": result.get("published_date", ""), - "source": "TAVILY_API" - } - } + "source": "TAVILY_API", + }, + }, } documents.append(document) self.source_id_counter += 1 @@ -307,9 +345,9 @@ class ConnectorService: "type": "TAVILY_API", "sources": sources_list, } - + return result_object, documents - + except Exception as e: # Log the error and return empty results print(f"Error searching with Tavily: {str(e)}") @@ -319,11 +357,18 @@ class ConnectorService: "type": "TAVILY_API", "sources": [], }, [] - - async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_slack( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for slack and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -333,7 +378,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="SLACK_CONNECTOR" + document_type="SLACK_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: slack_chunks = await self.document_retriever.hybrid_search( @@ -341,11 +386,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="SLACK_CONNECTOR" + document_type="SLACK_CONNECTOR", ) # Transform document retriever results to match expected format slack_chunks = self._transform_document_results(slack_chunks) - + # Early return if no results if not slack_chunks: return { @@ -360,31 +405,31 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(slack_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a mapped source entry with Slack-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - + channel_name = metadata.get("channel_name", "Unknown Channel") + channel_id = metadata.get("channel_id", "") + message_date = metadata.get("start_date", "") + # Create a more descriptive title for Slack messages title = f"Slack: {channel_name}" if message_date: title += f" ({message_date})" - + # Create a more descriptive description for Slack messages - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # For URL, we can use a placeholder or construct a URL to the Slack channel if available url = "" if channel_id: url = f"https://slack.com/app_redirect?channel={channel_id}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -392,7 +437,7 @@ class ConnectorService: self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 4, @@ -400,19 +445,26 @@ class ConnectorService: "type": "SLACK_CONNECTOR", "sources": sources_list, } - + return result_object, slack_chunks - - async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_notion( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Notion pages and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -422,7 +474,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="NOTION_CONNECTOR" + document_type="NOTION_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: notion_chunks = await self.document_retriever.hybrid_search( @@ -430,11 +482,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="NOTION_CONNECTOR" + document_type="NOTION_CONNECTOR", ) # Transform document retriever results to match expected format notion_chunks = self._transform_document_results(notion_chunks) - + # Early return if no results if not notion_chunks: return { @@ -449,24 +501,24 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(notion_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a mapped source entry with Notion-specific metadata - page_title = metadata.get('page_title', 'Untitled Page') - page_id = metadata.get('page_id', '') - indexed_at = metadata.get('indexed_at', '') - + page_title = metadata.get("page_title", "Untitled Page") + page_id = metadata.get("page_id", "") + indexed_at = metadata.get("indexed_at", "") + # Create a more descriptive title for Notion pages title = f"Notion: {page_title}" if indexed_at: title += f" (indexed: {indexed_at})" - + # Create a more descriptive description for Notion pages - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # For URL, we can use a placeholder or construct a URL to the Notion page if available url = "" if page_id: @@ -474,7 +526,7 @@ class ConnectorService: url = f"https://notion.so/{page_id.replace('-', '')}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -482,7 +534,7 @@ class ConnectorService: self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 5, @@ -490,19 +542,26 @@ class ConnectorService: "type": "NOTION_CONNECTOR", "sources": sources_list, } - + return result_object, notion_chunks - - async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_extension( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for extension data and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -512,7 +571,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="EXTENSION" + document_type="EXTENSION", ) elif search_mode == SearchMode.DOCUMENTS: extension_chunks = await self.document_retriever.hybrid_search( @@ -520,7 +579,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="EXTENSION" + document_type="EXTENSION", ) # Transform document retriever results to match expected format extension_chunks = self._transform_document_results(extension_chunks) @@ -539,33 +598,39 @@ class ConnectorService: async with self.counter_lock: for i, chunk in enumerate(extension_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract extension-specific metadata - webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') - webpage_url = metadata.get('VisitedWebPageURL', '') - visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') - visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') - browsing_session_id = metadata.get('BrowsingSessionId', '') - + webpage_title = metadata.get("VisitedWebPageTitle", "Untitled Page") + webpage_url = metadata.get("VisitedWebPageURL", "") + visit_date = metadata.get("VisitedWebPageDateWithTimeInISOString", "") + visit_duration = metadata.get( + "VisitedWebPageVisitDurationInMilliseconds", "" + ) + browsing_session_id = metadata.get("BrowsingSessionId", "") + # Create a more descriptive title for extension data title = webpage_title if visit_date: # Format the date for display (simplified) try: # Just extract the date part for display - formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date + formatted_date = ( + visit_date.split("T")[0] + if "T" in visit_date + else visit_date + ) title += f" (visited: {formatted_date})" except: # Fallback if date parsing fails title += f" (visited: {visit_date})" - + # Create a more descriptive description for extension data - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # Add visit duration if available if visit_duration: try: @@ -573,8 +638,8 @@ class ConnectorService: if duration_seconds < 60: duration_text = f"{duration_seconds:.1f} seconds" else: - duration_text = f"{duration_seconds/60:.1f} minutes" - + duration_text = f"{duration_seconds / 60:.1f} minutes" + if description: description += f" | Duration: {duration_text}" except: @@ -582,15 +647,15 @@ class ConnectorService: pass source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, - "url": webpage_url + "url": webpage_url, } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 6, @@ -598,19 +663,26 @@ class ConnectorService: "type": "EXTENSION", "sources": sources_list, } - + return result_object, extension_chunks - - async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_youtube( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for YouTube videos and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -620,7 +692,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="YOUTUBE_VIDEO" + document_type="YOUTUBE_VIDEO", ) elif search_mode == SearchMode.DOCUMENTS: youtube_chunks = await self.document_retriever.hybrid_search( @@ -628,11 +700,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="YOUTUBE_VIDEO" + document_type="YOUTUBE_VIDEO", ) # Transform document retriever results to match expected format youtube_chunks = self._transform_document_results(youtube_chunks) - + # Early return if no results if not youtube_chunks: return { @@ -647,40 +719,42 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(youtube_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract YouTube-specific metadata - video_title = metadata.get('video_title', 'Untitled Video') - video_id = metadata.get('video_id', '') - channel_name = metadata.get('channel_name', '') + video_title = metadata.get("video_title", "Untitled Video") + video_id = metadata.get("video_id", "") + channel_name = metadata.get("channel_name", "") # published_date = metadata.get('published_date', '') - + # Create a more descriptive title for YouTube videos title = video_title if channel_name: title += f" - {channel_name}" - + # Create a more descriptive description for YouTube videos - description = metadata.get('description', chunk.get('content', '')[:100]) + description = metadata.get( + "description", chunk.get("content", "")[:100] + ) if len(description) == 100: description += "..." - + # For URL, construct a URL to the YouTube video url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "video_id": video_id, # Additional field for YouTube videos - "channel_name": channel_name # Additional field for YouTube videos + "channel_name": channel_name, # Additional field for YouTube videos } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 7, # Assign a unique ID for the YouTube connector @@ -688,13 +762,20 @@ class ConnectorService: "type": "YOUTUBE_VIDEO", "sources": sources_list, } - + return result_object, youtube_chunks - async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + async def search_github( + self, + user_query: str, + user_id: int, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for GitHub documents and return both the source information and langchain documents - + Returns: tuple: (sources_info, langchain_documents) """ @@ -704,7 +785,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="GITHUB_CONNECTOR" + document_type="GITHUB_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: github_chunks = await self.document_retriever.hybrid_search( @@ -712,11 +793,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="GITHUB_CONNECTOR" + document_type="GITHUB_CONNECTOR", ) # Transform document retriever results to match expected format github_chunks = self._transform_document_results(github_chunks) - + # Early return if no results if not github_chunks: return { @@ -731,20 +812,24 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(github_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a source entry source = { - "id": document.get('id', self.source_id_counter), - "title": document.get('title', 'GitHub Document'), # Use specific title if available - "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview - "url": metadata.get('url', '') # Use URL if available in metadata + "id": document.get("id", self.source_id_counter), + "title": document.get( + "title", "GitHub Document" + ), # Use specific title if available + "description": metadata.get( + "description", chunk.get("content", "")[:100] + ), # Use description or content preview + "url": metadata.get("url", ""), # Use URL if available in metadata } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 8, @@ -752,19 +837,26 @@ class ConnectorService: "type": "GITHUB_CONNECTOR", "sources": sources_list, } - + return result_object, github_chunks - async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + async def search_linear( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Linear issues and comments and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -774,7 +866,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="LINEAR_CONNECTOR" + document_type="LINEAR_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: linear_chunks = await self.document_retriever.hybrid_search( @@ -782,7 +874,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="LINEAR_CONNECTOR" + document_type="LINEAR_CONNECTOR", ) # Transform document retriever results to match expected format linear_chunks = self._transform_document_results(linear_chunks) @@ -801,32 +893,32 @@ class ConnectorService: async with self.counter_lock: for _i, chunk in enumerate(linear_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Extract Linear-specific metadata - issue_identifier = metadata.get('issue_identifier', '') - issue_title = metadata.get('issue_title', 'Untitled Issue') - issue_state = metadata.get('state', '') - comment_count = metadata.get('comment_count', 0) - + issue_identifier = metadata.get("issue_identifier", "") + issue_title = metadata.get("issue_title", "Untitled Issue") + issue_state = metadata.get("state", "") + comment_count = metadata.get("comment_count", 0) + # Create a more descriptive title for Linear issues title = f"Linear: {issue_identifier} - {issue_title}" if issue_state: title += f" ({issue_state})" - + # Create a more descriptive description for Linear issues - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + # Add comment count info to description if comment_count: if description: description += f" | Comments: {comment_count}" else: description = f"Comments: {comment_count}" - + # For URL, we could construct a URL to the Linear issue if we have the workspace info # For now, use a generic placeholder url = "" @@ -835,18 +927,18 @@ class ConnectorService: url = f"https://linear.app/issue/{issue_identifier}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, "issue_identifier": issue_identifier, "state": issue_state, - "comment_count": comment_count + "comment_count": comment_count, } self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 9, # Assign a unique ID for the Linear connector @@ -854,24 +946,149 @@ class ConnectorService: "type": "LINEAR_CONNECTOR", "sources": sources_list, } - + return result_object, linear_chunks - async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: + async def search_jira( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: + """ + Search for Jira issues and comments and return both the source information and langchain documents + + Args: + user_query: The user's query + user_id: The user's ID + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + + Returns: + tuple: (sources_info, langchain_documents) + """ + if search_mode == SearchMode.CHUNKS: + jira_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="JIRA_CONNECTOR", + ) + elif search_mode == SearchMode.DOCUMENTS: + jira_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="JIRA_CONNECTOR", + ) + # Transform document retriever results to match expected format + jira_chunks = self._transform_document_results(jira_chunks) + + # Early return if no results + if not jira_chunks: + return { + "id": 30, + "name": "Jira Issues", + "type": "JIRA_CONNECTOR", + "sources": [], + }, [] + + # Process each chunk and create sources directly without deduplication + sources_list = [] + async with self.counter_lock: + for _i, chunk in enumerate(jira_chunks): + # Extract document metadata + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) + + # Extract Jira-specific metadata + issue_key = metadata.get("issue_key", "") + issue_title = metadata.get("issue_title", "Untitled Issue") + status = metadata.get("status", "") + priority = metadata.get("priority", "") + issue_type = metadata.get("issue_type", "") + comment_count = metadata.get("comment_count", 0) + + # Create a more descriptive title for Jira issues + title = f"Jira: {issue_key} - {issue_title}" + if status: + title += f" ({status})" + + # Create a more descriptive description for Jira issues + description = chunk.get("content", "")[:100] + if len(description) == 100: + description += "..." + + # Add priority and type info to description + info_parts = [] + if priority: + info_parts.append(f"Priority: {priority}") + if issue_type: + info_parts.append(f"Type: {issue_type}") + if comment_count: + info_parts.append(f"Comments: {comment_count}") + + if info_parts: + if description: + description += f" | {' | '.join(info_parts)}" + else: + description = " | ".join(info_parts) + + # For URL, we could construct a URL to the Jira issue if we have the base URL + # For now, use a generic placeholder + url = "" + if issue_key and metadata.get("base_url"): + url = f"{metadata.get('base_url')}/browse/{issue_key}" + + source = { + "id": document.get("id", self.source_id_counter), + "title": title, + "description": description, + "url": url, + "issue_key": issue_key, + "status": status, + "priority": priority, + "issue_type": issue_type, + "comment_count": comment_count, + } + + self.source_id_counter += 1 + sources_list.append(source) + + # Create result object + result_object = { + "id": 10, # Assign a unique ID for the Jira connector + "name": "Jira Issues", + "type": "JIRA_CONNECTOR", + "sources": sources_list, + } + + return result_object, jira_chunks + + async def search_linkup( + self, user_query: str, user_id: str, mode: str = "standard" + ) -> tuple: """ Search using Linkup API and return both the source information and documents - + Args: user_query: The user's query user_id: The user's ID mode: Search depth mode, can be "standard" or "deep" - + Returns: tuple: (sources_info, documents) """ # Get Linkup connector configuration - linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API) - + linkup_connector = await self.get_connector_by_type( + user_id, SearchSourceConnectorType.LINKUP_API + ) + if not linkup_connector: # Return empty results if no Linkup connector is configured return { @@ -880,11 +1097,11 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] - + # Initialize Linkup client with API key from connector config linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY") linkup_client = LinkupClient(api_key=linkup_api_key) - + # Perform search with Linkup try: response = linkup_client.search( @@ -892,10 +1109,10 @@ class ConnectorService: depth=mode, # Use the provided mode ("standard" or "deep") output_type="searchResults", # Default to search results ) - + # Extract results from Linkup response - access as attribute instead of using .get() - linkup_results = response.results if hasattr(response, 'results') else [] - + linkup_results = response.results if hasattr(response, "results") else [] + # Only proceed if we have results if not linkup_results: return { @@ -904,41 +1121,49 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] - + # Process each result and create sources directly without deduplication sources_list = [] documents = [] - + async with self.counter_lock: for i, result in enumerate(linkup_results): # Only process results that have content - if not hasattr(result, 'content') or not result.content: + if not hasattr(result, "content") or not result.content: continue - + # Create a source entry source = { "id": self.source_id_counter, - "title": result.name if hasattr(result, 'name') else "Linkup Result", - "description": result.content[:100] if hasattr(result, 'content') else "", - "url": result.url if hasattr(result, 'url') else "" + "title": ( + result.name if hasattr(result, "name") else "Linkup Result" + ), + "description": ( + result.content[:100] if hasattr(result, "content") else "" + ), + "url": result.url if hasattr(result, "url") else "", } sources_list.append(source) - + # Create a document entry document = { "chunk_id": f"linkup_chunk_{i}", - "content": result.content if hasattr(result, 'content') else "", + "content": result.content if hasattr(result, "content") else "", "score": 1.0, # Default score since not provided by Linkup "document": { "id": self.source_id_counter, - "title": result.name if hasattr(result, 'name') else "Linkup Result", + "title": ( + result.name + if hasattr(result, "name") + else "Linkup Result" + ), "document_type": "LINKUP_API", "metadata": { - "url": result.url if hasattr(result, 'url') else "", - "type": result.type if hasattr(result, 'type') else "", - "source": "LINKUP_API" - } - } + "url": result.url if hasattr(result, "url") else "", + "type": result.type if hasattr(result, "type") else "", + "source": "LINKUP_API", + }, + }, } documents.append(document) self.source_id_counter += 1 @@ -950,9 +1175,9 @@ class ConnectorService: "type": "LINKUP_API", "sources": sources_list, } - + return result_object, documents - + except Exception as e: # Log the error and return empty results print(f"Error searching with Linkup: {str(e)}") @@ -962,17 +1187,24 @@ class ConnectorService: "type": "LINKUP_API", "sources": [], }, [] - - async def search_discord(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: + + async def search_discord( + self, + user_query: str, + user_id: str, + search_space_id: int, + top_k: int = 20, + search_mode: SearchMode = SearchMode.CHUNKS, + ) -> tuple: """ Search for Discord messages and return both the source information and langchain documents - + Args: user_query: The user's query user_id: The user's ID search_space_id: The search space ID to search in top_k: Maximum number of results to return - + Returns: tuple: (sources_info, langchain_documents) """ @@ -982,7 +1214,7 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="DISCORD_CONNECTOR" + document_type="DISCORD_CONNECTOR", ) elif search_mode == SearchMode.DOCUMENTS: discord_chunks = await self.document_retriever.hybrid_search( @@ -990,11 +1222,11 @@ class ConnectorService: top_k=top_k, user_id=user_id, search_space_id=search_space_id, - document_type="DISCORD_CONNECTOR" + document_type="DISCORD_CONNECTOR", ) # Transform document retriever results to match expected format discord_chunks = self._transform_document_results(discord_chunks) - + # Early return if no results if not discord_chunks: return { @@ -1009,26 +1241,26 @@ class ConnectorService: async with self.counter_lock: for i, chunk in enumerate(discord_chunks): # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + document = chunk.get("document", {}) + metadata = document.get("metadata", {}) # Create a mapped source entry with Discord-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - + channel_name = metadata.get("channel_name", "Unknown Channel") + channel_id = metadata.get("channel_id", "") + message_date = metadata.get("start_date", "") + # Create a more descriptive title for Discord messages title = f"Discord: {channel_name}" if message_date: title += f" ({message_date})" - + # Create a more descriptive description for Discord messages - description = chunk.get('content', '')[:100] + description = chunk.get("content", "")[:100] if len(description) == 100: description += "..." - + url = "" - guild_id = metadata.get('guild_id', '') + guild_id = metadata.get("guild_id", "") if guild_id and channel_id: url = f"https://discord.com/channels/{guild_id}/{channel_id}" elif channel_id: @@ -1036,7 +1268,7 @@ class ConnectorService: url = f"https://discord.com/channels/@me/{channel_id}" source = { - "id": document.get('id', self.source_id_counter), + "id": document.get("id", self.source_id_counter), "title": title, "description": description, "url": url, @@ -1044,7 +1276,7 @@ class ConnectorService: self.source_id_counter += 1 sources_list.append(source) - + # Create result object result_object = { "id": 11, @@ -1052,7 +1284,5 @@ class ConnectorService: "type": "DISCORD_CONNECTOR", "sources": sources_list, } - + return result_object, discord_chunks - - diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index e0b3cd1e0..fae64178f 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -1,27 +1,35 @@ -from typing import Optional, Tuple -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.exc import SQLAlchemyError -from sqlalchemy.future import select +import asyncio +import logging from datetime import datetime, timedelta, timezone -from app.db import Document, DocumentType, Chunk, SearchSourceConnector, SearchSourceConnectorType, SearchSpace +from typing import Optional, Tuple + from app.config import config +from app.connectors.discord_connector import DiscordConnector +from app.connectors.github_connector import GitHubConnector +from app.connectors.jira_connector import JiraConnector +from app.connectors.linear_connector import LinearConnector +from app.connectors.notion_history import NotionHistoryConnector +from app.connectors.slack_history import SlackHistory +from app.db import ( + Chunk, + Document, + DocumentType, + SearchSourceConnector, + SearchSourceConnectorType, +) from app.prompts import SUMMARY_PROMPT_TEMPLATE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.connectors.slack_history import SlackHistory -from app.connectors.notion_history import NotionHistoryConnector -from app.connectors.github_connector import GitHubConnector -from app.connectors.linear_connector import LinearConnector -from app.connectors.discord_connector import DiscordConnector -from slack_sdk.errors import SlackApiError -import logging -import asyncio - from app.utils.document_converters import generate_content_hash +from slack_sdk.errors import SlackApiError +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select # Set up logging logger = logging.getLogger(__name__) + async def index_slack_messages( session: AsyncSession, connector_id: int, @@ -29,56 +37,64 @@ async def index_slack_messages( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Slack messages from all accessible channels. - + Args: session: Database session connector_id: ID of the Slack connector search_space_id: ID of the search space to store documents in update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - + Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="slack_messages_indexing", source="connector_indexing_task", message=f"Starting Slack messages indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Slack connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.SLACK_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.SLACK_CONNECTOR, ) ) connector = result.scalars().first() - + if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a Slack connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) - return 0, f"Connector with ID {connector_id} not found or is not a Slack connector" - + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Slack connector", + ) + # Get the Slack token from the connector config slack_token = connector.config.get("SLACK_BOT_TOKEN") if not slack_token: @@ -86,62 +102,86 @@ async def index_slack_messages( log_entry, f"Slack token not found in connector config for connector {connector_id}", "Missing Slack token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Slack token not found in connector config" - + # Initialize Slack client await task_logger.log_task_progress( log_entry, f"Initializing Slack client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + slack_client = SlackHistory(token=slack_token) - + # Calculate date range await task_logger.log_task_progress( log_entry, - f"Calculating date range for Slack indexing", - {"stage": "date_calculation", "provided_start_date": start_date, "provided_end_date": end_date} + "Calculating date range for Slack indexing", + { + "stage": "date_calculation", + "provided_start_date": start_date, + "provided_end_date": end_date, + }, ) - + if start_date is None or end_date is None: # Fall back to calculating dates based on last_indexed_at calculated_end_date = datetime.now() - + # Use last_indexed_at as start date if available, otherwise use 365 days ago if connector.last_indexed_at: # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = connector.last_indexed_at.replace(tzinfo=None) if connector.last_indexed_at.tzinfo else connector.last_indexed_at - + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + # Check if last_indexed_at is in the future or after end_date if last_indexed_naive > calculated_end_date: - logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.") + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." + ) calculated_start_date = calculated_end_date - timedelta(days=365) else: calculated_start_date = last_indexed_naive - logger.info(f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date") + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) else: - calculated_start_date = calculated_end_date - timedelta(days=365) # Use 365 days as default - logger.info(f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") - + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Use 365 days as default + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) + # Use calculated dates if not provided - start_date_str = start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = ( + end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + ) else: # Use provided dates start_date_str = start_date end_date_str = end_date - + logger.info(f"Indexing Slack messages from {start_date_str} to {end_date_str}") - + await task_logger.log_task_progress( log_entry, f"Fetching Slack channels from {start_date_str} to {end_date_str}", - {"stage": "fetch_channels", "start_date": start_date_str, "end_date": end_date_str} + { + "stage": "fetch_channels", + "start_date": start_date_str, + "end_date": end_date_str, + }, ) - + # Get all channels try: channels = slack_client.get_all_channels() @@ -150,133 +190,162 @@ async def index_slack_messages( log_entry, f"Failed to get Slack channels for connector {connector_id}", str(e), - {"error_type": "ChannelFetchError"} + {"error_type": "ChannelFetchError"}, ) return 0, f"Failed to get Slack channels: {str(e)}" - + if not channels: await task_logger.log_task_success( log_entry, f"No Slack channels found for connector {connector_id}", - {"channels_found": 0} + {"channels_found": 0}, ) return 0, "No Slack channels found" - + # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 skipped_channels = [] - + await task_logger.log_task_progress( log_entry, f"Starting to process {len(channels)} Slack channels", - {"stage": "process_channels", "total_channels": len(channels)} + {"stage": "process_channels", "total_channels": len(channels)}, ) - + # Process each channel - for channel_obj in channels: # Modified loop to iterate over list of channel objects + for ( + channel_obj + ) in channels: # Modified loop to iterate over list of channel objects channel_id = channel_obj["id"] channel_name = channel_obj["name"] is_private = channel_obj["is_private"] - is_member = channel_obj["is_member"] # This might be False for public channels too + is_member = channel_obj[ + "is_member" + ] # This might be False for public channels too try: # If it's a private channel and the bot is not a member, skip. # For public channels, if they are listed by conversations.list, the bot can typically read history. # The `not_in_channel` error in get_conversation_history will be the ultimate gatekeeper if history is inaccessible. if is_private and not is_member: - logger.warning(f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping.") - skipped_channels.append(f"{channel_name} (private, bot not a member)") + logger.warning( + f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping." + ) + skipped_channels.append( + f"{channel_name} (private, bot not a member)" + ) documents_skipped += 1 continue - + # Get messages for this channel - # The get_history_by_date_range now uses get_conversation_history, + # The get_history_by_date_range now uses get_conversation_history, # which handles 'not_in_channel' by returning [] and logging. messages, error = slack_client.get_history_by_date_range( channel_id=channel_id, start_date=start_date_str, end_date=end_date_str, - limit=1000 # Limit to 1000 messages per channel + limit=1000, # Limit to 1000 messages per channel ) - + if error: - logger.warning(f"Error getting messages from channel {channel_name}: {error}") + logger.warning( + f"Error getting messages from channel {channel_name}: {error}" + ) skipped_channels.append(f"{channel_name} (error: {error})") documents_skipped += 1 continue # Skip this channel if there's an error - + if not messages: - logger.info(f"No messages found in channel {channel_name} for the specified date range.") + logger.info( + f"No messages found in channel {channel_name} for the specified date range." + ) documents_skipped += 1 continue # Skip if no messages - + # Format messages with user info formatted_messages = [] for msg in messages: # Skip bot messages and system messages - if msg.get("subtype") in ["bot_message", "channel_join", "channel_leave"]: + if msg.get("subtype") in [ + "bot_message", + "channel_join", + "channel_leave", + ]: continue - - formatted_msg = slack_client.format_message(msg, include_user_info=True) + + formatted_msg = slack_client.format_message( + msg, include_user_info=True + ) formatted_messages.append(formatted_msg) - + if not formatted_messages: - logger.info(f"No valid messages found in channel {channel_name} after filtering.") + logger.info( + f"No valid messages found in channel {channel_name} after filtering." + ) documents_skipped += 1 continue # Skip if no valid messages after filtering - + # Convert messages to markdown format channel_content = f"# Slack Channel: {channel_name}\n\n" - + for msg in formatted_messages: user_name = msg.get("user_name", "Unknown User") timestamp = msg.get("datetime", "Unknown Time") text = msg.get("text", "") - - channel_content += f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" - + + channel_content += ( + f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + ) + # Format document metadata metadata_sections = [ - ("METADATA", [ - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - # f"START_DATE: {start_date_str}", - # f"END_DATE: {end_date_str}", - f"MESSAGE_COUNT: {len(formatted_messages)}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - channel_content, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + # f"START_DATE: {start_date_str}", + # f"END_DATE: {end_date_str}", + f"MESSAGE_COUNT: {len(formatted_messages)}", + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", channel_content, "TEXT_END"], + ), ] - + # Build the document string document_parts = [] document_parts.append("") - + for section_title, section_content in metadata_sections: document_parts.append(f"<{section_title}>") document_parts.extend(section_content) document_parts.append(f"") - + document_parts.append("") - combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing." + ) documents_skipped += 1 continue - + # Get user's long context LLM user_llm = await get_user_long_context_llm(session, user_id) if not user_llm: @@ -284,19 +353,26 @@ async def index_slack_messages( skipped_channels.append(f"{channel_name} (no LLM configured)") documents_skipped += 1 continue - + # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(channel_content) ] - + # Create and store new document document = Document( search_space_id=search_space_id, @@ -308,20 +384,24 @@ async def index_slack_messages( "start_date": start_date_str, "end_date": end_date_str, "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, content=summary_content, embedding=summary_embedding, chunks=chunks, content_hash=content_hash, ) - + session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages") - + logger.info( + f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" + ) + except SlackApiError as slack_error: - logger.error(f"Slack API error for channel {channel_name}: {str(slack_error)}") + logger.error( + f"Slack API error for channel {channel_name}: {str(slack_error)}" + ) skipped_channels.append(f"{channel_name} (Slack API error)") documents_skipped += 1 continue # Skip this channel and continue with others @@ -330,23 +410,23 @@ async def index_slack_messages( skipped_channels.append(f"{channel_name} (processing error)") documents_skipped += 1 continue # Skip this channel and continue with others - + # Update the last_indexed_at timestamp for the connector only if requested # and if we successfully indexed at least one channel total_processed = documents_indexed if update_last_indexed and total_processed > 0: connector.last_indexed_at = datetime.now() - + # Commit all changes await session.commit() - + # Prepare result message result_message = None if skipped_channels: result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" else: result_message = f"Processed {total_processed} channels." - + # Log success await task_logger.log_task_success( log_entry, @@ -356,20 +436,22 @@ async def index_slack_messages( "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, "skipped_channels_count": len(skipped_channels), - "result_message": result_message - } + "result_message": result_message, + }, + ) + + logger.info( + f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" ) - - logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped") return total_processed, result_message - + except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during Slack indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {str(db_error)}") return 0, f"Database error: {str(db_error)}" @@ -379,11 +461,12 @@ async def index_slack_messages( log_entry, f"Failed to index Slack messages for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Slack messages: {str(e)}") return 0, f"Failed to index Slack messages: {str(e)}" + async def index_notion_pages( session: AsyncSession, connector_id: int, @@ -391,56 +474,64 @@ async def index_notion_pages( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Notion pages from all accessible pages. - + Args: session: Database session connector_id: ID of the Notion connector search_space_id: ID of the search space to store documents in update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - + Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="notion_pages_indexing", source="connector_indexing_task", message=f"Starting Notion pages indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Notion connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.NOTION_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.NOTION_CONNECTOR, ) ) connector = result.scalars().first() - + if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a Notion connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) - return 0, f"Connector with ID {connector_id} not found or is not a Notion connector" - + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Notion connector", + ) + # Get the Notion token from the connector config notion_token = connector.config.get("NOTION_INTEGRATION_TOKEN") if not notion_token: @@ -448,103 +539,119 @@ async def index_notion_pages( log_entry, f"Notion integration token not found in connector config for connector {connector_id}", "Missing Notion token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Notion integration token not found in connector config" - + # Initialize Notion client await task_logger.log_task_progress( log_entry, f"Initializing Notion client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + logger.info(f"Initializing Notion client for connector {connector_id}") notion_client = NotionHistoryConnector(token=notion_token) - + # Calculate date range if start_date is None or end_date is None: # Fall back to calculating dates calculated_end_date = datetime.now() - calculated_start_date = calculated_end_date - timedelta(days=365) # Check for last 1 year of pages - + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Check for last 1 year of pages + # Use calculated dates if not provided if start_date is None: start_date_iso = calculated_start_date.strftime("%Y-%m-%dT%H:%M:%SZ") else: # Convert YYYY-MM-DD to ISO format - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") - + start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + if end_date is None: end_date_iso = calculated_end_date.strftime("%Y-%m-%dT%H:%M:%SZ") else: # Convert YYYY-MM-DD to ISO format - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") + end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) else: # Convert provided dates to ISO format for Notion API - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%SZ") - + start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + logger.info(f"Fetching Notion pages from {start_date_iso} to {end_date_iso}") - + await task_logger.log_task_progress( log_entry, f"Fetching Notion pages from {start_date_iso} to {end_date_iso}", - {"stage": "fetch_pages", "start_date": start_date_iso, "end_date": end_date_iso} + { + "stage": "fetch_pages", + "start_date": start_date_iso, + "end_date": end_date_iso, + }, ) - + # Get all pages try: - pages = notion_client.get_all_pages(start_date=start_date_iso, end_date=end_date_iso) + pages = notion_client.get_all_pages( + start_date=start_date_iso, end_date=end_date_iso + ) logger.info(f"Found {len(pages)} Notion pages") except Exception as e: await task_logger.log_task_failure( log_entry, f"Failed to get Notion pages for connector {connector_id}", str(e), - {"error_type": "PageFetchError"} + {"error_type": "PageFetchError"}, ) logger.error(f"Error fetching Notion pages: {str(e)}", exc_info=True) return 0, f"Failed to get Notion pages: {str(e)}" - + if not pages: await task_logger.log_task_success( log_entry, f"No Notion pages found for connector {connector_id}", - {"pages_found": 0} + {"pages_found": 0}, ) logger.info("No Notion pages found to index") return 0, "No Notion pages found" - + # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 skipped_pages = [] - + await task_logger.log_task_progress( log_entry, f"Starting to process {len(pages)} Notion pages", - {"stage": "process_pages", "total_pages": len(pages)} + {"stage": "process_pages", "total_pages": len(pages)}, ) - + # Process each page for page in pages: try: page_id = page.get("page_id") page_title = page.get("title", f"Untitled page ({page_id})") page_content = page.get("content", []) - + logger.info(f"Processing Notion page: {page_title} ({page_id})") - + if not page_content: logger.info(f"No content found in page {page_title}. Skipping.") skipped_pages.append(f"{page_title} (no content)") documents_skipped += 1 continue - + # Convert page content to markdown format markdown_content = f"# Notion Page: {page_title}\n\n" - + # Process blocks recursively def process_blocks(blocks, level=0): result = "" @@ -552,10 +659,10 @@ async def index_notion_pages( block_type = block.get("type") block_content = block.get("content", "") children = block.get("children", []) - + # Add indentation based on level indent = " " * level - + # Format based on block type if block_type in ["paragraph", "text"]: result += f"{indent}{block_content}\n\n" @@ -585,54 +692,62 @@ async def index_notion_pages( # Default for other block types if block_content: result += f"{indent}{block_content}\n\n" - + # Process children recursively if children: result += process_blocks(children, level + 1) - + return result - - logger.debug(f"Converting {len(page_content)} blocks to markdown for page {page_title}") + + logger.debug( + f"Converting {len(page_content)} blocks to markdown for page {page_title}" + ) markdown_content += process_blocks(page_content) - + # Format document metadata metadata_sections = [ - ("METADATA", [ - f"PAGE_TITLE: {page_title}", - f"PAGE_ID: {page_id}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - markdown_content, - "TEXT_END" - ]) + ("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + markdown_content, + "TEXT_END", + ], + ), ] - + # Build the document string document_parts = [] document_parts.append("") - + for section_title, section_content in metadata_sections: document_parts.append(f"<{section_title}>") document_parts.extend(section_content) document_parts.append(f"") - + document_parts.append("") - combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing." + ) documents_skipped += 1 continue - + # Get user's long context LLM user_llm = await get_user_long_context_llm(session, user_id) if not user_llm: @@ -640,21 +755,28 @@ async def index_notion_pages( skipped_pages.append(f"{page_title} (no LLM configured)") documents_skipped += 1 continue - + # Generate summary logger.debug(f"Generating summary for page {page_title}") summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) - + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + # Process chunks logger.debug(f"Chunking content for page {page_title}") chunks = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(markdown_content) ] - + # Create and store new document document = Document( search_space_id=search_space_id, @@ -663,41 +785,46 @@ async def index_notion_pages( document_metadata={ "page_title": page_title, "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, content=summary_content, content_hash=content_hash, embedding=summary_embedding, - chunks=chunks + chunks=chunks, ) - + session.add(document) documents_indexed += 1 logger.info(f"Successfully indexed new Notion page: {page_title}") - + except Exception as e: - logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True) - skipped_pages.append(f"{page.get('title', 'Unknown')} (processing error)") + logger.error( + f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", + exc_info=True, + ) + skipped_pages.append( + f"{page.get('title', 'Unknown')} (processing error)" + ) documents_skipped += 1 continue # Skip this page and continue with others - + # Update the last_indexed_at timestamp for the connector only if requested # and if we successfully indexed at least one page total_processed = documents_indexed if update_last_indexed and total_processed > 0: connector.last_indexed_at = datetime.now() logger.info(f"Updated last_indexed_at for connector {connector_id}") - + # Commit all changes await session.commit() - + # Prepare result message result_message = None if skipped_pages: result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" else: result_message = f"Processed {total_processed} pages." - + # Log success await task_logger.log_task_success( log_entry, @@ -707,22 +834,26 @@ async def index_notion_pages( "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, "skipped_pages_count": len(skipped_pages), - "result_message": result_message - } + "result_message": result_message, + }, + ) + + logger.info( + f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" ) - - logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped") return total_processed, result_message - + except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during Notion indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during Notion indexing: {str(db_error)}", exc_info=True ) - logger.error(f"Database error during Notion indexing: {str(db_error)}", exc_info=True) return 0, f"Database error: {str(db_error)}" except Exception as e: await session.rollback() @@ -730,11 +861,12 @@ async def index_notion_pages( log_entry, f"Failed to index Notion pages for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Notion pages: {str(e)}", exc_info=True) return 0, f"Failed to index Notion pages: {str(e)}" + async def index_github_repos( session: AsyncSession, connector_id: int, @@ -742,7 +874,7 @@ async def index_github_repos( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index code and documentation files from accessible GitHub repositories. @@ -757,15 +889,20 @@ async def index_github_repos( Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="github_repos_indexing", source="connector_indexing_task", message=f"Starting GitHub repositories indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + documents_processed = 0 errors = [] @@ -774,14 +911,14 @@ async def index_github_repos( await task_logger.log_task_progress( log_entry, f"Retrieving GitHub connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.GITHUB_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.GITHUB_CONNECTOR, ) ) connector = result.scalars().first() @@ -791,9 +928,12 @@ async def index_github_repos( log_entry, f"Connector with ID {connector_id} not found or is not a GitHub connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a GitHub connector", ) - return 0, f"Connector with ID {connector_id} not found or is not a GitHub connector" # 2. Get the GitHub PAT and selected repositories from the connector config github_pat = connector.config.get("GITHUB_PAT") @@ -804,16 +944,18 @@ async def index_github_repos( log_entry, f"GitHub Personal Access Token (PAT) not found in connector config for connector {connector_id}", "Missing GitHub PAT", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "GitHub Personal Access Token (PAT) not found in connector config" - - if not repo_full_names_to_index or not isinstance(repo_full_names_to_index, list): + + if not repo_full_names_to_index or not isinstance( + repo_full_names_to_index, list + ): await task_logger.log_task_failure( log_entry, f"'repo_full_names' not found or is not a list in connector config for connector {connector_id}", "Invalid repo configuration", - {"error_type": "InvalidConfiguration"} + {"error_type": "InvalidConfiguration"}, ) return 0, "'repo_full_names' not found or is not a list in connector config" @@ -821,9 +963,12 @@ async def index_github_repos( await task_logger.log_task_progress( log_entry, f"Initializing GitHub client for connector {connector_id}", - {"stage": "client_initialization", "repo_count": len(repo_full_names_to_index)} + { + "stage": "client_initialization", + "repo_count": len(repo_full_names_to_index), + }, ) - + try: github_client = GitHubConnector(token=github_pat) except ValueError as e: @@ -831,7 +976,7 @@ async def index_github_repos( log_entry, f"Failed to initialize GitHub client for connector {connector_id}", str(e), - {"error_type": "ClientInitializationError"} + {"error_type": "ClientInitializationError"}, ) return 0, f"Failed to initialize GitHub client: {str(e)}" @@ -841,12 +986,21 @@ async def index_github_repos( await task_logger.log_task_progress( log_entry, f"Starting indexing for {len(repo_full_names_to_index)} selected repositories", - {"stage": "repo_processing", "repo_count": len(repo_full_names_to_index), "start_date": start_date, "end_date": end_date} + { + "stage": "repo_processing", + "repo_count": len(repo_full_names_to_index), + "start_date": start_date, + "end_date": end_date, + }, + ) + + logger.info( + f"Starting indexing for {len(repo_full_names_to_index)} selected repositories." ) - - logger.info(f"Starting indexing for {len(repo_full_names_to_index)} selected repositories.") if start_date and end_date: - logger.info(f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)") + logger.info( + f"Date range requested: {start_date} to {end_date} (Note: GitHub indexing processes all files regardless of dates)" + ) # 6. Iterate through selected repositories and index files for repo_full_name in repo_full_names_to_index: @@ -858,65 +1012,92 @@ async def index_github_repos( try: files_to_index = github_client.get_repository_files(repo_full_name) if not files_to_index: - logger.info(f"No indexable files found in repository: {repo_full_name}") + logger.info( + f"No indexable files found in repository: {repo_full_name}" + ) continue - logger.info(f"Found {len(files_to_index)} files to process in {repo_full_name}") + logger.info( + f"Found {len(files_to_index)} files to process in {repo_full_name}" + ) for file_info in files_to_index: file_path = file_info.get("path") file_url = file_info.get("url") file_sha = file_info.get("sha") - file_type = file_info.get("type") # 'code' or 'doc' + file_type = file_info.get("type") # 'code' or 'doc' full_path_key = f"{repo_full_name}/{file_path}" if not file_path or not file_url or not file_sha: - logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}") + logger.warning( + f"Skipping file with missing info in {repo_full_name}: {file_info}" + ) continue # Get file content - file_content = github_client.get_file_content(repo_full_name, file_path) + file_content = github_client.get_file_content( + repo_full_name, file_path + ) if file_content is None: - logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.") - continue # Skip if content fetch failed - + logger.warning( + f"Could not retrieve content for {full_path_key}. Skipping." + ) + continue # Skip if content fetch failed + content_hash = generate_content_hash(file_content, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing." + ) continue - + # Use file_content directly for chunking, maybe summary for main content? # For now, let's use the full content for both, might need refinement - summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) # Chunk the content try: chunks_data = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) - for chunk in config.code_chunker_instance.chunk(file_content) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed( + chunk.text + ), + ) + for chunk in config.code_chunker_instance.chunk( + file_content + ) ] except Exception as chunk_err: - logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}") - errors.append(f"Chunking failed for {full_path_key}: {chunk_err}") - continue # Skip this file if chunking fails + logger.error( + f"Failed to chunk file {full_path_key}: {chunk_err}" + ) + errors.append( + f"Chunking failed for {full_path_key}: {chunk_err}" + ) + continue # Skip this file if chunking fails doc_metadata = { "repository_full_name": repo_full_name, "file_path": file_path, - "full_path": full_path_key, # For easier lookup + "full_path": full_path_key, # For easier lookup "url": file_url, "sha": file_sha, "type": file_type, - "indexed_at": datetime.now(timezone.utc).isoformat() + "indexed_at": datetime.now(timezone.utc).isoformat(), } # Create new document @@ -925,22 +1106,26 @@ async def index_github_repos( title=f"GitHub - {file_path}", document_type=DocumentType.GITHUB_CONNECTOR, document_metadata=doc_metadata, - content=summary_content, # Store summary + content=summary_content, # Store summary content_hash=content_hash, embedding=summary_embedding, search_space_id=search_space_id, - chunks=chunks_data # Associate chunks directly + chunks=chunks_data, # Associate chunks directly ) session.add(document) documents_processed += 1 except Exception as repo_err: - logger.error(f"Failed to process repository {repo_full_name}: {repo_err}") + logger.error( + f"Failed to process repository {repo_full_name}: {repo_err}" + ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") - + # Commit all changes at the end await session.commit() - logger.info(f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files.") + logger.info( + f"Finished GitHub indexing for connector {connector_id}. Processed {documents_processed} files." + ) # Log success await task_logger.log_task_success( @@ -949,8 +1134,8 @@ async def index_github_repos( { "documents_processed": documents_processed, "errors_count": len(errors), - "repo_count": len(repo_full_names_to_index) - } + "repo_count": len(repo_full_names_to_index), + }, ) except SQLAlchemyError as db_err: @@ -959,9 +1144,11 @@ async def index_github_repos( log_entry, f"Database error during GitHub indexing for connector {connector_id}", str(db_err), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during GitHub indexing for connector {connector_id}: {db_err}" ) - logger.error(f"Database error during GitHub indexing for connector {connector_id}: {db_err}") errors.append(f"Database error: {db_err}") return documents_processed, "; ".join(errors) if errors else str(db_err) except Exception as e: @@ -970,15 +1157,19 @@ async def index_github_repos( log_entry, f"Unexpected error during GitHub indexing for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, + ) + logger.error( + f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", + exc_info=True, ) - logger.error(f"Unexpected error during GitHub indexing for connector {connector_id}: {e}", exc_info=True) errors.append(f"Unexpected error: {e}") return documents_processed, "; ".join(errors) if errors else str(e) error_message = "; ".join(errors) if errors else None return documents_processed, error_message + async def index_linear_issues( session: AsyncSession, connector_id: int, @@ -986,56 +1177,64 @@ async def index_linear_issues( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Linear issues and comments. - + Args: session: Database session connector_id: ID of the Linear connector search_space_id: ID of the search space to store documents in update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) - + Returns: Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="linear_issues_indexing", source="connector_indexing_task", message=f"Starting Linear issues indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Linear connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.LINEAR_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.LINEAR_CONNECTOR, ) ) connector = result.scalars().first() - + if not connector: await task_logger.log_task_failure( log_entry, f"Connector with ID {connector_id} not found or is not a Linear connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, ) - return 0, f"Connector with ID {connector_id} not found or is not a Linear connector" - + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Linear connector", + ) + # Get the Linear token from the connector config linear_token = connector.config.get("LINEAR_API_KEY") if not linear_token: @@ -1043,135 +1242,167 @@ async def index_linear_issues( log_entry, f"Linear API token not found in connector config for connector {connector_id}", "Missing Linear token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Linear API token not found in connector config" - + # Initialize Linear client await task_logger.log_task_progress( log_entry, f"Initializing Linear client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + linear_client = LinearConnector(token=linear_token) - + # Calculate date range if start_date is None or end_date is None: # Fall back to calculating dates based on last_indexed_at calculated_end_date = datetime.now() - + # Use last_indexed_at as start date if available, otherwise use 365 days ago if connector.last_indexed_at: # Convert dates to be comparable (both timezone-naive) - last_indexed_naive = connector.last_indexed_at.replace(tzinfo=None) if connector.last_indexed_at.tzinfo else connector.last_indexed_at - + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + # Check if last_indexed_at is in the future or after end_date if last_indexed_naive > calculated_end_date: - logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.") + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." + ) calculated_start_date = calculated_end_date - timedelta(days=365) else: calculated_start_date = last_indexed_naive - logger.info(f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date") + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) else: - calculated_start_date = calculated_end_date - timedelta(days=365) # Use 365 days as default - logger.info(f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") - + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Use 365 days as default + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) + # Use calculated dates if not provided - start_date_str = start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") - end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = ( + end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + ) else: # Use provided dates start_date_str = start_date end_date_str = end_date - + logger.info(f"Fetching Linear issues from {start_date_str} to {end_date_str}") - + await task_logger.log_task_progress( log_entry, f"Fetching Linear issues from {start_date_str} to {end_date_str}", - {"stage": "fetch_issues", "start_date": start_date_str, "end_date": end_date_str} + { + "stage": "fetch_issues", + "start_date": start_date_str, + "end_date": end_date_str, + }, ) - + # Get issues within date range try: issues, error = linear_client.get_issues_by_date_range( - start_date=start_date_str, - end_date=end_date_str, - include_comments=True + start_date=start_date_str, end_date=end_date_str, include_comments=True ) - + if error: logger.error(f"Failed to get Linear issues: {error}") - + # Don't treat "No issues found" as an error that should stop indexing if "No issues found" in error: - logger.info("No issues found is not a critical error, continuing with update") + logger.info( + "No issues found is not a critical error, continuing with update" + ) if update_last_indexed: connector.last_indexed_at = datetime.now() await session.commit() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found") + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) return 0, None else: return 0, f"Failed to get Linear issues: {error}" - + logger.info(f"Retrieved {len(issues)} issues from Linear API") - + except Exception as e: logger.error(f"Exception when calling Linear API: {str(e)}", exc_info=True) return 0, f"Failed to get Linear issues: {str(e)}" - + if not issues: logger.info("No Linear issues found for the specified date range") if update_last_indexed: connector.last_indexed_at = datetime.now() await session.commit() - logger.info(f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found") + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) return 0, None # Return None instead of error message when no issues found - + # Log issue IDs and titles for debugging logger.info("Issues retrieved from Linear API:") for idx, issue in enumerate(issues[:10]): # Log first 10 issues - logger.info(f" {idx+1}. {issue.get('identifier', 'Unknown')} - {issue.get('title', 'Unknown')} - Created: {issue.get('createdAt', 'Unknown')} - Updated: {issue.get('updatedAt', 'Unknown')}") + logger.info( + f" {idx + 1}. {issue.get('identifier', 'Unknown')} - {issue.get('title', 'Unknown')} - Created: {issue.get('createdAt', 'Unknown')} - Updated: {issue.get('updatedAt', 'Unknown')}" + ) if len(issues) > 10: logger.info(f" ...and {len(issues) - 10} more issues") - + # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 skipped_issues = [] - + await task_logger.log_task_progress( log_entry, f"Starting to process {len(issues)} Linear issues", - {"stage": "process_issues", "total_issues": len(issues)} + {"stage": "process_issues", "total_issues": len(issues)}, ) - + # Process each issue for issue in issues: try: - issue_id = issue.get("id") - issue_identifier = issue.get("identifier", "") - issue_title = issue.get("title", "") - + issue_id = issue.get("key") + issue_identifier = issue.get("id", "") + issue_title = issue.get("key", "") + if not issue_id or not issue_title: - logger.warning(f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}") - skipped_issues.append(f"{issue_identifier or 'Unknown'} (missing data)") + logger.warning( + f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" + ) + skipped_issues.append( + f"{issue_identifier or 'Unknown'} (missing data)" + ) documents_skipped += 1 continue - + # Format the issue first to get well-structured data formatted_issue = linear_client.format_issue(issue) - + # Convert issue to markdown format issue_content = linear_client.format_issue_to_markdown(formatted_issue) - + if not issue_content: - logger.warning(f"Skipping issue with no content: {issue_identifier} - {issue_title}") + logger.warning( + f"Skipping issue with no content: {issue_identifier} - {issue_title}" + ) skipped_issues.append(f"{issue_identifier} (no content)") documents_skipped += 1 continue - + # Create a short summary for the embedding # This avoids using the LLM and just uses the issue data directly state = formatted_issue.get("state", "Unknown") @@ -1179,40 +1410,51 @@ async def index_linear_issues( # Truncate description if it's too long for the summary if description and len(description) > 500: description = description[:497] + "..." - + # Create a simple summary from the issue data summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" if description: summary_content += f"Description: {description}\n\n" - + # Add comment count comment_count = len(formatted_issue.get("comments", [])) summary_content += f"Comments: {comment_count}" - + content_hash = generate_content_hash(issue_content, search_space_id) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() - + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." + ) documents_skipped += 1 continue - + # Generate embedding for the summary - summary_embedding = config.embedding_model_instance.embed(summary_content) - + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + # Process chunks - using the full issue content with comments chunks = [ - Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(issue_content) ] - + # Create and store new document - logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}") + logger.info( + f"Creating new document for issue {issue_identifier} - {issue_title}" + ) document = Document( search_space_id=search_space_id, title=f"Linear - {issue_identifier}: {issue_title}", @@ -1223,34 +1465,41 @@ async def index_linear_issues( "issue_title": issue_title, "state": state, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, content=summary_content, content_hash=content_hash, embedding=summary_embedding, - chunks=chunks + chunks=chunks, ) - + session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}") - + logger.info( + f"Successfully indexed new issue {issue_identifier} - {issue_title}" + ) + except Exception as e: - logger.error(f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", exc_info=True) - skipped_issues.append(f"{issue.get('identifier', 'Unknown')} (processing error)") + logger.error( + f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", + exc_info=True, + ) + skipped_issues.append( + f"{issue.get('identifier', 'Unknown')} (processing error)" + ) documents_skipped += 1 continue # Skip this issue and continue with others - + # Update the last_indexed_at timestamp for the connector only if requested total_processed = documents_indexed if update_last_indexed: connector.last_indexed_at = datetime.now() logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") - + # Commit all changes await session.commit() - logger.info(f"Successfully committed all Linear document changes to database") - + logger.info("Successfully committed all Linear document changes to database") + # Log success await task_logger.log_task_success( log_entry, @@ -1259,20 +1508,25 @@ async def index_linear_issues( "issues_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_issues_count": len(skipped_issues) - } + "skipped_issues_count": len(skipped_issues), + }, ) - - logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped") - return total_processed, None # Return None as the error message to indicate success - + + logger.info( + f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + ) + return ( + total_processed, + None, + ) # Return None as the error message to indicate success + except SQLAlchemyError as db_error: await session.rollback() await task_logger.log_task_failure( log_entry, f"Database error during Linear indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {str(db_error)}", exc_info=True) return 0, f"Database error: {str(db_error)}" @@ -1282,11 +1536,12 @@ async def index_linear_issues( log_entry, f"Failed to index Linear issues for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Linear issues: {str(e)}", exc_info=True) return 0, f"Failed to index Linear issues: {str(e)}" + async def index_discord_messages( session: AsyncSession, connector_id: int, @@ -1294,7 +1549,7 @@ async def index_discord_messages( user_id: str, start_date: str = None, end_date: str = None, - update_last_indexed: bool = True + update_last_indexed: bool = True, ) -> Tuple[int, Optional[str]]: """ Index Discord messages from all accessible channels. @@ -1309,28 +1564,33 @@ async def index_discord_messages( Tuple containing (number of documents indexed, error message or None) """ task_logger = TaskLoggingService(session, search_space_id) - + # Log task start log_entry = await task_logger.log_task_start( task_name="discord_messages_indexing", source="connector_indexing_task", message=f"Starting Discord messages indexing for connector {connector_id}", - metadata={"connector_id": connector_id, "user_id": str(user_id), "start_date": start_date, "end_date": end_date} + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, ) - + try: # Get the connector await task_logger.log_task_progress( log_entry, f"Retrieving Discord connector {connector_id} from database", - {"stage": "connector_retrieval"} + {"stage": "connector_retrieval"}, ) - + result = await session.execute( - select(SearchSourceConnector) - .filter( + select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR + SearchSourceConnector.connector_type + == SearchSourceConnectorType.DISCORD_CONNECTOR, ) ) connector = result.scalars().first() @@ -1340,9 +1600,12 @@ async def index_discord_messages( log_entry, f"Connector with ID {connector_id} not found or is not a Discord connector", "Connector not found", - {"error_type": "ConnectorNotFound"} + {"error_type": "ConnectorNotFound"}, + ) + return ( + 0, + f"Connector with ID {connector_id} not found or is not a Discord connector", ) - return 0, f"Connector with ID {connector_id} not found or is not a Discord connector" # Get the Discord token from the connector config discord_token = connector.config.get("DISCORD_BOT_TOKEN") @@ -1351,7 +1614,7 @@ async def index_discord_messages( log_entry, f"Discord token not found in connector config for connector {connector_id}", "Missing Discord token", - {"error_type": "MissingToken"} + {"error_type": "MissingToken"}, ) return 0, "Discord token not found in connector config" @@ -1361,9 +1624,9 @@ async def index_discord_messages( await task_logger.log_task_progress( log_entry, f"Initializing Discord client for connector {connector_id}", - {"stage": "client_initialization"} + {"stage": "client_initialization"}, ) - + discord_client = DiscordConnector(token=discord_token) # Calculate date range @@ -1373,30 +1636,54 @@ async def index_discord_messages( # Use last_indexed_at as start date if available, otherwise use 365 days ago if connector.last_indexed_at: - calculated_start_date = connector.last_indexed_at.replace(tzinfo=timezone.utc) - logger.info(f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date") + calculated_start_date = connector.last_indexed_at.replace( + tzinfo=timezone.utc + ) + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) else: calculated_start_date = calculated_end_date - timedelta(days=365) - logger.info(f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) # Use calculated dates if not provided, convert to ISO format for Discord API if start_date is None: start_date_iso = calculated_start_date.isoformat() else: # Convert YYYY-MM-DD to ISO format - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() - + start_date_iso = ( + datetime.strptime(start_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) + if end_date is None: end_date_iso = calculated_end_date.isoformat() else: - # Convert YYYY-MM-DD to ISO format - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() + # Convert YYYY-MM-DD to ISO format + end_date_iso = ( + datetime.strptime(end_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) else: # Convert provided dates to ISO format for Discord API - start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() - end_date_iso = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat() - - logger.info(f"Indexing Discord messages from {start_date_iso} to {end_date_iso}") + start_date_iso = ( + datetime.strptime(start_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) + end_date_iso = ( + datetime.strptime(end_date, "%Y-%m-%d") + .replace(tzinfo=timezone.utc) + .isoformat() + ) + + logger.info( + f"Indexing Discord messages from {start_date_iso} to {end_date_iso}" + ) documents_indexed = 0 documents_skipped = 0 @@ -1406,9 +1693,9 @@ async def index_discord_messages( await task_logger.log_task_progress( log_entry, f"Starting Discord bot and fetching guilds for connector {connector_id}", - {"stage": "fetch_guilds"} + {"stage": "fetch_guilds"}, ) - + logger.info("Starting Discord bot to fetch guilds") discord_client._bot_task = asyncio.create_task(discord_client.start_bot()) await discord_client._wait_until_ready() @@ -1421,7 +1708,7 @@ async def index_discord_messages( log_entry, f"Failed to get Discord guilds for connector {connector_id}", str(e), - {"error_type": "GuildFetchError"} + {"error_type": "GuildFetchError"}, ) logger.error(f"Failed to get Discord guilds: {str(e)}", exc_info=True) await discord_client.close_bot() @@ -1430,7 +1717,7 @@ async def index_discord_messages( await task_logger.log_task_success( log_entry, f"No Discord guilds found for connector {connector_id}", - {"guilds_found": 0} + {"guilds_found": 0}, ) logger.info("No Discord guilds found to index") await discord_client.close_bot() @@ -1440,9 +1727,9 @@ async def index_discord_messages( await task_logger.log_task_progress( log_entry, f"Starting to process {len(guilds)} Discord guilds", - {"stage": "process_guilds", "total_guilds": len(guilds)} + {"stage": "process_guilds", "total_guilds": len(guilds)}, ) - + for guild in guilds: guild_id = guild["id"] guild_name = guild["name"] @@ -1466,13 +1753,19 @@ async def index_discord_messages( end_date=end_date_iso, ) except Exception as e: - logger.error(f"Failed to get messages for channel {channel_name}: {str(e)}") - skipped_channels.append(f"{guild_name}#{channel_name} (fetch error)") + logger.error( + f"Failed to get messages for channel {channel_name}: {str(e)}" + ) + skipped_channels.append( + f"{guild_name}#{channel_name} (fetch error)" + ) documents_skipped += 1 continue if not messages: - logger.info(f"No messages found in channel {channel_name} for the specified date range.") + logger.info( + f"No messages found in channel {channel_name} for the specified date range." + ) documents_skipped += 1 continue @@ -1485,33 +1778,45 @@ async def index_discord_messages( formatted_messages.append(msg) if not formatted_messages: - logger.info(f"No valid messages found in channel {channel_name} after filtering.") + logger.info( + f"No valid messages found in channel {channel_name} after filtering." + ) documents_skipped += 1 continue # Convert messages to markdown format - channel_content = f"# Discord Channel: {guild_name} / {channel_name}\n\n" + channel_content = ( + f"# Discord Channel: {guild_name} / {channel_name}\n\n" + ) for msg in formatted_messages: user_name = msg.get("author_name", "Unknown User") timestamp = msg.get("created_at", "Unknown Time") text = msg.get("content", "") - channel_content += f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + channel_content += ( + f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + ) # Format document metadata metadata_sections = [ - ("METADATA", [ - f"GUILD_NAME: {guild_name}", - f"GUILD_ID: {guild_id}", - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - f"MESSAGE_COUNT: {len(formatted_messages)}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - channel_content, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"GUILD_NAME: {guild_name}", + f"GUILD_ID: {guild_id}", + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + f"MESSAGE_COUNT: {len(formatted_messages)}", + ], + ), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + channel_content, + "TEXT_END", + ], + ), ] # Build the document string @@ -1522,31 +1827,43 @@ async def index_discord_messages( document_parts.extend(section_content) document_parts.append(f"") document_parts.append("") - combined_document_string = '\n'.join(document_parts) - content_hash = generate_content_hash(combined_document_string, search_space_id) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) # Check if document with this content hash already exists existing_doc_by_hash_result = await session.execute( select(Document).where(Document.content_hash == content_hash) ) - existing_document_by_hash = existing_doc_by_hash_result.scalars().first() + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) if existing_document_by_hash: - logger.info(f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing.") + logger.info( + f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing." + ) documents_skipped += 1 continue # Get user's long context LLM user_llm = await get_user_long_context_llm(session, user_id) if not user_llm: - logger.error(f"No long context LLM configured for user {user_id}") - skipped_channels.append(f"{guild_name}#{channel_name} (no LLM configured)") + logger.error( + f"No long context LLM configured for user {user_id}" + ) + skipped_channels.append( + f"{guild_name}#{channel_name} (no LLM configured)" + ) documents_skipped += 1 continue # Generate summary using summary_chain summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content summary_embedding = await asyncio.to_thread( config.embedding_model_instance.embed, summary_content @@ -1554,14 +1871,17 @@ async def index_discord_messages( # Process chunks raw_chunks = await asyncio.to_thread( - config.chunker_instance.chunk, - channel_content + config.chunker_instance.chunk, channel_content ) - chunk_texts = [chunk.text for chunk in raw_chunks if chunk.text.strip()] + chunk_texts = [ + chunk.text for chunk in raw_chunks if chunk.text.strip() + ] chunk_embeddings = await asyncio.to_thread( - lambda texts: [config.embedding_model_instance.embed(t) for t in texts], - chunk_texts + lambda texts: [ + config.embedding_model_instance.embed(t) for t in texts + ], + chunk_texts, ) chunks = [ @@ -1582,20 +1902,26 @@ async def index_discord_messages( "message_count": len(formatted_messages), "start_date": start_date_iso, "end_date": end_date_iso, - "indexed_at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + "indexed_at": datetime.now(timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S" + ), }, content=summary_content, content_hash=content_hash, embedding=summary_embedding, - chunks=chunks + chunks=chunks, ) session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages") + logger.info( + f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" + ) except Exception as e: - logger.error(f"Error processing guild {guild_name}: {str(e)}", exc_info=True) + logger.error( + f"Error processing guild {guild_name}: {str(e)}", exc_info=True + ) skipped_channels.append(f"{guild_name} (processing error)") documents_skipped += 1 continue @@ -1624,11 +1950,13 @@ async def index_discord_messages( "documents_skipped": documents_skipped, "skipped_channels_count": len(skipped_channels), "guilds_processed": len(guilds), - "result_message": result_message - } + "result_message": result_message, + }, ) - logger.info(f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped") + logger.info( + f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + ) return documents_indexed, result_message except SQLAlchemyError as db_error: @@ -1637,9 +1965,11 @@ async def index_discord_messages( log_entry, f"Database error during Discord indexing for connector {connector_id}", str(db_error), - {"error_type": "SQLAlchemyError"} + {"error_type": "SQLAlchemyError"}, + ) + logger.error( + f"Database error during Discord indexing: {str(db_error)}", exc_info=True ) - logger.error(f"Database error during Discord indexing: {str(db_error)}", exc_info=True) return 0, f"Database error: {str(db_error)}" except Exception as e: await session.rollback() @@ -1647,7 +1977,357 @@ async def index_discord_messages( log_entry, f"Failed to index Discord messages for connector {connector_id}", str(e), - {"error_type": type(e).__name__} + {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Discord messages: {str(e)}", exc_info=True) return 0, f"Failed to index Discord messages: {str(e)}" + + +async def index_jira_issues( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + start_date: str = None, + end_date: str = None, + update_last_indexed: bool = True, +) -> Tuple[int, Optional[str]]: + """ + Index Jira issues and comments. + + Args: + session: Database session + connector_id: ID of the Jira connector + search_space_id: ID of the search space to store documents in + user_id: User ID + start_date: Start date for indexing (YYYY-MM-DD format) + end_date: End date for indexing (YYYY-MM-DD format) + update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) + + Returns: + Tuple containing (number of documents indexed, error message or None) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="jira_issues_indexing", + source="connector_indexing_task", + message=f"Starting Jira issues indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "start_date": start_date, + "end_date": end_date, + }, + ) + + try: + # Get the connector from the database + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.JIRA_CONNECTOR, + ) + ) + connector = result.scalars().first() + + if not connector: + await task_logger.log_task_failure( + log_entry, + f"Connector with ID {connector_id} not found", + "Connector not found", + {"error_type": "ConnectorNotFound"}, + ) + return 0, f"Connector with ID {connector_id} not found" + + # Get the Jira credentials from the connector config + jira_email = connector.config.get("JIRA_EMAIL") + jira_api_token = connector.config.get("JIRA_API_TOKEN") + jira_base_url = connector.config.get("JIRA_BASE_URL") + + if not jira_email or not jira_api_token or not jira_base_url: + await task_logger.log_task_failure( + log_entry, + f"Jira credentials not found in connector config for connector {connector_id}", + "Missing Jira credentials", + {"error_type": "MissingCredentials"}, + ) + return 0, "Jira credentials not found in connector config" + + # Initialize Jira client + await task_logger.log_task_progress( + log_entry, + f"Initializing Jira client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + jira_client = JiraConnector( + base_url=jira_base_url, email=jira_email, api_token=jira_api_token + ) + + # Calculate date range + if start_date is None or end_date is None: + # Fall back to calculating dates based on last_indexed_at + calculated_end_date = datetime.now() + + # Use last_indexed_at as start date if available, otherwise use 365 days ago + if connector.last_indexed_at: + # Convert dates to be comparable (both timezone-naive) + last_indexed_naive = ( + connector.last_indexed_at.replace(tzinfo=None) + if connector.last_indexed_at.tzinfo + else connector.last_indexed_at + ) + + # Check if last_indexed_at is in the future or after end_date + if last_indexed_naive > calculated_end_date: + logger.warning( + f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead." + ) + calculated_start_date = calculated_end_date - timedelta(days=365) + else: + calculated_start_date = last_indexed_naive + logger.info( + f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" + ) + else: + calculated_start_date = calculated_end_date - timedelta( + days=365 + ) # Use 365 days as default + logger.info( + f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" + ) + + # Use calculated dates if not provided + start_date_str = ( + start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") + ) + end_date_str = ( + end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + ) + else: + # Use provided dates + start_date_str = start_date + end_date_str = end_date + + await task_logger.log_task_progress( + log_entry, + f"Fetching Jira issues from {start_date_str} to {end_date_str}", + { + "stage": "fetching_issues", + "start_date": start_date_str, + "end_date": end_date_str, + }, + ) + + # Get issues within date range + try: + issues, error = jira_client.get_issues_by_date_range( + start_date=start_date_str, end_date=end_date_str, include_comments=True + ) + + if error: + logger.error(f"Failed to get Jira issues: {error}") + + # Don't treat "No issues found" as an error that should stop indexing + if "No issues found" in error: + logger.info( + "No issues found is not a critical error, continuing with update" + ) + if update_last_indexed: + connector.last_indexed_at = datetime.now() + await session.commit() + logger.info( + f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found" + ) + + await task_logger.log_task_success( + log_entry, + f"No Jira issues found in date range {start_date_str} to {end_date_str}", + {"issues_found": 0}, + ) + return 0, None + else: + await task_logger.log_task_failure( + log_entry, + f"Failed to get Jira issues: {error}", + "API Error", + {"error_type": "APIError"}, + ) + return 0, f"Failed to get Jira issues: {error}" + + logger.info(f"Retrieved {len(issues)} issues from Jira API") + + except Exception as e: + logger.error(f"Error fetching Jira issues: {str(e)}", exc_info=True) + return 0, f"Error fetching Jira issues: {str(e)}" + + # Process and index each issue + documents_indexed = 0 + skipped_issues = [] + documents_skipped = 0 + + for issue in issues: + try: + issue_id = issue.get("key") + issue_identifier = issue.get("key", "") + issue_title = issue.get("id", "") + + if not issue_id or not issue_title: + logger.warning( + f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" + ) + skipped_issues.append( + f"{issue_identifier or 'Unknown'} (missing data)" + ) + documents_skipped += 1 + continue + + # Format the issue for better readability + formatted_issue = jira_client.format_issue(issue) + + # Convert to markdown + issue_content = jira_client.format_issue_to_markdown(formatted_issue) + + if not issue_content: + logger.warning( + f"Skipping issue with no content: {issue_identifier} - {issue_title}" + ) + skipped_issues.append(f"{issue_identifier} (no content)") + documents_skipped += 1 + continue + + # Create a simple summary + summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" + if formatted_issue.get("description"): + summary_content += ( + f"Description: {formatted_issue.get('description')}\n\n" + ) + + # Add comment count + comment_count = len(formatted_issue.get("comments", [])) + summary_content += f"Comments: {comment_count}" + + # Generate content hash + content_hash = generate_content_hash(issue_content, search_space_id) + + # Check if document already exists + existing_doc_by_hash_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document_by_hash = ( + existing_doc_by_hash_result.scalars().first() + ) + + if existing_document_by_hash: + logger.info( + f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing." + ) + documents_skipped += 1 + continue + + # Generate embedding for the summary + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full issue content with comments + chunks = [ + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(issue_content) + ] + + # Create and store new document + logger.info( + f"Creating new document for issue {issue_identifier} - {issue_title}" + ) + document = Document( + search_space_id=search_space_id, + title=f"Jira - {issue_identifier}: {issue_title}", + document_type=DocumentType.JIRA_CONNECTOR, + document_metadata={ + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "state": formatted_issue.get("status", "Unknown"), + "comment_count": comment_count, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks, + ) + + session.add(document) + documents_indexed += 1 + logger.info( + f"Successfully indexed new issue {issue_identifier} - {issue_title}" + ) + + except Exception as e: + logger.error( + f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", + exc_info=True, + ) + skipped_issues.append( + f"{issue.get('identifier', 'Unknown')} (processing error)" + ) + documents_skipped += 1 + continue # Skip this issue and continue with others + + # Update the last_indexed_at timestamp for the connector only if requested + total_processed = documents_indexed + if update_last_indexed: + connector.last_indexed_at = datetime.now() + logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") + + # Commit all changes + await session.commit() + logger.info("Successfully committed all JIRA document changes to database") + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed JIRA indexing for connector {connector_id}", + { + "issues_processed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "skipped_issues_count": len(skipped_issues), + }, + ) + + logger.info( + f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + ) + return ( + total_processed, + None, + ) # Return None as the error message to indicate success + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during JIRA indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {str(db_error)}", exc_info=True) + return 0, f"Database error: {str(db_error)}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index JIRA issues for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index JIRA issues: {str(e)}", exc_info=True) + return 0, f"Failed to index JIRA issues: {str(e)}" diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx index 34db58f67..918a625d5 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx @@ -9,12 +9,12 @@ import { ArrowLeft, Check, Loader2, Github } from "lucide-react"; import { Form } from "@/components/ui/form"; import { Button } from "@/components/ui/button"; import { - Card, - CardContent, - CardDescription, - CardFooter, - CardHeader, - CardTitle, + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, } from "@/components/ui/card"; // Import Utils, Types, Hook, and Components @@ -27,201 +27,227 @@ import { EditSimpleTokenForm } from "@/components/editConnector/EditSimpleTokenF import { getConnectorIcon } from "@/components/chat"; export default function EditConnectorPage() { - const router = useRouter(); - const params = useParams(); - const searchSpaceId = params.search_space_id as string; - // Ensure connectorId is parsed safely - const connectorIdParam = params.connector_id as string; - const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN; + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + // Ensure connectorId is parsed safely + const connectorIdParam = params.connector_id as string; + const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN; - // Use the custom hook to manage state and logic - const { - connectorsLoading, - connector, - isSaving, - editForm, - patForm, // Needed for GitHub child component - handleSaveChanges, - // GitHub specific props for the child component - editMode, - setEditMode, // Pass down if needed by GitHub component - originalPat, - currentSelectedRepos, - fetchedRepos, - setFetchedRepos, - newSelectedRepos, - setNewSelectedRepos, - isFetchingRepos, - handleFetchRepositories, - handleRepoSelectionChange, - } = useConnectorEditPage(connectorId, searchSpaceId); + // Use the custom hook to manage state and logic + const { + connectorsLoading, + connector, + isSaving, + editForm, + patForm, // Needed for GitHub child component + handleSaveChanges, + // GitHub specific props for the child component + editMode, + setEditMode, // Pass down if needed by GitHub component + originalPat, + currentSelectedRepos, + fetchedRepos, + setFetchedRepos, + newSelectedRepos, + setNewSelectedRepos, + isFetchingRepos, + handleFetchRepositories, + handleRepoSelectionChange, + } = useConnectorEditPage(connectorId, searchSpaceId); - // Redirect if connectorId is not a valid number after parsing - useEffect(() => { - if (isNaN(connectorId)) { - toast.error("Invalid Connector ID."); - router.push(`/dashboard/${searchSpaceId}/connectors`); - } - }, [connectorId, router, searchSpaceId]); + // Redirect if connectorId is not a valid number after parsing + useEffect(() => { + if (isNaN(connectorId)) { + toast.error("Invalid Connector ID."); + router.push(`/dashboard/${searchSpaceId}/connectors`); + } + }, [connectorId, router, searchSpaceId]); - // Loading State - if (connectorsLoading || !connector) { - // Handle NaN case before showing skeleton - if (isNaN(connectorId)) return null; - return ; - } + // Loading State + if (connectorsLoading || !connector) { + // Handle NaN case before showing skeleton + if (isNaN(connectorId)) return null; + return ; + } - // Main Render using data/handlers from the hook - return ( -
- + // Main Render using data/handlers from the hook + return ( +
+ - - - - - {getConnectorIcon(connector.connector_type)} - Edit {getConnectorTypeDisplay(connector.connector_type)} Connector - - - Modify connector name and configuration. - - + + + + + {getConnectorIcon(connector.connector_type)} + Edit {getConnectorTypeDisplay(connector.connector_type)} Connector + + + Modify connector name and configuration. + + -
- {/* Pass hook's handleSaveChanges */} - - - {/* Pass form control from hook */} - + + {/* Pass hook's handleSaveChanges */} + + + {/* Pass form control from hook */} + -
+
-

Configuration

+

Configuration

- {/* == GitHub == */} - {connector.connector_type === "GITHUB_CONNECTOR" && ( - - )} + {/* == GitHub == */} + {connector.connector_type === "GITHUB_CONNECTOR" && ( + + )} - {/* == Slack == */} - {connector.connector_type === "SLACK_CONNECTOR" && ( - - )} - {/* == Notion == */} - {connector.connector_type === "NOTION_CONNECTOR" && ( - - )} - {/* == Serper == */} - {connector.connector_type === "SERPER_API" && ( - - )} - {/* == Tavily == */} - {connector.connector_type === "TAVILY_API" && ( - - )} + {/* == Slack == */} + {connector.connector_type === "SLACK_CONNECTOR" && ( + + )} + {/* == Notion == */} + {connector.connector_type === "NOTION_CONNECTOR" && ( + + )} + {/* == Serper == */} + {connector.connector_type === "SERPER_API" && ( + + )} + {/* == Tavily == */} + {connector.connector_type === "TAVILY_API" && ( + + )} - {/* == Linear == */} - {connector.connector_type === "LINEAR_CONNECTOR" && ( - - )} + {/* == Linear == */} + {connector.connector_type === "LINEAR_CONNECTOR" && ( + + )} - {/* == Linkup == */} - {connector.connector_type === "LINKUP_API" && ( - - )} + {/* == Jira == */} + {connector.connector_type === "JIRA_CONNECTOR" && ( +
+ + + +
+ )} - {/* == Discord == */} - {connector.connector_type === "DISCORD_CONNECTOR" && ( - - )} + {/* == Linkup == */} + {connector.connector_type === "LINKUP_API" && ( + + )} -
- - - - - -
-
-
- ); + {/* == Discord == */} + {connector.connector_type === "DISCORD_CONNECTOR" && ( + + )} + + + + + + + + +
+ ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx index 898644429..9ed3f94b9 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx @@ -9,7 +9,10 @@ import * as z from "zod"; import { toast } from "sonner"; import { ArrowLeft, Check, Info, Loader2 } from "lucide-react"; -import { useSearchSourceConnectors, SearchSourceConnector } from "@/hooks/useSearchSourceConnectors"; +import { + useSearchSourceConnectors, + SearchSourceConnector, +} from "@/hooks/useSearchSourceConnectors"; import { Form, FormControl, @@ -28,11 +31,7 @@ import { CardHeader, CardTitle, } from "@/components/ui/card"; -import { - Alert, - AlertDescription, - AlertTitle, -} from "@/components/ui/alert"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; // Define the form schema with Zod const apiConnectorFormSchema = z.object({ @@ -47,13 +46,15 @@ const apiConnectorFormSchema = z.object({ // Helper function to get connector type display name const getConnectorTypeDisplay = (type: string): string => { const typeMap: Record = { - "SERPER_API": "Serper API", - "TAVILY_API": "Tavily API", - "SLACK_CONNECTOR": "Slack Connector", - "NOTION_CONNECTOR": "Notion Connector", - "GITHUB_CONNECTOR": "GitHub Connector", - "DISCORD_CONNECTOR": "Discord Connector", - "LINKUP_API": "Linkup", + SERPER_API: "Serper API", + TAVILY_API: "Tavily API", + SLACK_CONNECTOR: "Slack Connector", + NOTION_CONNECTOR: "Notion Connector", + GITHUB_CONNECTOR: "GitHub Connector", + LINEAR_CONNECTOR: "Linear Connector", + JIRA_CONNECTOR: "Jira Connector", + DISCORD_CONNECTOR: "Discord Connector", + LINKUP_API: "Linkup", // Add other connector types here as needed }; return typeMap[type] || type; @@ -67,9 +68,11 @@ export default function EditConnectorPage() { const params = useParams(); const searchSpaceId = params.search_space_id as string; const connectorId = parseInt(params.connector_id as string, 10); - + const { connectors, updateConnector } = useSearchSourceConnectors(); - const [connector, setConnector] = useState(null); + const [connector, setConnector] = useState( + null, + ); const [isLoading, setIsLoading] = useState(true); const [isSubmitting, setIsSubmitting] = useState(false); // console.log("connector", connector); @@ -85,24 +88,24 @@ export default function EditConnectorPage() { // Get API key field name based on connector type const getApiKeyFieldName = (connectorType: string): string => { const fieldMap: Record = { - "SERPER_API": "SERPER_API_KEY", - "TAVILY_API": "TAVILY_API_KEY", - "SLACK_CONNECTOR": "SLACK_BOT_TOKEN", - "NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN", - "GITHUB_CONNECTOR": "GITHUB_PAT", - "DISCORD_CONNECTOR": "DISCORD_BOT_TOKEN", - "LINKUP_API": "LINKUP_API_KEY" + SERPER_API: "SERPER_API_KEY", + TAVILY_API: "TAVILY_API_KEY", + SLACK_CONNECTOR: "SLACK_BOT_TOKEN", + NOTION_CONNECTOR: "NOTION_INTEGRATION_TOKEN", + GITHUB_CONNECTOR: "GITHUB_PAT", + DISCORD_CONNECTOR: "DISCORD_BOT_TOKEN", + LINKUP_API: "LINKUP_API_KEY", }; return fieldMap[connectorType] || ""; }; // Find connector in the list useEffect(() => { - const currentConnector = connectors.find(c => c.id === connectorId); - + const currentConnector = connectors.find((c) => c.id === connectorId); + if (currentConnector) { setConnector(currentConnector); - + // Check if connector type is supported const apiKeyField = getApiKeyFieldName(currentConnector.connector_type); if (apiKeyField) { @@ -115,7 +118,7 @@ export default function EditConnectorPage() { toast.error("This connector type is not supported for editing"); router.push(`/dashboard/${searchSpaceId}/connectors`); } - + setIsLoading(false); } else if (!isLoading && connectors.length > 0) { // If connectors are loaded but this one isn't found @@ -127,11 +130,11 @@ export default function EditConnectorPage() { // Handle form submission const onSubmit = async (values: ApiConnectorFormValues) => { if (!connector) return; - + setIsSubmitting(true); try { const apiKeyField = getApiKeyFieldName(connector.connector_type); - + // Only update the API key if a new one was provided const updatedConfig = { ...connector.config }; if (values.api_key) { @@ -150,7 +153,9 @@ export default function EditConnectorPage() { router.push(`/dashboard/${searchSpaceId}/connectors`); } catch (error) { console.error("Error updating connector:", error); - toast.error(error instanceof Error ? error.message : "Failed to update connector"); + toast.error( + error instanceof Error ? error.message : "Failed to update connector", + ); } finally { setIsSubmitting(false); } @@ -186,24 +191,30 @@ export default function EditConnectorPage() { - Edit {connector ? getConnectorTypeDisplay(connector.connector_type) : ""} Connector + Edit{" "} + {connector + ? getConnectorTypeDisplay(connector.connector_type) + : ""}{" "} + Connector - - Update your connector settings. - + Update your connector settings. API Key Security - Your API key is stored securely. For security reasons, we don't display your existing API key. - If you don't update the API key field, your existing key will be preserved. + Your API key is stored securely. For security reasons, we don't + display your existing API key. If you don't update the API key + field, your existing key will be preserved.
- + ( - {connector?.connector_type === "SLACK_CONNECTOR" - ? "Slack Bot Token" - : connector?.connector_type === "NOTION_CONNECTOR" - ? "Notion Integration Token" + {connector?.connector_type === "SLACK_CONNECTOR" + ? "Slack Bot Token" + : connector?.connector_type === "NOTION_CONNECTOR" + ? "Notion Integration Token" : connector?.connector_type === "GITHUB_CONNECTOR" ? "GitHub Personal Access Token (PAT)" : connector?.connector_type === "LINKUP_API" @@ -238,27 +249,28 @@ export default function EditConnectorPage() { : "API Key"} - - {connector?.connector_type === "SLACK_CONNECTOR" - ? "Enter a new Slack Bot Token or leave blank to keep your existing token." - : connector?.connector_type === "NOTION_CONNECTOR" - ? "Enter a new Notion Integration Token or leave blank to keep your existing token." + {connector?.connector_type === "SLACK_CONNECTOR" + ? "Enter a new Slack Bot Token or leave blank to keep your existing token." + : connector?.connector_type === "NOTION_CONNECTOR" + ? "Enter a new Notion Integration Token or leave blank to keep your existing token." : connector?.connector_type === "GITHUB_CONNECTOR" ? "Enter a new GitHub PAT or leave blank to keep your existing token." : connector?.connector_type === "LINKUP_API" @@ -271,8 +283,8 @@ export default function EditConnectorPage() { />
-
); -} +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx new file mode 100644 index 000000000..23e128f1f --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/jira-connector/page.tsx @@ -0,0 +1,472 @@ +"use client"; + +import { useState } from "react"; +import { useRouter, useParams } from "next/navigation"; +import { motion } from "framer-motion"; +import { zodResolver } from "@hookform/resolvers/zod"; +import { useForm } from "react-hook-form"; +import * as z from "zod"; +import { toast } from "sonner"; +import { ArrowLeft, Check, Info, Loader2 } from "lucide-react"; + +import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors"; +import { + Form, + FormControl, + FormDescription, + FormField, + FormItem, + FormLabel, + FormMessage, +} from "@/components/ui/form"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; +import { + Accordion, + AccordionContent, + AccordionItem, + AccordionTrigger, +} from "@/components/ui/accordion"; +import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; + +// Define the form schema with Zod +const jiraConnectorFormSchema = z.object({ + name: z.string().min(3, { + message: "Connector name must be at least 3 characters.", + }), + base_url: z + .string() + .url({ + message: + "Please enter a valid Jira URL (e.g., https://yourcompany.atlassian.net)", + }) + .refine( + (url) => { + return url.includes("atlassian.net") || url.includes("jira"); + }, + { + message: "Please enter a valid Jira instance URL", + }, + ), + email: z.string().email({ + message: "Please enter a valid email address.", + }), + api_token: z.string().min(10, { + message: "Jira API Token is required and must be valid.", + }), +}); + +// Define the type for the form values +type JiraConnectorFormValues = z.infer; + +export default function JiraConnectorPage() { + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + const [isSubmitting, setIsSubmitting] = useState(false); + const { createConnector } = useSearchSourceConnectors(); + + // Initialize the form + const form = useForm({ + resolver: zodResolver(jiraConnectorFormSchema), + defaultValues: { + name: "Jira Connector", + base_url: "", + email: "", + api_token: "", + }, + }); + + // Handle form submission + const onSubmit = async (values: JiraConnectorFormValues) => { + setIsSubmitting(true); + try { + await createConnector({ + name: values.name, + connector_type: "JIRA_CONNECTOR", + config: { + JIRA_BASE_URL: values.base_url, + JIRA_EMAIL: values.email, + JIRA_API_TOKEN: values.api_token, + }, + is_indexable: true, + last_indexed_at: null, + }); + + toast.success("Jira connector created successfully!"); + + // Navigate back to connectors page + router.push(`/dashboard/${searchSpaceId}/connectors`); + } catch (error) { + console.error("Error creating connector:", error); + toast.error( + error instanceof Error ? error.message : "Failed to create connector", + ); + } finally { + setIsSubmitting(false); + } + }; + + return ( +
+ + + + + + Connect + Documentation + + + + + + + Connect Jira Instance + + + Integrate with Jira to search and retrieve information from + your issues, tickets, and comments. This connector can index + your Jira content for search. + + + + + + Jira Personal Access Token Required + + You'll need a Jira Personal Access Token to use this + connector. You can create one from{" "} + + Atlassian Account Settings + + + + + + + ( + + Connector Name + + + + + A friendly name to identify this connector. + + + + )} + /> + + ( + + Jira Instance URL + + + + + Your Jira instance URL. For Atlassian Cloud, this is + typically https://yourcompany.atlassian.net + + + + )} + /> + + ( + + Email Address + + + + + Your Atlassian account email address. + + + + )} + /> + + ( + + API Token + + + + + Your Jira API Token will be encrypted and stored securely. + + + + )} + /> + +
+ +
+ + +
+ +

+ What you get with Jira integration: +

+
    +
  • Search through all your Jira issues and tickets
  • +
  • + Access issue descriptions, comments, and full discussion + threads +
  • +
  • + Connect your team's project management directly to your + search space +
  • +
  • + Keep your search results up-to-date with latest Jira content +
  • +
  • + Index your Jira issues for enhanced search capabilities +
  • +
  • + Search by issue keys, status, priority, and assignee + information +
  • +
+
+
+
+ + + + + + Jira Connector Documentation + + + Learn how to set up and use the Jira connector to index your + project management data. + + + +
+

How it works

+

+ The Jira connector uses the Jira REST API with Basic Authentication + to fetch all issues and comments that your account has + access to within your Jira instance. +

+
    +
  • + For follow up indexing runs, the connector retrieves + issues and comments that have been updated since the last + indexing attempt. +
  • +
  • + Indexing is configured to run periodically, so updates + should appear in your search results within minutes. +
  • +
+
+ + + + + Authorization + + + + + Read-Only Access is Sufficient + + You only need read access for this connector to work. + The API Token will only be used to read your Jira data. + + + +
+
+

+ Step 1: Create an API Token +

+
    +
  1. Log in to your Atlassian account
  2. +
  3. + Navigate to{" "} + + https://id.atlassian.com/manage-profile/security/api-tokens + +
  4. +
  5. + Click Create API token +
  6. +
  7. + Enter a label for your token (like "SurfSense + Connector") +
  8. +
  9. + Click Create +
  10. +
  11. + Copy the generated token as it will only be shown + once +
  12. +
+
+ +
+

+ Step 2: Grant necessary access +

+

+ The API Token will have access to all projects and + issues that your user account can see. Make sure your + account has appropriate permissions for the projects + you want to index. +

+ + + Data Privacy + + Only issues, comments, and basic metadata will be + indexed. Jira attachments and linked files are not + indexed by this connector. + + +
+
+
+
+ + + + Indexing + + +
    +
  1. + Navigate to the Connector Dashboard and select the{" "} + Jira Connector. +
  2. +
  3. + Enter your Jira Instance URL (e.g., + https://yourcompany.atlassian.net) +
  4. +
  5. + Place your Personal Access Token in + the form field. +
  6. +
  7. + Click Connect to establish the + connection. +
  8. +
  9. + Once connected, your Jira issues will be indexed + automatically. +
  10. +
+ + + + What Gets Indexed + +

+ The Jira connector indexes the following data: +

+
    +
  • Issue keys and summaries (e.g., PROJ-123)
  • +
  • Issue descriptions
  • +
  • Issue comments and discussion threads
  • +
  • + Issue status, priority, and type information +
  • +
  • Assignee and reporter information
  • +
  • Project information
  • +
+
+
+
+
+
+
+
+
+
+
+
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx index afcc0af00..3d0e59d9b 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx @@ -1,8 +1,17 @@ "use client"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardFooter, CardHeader } from "@/components/ui/card"; -import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; +import { + Card, + CardContent, + CardFooter, + CardHeader, +} from "@/components/ui/card"; +import { + Collapsible, + CollapsibleContent, + CollapsibleTrigger, +} from "@/components/ui/collapsible"; import { IconBrandDiscord, IconBrandGithub, @@ -67,23 +76,26 @@ const connectorCategories: ConnectorCategory[] = [ { id: "slack-connector", title: "Slack", - description: "Connect to your Slack workspace to access messages and channels.", + description: + "Connect to your Slack workspace to access messages and channels.", icon: , status: "available", }, { id: "ms-teams", title: "Microsoft Teams", - description: "Connect to Microsoft Teams to access your team's conversations.", + description: + "Connect to Microsoft Teams to access your team's conversations.", icon: , status: "coming-soon", }, { id: "discord-connector", title: "Discord", - description: "Connect to Discord servers to access messages and channels.", + description: + "Connect to Discord servers to access messages and channels.", icon: , - status: "available" + status: "available", }, ], }, @@ -94,16 +106,18 @@ const connectorCategories: ConnectorCategory[] = [ { id: "linear-connector", title: "Linear", - description: "Connect to Linear to search issues, comments and project data.", + description: + "Connect to Linear to search issues, comments and project data.", icon: , status: "available", }, { id: "jira-connector", title: "Jira", - description: "Connect to Jira to search issues, tickets and project data.", + description: + "Connect to Jira to search issues, tickets and project data.", icon: , - status: "coming-soon", + status: "available", }, ], }, @@ -114,14 +128,16 @@ const connectorCategories: ConnectorCategory[] = [ { id: "notion-connector", title: "Notion", - description: "Connect to your Notion workspace to access pages and databases.", + description: + "Connect to your Notion workspace to access pages and databases.", icon: , status: "available", }, { id: "github-connector", title: "GitHub", - description: "Connect a GitHub PAT to index code and docs from accessible repositories.", + description: + "Connect a GitHub PAT to index code and docs from accessible repositories.", icon: , status: "available", }, @@ -141,7 +157,8 @@ const connectorCategories: ConnectorCategory[] = [ { id: "zoom", title: "Zoom", - description: "Connect to Zoom to access meeting recordings and transcripts.", + description: + "Connect to Zoom to access meeting recordings and transcripts.", icon: , status: "coming-soon", }, @@ -152,7 +169,7 @@ const connectorCategories: ConnectorCategory[] = [ // Animation variants const fadeIn = { hidden: { opacity: 0 }, - visible: { opacity: 1, transition: { duration: 0.4 } } + visible: { opacity: 1, transition: { duration: 0.4 } }, }; const staggerContainer = { @@ -160,43 +177,49 @@ const staggerContainer = { visible: { opacity: 1, transition: { - staggerChildren: 0.1 - } - } + staggerChildren: 0.1, + }, + }, }; const cardVariants = { hidden: { opacity: 0, y: 20 }, - visible: { - opacity: 1, + visible: { + opacity: 1, y: 0, - transition: { + transition: { type: "spring", stiffness: 260, - damping: 20 - } + damping: 20, + }, }, - hover: { + hover: { scale: 1.02, - boxShadow: "0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05)", - transition: { + boxShadow: + "0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05)", + transition: { type: "spring", stiffness: 400, - damping: 10 - } - } + damping: 10, + }, + }, }; export default function ConnectorsPage() { const params = useParams(); const searchSpaceId = params.search_space_id as string; - const [expandedCategories, setExpandedCategories] = useState(["search-engines", "knowledge-bases", "project-management", "team-chats"]); + const [expandedCategories, setExpandedCategories] = useState([ + "search-engines", + "knowledge-bases", + "project-management", + "team-chats", + ]); const toggleCategory = (categoryId: string) => { - setExpandedCategories(prev => - prev.includes(categoryId) - ? prev.filter(id => id !== categoryId) - : [...prev, categoryId] + setExpandedCategories((prev) => + prev.includes(categoryId) + ? prev.filter((id) => id !== categoryId) + : [...prev, categoryId], ); }; @@ -205,9 +228,9 @@ export default function ConnectorsPage() { @@ -215,18 +238,19 @@ export default function ConnectorsPage() { Connect Your Tools

- Integrate with your favorite services to enhance your research capabilities. + Integrate with your favorite services to enhance your research + capabilities.

- {connectorCategories.map((category) => ( -

{category.title}

- - + -
-

{connector.title}

+

+ {connector.title} +

{connector.status === "coming-soon" && ( - + Coming soon )} {connector.status === "connected" && ( - + Connected )}
- +

{connector.description}

- + - {connector.status === 'available' && ( - - )} - {connector.status === 'coming-soon' && ( - )} - {connector.status === 'connected' && ( - )} diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index 7d9aa3cee..1b66684db 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -3,88 +3,111 @@ import { DocumentViewer } from "@/components/document-viewer"; import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { - AlertDialog, - AlertDialogAction, - AlertDialogCancel, - AlertDialogContent, - AlertDialogDescription, - AlertDialogFooter, - AlertDialogHeader, - AlertDialogTitle, - AlertDialogTrigger, + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, + AlertDialogTrigger, } from "@/components/ui/alert-dialog"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; import { - DropdownMenu, - DropdownMenuCheckboxItem, - DropdownMenuContent, - DropdownMenuItem, - DropdownMenuLabel, - DropdownMenuSeparator, - DropdownMenuTrigger, + DropdownMenu, + DropdownMenuCheckboxItem, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; -import { Pagination, PaginationContent, PaginationItem } from "@/components/ui/pagination"; -import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover"; import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, + Pagination, + PaginationContent, + PaginationItem, +} from "@/components/ui/pagination"; +import { + Popover, + PopoverContent, + PopoverTrigger, +} from "@/components/ui/popover"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, } from "@/components/ui/select"; import { - Table, - TableBody, - TableCell, - TableHead, - TableHeader, - TableRow, + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, } from "@/components/ui/table"; import { useDocuments } from "@/hooks/use-documents"; import { cn } from "@/lib/utils"; -import { IconBrandDiscord, IconBrandGithub, IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconLayoutKanban } from "@tabler/icons-react"; import { - ColumnDef, - ColumnFiltersState, - FilterFn, - PaginationState, - Row, - SortingState, - VisibilityState, - flexRender, - getCoreRowModel, - getFacetedUniqueValues, - getFilteredRowModel, - getPaginationRowModel, - getSortedRowModel, - useReactTable, + IconBrandDiscord, + IconBrandGithub, + IconBrandNotion, + IconBrandSlack, + IconBrandYoutube, + IconLayoutKanban, + IconTicket, +} from "@tabler/icons-react"; +import { + ColumnDef, + ColumnFiltersState, + FilterFn, + PaginationState, + Row, + SortingState, + VisibilityState, + flexRender, + getCoreRowModel, + getFacetedUniqueValues, + getFilteredRowModel, + getPaginationRowModel, + getSortedRowModel, + useReactTable, } from "@tanstack/react-table"; import { AnimatePresence, motion } from "framer-motion"; import { - AlertCircle, - ChevronDown, - ChevronFirst, - ChevronLast, - ChevronLeft, - ChevronRight, - ChevronUp, - CircleAlert, - CircleX, - Columns3, - File, - FileX, - Filter, - Globe, - ListFilter, - MoreHorizontal, - Trash, - Webhook + AlertCircle, + ChevronDown, + ChevronFirst, + ChevronLast, + ChevronLeft, + ChevronRight, + ChevronUp, + CircleAlert, + CircleX, + Columns3, + File, + FileX, + Filter, + Globe, + ListFilter, + MoreHorizontal, + Trash, + Webhook, } from "lucide-react"; import { useParams } from "next/navigation"; -import React, { useContext, useEffect, useId, useMemo, useRef, useState } from "react"; +import React, { + useContext, + useEffect, + useId, + useMemo, + useRef, + useState, +} from "react"; import ReactMarkdown from "react-markdown"; import rehypeRaw from "rehype-raw"; import rehypeSanitize from "rehype-sanitize"; @@ -93,938 +116,1065 @@ import { toast } from "sonner"; // Define animation variants for reuse const fadeInScale = { - hidden: { opacity: 0, scale: 0.95 }, - visible: { - opacity: 1, - scale: 1, - transition: { type: "spring", stiffness: 300, damping: 30 } - }, - exit: { - opacity: 0, - scale: 0.95, - transition: { duration: 0.15 } - } + hidden: { opacity: 0, scale: 0.95 }, + visible: { + opacity: 1, + scale: 1, + transition: { type: "spring", stiffness: 300, damping: 30 }, + }, + exit: { + opacity: 0, + scale: 0.95, + transition: { duration: 0.15 }, + }, }; type Document = { - id: number; - title: string; - document_type: "EXTENSION" | "CRAWLED_URL" | "SLACK_CONNECTOR" | "NOTION_CONNECTOR" | "FILE" | "YOUTUBE_VIDEO" | "LINEAR_CONNECTOR" | "DISCORD_CONNECTOR"; - document_metadata: any; - content: string; - created_at: string; - search_space_id: number; + id: number; + title: string; + document_type: + | "EXTENSION" + | "CRAWLED_URL" + | "SLACK_CONNECTOR" + | "NOTION_CONNECTOR" + | "FILE" + | "YOUTUBE_VIDEO" + | "LINEAR_CONNECTOR" + | "DISCORD_CONNECTOR"; + document_metadata: any; + content: string; + created_at: string; + search_space_id: number; }; // Custom filter function for multi-column searching -const multiColumnFilterFn: FilterFn = (row, columnId, filterValue) => { - const searchableRowContent = `${row.original.title}`.toLowerCase(); - const searchTerm = (filterValue ?? "").toLowerCase(); - return searchableRowContent.includes(searchTerm); +const multiColumnFilterFn: FilterFn = ( + row, + columnId, + filterValue, +) => { + const searchableRowContent = `${row.original.title}`.toLowerCase(); + const searchTerm = (filterValue ?? "").toLowerCase(); + return searchableRowContent.includes(searchTerm); }; -const statusFilterFn: FilterFn = (row, columnId, filterValue: string[]) => { - if (!filterValue?.length) return true; - const status = row.getValue(columnId) as string; - return filterValue.includes(status); +const statusFilterFn: FilterFn = ( + row, + columnId, + filterValue: string[], +) => { + if (!filterValue?.length) return true; + const status = row.getValue(columnId) as string; + return filterValue.includes(status); }; // Add document type icons mapping const documentTypeIcons = { - EXTENSION: Webhook, - CRAWLED_URL: Globe, - SLACK_CONNECTOR: IconBrandSlack, - NOTION_CONNECTOR: IconBrandNotion, - FILE: File, - YOUTUBE_VIDEO: IconBrandYoutube, - GITHUB_CONNECTOR: IconBrandGithub, - LINEAR_CONNECTOR: IconLayoutKanban, - DISCORD_CONNECTOR: IconBrandDiscord, + EXTENSION: Webhook, + CRAWLED_URL: Globe, + SLACK_CONNECTOR: IconBrandSlack, + NOTION_CONNECTOR: IconBrandNotion, + FILE: File, + YOUTUBE_VIDEO: IconBrandYoutube, + GITHUB_CONNECTOR: IconBrandGithub, + LINEAR_CONNECTOR: IconLayoutKanban, + JIRA_CONNECTOR: IconTicket, + DISCORD_CONNECTOR: IconBrandDiscord, } as const; const columns: ColumnDef[] = [ - { - id: "select", - header: ({ table }) => ( - table.toggleAllPageRowsSelected(!!value)} - aria-label="Select all" - /> - ), - cell: ({ row }) => ( - row.toggleSelected(!!value)} - aria-label="Select row" - /> - ), - size: 28, - enableSorting: false, - enableHiding: false, + { + id: "select", + header: ({ table }) => ( + table.toggleAllPageRowsSelected(!!value)} + aria-label="Select all" + /> + ), + cell: ({ row }) => ( + row.toggleSelected(!!value)} + aria-label="Select row" + /> + ), + size: 28, + enableSorting: false, + enableHiding: false, + }, + { + header: "Title", + accessorKey: "title", + cell: ({ row }) => { + const Icon = documentTypeIcons[row.original.document_type]; + return ( + + + {row.getValue("title")} + + ); }, - { - header: "Title", - accessorKey: "title", - cell: ({ row }) => { - const Icon = documentTypeIcons[row.original.document_type]; - return ( - - - {row.getValue("title")} - - ); - }, - size: 250, + size: 250, + }, + { + header: "Type", + accessorKey: "document_type", + cell: ({ row }) => { + const type = row.getValue( + "document_type", + ) as keyof typeof documentTypeIcons; + const Icon = documentTypeIcons[type]; + return ( +
+
+ +
+ + {type + .split("_") + .map((word) => word.charAt(0) + word.slice(1).toLowerCase()) + .join(" ")} + +
+ ); }, - { - header: "Type", - accessorKey: "document_type", - cell: ({ row }) => { - const type = row.getValue("document_type") as keyof typeof documentTypeIcons; - const Icon = documentTypeIcons[type]; - return ( -
-
- -
- - {type.split('_').map(word => word.charAt(0) + word.slice(1).toLowerCase()).join(' ')} - -
- ); - }, - size: 180, + size: 180, + }, + { + header: "Content Summary", + accessorKey: "content", + cell: ({ row }) => { + const content = row.getValue("content") as string; + const title = row.getValue("title") as string; + + // Create a truncated preview (first 150 characters) + const previewContent = + content.length > 150 ? content.substring(0, 150) + "..." : content; + + return ( + + ); }, - { - header: "Content Summary", - accessorKey: "content", - cell: ({ row }) => { - const content = row.getValue("content") as string; - const title = row.getValue("title") as string; - - // Create a truncated preview (first 150 characters) - const previewContent = content.length > 150 - ? content.substring(0, 150) + "..." - : content; - - return ( - - ); - }, - size: 300, - }, - { - header: "Created At", - accessorKey: "created_at", - cell: ({ row }) => { - const date = new Date(row.getValue("created_at")); - return date.toLocaleDateString(); - }, - size: 120, - }, - { - id: "actions", - header: () => Actions, - cell: ({ row }) => , - size: 60, - enableHiding: false, + size: 300, + }, + { + header: "Created At", + accessorKey: "created_at", + cell: ({ row }) => { + const date = new Date(row.getValue("created_at")); + return date.toLocaleDateString(); }, + size: 120, + }, + { + id: "actions", + header: () => Actions, + cell: ({ row }) => , + size: 60, + enableHiding: false, + }, ]; // Create a context to share the deleteDocument function const DocumentsContext = React.createContext<{ - deleteDocument: (id: number) => Promise; - refreshDocuments: () => Promise; + deleteDocument: (id: number) => Promise; + refreshDocuments: () => Promise; } | null>(null); export default function DocumentsTable() { - const id = useId(); - const params = useParams(); - const searchSpaceId = Number(params.search_space_id); - const { documents, loading, error, refreshDocuments, deleteDocument } = useDocuments(searchSpaceId); - - // console.log("Search Space ID:", searchSpaceId); - // console.log("Documents loaded:", documents?.length); - - useEffect(() => { - console.log("Delete document function available:", !!deleteDocument); - }, [deleteDocument]); - - const [columnFilters, setColumnFilters] = useState([]); - const [columnVisibility, setColumnVisibility] = useState({}); - const [pagination, setPagination] = useState({ - pageIndex: 0, - pageSize: 10, - }); - const inputRef = useRef(null); + const id = useId(); + const params = useParams(); + const searchSpaceId = Number(params.search_space_id); + const { documents, loading, error, refreshDocuments, deleteDocument } = + useDocuments(searchSpaceId); - const [sorting, setSorting] = useState([ - { - id: "title", - desc: false, - }, - ]); + // console.log("Search Space ID:", searchSpaceId); + // console.log("Documents loaded:", documents?.length); - const [data, setData] = useState([]); - - useEffect(() => { - if (documents) { - setData(documents); - } - }, [documents]); + useEffect(() => { + console.log("Delete document function available:", !!deleteDocument); + }, [deleteDocument]); - const handleDeleteRows = async () => { - const selectedRows = table.getSelectedRowModel().rows; - // console.log("Deleting selected rows:", selectedRows.length); - - if (selectedRows.length === 0) { - toast.error("No rows selected"); - return; - } - - // Create an array of promises for each delete operation - const deletePromises = selectedRows.map(row => { - // console.log("Deleting row with ID:", row.original.id); - return deleteDocument(row.original.id); - }); - - try { - // Execute all delete operations - const results = await Promise.all(deletePromises); - // console.log("Delete results:", results); - - // Check if all deletions were successful - const allSuccessful = results.every(result => result === true); - - if (allSuccessful) { - toast.success(`Successfully deleted ${selectedRows.length} document(s)`); - } else { - toast.error("Some documents could not be deleted"); - } - - // Refresh the documents list after all deletions - await refreshDocuments(); - table.resetRowSelection(); - } catch (error: any) { - console.error("Error deleting documents:", error); - toast.error("Error deleting documents"); - } - }; + const [columnFilters, setColumnFilters] = useState([]); + const [columnVisibility, setColumnVisibility] = useState({}); + const [pagination, setPagination] = useState({ + pageIndex: 0, + pageSize: 10, + }); + const inputRef = useRef(null); - const table = useReactTable({ - data, - columns, - getCoreRowModel: getCoreRowModel(), - getSortedRowModel: getSortedRowModel(), - onSortingChange: setSorting, - enableSortingRemoval: false, - getPaginationRowModel: getPaginationRowModel(), - onPaginationChange: setPagination, - onColumnFiltersChange: setColumnFilters, - onColumnVisibilityChange: setColumnVisibility, - getFilteredRowModel: getFilteredRowModel(), - getFacetedUniqueValues: getFacetedUniqueValues(), - state: { - sorting, - pagination, - columnFilters, - columnVisibility, - }, + const [sorting, setSorting] = useState([ + { + id: "title", + desc: false, + }, + ]); + + const [data, setData] = useState([]); + + useEffect(() => { + if (documents) { + setData(documents); + } + }, [documents]); + + const handleDeleteRows = async () => { + const selectedRows = table.getSelectedRowModel().rows; + // console.log("Deleting selected rows:", selectedRows.length); + + if (selectedRows.length === 0) { + toast.error("No rows selected"); + return; + } + + // Create an array of promises for each delete operation + const deletePromises = selectedRows.map((row) => { + // console.log("Deleting row with ID:", row.original.id); + return deleteDocument(row.original.id); }); - // Get unique status values - const uniqueStatusValues = useMemo(() => { - const statusColumn = table.getColumn("document_type"); + try { + // Execute all delete operations + const results = await Promise.all(deletePromises); + // console.log("Delete results:", results); - if (!statusColumn) return []; + // Check if all deletions were successful + const allSuccessful = results.every((result) => result === true); - const values = Array.from(statusColumn.getFacetedUniqueValues().keys()); + if (allSuccessful) { + toast.success( + `Successfully deleted ${selectedRows.length} document(s)`, + ); + } else { + toast.error("Some documents could not be deleted"); + } - return values.sort(); - }, [table.getColumn("document_type")?.getFacetedUniqueValues()]); + // Refresh the documents list after all deletions + await refreshDocuments(); + table.resetRowSelection(); + } catch (error: any) { + console.error("Error deleting documents:", error); + toast.error("Error deleting documents"); + } + }; - // Get counts for each status - const statusCounts = useMemo(() => { - const statusColumn = table.getColumn("document_type"); - if (!statusColumn) return new Map(); - return statusColumn.getFacetedUniqueValues(); - }, [table.getColumn("document_type")?.getFacetedUniqueValues()]); + const table = useReactTable({ + data, + columns, + getCoreRowModel: getCoreRowModel(), + getSortedRowModel: getSortedRowModel(), + onSortingChange: setSorting, + enableSortingRemoval: false, + getPaginationRowModel: getPaginationRowModel(), + onPaginationChange: setPagination, + onColumnFiltersChange: setColumnFilters, + onColumnVisibilityChange: setColumnVisibility, + getFilteredRowModel: getFilteredRowModel(), + getFacetedUniqueValues: getFacetedUniqueValues(), + state: { + sorting, + pagination, + columnFilters, + columnVisibility, + }, + }); - const selectedStatuses = useMemo(() => { - const filterValue = table.getColumn("document_type")?.getFilterValue() as string[]; - return filterValue ?? []; - }, [table.getColumn("document_type")?.getFilterValue()]); + // Get unique status values + const uniqueStatusValues = useMemo(() => { + const statusColumn = table.getColumn("document_type"); - const handleStatusChange = (checked: boolean, value: string) => { - const filterValue = table.getColumn("document_type")?.getFilterValue() as string[]; - const newFilterValue = filterValue ? [...filterValue] : []; + if (!statusColumn) return []; - if (checked) { - newFilterValue.push(value); - } else { - const index = newFilterValue.indexOf(value); - if (index > -1) { - newFilterValue.splice(index, 1); - } - } + const values = Array.from(statusColumn.getFacetedUniqueValues().keys()); - table.getColumn("document_type")?.setFilterValue(newFilterValue.length ? newFilterValue : undefined); - }; + return values.sort(); + }, [table.getColumn("document_type")?.getFacetedUniqueValues()]); - return ( - Promise.resolve(false)), - refreshDocuments: refreshDocuments || (() => Promise.resolve()) - }}> + // Get counts for each status + const statusCounts = useMemo(() => { + const statusColumn = table.getColumn("document_type"); + if (!statusColumn) return new Map(); + return statusColumn.getFacetedUniqueValues(); + }, [table.getColumn("document_type")?.getFacetedUniqueValues()]); + + const selectedStatuses = useMemo(() => { + const filterValue = table + .getColumn("document_type") + ?.getFilterValue() as string[]; + return filterValue ?? []; + }, [table.getColumn("document_type")?.getFilterValue()]); + + const handleStatusChange = (checked: boolean, value: string) => { + const filterValue = table + .getColumn("document_type") + ?.getFilterValue() as string[]; + const newFilterValue = filterValue ? [...filterValue] : []; + + if (checked) { + newFilterValue.push(value); + } else { + const index = newFilterValue.indexOf(value); + if (index > -1) { + newFilterValue.splice(index, 1); + } + } + + table + .getColumn("document_type") + ?.setFilterValue(newFilterValue.length ? newFilterValue : undefined); + }; + + return ( + Promise.resolve(false)), + refreshDocuments: refreshDocuments || (() => Promise.resolve()), + }} + > + + {/* Filters */} + +
+ {/* Filter by name or email */} - {/* Filters */} - + table.getColumn("title")?.setFilterValue(e.target.value) + } + placeholder="Filter by title..." + type="text" + aria-label="Filter by title" + /> + + + {Boolean(table.getColumn("title")?.getFilterValue()) && ( + { + table.getColumn("title")?.setFilterValue(""); + if (inputRef.current) { + inputRef.current.focus(); + } + }} + initial={{ opacity: 0, rotate: -90 }} + animate={{ opacity: 1, rotate: 0 }} + exit={{ opacity: 0, rotate: 90 }} + whileHover={{ scale: 1.1 }} + whileTap={{ scale: 0.9 }} > -
- {/* Filter by name or email */} - - table.getColumn("title")?.setFilterValue(e.target.value)} - placeholder="Filter by title..." - type="text" - aria-label="Filter by title" - /> - - - {Boolean(table.getColumn("title")?.getFilterValue()) && ( - { - table.getColumn("title")?.setFilterValue(""); - if (inputRef.current) { - inputRef.current.focus(); - } - }} - initial={{ opacity: 0, rotate: -90 }} - animate={{ opacity: 1, rotate: 0 }} - exit={{ opacity: 0, rotate: 90 }} - whileHover={{ scale: 1.1 }} - whileTap={{ scale: 0.9 }} - > - - )} - - {/* Filter by status */} - - - - - - - - -
-
Filters
-
- - {uniqueStatusValues.map((value, i) => ( - - handleStatusChange(checked, value)} - /> - - - ))} - -
-
-
-
-
- {/* Toggle columns visibility */} - - - - - - - - - Toggle columns - {table - .getAllColumns() - .filter((column) => column.getCanHide()) - .map((column) => { - return ( - column.toggleVisibility(!!value)} - onSelect={(event) => event.preventDefault()} - > - {column.id} - - ); - })} - - - +
+
- {/* Table */} - + {loading ? ( +
+
+
+

+ Loading documents... +

+
+
+ ) : error ? ( +
+
+ +

+ Error loading documents +

+ -
-
- ) : data.length === 0 ? ( -
-
- -

No documents found

-
-
- ) : ( - - - {table.getHeaderGroups().map((headerGroup) => ( - - {headerGroup.headers.map((header) => { - return ( - - {header.isPlaceholder ? null : header.column.getCanSort() ? ( -
{ - // Enhanced keyboard handling for sorting - if ( - header.column.getCanSort() && - (e.key === "Enter" || e.key === " ") - ) { - e.preventDefault(); - header.column.getToggleSortingHandler()?.(e); - } - }} - tabIndex={header.column.getCanSort() ? 0 : undefined} - > - {flexRender(header.column.columnDef.header, header.getContext())} - {{ - asc: ( -
- ) : ( - flexRender(header.column.columnDef.header, header.getContext()) - )} -
- ); - })} -
- ))} -
- - - {table.getRowModel().rows?.length ? ( - table.getRowModel().rows.map((row, index) => ( - - {row.getVisibleCells().map((cell) => ( - - {flexRender(cell.column.columnDef.cell, cell.getContext())} - - ))} - - )) - ) : ( - - - No documents found. - - - )} - - -
- )} -
- - {/* Pagination */} -
- {/* Results per page */} - - - - - {/* Page number information */} - -

- - {table.getState().pagination.pageIndex * table.getState().pagination.pageSize + 1}- - {Math.min( - Math.max( - table.getState().pagination.pageIndex * table.getState().pagination.pageSize + - table.getState().pagination.pageSize, - 0, - ), - table.getRowCount(), - )} - {" "} - of {table.getRowCount().toString()} -

-
+ {header.isPlaceholder ? null : header.column.getCanSort() ? ( +
{ + // Enhanced keyboard handling for sorting + if ( + header.column.getCanSort() && + (e.key === "Enter" || e.key === " ") + ) { + e.preventDefault(); + header.column.getToggleSortingHandler()?.(e); + } + }} + tabIndex={ + header.column.getCanSort() ? 0 : undefined + } + > + {flexRender( + header.column.columnDef.header, + header.getContext(), + )} + {{ + asc: ( +
+ ) : ( + flexRender( + header.column.columnDef.header, + header.getContext(), + ) + )} + + ); + })} + + ))} + + + + {table.getRowModel().rows?.length ? ( + table.getRowModel().rows.map((row, index) => ( + + {row.getVisibleCells().map((cell) => ( + + {flexRender( + cell.column.columnDef.cell, + cell.getContext(), + )} + + ))} + + )) + ) : ( + + + No documents found. + + + )} + + + + )} + - {/* Pagination buttons */} -
- - - {/* First page button */} - - - - - - {/* Previous page button */} - - - - - - {/* Next page button */} - - - - - - {/* Last page button */} - - - - - - - -
-
-
- - ); + {/* Pagination */} +
+ {/* Results per page */} + + + + + {/* Page number information */} + +

+ + {table.getState().pagination.pageIndex * + table.getState().pagination.pageSize + + 1} + - + {Math.min( + Math.max( + table.getState().pagination.pageIndex * + table.getState().pagination.pageSize + + table.getState().pagination.pageSize, + 0, + ), + table.getRowCount(), + )} + {" "} + of{" "} + + {table.getRowCount().toString()} + +

+
+ + {/* Pagination buttons */} +
+ + + {/* First page button */} + + + + + + {/* Previous page button */} + + + + + + {/* Next page button */} + + + + + + {/* Last page button */} + + + + + + + +
+
+ + + ); } function RowActions({ row }: { row: Row }) { - const [isOpen, setIsOpen] = useState(false); - const [isDeleting, setIsDeleting] = useState(false); - const { deleteDocument, refreshDocuments } = useContext(DocumentsContext)!; - const document = row.original; + const [isOpen, setIsOpen] = useState(false); + const [isDeleting, setIsDeleting] = useState(false); + const { deleteDocument, refreshDocuments } = useContext(DocumentsContext)!; + const document = row.original; - const handleDelete = async () => { - setIsDeleting(true); - try { - await deleteDocument(document.id); - toast.success("Document deleted successfully"); - await refreshDocuments(); - } catch (error) { - console.error("Error deleting document:", error); - toast.error("Failed to delete document"); - } finally { - setIsDeleting(false); - setIsOpen(false); - } - }; + const handleDelete = async () => { + setIsDeleting(true); + try { + await deleteDocument(document.id); + toast.success("Document deleted successfully"); + await refreshDocuments(); + } catch (error) { + console.error("Error deleting document:", error); + toast.error("Failed to delete document"); + } finally { + setIsDeleting(false); + setIsOpen(false); + } + }; - return ( -
- - - - - - e.preventDefault()}> - View Metadata - - } - /> - - - - { - e.preventDefault(); - setIsOpen(true); - }} - > - Delete - - - - - Are you sure? - - This action cannot be undone. This will permanently delete the document. - - - - Cancel - { - e.preventDefault(); - handleDelete(); - }} - disabled={isDeleting} - > - {isDeleting ? "Deleting..." : "Delete"} - - - - - - -
- ); + return ( +
+ + + + + + e.preventDefault()}> + View Metadata + + } + /> + + + + { + e.preventDefault(); + setIsOpen(true); + }} + > + Delete + + + + + Are you sure? + + This action cannot be undone. This will permanently delete the + document. + + + + Cancel + { + e.preventDefault(); + handleDelete(); + }} + disabled={isDeleting} + > + {isDeleting ? "Deleting..." : "Delete"} + + + + + + +
+ ); } export { DocumentsTable }; - diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx index 8a0bde74f..e92db282e 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx @@ -1,77 +1,77 @@ "use client"; import React, { - useRef, - useEffect, - useState, - useMemo, - useCallback, + useRef, + useEffect, + useState, + useMemo, + useCallback, } from "react"; import { useChat } from "@ai-sdk/react"; import { useParams } from "next/navigation"; import { - Loader2, - X, - Search, - ExternalLink, - ChevronLeft, - ChevronRight, - Check, - ArrowDown, - CircleUser, - Database, - SendHorizontal, - FileText, - Grid3x3, - FolderOpen, - Upload, - ChevronDown, - Filter, - Brain, - Zap, + Loader2, + X, + Search, + ExternalLink, + ChevronLeft, + ChevronRight, + Check, + ArrowDown, + CircleUser, + Database, + SendHorizontal, + FileText, + Grid3x3, + FolderOpen, + Upload, + ChevronDown, + Filter, + Brain, + Zap, } from "lucide-react"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; import { - Dialog, - DialogContent, - DialogDescription, - DialogHeader, - DialogTitle, - DialogTrigger, - DialogFooter, + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, + DialogTrigger, + DialogFooter, } from "@/components/ui/dialog"; import { - DropdownMenu, - DropdownMenuContent, - DropdownMenuItem, - DropdownMenuLabel, - DropdownMenuSeparator, - DropdownMenuTrigger, + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, } from "@/components/ui/select"; import { Badge } from "@/components/ui/badge"; import { Skeleton } from "@/components/ui/skeleton"; import { - ConnectorButton as ConnectorButtonComponent, - getConnectorIcon, - getFilteredSources as getFilteredSourcesUtil, - getPaginatedDialogSources as getPaginatedDialogSourcesUtil, - useScrollToBottom, - updateScrollIndicators as updateScrollIndicatorsUtil, - useScrollIndicators, - scrollTabsLeft as scrollTabsLeftUtil, - scrollTabsRight as scrollTabsRightUtil, - Source, - ResearchMode, - ResearchModeControl, + ConnectorButton as ConnectorButtonComponent, + getConnectorIcon, + getFilteredSources as getFilteredSourcesUtil, + getPaginatedDialogSources as getPaginatedDialogSourcesUtil, + useScrollToBottom, + updateScrollIndicators as updateScrollIndicatorsUtil, + useScrollIndicators, + scrollTabsLeft as scrollTabsLeftUtil, + scrollTabsRight as scrollTabsRightUtil, + Source, + ResearchMode, + ResearchModeControl, } from "@/components/chat"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Logo } from "@/components/Logo"; @@ -80,446 +80,447 @@ import { useDocuments } from "@/hooks/use-documents"; import { useLLMConfigs, useLLMPreferences } from "@/hooks/use-llm-configs"; interface SourceItem { - id: number; - title: string; - description: string; - url: string; - connectorType?: string; + id: number; + title: string; + description: string; + url: string; + connectorType?: string; } interface ConnectorSource { - id: number; - name: string; - type: string; - sources: SourceItem[]; + id: number; + name: string; + type: string; + sources: SourceItem[]; } type DocumentType = - | "EXTENSION" - | "CRAWLED_URL" - | "SLACK_CONNECTOR" - | "NOTION_CONNECTOR" - | "FILE" - | "YOUTUBE_VIDEO" - | "GITHUB_CONNECTOR" - | "LINEAR_CONNECTOR" - | "DISCORD_CONNECTOR"; + | "EXTENSION" + | "CRAWLED_URL" + | "SLACK_CONNECTOR" + | "NOTION_CONNECTOR" + | "FILE" + | "YOUTUBE_VIDEO" + | "GITHUB_CONNECTOR" + | "LINEAR_CONNECTOR" + | "JIRA_CONNECTOR" + | "DISCORD_CONNECTOR"; /** * Skeleton loader for document items */ const DocumentSkeleton = () => ( -
- -
- - - -
- -
+
+ +
+ + + +
+ +
); /** * Enhanced document type filter dropdown */ const DocumentTypeFilter = ({ - value, - onChange, - counts, + value, + onChange, + counts, }: { - value: DocumentType | "ALL"; - onChange: (value: DocumentType | "ALL") => void; - counts: Record; + value: DocumentType | "ALL"; + onChange: (value: DocumentType | "ALL") => void; + counts: Record; }) => { - const getTypeLabel = (type: DocumentType | "ALL") => { - if (type === "ALL") return "All Types"; - return type - .replace(/_/g, " ") - .toLowerCase() - .replace(/\b\w/g, (l) => l.toUpperCase()); - }; + const getTypeLabel = (type: DocumentType | "ALL") => { + if (type === "ALL") return "All Types"; + return type + .replace(/_/g, " ") + .toLowerCase() + .replace(/\b\w/g, (l) => l.toUpperCase()); + }; - const getTypeIcon = (type: DocumentType | "ALL") => { - if (type === "ALL") return ; - return getConnectorIcon(type); - }; + const getTypeIcon = (type: DocumentType | "ALL") => { + if (type === "ALL") return ; + return getConnectorIcon(type); + }; - return ( - - - - - - Document Types - - {Object.entries(counts).map(([type, count]) => ( - onChange(type as DocumentType | "ALL")} - className="flex items-center justify-between" - > -
- {getTypeIcon(type as DocumentType | "ALL")} - {getTypeLabel(type as DocumentType | "ALL")} -
- - {count} - -
- ))} -
-
- ); + return ( + + + + + + Document Types + + {Object.entries(counts).map(([type, count]) => ( + onChange(type as DocumentType | "ALL")} + className="flex items-center justify-between" + > +
+ {getTypeIcon(type as DocumentType | "ALL")} + {getTypeLabel(type as DocumentType | "ALL")} +
+ + {count} + +
+ ))} +
+
+ ); }; /** * Button that displays selected connectors and opens connector selection dialog */ const ConnectorButton = ({ - selectedConnectors, - onClick, + selectedConnectors, + onClick, }: { - selectedConnectors: string[]; - onClick: () => void; + selectedConnectors: string[]; + onClick: () => void; }) => { - const { connectorSourceItems } = useSearchSourceConnectors(); + const { connectorSourceItems } = useSearchSourceConnectors(); - return ( - - ); + return ( + + ); }; /** * Button that displays selected documents count and opens document selection dialog */ const DocumentSelectorButton = ({ - selectedDocuments, - onClick, - documentsCount, + selectedDocuments, + onClick, + documentsCount, }: { - selectedDocuments: number[]; - onClick: () => void; - documentsCount: number; + selectedDocuments: number[]; + onClick: () => void; + documentsCount: number; }) => { - return ( -
- - {selectedDocuments.length > 0 && ( - - {selectedDocuments.length > 99 ? "99+" : selectedDocuments.length} - - )} - {selectedDocuments.length === 0 && ( - - 0 - - )} -
- ); + return ( +
+ + {selectedDocuments.length > 0 && ( + + {selectedDocuments.length > 99 ? "99+" : selectedDocuments.length} + + )} + {selectedDocuments.length === 0 && ( + + 0 + + )} +
+ ); }; // Create a wrapper component for the sources dialog content const SourcesDialogContent = ({ - connector, - sourceFilter, - expandedSources, - sourcesPage, - setSourcesPage, - setSourceFilter, - setExpandedSources, - isLoadingMore, + connector, + sourceFilter, + expandedSources, + sourcesPage, + setSourcesPage, + setSourceFilter, + setExpandedSources, + isLoadingMore, }: { - connector: any; - sourceFilter: string; - expandedSources: boolean; - sourcesPage: number; - setSourcesPage: React.Dispatch>; - setSourceFilter: React.Dispatch>; - setExpandedSources: React.Dispatch>; - isLoadingMore: boolean; + connector: any; + sourceFilter: string; + expandedSources: boolean; + sourcesPage: number; + setSourcesPage: React.Dispatch>; + setSourceFilter: React.Dispatch>; + setExpandedSources: React.Dispatch>; + isLoadingMore: boolean; }) => { - // Safely access sources with fallbacks - const sources = connector?.sources || []; + // Safely access sources with fallbacks + const sources = connector?.sources || []; - // Safe versions of utility functions - const getFilteredSourcesSafe = () => { - if (!sources.length) return []; - return getFilteredSourcesUtil(connector, sourceFilter); - }; + // Safe versions of utility functions + const getFilteredSourcesSafe = () => { + if (!sources.length) return []; + return getFilteredSourcesUtil(connector, sourceFilter); + }; - const getPaginatedSourcesSafe = () => { - if (!sources.length) return []; - return getPaginatedDialogSourcesUtil( - connector, - sourceFilter, - expandedSources, - sourcesPage, - 5, // SOURCES_PER_PAGE - ); - }; + const getPaginatedSourcesSafe = () => { + if (!sources.length) return []; + return getPaginatedDialogSourcesUtil( + connector, + sourceFilter, + expandedSources, + sourcesPage, + 5, // SOURCES_PER_PAGE + ); + }; - const filteredSources = getFilteredSourcesSafe() || []; - const paginatedSources = getPaginatedSourcesSafe() || []; + const filteredSources = getFilteredSourcesSafe() || []; + const paginatedSources = getPaginatedSourcesSafe() || []; - // Description text - const descriptionText = sourceFilter - ? `Found ${filteredSources.length} sources matching "${sourceFilter}"` - : `Viewing ${paginatedSources.length} of ${sources.length} sources`; + // Description text + const descriptionText = sourceFilter + ? `Found ${filteredSources.length} sources matching "${sourceFilter}"` + : `Viewing ${paginatedSources.length} of ${sources.length} sources`; - if (paginatedSources.length === 0) { - return ( -
- -

No sources found matching "{sourceFilter}"

- -
- ); - } + if (paginatedSources.length === 0) { + return ( +
+ +

No sources found matching "{sourceFilter}"

+ +
+ ); + } - return ( - <> - - - {getConnectorIcon(connector.type)} - {connector.name} Sources - - - {descriptionText} - - + return ( + <> + + + {getConnectorIcon(connector.type)} + {connector.name} Sources + + + {descriptionText} + + -
- - { - setSourceFilter(e.target.value); - setSourcesPage(1); - setExpandedSources(false); - }} - /> - {sourceFilter && ( - - )} -
+
+ + { + setSourceFilter(e.target.value); + setSourcesPage(1); + setExpandedSources(false); + }} + /> + {sourceFilter && ( + + )} +
-
- {paginatedSources.map((source: any, index: number) => ( - -
-
- {getConnectorIcon(connector.type)} -
-
-

{source.title}

-

- {source.description} -

-
- -
-
- ))} +
+ {paginatedSources.map((source: any, index: number) => ( + +
+
+ {getConnectorIcon(connector.type)} +
+
+

{source.title}

+

+ {source.description} +

+
+ +
+
+ ))} - {!expandedSources && - paginatedSources.length < filteredSources.length && ( - - )} + {!expandedSources && + paginatedSources.length < filteredSources.length && ( + + )} - {expandedSources && filteredSources.length > 10 && ( -
- Showing all {filteredSources.length} sources -
- )} -
- - ); + {expandedSources && filteredSources.length > 10 && ( +
+ Showing all {filteredSources.length} sources +
+ )} +
+ + ); }; const ChatPage = () => { - const [token, setToken] = React.useState(null); - const [dialogOpenId, setDialogOpenId] = useState(null); - const [sourcesPage, setSourcesPage] = useState(1); - const [expandedSources, setExpandedSources] = useState(false); - const [canScrollLeft, setCanScrollLeft] = useState(false); - const [canScrollRight, setCanScrollRight] = useState(true); - const [sourceFilter, setSourceFilter] = useState(""); - const tabsListRef = useRef(null); - const [terminalExpanded, setTerminalExpanded] = useState(false); - const [selectedConnectors, setSelectedConnectors] = useState([]); - const [searchMode, setSearchMode] = useState<"DOCUMENTS" | "CHUNKS">( - "DOCUMENTS", - ); - const [researchMode, setResearchMode] = useState("QNA"); - const [currentTime, setCurrentTime] = useState(""); - const [currentDate, setCurrentDate] = useState(""); - const terminalMessagesRef = useRef(null); - const { connectorSourceItems, isLoading: isLoadingConnectors } = - useSearchSourceConnectors(); - const { llmConfigs } = useLLMConfigs(); - const { preferences, updatePreferences } = useLLMPreferences(); + const [token, setToken] = React.useState(null); + const [dialogOpenId, setDialogOpenId] = useState(null); + const [sourcesPage, setSourcesPage] = useState(1); + const [expandedSources, setExpandedSources] = useState(false); + const [canScrollLeft, setCanScrollLeft] = useState(false); + const [canScrollRight, setCanScrollRight] = useState(true); + const [sourceFilter, setSourceFilter] = useState(""); + const tabsListRef = useRef(null); + const [terminalExpanded, setTerminalExpanded] = useState(false); + const [selectedConnectors, setSelectedConnectors] = useState([]); + const [searchMode, setSearchMode] = useState<"DOCUMENTS" | "CHUNKS">( + "DOCUMENTS", + ); + const [researchMode, setResearchMode] = useState("QNA"); + const [currentTime, setCurrentTime] = useState(""); + const [currentDate, setCurrentDate] = useState(""); + const terminalMessagesRef = useRef(null); + const { connectorSourceItems, isLoading: isLoadingConnectors } = + useSearchSourceConnectors(); + const { llmConfigs } = useLLMConfigs(); + const { preferences, updatePreferences } = useLLMPreferences(); - const INITIAL_SOURCES_DISPLAY = 3; + const INITIAL_SOURCES_DISPLAY = 3; - const { search_space_id, chat_id } = useParams(); + const { search_space_id, chat_id } = useParams(); - // Document selection state - const [selectedDocuments, setSelectedDocuments] = useState([]); - const [documentFilter, setDocumentFilter] = useState(""); - const [debouncedDocumentFilter, setDebouncedDocumentFilter] = useState(""); - const [documentTypeFilter, setDocumentTypeFilter] = useState< - DocumentType | "ALL" - >("ALL"); - const [documentsPage, setDocumentsPage] = useState(1); - const [documentsPerPage] = useState(10); - const { - documents, - loading: isLoadingDocuments, - error: documentsError, - } = useDocuments(Number(search_space_id)); + // Document selection state + const [selectedDocuments, setSelectedDocuments] = useState([]); + const [documentFilter, setDocumentFilter] = useState(""); + const [debouncedDocumentFilter, setDebouncedDocumentFilter] = useState(""); + const [documentTypeFilter, setDocumentTypeFilter] = useState< + DocumentType | "ALL" + >("ALL"); + const [documentsPage, setDocumentsPage] = useState(1); + const [documentsPerPage] = useState(10); + const { + documents, + loading: isLoadingDocuments, + error: documentsError, + } = useDocuments(Number(search_space_id)); - // Debounced search effect (proper implementation) - useEffect(() => { - const handler = setTimeout(() => { - setDebouncedDocumentFilter(documentFilter); - setDocumentsPage(1); // Reset page when search changes - }, 300); + // Debounced search effect (proper implementation) + useEffect(() => { + const handler = setTimeout(() => { + setDebouncedDocumentFilter(documentFilter); + setDocumentsPage(1); // Reset page when search changes + }, 300); - return () => { - clearTimeout(handler); - }; - }, [documentFilter]); + return () => { + clearTimeout(handler); + }; + }, [documentFilter]); - // Memoized filtered and paginated documents - const filteredDocuments = useMemo(() => { - if (!documents) return []; + // Memoized filtered and paginated documents + const filteredDocuments = useMemo(() => { + if (!documents) return []; - return documents.filter((doc) => { - const matchesSearch = - doc.title - .toLowerCase() - .includes(debouncedDocumentFilter.toLowerCase()) || - doc.content - .toLowerCase() - .includes(debouncedDocumentFilter.toLowerCase()); - const matchesType = - documentTypeFilter === "ALL" || - doc.document_type === documentTypeFilter; - return matchesSearch && matchesType; - }); - }, [documents, debouncedDocumentFilter, documentTypeFilter]); + return documents.filter((doc) => { + const matchesSearch = + doc.title + .toLowerCase() + .includes(debouncedDocumentFilter.toLowerCase()) || + doc.content + .toLowerCase() + .includes(debouncedDocumentFilter.toLowerCase()); + const matchesType = + documentTypeFilter === "ALL" || + doc.document_type === documentTypeFilter; + return matchesSearch && matchesType; + }); + }, [documents, debouncedDocumentFilter, documentTypeFilter]); - const paginatedDocuments = useMemo(() => { - const startIndex = (documentsPage - 1) * documentsPerPage; - return filteredDocuments.slice(startIndex, startIndex + documentsPerPage); - }, [filteredDocuments, documentsPage, documentsPerPage]); + const paginatedDocuments = useMemo(() => { + const startIndex = (documentsPage - 1) * documentsPerPage; + return filteredDocuments.slice(startIndex, startIndex + documentsPerPage); + }, [filteredDocuments, documentsPage, documentsPerPage]); - const totalPages = Math.ceil(filteredDocuments.length / documentsPerPage); + const totalPages = Math.ceil(filteredDocuments.length / documentsPerPage); - // Document type counts for filter dropdown - const documentTypeCounts = useMemo(() => { - if (!documents) return {}; + // Document type counts for filter dropdown + const documentTypeCounts = useMemo(() => { + if (!documents) return {}; - const counts: Record = { ALL: documents.length }; - documents.forEach((doc) => { - counts[doc.document_type] = (counts[doc.document_type] || 0) + 1; - }); - return counts; - }, [documents]); + const counts: Record = { ALL: documents.length }; + documents.forEach((doc) => { + counts[doc.document_type] = (counts[doc.document_type] || 0) + 1; + }); + return counts; + }, [documents]); - // Callback to handle document selection - const handleDocumentToggle = useCallback((documentId: number) => { - setSelectedDocuments((prev) => - prev.includes(documentId) - ? prev.filter((id) => id !== documentId) - : [...prev, documentId], - ); - }, []); + // Callback to handle document selection + const handleDocumentToggle = useCallback((documentId: number) => { + setSelectedDocuments((prev) => + prev.includes(documentId) + ? prev.filter((id) => id !== documentId) + : [...prev, documentId], + ); + }, []); - // Function to scroll terminal to bottom - const scrollTerminalToBottom = () => { - if (terminalMessagesRef.current) { - terminalMessagesRef.current.scrollTop = - terminalMessagesRef.current.scrollHeight; - } - }; + // Function to scroll terminal to bottom + const scrollTerminalToBottom = () => { + if (terminalMessagesRef.current) { + terminalMessagesRef.current.scrollTop = + terminalMessagesRef.current.scrollHeight; + } + }; - // Get token from localStorage on client side only - React.useEffect(() => { - setToken(localStorage.getItem("surfsense_bearer_token")); - }, []); + // Get token from localStorage on client side only + React.useEffect(() => { + setToken(localStorage.getItem("surfsense_bearer_token")); + }, []); - // Set the current time only on the client side after initial render - useEffect(() => { - setCurrentDate(new Date().toISOString().split("T")[0]); - setCurrentTime(new Date().toTimeString().split(" ")[0]); - }, []); + // Set the current time only on the client side after initial render + useEffect(() => { + setCurrentDate(new Date().toISOString().split("T")[0]); + setCurrentTime(new Date().toTimeString().split(" ")[0]); + }, []); - // Add this CSS to remove input shadow and improve the UI - useEffect(() => { - if (typeof document !== "undefined") { - const style = document.createElement("style"); - style.innerHTML = ` + // Add this CSS to remove input shadow and improve the UI + useEffect(() => { + if (typeof document !== "undefined") { + const style = document.createElement("style"); + style.innerHTML = ` .no-shadow-input { box-shadow: none !important; } @@ -617,825 +618,860 @@ const ChatPage = () => { background: hsl(var(--muted-foreground) / 0.5); } `; - document.head.appendChild(style); + document.head.appendChild(style); - return () => { - document.head.removeChild(style); - }; - } - }, []); + return () => { + document.head.removeChild(style); + }; + } + }, []); - const { - messages, - input, - handleInputChange, - handleSubmit: handleChatSubmit, - status, - setMessages, - } = useChat({ - api: `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chat`, - streamProtocol: "data", - headers: { - ...(token && { Authorization: `Bearer ${token}` }), - }, - body: { - data: { - search_space_id: search_space_id, - selected_connectors: selectedConnectors, - research_mode: researchMode, - search_mode: searchMode, - document_ids_to_add_in_context: selectedDocuments, - }, - }, - onError: (error) => { - console.error("Chat error:", error); - // You can add additional error handling here if needed - }, - }); + const { + messages, + input, + handleInputChange, + handleSubmit: handleChatSubmit, + status, + setMessages, + } = useChat({ + api: `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chat`, + streamProtocol: "data", + headers: { + ...(token && { Authorization: `Bearer ${token}` }), + }, + body: { + data: { + search_space_id: search_space_id, + selected_connectors: selectedConnectors, + research_mode: researchMode, + search_mode: searchMode, + document_ids_to_add_in_context: selectedDocuments, + }, + }, + onError: (error) => { + console.error("Chat error:", error); + // You can add additional error handling here if needed + }, + }); - // Fetch chat details when component mounts - useEffect(() => { - const fetchChatDetails = async () => { - try { - if (!token) return; // Wait for token to be set + // Fetch chat details when component mounts + useEffect(() => { + const fetchChatDetails = async () => { + try { + if (!token) return; // Wait for token to be set - // console.log('Fetching chat details for chat ID:', chat_id); + // console.log('Fetching chat details for chat ID:', chat_id); - const response = await fetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, - { - method: "GET", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${token}`, - }, - }, - ); + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, + { + method: "GET", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${token}`, + }, + }, + ); - if (!response.ok) { - throw new Error( - `Failed to fetch chat details: ${response.statusText}`, - ); - } + if (!response.ok) { + throw new Error( + `Failed to fetch chat details: ${response.statusText}`, + ); + } - const chatData = await response.json(); - // console.log('Chat details fetched:', chatData); + const chatData = await response.json(); + // console.log('Chat details fetched:', chatData); - // Set research mode from chat data - if (chatData.type) { - setResearchMode(chatData.type as ResearchMode); - } + // Set research mode from chat data + if (chatData.type) { + setResearchMode(chatData.type as ResearchMode); + } - // Set connectors from chat data - if ( - chatData.initial_connectors && - Array.isArray(chatData.initial_connectors) - ) { - setSelectedConnectors(chatData.initial_connectors); - } + // Set connectors from chat data + if ( + chatData.initial_connectors && + Array.isArray(chatData.initial_connectors) + ) { + setSelectedConnectors(chatData.initial_connectors); + } - // Set messages from chat data - if (chatData.messages && Array.isArray(chatData.messages)) { - setMessages(chatData.messages); - } - } catch (err) { - console.error("Error fetching chat details:", err); - } - }; + // Set messages from chat data + if (chatData.messages && Array.isArray(chatData.messages)) { + setMessages(chatData.messages); + } + } catch (err) { + console.error("Error fetching chat details:", err); + } + }; - if (token) { - fetchChatDetails(); - } - }, [token, chat_id, setMessages]); + if (token) { + fetchChatDetails(); + } + }, [token, chat_id, setMessages]); - // Update chat when a conversation exchange is complete - useEffect(() => { - const updateChat = async () => { - try { - // Only update when: - // 1. Status is ready (not loading) - // 2. We have messages - // 3. Last message is from assistant (completed response) - if ( - status === "ready" && - messages.length > 0 && - messages[messages.length - 1]?.role === "assistant" - ) { - const token = localStorage.getItem("surfsense_bearer_token"); - if (!token) return; + // Update chat when a conversation exchange is complete + useEffect(() => { + const updateChat = async () => { + try { + // Only update when: + // 1. Status is ready (not loading) + // 2. We have messages + // 3. Last message is from assistant (completed response) + if ( + status === "ready" && + messages.length > 0 && + messages[messages.length - 1]?.role === "assistant" + ) { + const token = localStorage.getItem("surfsense_bearer_token"); + if (!token) return; - // Find the first user message to use as title - const userMessages = messages.filter((msg) => msg.role === "user"); - if (userMessages.length === 0) return; + // Find the first user message to use as title + const userMessages = messages.filter((msg) => msg.role === "user"); + if (userMessages.length === 0) return; - // Use the first user message as the title - const title = userMessages[0].content; + // Use the first user message as the title + const title = userMessages[0].content; - // console.log('Updating chat with title:', title); + // console.log('Updating chat with title:', title); - // Update the chat - const response = await fetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, - { - method: "PUT", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${token}`, - }, - body: JSON.stringify({ - type: researchMode, - title: title, - initial_connectors: selectedConnectors, - messages: messages, - search_space_id: Number(search_space_id), - }), - }, - ); + // Update the chat + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/chats/${Number(chat_id)}`, + { + method: "PUT", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${token}`, + }, + body: JSON.stringify({ + type: researchMode, + title: title, + initial_connectors: selectedConnectors, + messages: messages, + search_space_id: Number(search_space_id), + }), + }, + ); - if (!response.ok) { - throw new Error(`Failed to update chat: ${response.statusText}`); - } + if (!response.ok) { + throw new Error(`Failed to update chat: ${response.statusText}`); + } - // console.log('Chat updated successfully'); - } - } catch (err) { - console.error("Error updating chat:", err); - } - }; + // console.log('Chat updated successfully'); + } + } catch (err) { + console.error("Error updating chat:", err); + } + }; - updateChat(); - }, [ - messages, - status, - chat_id, - researchMode, - selectedConnectors, - search_space_id, - ]); + updateChat(); + }, [ + messages, + status, + chat_id, + researchMode, + selectedConnectors, + search_space_id, + ]); - // Check and scroll terminal when terminal info is available - useEffect(() => { - // Modified to trigger during streaming as well (removed status check) - if (messages.length === 0) return; + // Check and scroll terminal when terminal info is available + useEffect(() => { + // Modified to trigger during streaming as well (removed status check) + if (messages.length === 0) return; - // Find the latest assistant message - const assistantMessages = messages.filter( - (msg) => msg.role === "assistant", - ); - if (assistantMessages.length === 0) return; + // Find the latest assistant message + const assistantMessages = messages.filter( + (msg) => msg.role === "assistant", + ); + if (assistantMessages.length === 0) return; - const latestAssistantMessage = - assistantMessages[assistantMessages.length - 1]; - if (!latestAssistantMessage?.annotations) return; + const latestAssistantMessage = + assistantMessages[assistantMessages.length - 1]; + if (!latestAssistantMessage?.annotations) return; - // Check for terminal info annotations - const annotations = latestAssistantMessage.annotations as any[]; - const terminalInfoAnnotations = annotations.filter( - (a) => a.type === "TERMINAL_INFO", - ); + // Check for terminal info annotations + const annotations = latestAssistantMessage.annotations as any[]; + const terminalInfoAnnotations = annotations.filter( + (a) => a.type === "TERMINAL_INFO", + ); - if (terminalInfoAnnotations.length > 0) { - // Always scroll to bottom when terminal info is updated, even during streaming - scrollTerminalToBottom(); - } - }, [messages]); // Removed status from dependencies to ensure it triggers during streaming + if (terminalInfoAnnotations.length > 0) { + // Always scroll to bottom when terminal info is updated, even during streaming + scrollTerminalToBottom(); + } + }, [messages]); // Removed status from dependencies to ensure it triggers during streaming - // Pure function to get connector sources for a specific message - const getMessageConnectorSources = (message: any): any[] => { - if (!message || message.role !== "assistant" || !message.annotations) - return []; + // Pure function to get connector sources for a specific message + const getMessageConnectorSources = (message: any): any[] => { + if (!message || message.role !== "assistant" || !message.annotations) + return []; - // Find all SOURCES annotations - const annotations = message.annotations as any[]; - const sourcesAnnotations = annotations.filter((a) => a.type === "SOURCES"); + // Find all SOURCES annotations + const annotations = message.annotations as any[]; + const sourcesAnnotations = annotations.filter((a) => a.type === "SOURCES"); - // Get the latest SOURCES annotation - if (sourcesAnnotations.length === 0) return []; - const latestSourcesAnnotation = - sourcesAnnotations[sourcesAnnotations.length - 1]; + // Get the latest SOURCES annotation + if (sourcesAnnotations.length === 0) return []; + const latestSourcesAnnotation = + sourcesAnnotations[sourcesAnnotations.length - 1]; - if (!latestSourcesAnnotation.content) return []; + if (!latestSourcesAnnotation.content) return []; - return latestSourcesAnnotation.content; - }; + return latestSourcesAnnotation.content; + }; - // Custom handleSubmit function to include selected connectors and answer type - const handleSubmit = (e: React.FormEvent) => { - e.preventDefault(); + // Custom handleSubmit function to include selected connectors and answer type + const handleSubmit = (e: React.FormEvent) => { + e.preventDefault(); - if (!input.trim() || status !== "ready") return; + if (!input.trim() || status !== "ready") return; - // Validation: require at least one connector OR at least one document - // Note: Fast LLM selection updates user preferences automatically - // if (selectedConnectors.length === 0 && selectedDocuments.length === 0) { - // alert("Please select at least one connector or document"); - // return; - // } + // Validation: require at least one connector OR at least one document + // Note: Fast LLM selection updates user preferences automatically + // if (selectedConnectors.length === 0 && selectedDocuments.length === 0) { + // alert("Please select at least one connector or document"); + // return; + // } - // Call the original handleSubmit from useChat - handleChatSubmit(e); - }; + // Call the original handleSubmit from useChat + handleChatSubmit(e); + }; - // Reference to the messages container for auto-scrolling - const messagesEndRef = useRef(null); + // Reference to the messages container for auto-scrolling + const messagesEndRef = useRef(null); - // Function to scroll to bottom - const scrollToBottom = () => { - messagesEndRef.current?.scrollIntoView({ behavior: "smooth" }); - }; + // Function to scroll to bottom + const scrollToBottom = () => { + messagesEndRef.current?.scrollIntoView({ behavior: "smooth" }); + }; - // Scroll to bottom when messages change - useEffect(() => { - scrollToBottom(); - }, [messages]); + // Scroll to bottom when messages change + useEffect(() => { + scrollToBottom(); + }, [messages]); - // Reset sources page when new messages arrive - useEffect(() => { - // Reset pagination when we get new messages - setSourcesPage(1); - setExpandedSources(false); - }, [messages]); + // Reset sources page when new messages arrive + useEffect(() => { + // Reset pagination when we get new messages + setSourcesPage(1); + setExpandedSources(false); + }, [messages]); - // Scroll terminal to bottom when expanded - useEffect(() => { - if (terminalExpanded) { - setTimeout(scrollTerminalToBottom, 300); // Wait for transition to complete - } - }, [terminalExpanded]); + // Scroll terminal to bottom when expanded + useEffect(() => { + if (terminalExpanded) { + setTimeout(scrollTerminalToBottom, 300); // Wait for transition to complete + } + }, [terminalExpanded]); - // Function to check scroll position and update indicators - const updateScrollIndicators = () => { - updateScrollIndicatorsUtil( - tabsListRef as React.RefObject, - setCanScrollLeft, - setCanScrollRight, - ); - }; + // Function to check scroll position and update indicators + const updateScrollIndicators = () => { + updateScrollIndicatorsUtil( + tabsListRef as React.RefObject, + setCanScrollLeft, + setCanScrollRight, + ); + }; - // Initialize scroll indicators - const updateIndicators = useScrollIndicators( - tabsListRef as React.RefObject, - setCanScrollLeft, - setCanScrollRight, - ); + // Initialize scroll indicators + const updateIndicators = useScrollIndicators( + tabsListRef as React.RefObject, + setCanScrollLeft, + setCanScrollRight, + ); - // Function to scroll tabs list left - const scrollTabsLeft = () => { - scrollTabsLeftUtil( - tabsListRef as React.RefObject, - updateIndicators, - ); - }; + // Function to scroll tabs list left + const scrollTabsLeft = () => { + scrollTabsLeftUtil( + tabsListRef as React.RefObject, + updateIndicators, + ); + }; - // Function to scroll tabs list right - const scrollTabsRight = () => { - scrollTabsRightUtil( - tabsListRef as React.RefObject, - updateIndicators, - ); - }; + // Function to scroll tabs list right + const scrollTabsRight = () => { + scrollTabsRightUtil( + tabsListRef as React.RefObject, + updateIndicators, + ); + }; - // Use the scroll to bottom hook - useScrollToBottom(messagesEndRef as React.RefObject, [ - messages, - ]); + // Use the scroll to bottom hook + useScrollToBottom(messagesEndRef as React.RefObject, [ + messages, + ]); - // Function to get a citation source by ID - const getCitationSource = React.useCallback( - (citationId: number, messageIndex?: number): Source | null => { - if (!messages || messages.length === 0) return null; + // Function to get a citation source by ID + const getCitationSource = React.useCallback( + (citationId: number, messageIndex?: number): Source | null => { + if (!messages || messages.length === 0) return null; - // If no specific message index is provided, use the latest assistant message - if (messageIndex === undefined) { - // Find the latest assistant message - const assistantMessages = messages.filter( - (msg) => msg.role === "assistant", - ); - if (assistantMessages.length === 0) return null; + // If no specific message index is provided, use the latest assistant message + if (messageIndex === undefined) { + // Find the latest assistant message + const assistantMessages = messages.filter( + (msg) => msg.role === "assistant", + ); + if (assistantMessages.length === 0) return null; - const latestAssistantMessage = - assistantMessages[assistantMessages.length - 1]; + const latestAssistantMessage = + assistantMessages[assistantMessages.length - 1]; - // Use our helper function to get sources - const sources = getMessageConnectorSources(latestAssistantMessage); - if (sources.length === 0) return null; + // Use our helper function to get sources + const sources = getMessageConnectorSources(latestAssistantMessage); + if (sources.length === 0) return null; - // Flatten all sources from all connectors - const allSources: Source[] = []; - sources.forEach((connector: ConnectorSource) => { - if (connector.sources && Array.isArray(connector.sources)) { - connector.sources.forEach((source: SourceItem) => { - allSources.push({ - id: source.id, - title: source.title, - description: source.description, - url: source.url, - connectorType: connector.type, - }); - }); - } - }); + // Flatten all sources from all connectors + const allSources: Source[] = []; + sources.forEach((connector: ConnectorSource) => { + if (connector.sources && Array.isArray(connector.sources)) { + connector.sources.forEach((source: SourceItem) => { + allSources.push({ + id: source.id, + title: source.title, + description: source.description, + url: source.url, + connectorType: connector.type, + }); + }); + } + }); - // Find the source with the matching ID - const foundSource = allSources.find( - (source) => source.id === citationId, - ); + // Find the source with the matching ID + const foundSource = allSources.find( + (source) => source.id === citationId, + ); - return foundSource || null; - } else { - // Use the specific message by index - const message = messages[messageIndex]; + return foundSource || null; + } else { + // Use the specific message by index + const message = messages[messageIndex]; - // Use our helper function to get sources - const sources = getMessageConnectorSources(message); - if (sources.length === 0) return null; + // Use our helper function to get sources + const sources = getMessageConnectorSources(message); + if (sources.length === 0) return null; - // Flatten all sources from all connectors - const allSources: Source[] = []; - sources.forEach((connector: ConnectorSource) => { - if (connector.sources && Array.isArray(connector.sources)) { - connector.sources.forEach((source: SourceItem) => { - allSources.push({ - id: source.id, - title: source.title, - description: source.description, - url: source.url, - connectorType: connector.type, - }); - }); - } - }); + // Flatten all sources from all connectors + const allSources: Source[] = []; + sources.forEach((connector: ConnectorSource) => { + if (connector.sources && Array.isArray(connector.sources)) { + connector.sources.forEach((source: SourceItem) => { + allSources.push({ + id: source.id, + title: source.title, + description: source.description, + url: source.url, + connectorType: connector.type, + }); + }); + } + }); - // Find the source with the matching ID - const foundSource = allSources.find( - (source) => source.id === citationId, - ); + // Find the source with the matching ID + const foundSource = allSources.find( + (source) => source.id === citationId, + ); - return foundSource || null; - } - }, - [messages], - ); + return foundSource || null; + } + }, + [messages], + ); - // Pure function for rendering terminal content - no hooks allowed here - const renderTerminalContent = (message: any) => { - if (!message.annotations) return null; + // Pure function for rendering terminal content - no hooks allowed here + const renderTerminalContent = (message: any) => { + if (!message.annotations) return null; - // Get all TERMINAL_INFO annotations content - const terminalInfoAnnotations = (message.annotations as any[]).map(item => { - if(item.type === "TERMINAL_INFO") { - return item.content.map((a: any) => a.text) - - } - }).flat().filter(Boolean) + // Get all TERMINAL_INFO annotations content + const terminalInfoAnnotations = (message.annotations as any[]) + .map((item) => { + if (item.type === "TERMINAL_INFO") { + return item.content.map((a: any) => a.text); + } + }) + .flat() + .filter(Boolean); - // Render the content of the latest TERMINAL_INFO annotation - return terminalInfoAnnotations.map((item: any, idx: number) => ( -
- - [{String(idx).padStart(2, "0")}: - {String(Math.floor(idx * 2)).padStart(2, "0")}] - - {">"} - ( +
+ + [{String(idx).padStart(2, "0")}: + {String(Math.floor(idx * 2)).padStart(2, "0")}] + + {">"} + - {item} - -
- )); - }; + > + {item} +
+
+ )); + }; - return ( - <> -
- {messages.length === 0 && ( -

- -
- Surf{""} -
-
- Sense -
-
-
-

- )} - {messages?.map((message, index) => { - if (message.role === "user") { - return ( -
- -
- - - getCitationSource(id, index)} - className="text-sm" - /> - - -
-
- ); - } + return ( + <> +
+ {messages.length === 0 && ( +

+ +
+ Surf{""} +
+
+ Sense +
+
+
+

+ )} + {messages?.map((message, index) => { + if (message.role === "user") { + return ( +
+ +
+ + + getCitationSource(id, index)} + className="text-sm" + /> + + +
+
+ ); + } - if (message.role === "assistant") { - return ( -
- - - - Answer - - - - {/* Status Messages Section */} - -
-
-
-
setTerminalExpanded(false)} - >
-
-
setTerminalExpanded(true)} - >
-
- - surfsense-research-terminal - -
-
+ if (message.role === "assistant") { + return ( +
+ + + + Answer + + + + {/* Status Messages Section */} + +
+
+
+
setTerminalExpanded(false)} + >
+
+
setTerminalExpanded(true)} + >
+
+ + surfsense-research-terminal + +
+
-
-
- Last login: {currentDate} {currentTime} -
-
- - researcher@surfsense - - : - ~/research - $ - surfsense-researcher -
+
+
+ Last login: {currentDate} {currentTime} +
+
+ + researcher@surfsense + + : + ~/research + $ + surfsense-researcher +
- {renderTerminalContent(message)} + {renderTerminalContent(message)} -
- - [00:13] - - - researcher@surfsense - - : - ~/research - $ -
-
+
+ + [00:13] + + + researcher@surfsense + + : + ~/research + $ +
+
- {/* Terminal scroll button */} -
- -
-
- + {/* Terminal scroll button */} +
+ +
+
+
- {/* Sources Section with Connector Tabs */} -
-
- - Sources -
+ {/* Sources Section with Connector Tabs */} +
+
+ + Sources +
- {(() => { - // Get sources for this specific message - const messageConnectorSources = - getMessageConnectorSources(message); + {(() => { + // Get sources for this specific message + const messageConnectorSources = + getMessageConnectorSources(message); - if (messageConnectorSources.length === 0) { - return ( -
- -
- ); - } + if (messageConnectorSources.length === 0) { + return ( +
+ +
+ ); + } - // Use these message-specific sources for the Tabs component - return ( - 0 - ? messageConnectorSources[0].type - : undefined - } - className="w-full" - > -
-
- + // Use these message-specific sources for the Tabs component + return ( + 0 + ? messageConnectorSources[0].type + : undefined + } + className="w-full" + > +
+
+ -
-
- - {messageConnectorSources.map( - (connector) => ( - - {getConnectorIcon(connector.type)} - - {connector.name.split(" ")[0]} - - - {connector.sources?.length || 0} - - - ), - )} - -
-
+
+
+ + {messageConnectorSources.map( + (connector) => ( + + {getConnectorIcon(connector.type)} + + {connector.name.split(" ")[0]} + + + {connector.sources?.length || 0} + + + ), + )} + +
+
- -
-
+ +
+
- {messageConnectorSources.map((connector) => ( - -
- {connector.sources - ?.slice(0, INITIAL_SOURCES_DISPLAY) - ?.map((source: any, index: number) => ( - -
-
- {getConnectorIcon(connector.type)} -
-
-

- {source.title} -

-

- {source.description} -

-
- -
-
- ))} + {messageConnectorSources.map((connector) => ( + +
+ {connector.sources + ?.slice(0, INITIAL_SOURCES_DISPLAY) + ?.map((source: any, index: number) => ( + +
+
+ {getConnectorIcon(connector.type)} +
+
+

+ {source.title} +

+

+ {source.description} +

+
+ +
+
+ ))} - {connector.sources?.length > - INITIAL_SOURCES_DISPLAY && ( - - setDialogOpenId( - open ? connector.id : null, - ) - } - > - - - - - - - - )} -
-
- ))} - - ); - })()} -
+ {connector.sources?.length > + INITIAL_SOURCES_DISPLAY && ( + + setDialogOpenId( + open ? connector.id : null, + ) + } + > + + + + + + + + )} +
+ + ))} + + ); + })()} +
- {/* Answer Section */} -
- { -
- {message.annotations && - (() => { - // Get all ANSWER annotations - const answerAnnotations = ( - message.annotations as any[] - ).filter((a) => a.type === "ANSWER"); + {/* Answer Section */} +
+ { +
+ {message.annotations && + (() => { + // Get all ANSWER annotations + const answerAnnotations = ( + message.annotations as any[] + ).filter((a) => a.type === "ANSWER"); - // Get the latest ANSWER annotation - const latestAnswer = - answerAnnotations.length > 0 - ? answerAnnotations[ - answerAnnotations.length - 1 - ] - : null; + // Get the latest ANSWER annotation + const latestAnswer = + answerAnnotations.length > 0 + ? answerAnnotations[ + answerAnnotations.length - 1 + ] + : null; - // If we have a latest ANSWER annotation with content, render it - if ( - latestAnswer?.content && - latestAnswer.content.length > 0 - ) { - return ( - - getCitationSource(id, index) - } - type="ai" - /> - ); - } + // If we have a latest ANSWER annotation with content, render it + if ( + latestAnswer?.content && + latestAnswer.content.length > 0 + ) { + return ( + + getCitationSource(id, index) + } + type="ai" + /> + ); + } - // Fallback to the message content if no ANSWER annotation is available - return getCitationSource(id, index)} - type="ai" - />; - })()} + // Fallback to the message content if no ANSWER annotation is available + return ( + + getCitationSource(id, index) + } + type="ai" + /> + ); + })()}
}
{/* Further Questions Section */} - {message.annotations && (() => { - // Get all FURTHER_QUESTIONS annotations - const furtherQuestionsAnnotations = (message.annotations as any[]) - .filter(a => a.type === 'FURTHER_QUESTIONS'); + {message.annotations && + (() => { + // Get all FURTHER_QUESTIONS annotations + const furtherQuestionsAnnotations = ( + message.annotations as any[] + ).filter((a) => a.type === "FURTHER_QUESTIONS"); - // Get the latest FURTHER_QUESTIONS annotation - const latestFurtherQuestions = furtherQuestionsAnnotations.length > 0 - ? furtherQuestionsAnnotations[furtherQuestionsAnnotations.length - 1] - : null; + // Get the latest FURTHER_QUESTIONS annotation + const latestFurtherQuestions = + furtherQuestionsAnnotations.length > 0 + ? furtherQuestionsAnnotations[ + furtherQuestionsAnnotations.length - 1 + ] + : null; - // Only render if we have questions - if (!latestFurtherQuestions?.content || latestFurtherQuestions.content.length === 0) { - return null; - } + // Only render if we have questions + if ( + !latestFurtherQuestions?.content || + latestFurtherQuestions.content.length === 0 + ) { + return null; + } - const furtherQuestions = latestFurtherQuestions.content; + const furtherQuestions = latestFurtherQuestions.content; - return ( -
- {/* Main container with improved styling */} -
- {/* Header with better visual separation */} -
-
-

- - - - Follow-up Questions -

- - {furtherQuestions.length} suggestion{furtherQuestions.length !== 1 ? 's' : ''} - + return ( +
+ {/* Main container with improved styling */} +
+ {/* Header with better visual separation */} +
+
+

+ + + + Follow-up Questions +

+ + {furtherQuestions.length} suggestion + {furtherQuestions.length !== 1 ? "s" : ""} + +
-
- {/* Questions container with enhanced scrolling */} -
-
- {/* Left fade gradient */} -
- - {/* Right fade gradient */} -
- - {/* Scrollable container */} -
-
- {furtherQuestions.map((question: any, qIndex: number) => ( - - ))} + {/* Questions container with enhanced scrolling */} +
+
+ {/* Left fade gradient */} +
+ + {/* Right fade gradient */} +
+ + {/* Scrollable container */} +
+
+ {furtherQuestions.map( + (question: any, qIndex: number) => ( + + ), + )} +
-
- ); - })()} + ); + })()} {/* Scroll to bottom button */}
- -
-
- {/* Enhanced Document Selection Dialog */} - - - {}} - documentsCount={documents?.length || 0} - /> - - - - -
- - Select Documents - - {selectedDocuments.length} selected - -
- -
- - Choose documents to include in your research context. Use - filters and search to find specific documents. - -
+ {/* New Chat Input Form */} +
+
+ + {/* Send button */} + +
+
+
+ {/* Enhanced Document Selection Dialog */} + + + {}} + documentsCount={documents?.length || 0} + /> + + + + +
+ + Select Documents + + {selectedDocuments.length} selected + +
+ +
+ + Choose documents to include in your research context. Use + filters and search to find specific documents. + +
- {/* Enhanced Search and Filter Controls */} -
-
- {/* Search Input */} -
- - setDocumentFilter(e.target.value)} - /> - {documentFilter && ( - - )} -
+ {/* Enhanced Search and Filter Controls */} +
+
+ {/* Search Input */} +
+ + setDocumentFilter(e.target.value)} + /> + {documentFilter && ( + + )} +
- {/* Document Type Filter */} - { - setDocumentTypeFilter(newType); - setDocumentsPage(1); // Reset to page 1 when filter changes - }} - counts={documentTypeCounts} - /> -
+ {/* Document Type Filter */} + { + setDocumentTypeFilter(newType); + setDocumentsPage(1); // Reset to page 1 when filter changes + }} + counts={documentTypeCounts} + /> +
- {/* Results Summary */} -
- - {isLoadingDocuments - ? "Loading documents..." - : `Showing ${paginatedDocuments.length} of ${filteredDocuments.length} documents`} - - {filteredDocuments.length > 0 && ( - - Page {documentsPage} of {totalPages} - - )} -
-
+ {/* Results Summary */} +
+ + {isLoadingDocuments + ? "Loading documents..." + : `Showing ${paginatedDocuments.length} of ${filteredDocuments.length} documents`} + + {filteredDocuments.length > 0 && ( + + Page {documentsPage} of {totalPages} + + )} +
+
- {/* Document List with Proper Scrolling */} -
-
- {isLoadingDocuments ? ( - // Enhanced skeleton loading - Array.from({ length: 6 }, (_, i) => ( - - )) - ) : documentsError ? ( -
-
- -
-

- Error loading documents -

-

- Please try refreshing the page -

-
- ) : filteredDocuments.length === 0 ? ( -
-
- -
-

- No documents found -

-

- {documentFilter || documentTypeFilter !== "ALL" - ? "Try adjusting your search or filters" - : "Upload documents to get started"} -

- {!documentFilter && documentTypeFilter === "ALL" && ( - - )} -
- ) : ( - // Enhanced document list - paginatedDocuments.map((document) => { - const isSelected = selectedDocuments.includes( - document.id, - ); - const typeLabel = document.document_type - .replace(/_/g, " ") - .toLowerCase(); + {/* Document List with Proper Scrolling */} +
+
+ {isLoadingDocuments ? ( + // Enhanced skeleton loading + Array.from({ length: 6 }, (_, i) => ( + + )) + ) : documentsError ? ( +
+
+ +
+

+ Error loading documents +

+

+ Please try refreshing the page +

+
+ ) : filteredDocuments.length === 0 ? ( +
+
+ +
+

+ No documents found +

+

+ {documentFilter || documentTypeFilter !== "ALL" + ? "Try adjusting your search or filters" + : "Upload documents to get started"} +

+ {!documentFilter && documentTypeFilter === "ALL" && ( + + )} +
+ ) : ( + // Enhanced document list + paginatedDocuments.map((document) => { + const isSelected = selectedDocuments.includes( + document.id, + ); + const typeLabel = document.document_type + .replace(/_/g, " ") + .toLowerCase(); - return ( -
handleDocumentToggle(document.id)} - > -
-
- {getConnectorIcon(document.document_type)} -
-
-
-
-

- {document.title} -

- {isSelected && ( -
-
- -
-
- )} -
-
- - {typeLabel} - - - {new Date( - document.created_at, - ).toLocaleDateString()} - -
-

- {document.content.substring(0, 200)}... -

-
-
- ); - }) - )} -
-
+ return ( +
handleDocumentToggle(document.id)} + > +
+
+ {getConnectorIcon(document.document_type)} +
+
+
+
+

+ {document.title} +

+ {isSelected && ( +
+
+ +
+
+ )} +
+
+ + {typeLabel} + + + {new Date( + document.created_at, + ).toLocaleDateString()} + +
+

+ {document.content.substring(0, 200)}... +

+
+
+ ); + }) + )} +
+
- {/* Enhanced Pagination Controls */} - {totalPages > 1 && ( -
-
- -
- {Array.from( - { length: Math.min(5, totalPages) }, - (_, i) => { - const page = - documentsPage <= 3 - ? i + 1 - : documentsPage - 2 + i; - if (page > totalPages) return null; - return ( - - ); - }, - )} - {totalPages > 5 && documentsPage < totalPages - 2 && ( - <> - - ... - - - - )} -
- -
-
- )} + {/* Enhanced Pagination Controls */} + {totalPages > 1 && ( +
+
+ +
+ {Array.from( + { length: Math.min(5, totalPages) }, + (_, i) => { + const page = + documentsPage <= 3 + ? i + 1 + : documentsPage - 2 + i; + if (page > totalPages) return null; + return ( + + ); + }, + )} + {totalPages > 5 && documentsPage < totalPages - 2 && ( + <> + + ... + + + + )} +
+ +
+
+ )} - {/* Enhanced Footer */} - -
- - {selectedDocuments.length} of {filteredDocuments.length}{" "} - document{selectedDocuments.length !== 1 ? "s" : ""}{" "} - selected - -
-
- - + - + -
-
-
-
+ if (allSelected) { + setSelectedDocuments((prev) => + prev.filter((id) => !allFilteredIds.includes(id)), + ); + } else { + setSelectedDocuments((prev) => [ + ...new Set([...prev, ...allFilteredIds]), + ]); + } + }} + disabled={filteredDocuments.length === 0} + > + {filteredDocuments.every((doc) => + selectedDocuments.includes(doc.id), + ) + ? "Deselect" + : "Select"}{" "} + All Filtered + +
+ + +
- {/* Connector Selection Dialog */} - - - {}} - /> - - - - Select Connectors - - Choose which data sources to include in your research - - + {/* Connector Selection Dialog */} + + + {}} + /> + + + + Select Connectors + + Choose which data sources to include in your research + + - {/* Connector selection grid */} -
- {isLoadingConnectors ? ( -
- -
- ) : ( - connectorSourceItems.map((connector) => { - const isSelected = selectedConnectors.includes( - connector.type, - ); + {/* Connector selection grid */} +
+ {isLoadingConnectors ? ( +
+ +
+ ) : ( + connectorSourceItems.map((connector) => { + const isSelected = selectedConnectors.includes( + connector.type, + ); - return ( -
{ - setSelectedConnectors( - isSelected - ? selectedConnectors.filter( - (type) => type !== connector.type, - ) - : [...selectedConnectors, connector.type], - ); - }} - role="checkbox" - aria-checked={isSelected} - tabIndex={0} - > -
- {getConnectorIcon(connector.type)} -
- - {connector.name} - - {isSelected && ( - - )} -
- ); - }) - )} -
+ return ( +
{ + setSelectedConnectors( + isSelected + ? selectedConnectors.filter( + (type) => type !== connector.type, + ) + : [...selectedConnectors, connector.type], + ); + }} + role="checkbox" + aria-checked={isSelected} + tabIndex={0} + > +
+ {getConnectorIcon(connector.type)} +
+ + {connector.name} + + {isSelected && ( + + )} +
+ ); + }) + )} +
- -
- - -
-
-
-
+ +
+ + +
+
+
+
- {/* Search Mode Control */} -
- - -
+ {/* Search Mode Control */} +
+ + +
- {/* Research Mode Control */} -
- -
+ {/* Research Mode Control */} +
+ +
- {/* Fast LLM Selector */} -
- -
-
-
-
+ {/* Fast LLM Selector */} +
+ +
+
+
+
- {/* Reference for auto-scrolling */} -
-
- - ); + {/* Reference for auto-scrolling */} +
+
+ + ); }; export default ChatPage; diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx index 4d0aa11ef..d7c977b98 100644 --- a/surfsense_web/components/chat/ConnectorComponents.tsx +++ b/surfsense_web/components/chat/ConnectorComponents.tsx @@ -1,6 +1,6 @@ -import React from 'react'; -import { - ChevronDown, +import React from "react"; +import { + ChevronDown, Plus, Search, Globe, @@ -12,78 +12,99 @@ import { Webhook, MessageCircle, FileText, -} from 'lucide-react'; -import { IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconBrandGithub, IconLayoutKanban, IconLinkPlus, IconBrandDiscord } from "@tabler/icons-react"; -import { Button } from '@/components/ui/button'; -import { Connector, ResearchMode } from './types'; +} from "lucide-react"; +import { + IconBrandNotion, + IconBrandSlack, + IconBrandYoutube, + IconBrandGithub, + IconLayoutKanban, + IconLinkPlus, + IconBrandDiscord, + IconTicket, +} from "@tabler/icons-react"; +import { Button } from "@/components/ui/button"; +import { Connector, ResearchMode } from "./types"; // Helper function to get connector icon export const getConnectorIcon = (connectorType: string) => { const iconProps = { className: "h-4 w-4" }; - - switch(connectorType) { - case 'LINKUP_API': + + switch (connectorType) { + case "LINKUP_API": return ; - case 'LINEAR_CONNECTOR': + case "LINEAR_CONNECTOR": return ; - case 'GITHUB_CONNECTOR': + case "GITHUB_CONNECTOR": return ; - case 'YOUTUBE_VIDEO': + case "YOUTUBE_VIDEO": return ; - case 'CRAWLED_URL': + case "CRAWLED_URL": return ; - case 'FILE': - return ; - case 'EXTENSION': - return ; - case 'SERPER_API': - case 'TAVILY_API': + case "FILE": + return ; + case "EXTENSION": + return ; + case "SERPER_API": + case "TAVILY_API": return ; - case 'SLACK_CONNECTOR': + case "SLACK_CONNECTOR": return ; - case 'NOTION_CONNECTOR': + case "NOTION_CONNECTOR": return ; - case 'DISCORD_CONNECTOR': + case "DISCORD_CONNECTOR": return ; - case 'DEEP': + case "JIRA_CONNECTOR": + return ; + case "DEEP": return ; - case 'DEEPER': + case "DEEPER": return ; - case 'DEEPEST': + case "DEEPEST": return ; default: return ; } }; -export const researcherOptions: { value: ResearchMode; label: string; icon: React.ReactNode }[] = [ +export const researcherOptions: { + value: ResearchMode; + label: string; + icon: React.ReactNode; +}[] = [ { - value: 'QNA', - label: 'Q/A', - icon: getConnectorIcon('GENERAL') + value: "QNA", + label: "Q/A", + icon: getConnectorIcon("GENERAL"), }, { - value: 'REPORT_GENERAL', - label: 'General', - icon: getConnectorIcon('GENERAL') + value: "REPORT_GENERAL", + label: "General", + icon: getConnectorIcon("GENERAL"), }, { - value: 'REPORT_DEEP', - label: 'Deep', - icon: getConnectorIcon('DEEP') + value: "REPORT_DEEP", + label: "Deep", + icon: getConnectorIcon("DEEP"), }, { - value: 'REPORT_DEEPER', - label: 'Deeper', - icon: getConnectorIcon('DEEPER') + value: "REPORT_DEEPER", + label: "Deeper", + icon: getConnectorIcon("DEEPER"), }, -] +]; /** * Displays a small icon for a connector type */ -export const ConnectorIcon = ({ type, index = 0 }: { type: string; index?: number }) => ( -
( +
@@ -109,24 +130,30 @@ type ConnectorButtonProps = { /** * Button that displays selected connectors and opens connector selection dialog */ -export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources }: ConnectorButtonProps) => { +export const ConnectorButton = ({ + selectedConnectors, + onClick, + connectorSources, +}: ConnectorButtonProps) => { const totalConnectors = connectorSources.length; const selectedCount = selectedConnectors.length; const progressPercentage = (selectedCount / totalConnectors) * 100; - + // Get the name of a single selected connector const getSingleConnectorName = () => { - const connector = connectorSources.find(c => c.type === selectedConnectors[0]); - return connector?.name || ''; + const connector = connectorSources.find( + (c) => c.type === selectedConnectors[0], + ); + return connector?.name || ""; }; - + // Get display text based on selection count const getDisplayText = () => { if (selectedCount === totalConnectors) return "All Connectors"; if (selectedCount === 1) return getSingleConnectorName(); return `${selectedCount} Connectors`; }; - + // Render the empty state (no connectors selected) const renderEmptyState = () => ( <> @@ -134,7 +161,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources Select Connectors ); - + // Render the selected connectors preview const renderSelectedConnectors = () => ( <> @@ -143,32 +170,36 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources {selectedConnectors.slice(0, 3).map((type, index) => ( ))} - + {/* Show count indicator if more than 3 connectors are selected */} {selectedCount > 3 && }
- + {/* Display text */} {getDisplayText()} ); - + return (
); -}; \ No newline at end of file +}; diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 022459be8..b53ffee64 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -1,14 +1,15 @@ // Helper function to get connector type display name export const getConnectorTypeDisplay = (type: string): string => { - const typeMap: Record = { - "SERPER_API": "Serper API", - "TAVILY_API": "Tavily API", - "SLACK_CONNECTOR": "Slack", - "NOTION_CONNECTOR": "Notion", - "GITHUB_CONNECTOR": "GitHub", - "LINEAR_CONNECTOR": "Linear", - "DISCORD_CONNECTOR": "Discord", - "LINKUP_API": "Linkup", - }; - return typeMap[type] || type; -}; + const typeMap: Record = { + SERPER_API: "Serper API", + TAVILY_API: "Tavily API", + SLACK_CONNECTOR: "Slack", + NOTION_CONNECTOR: "Notion", + GITHUB_CONNECTOR: "GitHub", + LINEAR_CONNECTOR: "Linear", + JIRA_CONNECTOR: "Jira", + DISCORD_CONNECTOR: "Discord", + LINKUP_API: "Linkup", + }; + return typeMap[type] || type; +};