mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
Merge pull request #404 from MODSetter/dev
feat: add unique identifier hash for documents to prevent duplicates across various connectors
This commit is contained in:
commit
d868bae134
22 changed files with 1631 additions and 356 deletions
|
|
@ -0,0 +1,54 @@
|
||||||
|
"""Add unique_identifier_hash column to documents table
|
||||||
|
|
||||||
|
Revision ID: 29
|
||||||
|
Revises: 28
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy import inspect
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "29"
|
||||||
|
down_revision: str | None = "28"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
bind = op.get_bind()
|
||||||
|
inspector = inspect(bind)
|
||||||
|
columns = [col["name"] for col in inspector.get_columns("documents")]
|
||||||
|
|
||||||
|
# Only add the column if it doesn't already exist
|
||||||
|
if "unique_identifier_hash" not in columns:
|
||||||
|
op.add_column(
|
||||||
|
"documents",
|
||||||
|
sa.Column("unique_identifier_hash", sa.String(), nullable=True),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
op.f("ix_documents_unique_identifier_hash"),
|
||||||
|
"documents",
|
||||||
|
["unique_identifier_hash"],
|
||||||
|
unique=False,
|
||||||
|
)
|
||||||
|
op.create_unique_constraint(
|
||||||
|
op.f("uq_documents_unique_identifier_hash"),
|
||||||
|
"documents",
|
||||||
|
["unique_identifier_hash"],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"Column 'unique_identifier_hash' already exists. Skipping column creation."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_constraint(
|
||||||
|
op.f("uq_documents_unique_identifier_hash"), "documents", type_="unique"
|
||||||
|
)
|
||||||
|
op.drop_index(op.f("ix_documents_unique_identifier_hash"), table_name="documents")
|
||||||
|
op.drop_column("documents", "unique_identifier_hash")
|
||||||
|
|
@ -174,6 +174,7 @@ class Document(BaseModel, TimestampMixin):
|
||||||
|
|
||||||
content = Column(Text, nullable=False)
|
content = Column(Text, nullable=False)
|
||||||
content_hash = Column(String, nullable=False, index=True, unique=True)
|
content_hash = Column(String, nullable=False, index=True, unique=True)
|
||||||
|
unique_identifier_hash = Column(String, nullable=True, index=True, unique=True)
|
||||||
embedding = Column(Vector(config.embedding_model_instance.dimension))
|
embedding = Column(Vector(config.embedding_model_instance.dimension))
|
||||||
|
|
||||||
search_space_id = Column(
|
search_space_id = Column(
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,12 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -240,25 +241,100 @@ async def index_airtable_records(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
record_id = record.get("id", "Unknown")
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Airtable record
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.AIRTABLE_CONNECTOR,
|
||||||
|
record_id,
|
||||||
|
search_space_id,
|
||||||
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(
|
content_hash = generate_content_hash(
|
||||||
markdown_content, search_space_id
|
markdown_content, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if document already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = (
|
existing_document = (
|
||||||
await check_duplicate_document_by_hash(
|
await check_document_by_unique_identifier(
|
||||||
session, content_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
if existing_document:
|
||||||
logger.info(
|
# Document exists - check if content has changed
|
||||||
f"Document with content hash {content_hash} already exists for message {record.get('id')}. Skipping processing."
|
if existing_document.content_hash == content_hash:
|
||||||
)
|
logger.info(
|
||||||
documents_skipped += 1
|
f"Document for Airtable record {record_id} unchanged. Skipping."
|
||||||
continue
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Airtable record {record_id}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate document summary
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"record_id": record_id,
|
||||||
|
"created_time": record.get(
|
||||||
|
"CREATED_TIME()", ""
|
||||||
|
),
|
||||||
|
"document_type": "Airtable Record",
|
||||||
|
"connector_type": "Airtable",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
markdown_content,
|
||||||
|
user_llm,
|
||||||
|
document_metadata,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = (
|
||||||
|
f"Airtable Record: {record_id}\n\n"
|
||||||
|
)
|
||||||
|
summary_embedding = (
|
||||||
|
config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(
|
||||||
|
markdown_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = (
|
||||||
|
f"Airtable Record: {record_id}"
|
||||||
|
)
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"record_id": record_id,
|
||||||
|
"created_time": record.get(
|
||||||
|
"CREATED_TIME()", ""
|
||||||
|
),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated Airtable record {record_id}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate document summary
|
# Generate document summary
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -266,7 +342,7 @@ async def index_airtable_records(
|
||||||
|
|
||||||
if user_llm:
|
if user_llm:
|
||||||
document_metadata = {
|
document_metadata = {
|
||||||
"record_id": record.get("id", "Unknown"),
|
"record_id": record_id,
|
||||||
"created_time": record.get("CREATED_TIME()", ""),
|
"created_time": record.get("CREATED_TIME()", ""),
|
||||||
"document_type": "Airtable Record",
|
"document_type": "Airtable Record",
|
||||||
"connector_type": "Airtable",
|
"connector_type": "Airtable",
|
||||||
|
|
@ -279,7 +355,7 @@ async def index_airtable_records(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Fallback to simple summary if no LLM configured
|
# Fallback to simple summary if no LLM configured
|
||||||
summary_content = f"Airtable Record: {record.get('id', 'Unknown')}\n\n"
|
summary_content = f"Airtable Record: {record_id}\n\n"
|
||||||
summary_embedding = (
|
summary_embedding = (
|
||||||
config.embedding_model_instance.embed(
|
config.embedding_model_instance.embed(
|
||||||
summary_content
|
summary_content
|
||||||
|
|
@ -291,18 +367,19 @@ async def index_airtable_records(
|
||||||
|
|
||||||
# Create and store new document
|
# Create and store new document
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Creating new document for Airtable record: {record.get('id', 'Unknown')}"
|
f"Creating new document for Airtable record: {record_id}"
|
||||||
)
|
)
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=f"Airtable Record: {record.get('id', 'Unknown')}",
|
title=f"Airtable Record: {record_id}",
|
||||||
document_type=DocumentType.AIRTABLE_CONNECTOR,
|
document_type=DocumentType.AIRTABLE_CONNECTOR,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"record_id": record.get("id", "Unknown"),
|
"record_id": record_id,
|
||||||
"created_time": record.get("CREATED_TIME()", ""),
|
"created_time": record.get("CREATED_TIME()", ""),
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,30 @@ async def check_duplicate_document_by_hash(
|
||||||
return existing_doc_result.scalars().first()
|
return existing_doc_result.scalars().first()
|
||||||
|
|
||||||
|
|
||||||
|
async def check_document_by_unique_identifier(
|
||||||
|
session: AsyncSession, unique_identifier_hash: str
|
||||||
|
) -> Document | None:
|
||||||
|
"""
|
||||||
|
Check if a document with the given unique identifier hash already exists.
|
||||||
|
Eagerly loads chunks to avoid lazy loading issues during updates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
unique_identifier_hash: Hash of the unique identifier from the source system
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Existing document if found, None otherwise
|
||||||
|
"""
|
||||||
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
||||||
|
existing_doc_result = await session.execute(
|
||||||
|
select(Document)
|
||||||
|
.options(selectinload(Document.chunks))
|
||||||
|
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
||||||
|
)
|
||||||
|
return existing_doc_result.scalars().first()
|
||||||
|
|
||||||
|
|
||||||
async def get_connector_by_id(
|
async def get_connector_by_id(
|
||||||
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
|
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
|
||||||
) -> SearchSourceConnector | None:
|
) -> SearchSourceConnector | None:
|
||||||
|
|
|
||||||
|
|
@ -16,10 +16,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -209,18 +210,92 @@ async def index_clickup_tasks(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Hash for duplicates
|
# Generate unique identifier hash for this ClickUp task
|
||||||
content_hash = generate_content_hash(task_content, search_space_id)
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
DocumentType.CLICKUP_CONNECTOR, task_id, search_space_id
|
||||||
session, content_hash
|
|
||||||
)
|
)
|
||||||
if existing_document_by_hash:
|
|
||||||
logger.info(
|
|
||||||
f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
|
content_hash = generate_content_hash(task_content, search_space_id)
|
||||||
|
|
||||||
|
# Check if document with this unique identifier already exists
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for ClickUp task {task_name} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for ClickUp task {task_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"task_name": task_name,
|
||||||
|
"task_status": task_status,
|
||||||
|
"task_priority": task_priority,
|
||||||
|
"task_list": task_list_name,
|
||||||
|
"task_space": task_space_name,
|
||||||
|
"assignees": len(task_assignees),
|
||||||
|
"document_type": "ClickUp Task",
|
||||||
|
"connector_type": "ClickUp",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
task_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = task_content
|
||||||
|
summary_embedding = (
|
||||||
|
config.embedding_model_instance.embed(task_content)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(task_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"Task - {task_name}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"task_name": task_name,
|
||||||
|
"task_status": task_status,
|
||||||
|
"task_priority": task_priority,
|
||||||
|
"task_assignees": task_assignees,
|
||||||
|
"task_due_date": task_due_date,
|
||||||
|
"task_created": task_created,
|
||||||
|
"task_updated": task_updated,
|
||||||
|
"indexed_at": datetime.now().strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated ClickUp task {task_name}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -270,6 +345,7 @@ async def index_clickup_tasks(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,12 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -217,26 +218,97 @@ async def index_confluence_pages(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Confluence page
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.CONFLUENCE_CONNECTOR, page_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(full_content, search_space_id)
|
content_hash = generate_content_hash(full_content, search_space_id)
|
||||||
|
|
||||||
# Check if document already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
existing_document = await check_document_by_unique_identifier(
|
||||||
session, content_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
comment_count = len(comments)
|
||||||
logger.info(
|
|
||||||
f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for Confluence page {page_title} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Confluence page {page_title}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"page_title": page_title,
|
||||||
|
"page_id": page_id,
|
||||||
|
"space_id": space_id,
|
||||||
|
"comment_count": comment_count,
|
||||||
|
"document_type": "Confluence Page",
|
||||||
|
"connector_type": "Confluence",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
full_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
|
||||||
|
if page_content:
|
||||||
|
content_preview = page_content[:1000]
|
||||||
|
if len(page_content) > 1000:
|
||||||
|
content_preview += "..."
|
||||||
|
summary_content += (
|
||||||
|
f"Content Preview: {content_preview}\n\n"
|
||||||
|
)
|
||||||
|
summary_content += f"Comments: {comment_count}"
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(full_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"Confluence - {page_title}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"page_id": page_id,
|
||||||
|
"page_title": page_title,
|
||||||
|
"space_id": space_id,
|
||||||
|
"comment_count": comment_count,
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated Confluence page {page_title}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
)
|
)
|
||||||
comment_count = len(comments)
|
|
||||||
|
|
||||||
if user_llm:
|
if user_llm:
|
||||||
document_metadata = {
|
document_metadata = {
|
||||||
|
|
@ -287,6 +359,7 @@ async def index_confluence_pages(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,12 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
build_document_metadata_string,
|
build_document_metadata_string,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -307,23 +308,98 @@ async def index_discord_messages(
|
||||||
combined_document_string = build_document_metadata_string(
|
combined_document_string = build_document_metadata_string(
|
||||||
metadata_sections
|
metadata_sections
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Discord channel
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(
|
content_hash = generate_content_hash(
|
||||||
combined_document_string, search_space_id
|
combined_document_string, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Skip duplicates by hash
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = (
|
existing_document = await check_document_by_unique_identifier(
|
||||||
await check_duplicate_document_by_hash(
|
session, unique_identifier_hash
|
||||||
session, content_hash
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if existing_document_by_hash:
|
|
||||||
logger.info(
|
|
||||||
f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get user's long context LLM
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
if not user_llm:
|
||||||
|
logger.error(
|
||||||
|
f"No long context LLM configured for user {user_id}"
|
||||||
|
)
|
||||||
|
skipped_channels.append(
|
||||||
|
f"{guild_name}#{channel_name} (no LLM configured)"
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
document_metadata = {
|
||||||
|
"guild_name": guild_name,
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"message_count": len(formatted_messages),
|
||||||
|
"document_type": "Discord Channel Messages",
|
||||||
|
"connector_type": "Discord",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
combined_document_string,
|
||||||
|
user_llm,
|
||||||
|
document_metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chunks from channel content
|
||||||
|
chunks = await create_document_chunks(channel_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = (
|
||||||
|
f"Discord - {guild_name}#{channel_name}"
|
||||||
|
)
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"guild_name": guild_name,
|
||||||
|
"guild_id": guild_id,
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_id": channel_id,
|
||||||
|
"message_count": len(formatted_messages),
|
||||||
|
"start_date": start_date_iso,
|
||||||
|
"end_date": end_date_iso,
|
||||||
|
"indexed_at": datetime.now(UTC).strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated Discord channel {guild_name}#{channel_name}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Get user's long context LLM
|
# Get user's long context LLM
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -375,6 +451,7 @@ async def index_discord_messages(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,10 +16,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
)
|
)
|
||||||
|
|
@ -199,19 +200,101 @@ async def index_github_repos(
|
||||||
)
|
)
|
||||||
continue # Skip if content fetch failed
|
continue # Skip if content fetch failed
|
||||||
|
|
||||||
content_hash = generate_content_hash(file_content, search_space_id)
|
# Generate unique identifier hash for this GitHub file
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
# Check if document with this content hash already exists
|
DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
|
||||||
session, content_hash
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
# Generate content hash
|
||||||
logger.info(
|
content_hash = generate_content_hash(file_content, search_space_id)
|
||||||
f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
# Check if document with this unique identifier already exists
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for GitHub file {full_path_key} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for GitHub file {full_path_key}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
if user_llm:
|
||||||
|
file_extension = (
|
||||||
|
file_path.split(".")[-1]
|
||||||
|
if "." in file_path
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
document_metadata = {
|
||||||
|
"file_path": full_path_key,
|
||||||
|
"repository": repo_full_name,
|
||||||
|
"file_type": file_extension or "unknown",
|
||||||
|
"document_type": "GitHub Repository File",
|
||||||
|
"connector_type": "GitHub",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
file_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
|
||||||
|
summary_embedding = (
|
||||||
|
config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chunk the content
|
||||||
|
try:
|
||||||
|
if hasattr(config, "code_chunker_instance"):
|
||||||
|
chunks_data = [
|
||||||
|
await create_document_chunks(file_content)
|
||||||
|
][0]
|
||||||
|
else:
|
||||||
|
chunks_data = await create_document_chunks(
|
||||||
|
file_content
|
||||||
|
)
|
||||||
|
except Exception as chunk_err:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to chunk file {full_path_key}: {chunk_err}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"GitHub - {full_path_key}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"file_path": file_path,
|
||||||
|
"file_sha": file_sha,
|
||||||
|
"file_url": file_url,
|
||||||
|
"repository": repo_full_name,
|
||||||
|
"indexed_at": datetime.now(UTC).strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks_data
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated GitHub file {full_path_key}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -290,6 +373,7 @@ async def index_github_repos(
|
||||||
document_metadata=doc_metadata,
|
document_metadata=doc_metadata,
|
||||||
content=summary_content, # Store summary
|
content=summary_content, # Store summary
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
chunks=chunks_data, # Associate chunks directly
|
chunks=chunks_data, # Associate chunks directly
|
||||||
|
|
|
||||||
|
|
@ -17,9 +17,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -248,23 +250,99 @@ async def index_google_calendar_events(
|
||||||
location = event.get("location", "")
|
location = event.get("location", "")
|
||||||
description = event.get("description", "")
|
description = event.get("description", "")
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Google Calendar event
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(event_markdown, search_space_id)
|
content_hash = generate_content_hash(event_markdown, search_space_id)
|
||||||
|
|
||||||
# Duplicate check via simple query using helper in base
|
# Check if document with this unique identifier already exists
|
||||||
from .base import (
|
existing_document = await check_document_by_unique_identifier(
|
||||||
check_duplicate_document_by_hash, # local import to avoid circular at module import
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
if existing_document:
|
||||||
session, content_hash
|
# Document exists - check if content has changed
|
||||||
)
|
if existing_document.content_hash == content_hash:
|
||||||
if existing_document_by_hash:
|
logger.info(
|
||||||
logger.info(
|
f"Document for Google Calendar event {event_summary} unchanged. Skipping."
|
||||||
f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing."
|
)
|
||||||
)
|
documents_skipped += 1
|
||||||
documents_skipped += 1
|
continue
|
||||||
continue
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Google Calendar event {event_summary}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"event_id": event_id,
|
||||||
|
"event_summary": event_summary,
|
||||||
|
"calendar_id": calendar_id,
|
||||||
|
"start_time": start_time,
|
||||||
|
"end_time": end_time,
|
||||||
|
"location": location or "No location",
|
||||||
|
"document_type": "Google Calendar Event",
|
||||||
|
"connector_type": "Google Calendar",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
event_markdown, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = (
|
||||||
|
f"Google Calendar Event: {event_summary}\n\n"
|
||||||
|
)
|
||||||
|
summary_content += f"Calendar: {calendar_id}\n"
|
||||||
|
summary_content += f"Start: {start_time}\n"
|
||||||
|
summary_content += f"End: {end_time}\n"
|
||||||
|
if location:
|
||||||
|
summary_content += f"Location: {location}\n"
|
||||||
|
if description:
|
||||||
|
desc_preview = description[:1000]
|
||||||
|
if len(description) > 1000:
|
||||||
|
desc_preview += "..."
|
||||||
|
summary_content += f"Description: {desc_preview}\n"
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(event_markdown)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"Calendar Event - {event_summary}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"event_id": event_id,
|
||||||
|
"event_summary": event_summary,
|
||||||
|
"calendar_id": calendar_id,
|
||||||
|
"start_time": start_time,
|
||||||
|
"end_time": end_time,
|
||||||
|
"location": location,
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated Google Calendar event {event_summary}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -320,6 +398,7 @@ async def index_google_calendar_events(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -21,10 +21,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -194,21 +195,85 @@ async def index_google_gmail_messages(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Gmail message
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||||
|
|
||||||
# Check if document already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
existing_document = await check_document_by_unique_identifier(
|
||||||
session, content_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
if existing_document:
|
||||||
logger.info(
|
# Document exists - check if content has changed
|
||||||
f"Document with content hash {content_hash} already exists for message {message_id}. Skipping processing."
|
if existing_document.content_hash == content_hash:
|
||||||
)
|
logger.info(
|
||||||
documents_skipped += 1
|
f"Document for Gmail message {subject} unchanged. Skipping."
|
||||||
continue
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Gmail message {subject}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"message_id": message_id,
|
||||||
|
"thread_id": thread_id,
|
||||||
|
"subject": subject,
|
||||||
|
"sender": sender,
|
||||||
|
"date": date_str,
|
||||||
|
"document_type": "Gmail Message",
|
||||||
|
"connector_type": "Google Gmail",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
markdown_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = f"Google Gmail Message: {subject}\n\n"
|
||||||
|
summary_content += f"Sender: {sender}\n"
|
||||||
|
summary_content += f"Date: {date_str}\n"
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(markdown_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"Gmail: {subject}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"message_id": message_id,
|
||||||
|
"thread_id": thread_id,
|
||||||
|
"subject": subject,
|
||||||
|
"sender": sender,
|
||||||
|
"date": date_str,
|
||||||
|
"connector_id": connector_id,
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(f"Successfully updated Gmail message {subject}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -258,6 +323,7 @@ async def index_google_gmail_messages(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,12 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -200,26 +201,96 @@ async def index_jira_issues(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Jira issue
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.JIRA_CONNECTOR, issue_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(issue_content, search_space_id)
|
content_hash = generate_content_hash(issue_content, search_space_id)
|
||||||
|
|
||||||
# Check if document already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
existing_document = await check_document_by_unique_identifier(
|
||||||
session, content_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
comment_count = len(formatted_issue.get("comments", []))
|
||||||
logger.info(
|
|
||||||
f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for Jira issue {issue_identifier} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Jira issue {issue_identifier}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"issue_key": issue_identifier,
|
||||||
|
"issue_title": issue_title,
|
||||||
|
"status": formatted_issue.get("status", "Unknown"),
|
||||||
|
"priority": formatted_issue.get("priority", "Unknown"),
|
||||||
|
"comment_count": comment_count,
|
||||||
|
"document_type": "Jira Issue",
|
||||||
|
"connector_type": "Jira",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
issue_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
|
||||||
|
if formatted_issue.get("description"):
|
||||||
|
summary_content += f"Description: {formatted_issue.get('description')}\n\n"
|
||||||
|
summary_content += f"Comments: {comment_count}"
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(issue_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = (
|
||||||
|
f"Jira - {issue_identifier}: {issue_title}"
|
||||||
|
)
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"issue_id": issue_id,
|
||||||
|
"issue_identifier": issue_identifier,
|
||||||
|
"issue_title": issue_title,
|
||||||
|
"state": formatted_issue.get("status", "Unknown"),
|
||||||
|
"comment_count": comment_count,
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated Jira issue {issue_identifier}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
)
|
)
|
||||||
comment_count = len(formatted_issue.get("comments", []))
|
|
||||||
|
|
||||||
if user_llm:
|
if user_llm:
|
||||||
document_metadata = {
|
document_metadata = {
|
||||||
|
|
@ -270,6 +341,7 @@ async def index_jira_issues(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,12 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -213,27 +214,101 @@ async def index_linear_issues(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content_hash = generate_content_hash(issue_content, search_space_id)
|
# Generate unique identifier hash for this Linear issue
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
# Check if document with this content hash already exists
|
DocumentType.LINEAR_CONNECTOR, issue_id, search_space_id
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
|
||||||
session, content_hash
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
# Generate content hash
|
||||||
logger.info(
|
content_hash = generate_content_hash(issue_content, search_space_id)
|
||||||
f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
# Check if document with this unique identifier already exists
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
state = formatted_issue.get("state", "Unknown")
|
||||||
|
description = formatted_issue.get("description", "")
|
||||||
|
comment_count = len(formatted_issue.get("comments", []))
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for Linear issue {issue_identifier} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Linear issue {issue_identifier}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"issue_id": issue_identifier,
|
||||||
|
"issue_title": issue_title,
|
||||||
|
"state": state,
|
||||||
|
"priority": formatted_issue.get("priority", "Unknown"),
|
||||||
|
"comment_count": comment_count,
|
||||||
|
"document_type": "Linear Issue",
|
||||||
|
"connector_type": "Linear",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
issue_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fallback to simple summary if no LLM configured
|
||||||
|
if description and len(description) > 1000:
|
||||||
|
description = description[:997] + "..."
|
||||||
|
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
|
||||||
|
if description:
|
||||||
|
summary_content += f"Description: {description}\n\n"
|
||||||
|
summary_content += f"Comments: {comment_count}"
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(issue_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = (
|
||||||
|
f"Linear - {issue_identifier}: {issue_title}"
|
||||||
|
)
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"issue_id": issue_id,
|
||||||
|
"issue_identifier": issue_identifier,
|
||||||
|
"issue_title": issue_title,
|
||||||
|
"state": state,
|
||||||
|
"comment_count": comment_count,
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(
|
||||||
|
f"Successfully updated Linear issue {issue_identifier}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
)
|
)
|
||||||
state = formatted_issue.get("state", "Unknown")
|
|
||||||
description = formatted_issue.get("description", "")
|
|
||||||
comment_count = len(formatted_issue.get("comments", []))
|
|
||||||
|
|
||||||
if user_llm:
|
if user_llm:
|
||||||
document_metadata = {
|
document_metadata = {
|
||||||
|
|
@ -285,6 +360,7 @@ async def index_linear_issues(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,9 +16,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -254,21 +256,108 @@ async def index_luma_events(
|
||||||
description = event_data.get("description", "")
|
description = event_data.get("description", "")
|
||||||
cover_url = event_data.get("cover_url", "")
|
cover_url = event_data.get("cover_url", "")
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Luma event
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.LUMA_CONNECTOR, event_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(event_markdown, search_space_id)
|
content_hash = generate_content_hash(event_markdown, search_space_id)
|
||||||
|
|
||||||
# Duplicate check via simple query using helper in base
|
# Check if document with this unique identifier already exists
|
||||||
from .base import check_duplicate_document_by_hash
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
|
||||||
session, content_hash
|
|
||||||
)
|
)
|
||||||
if existing_document_by_hash:
|
|
||||||
logger.info(
|
|
||||||
f"Document with content hash {content_hash} already exists for event {event_name}. Skipping processing."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logger.info(
|
||||||
|
f"Document for Luma event {event_name} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Luma event {event_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if user_llm:
|
||||||
|
document_metadata = {
|
||||||
|
"event_id": event_id,
|
||||||
|
"event_name": event_name,
|
||||||
|
"event_url": event_url,
|
||||||
|
"start_at": start_at,
|
||||||
|
"end_at": end_at,
|
||||||
|
"timezone": timezone,
|
||||||
|
"location": location or "No location",
|
||||||
|
"city": city,
|
||||||
|
"hosts": host_names,
|
||||||
|
"document_type": "Luma Event",
|
||||||
|
"connector_type": "Luma",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
event_markdown, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary_content = f"Luma Event: {event_name}\n\n"
|
||||||
|
if event_url:
|
||||||
|
summary_content += f"URL: {event_url}\n"
|
||||||
|
summary_content += f"Start: {start_at}\n"
|
||||||
|
summary_content += f"End: {end_at}\n"
|
||||||
|
if timezone:
|
||||||
|
summary_content += f"Timezone: {timezone}\n"
|
||||||
|
if location:
|
||||||
|
summary_content += f"Location: {location}\n"
|
||||||
|
if city:
|
||||||
|
summary_content += f"City: {city}\n"
|
||||||
|
if host_names:
|
||||||
|
summary_content += f"Hosts: {host_names}\n"
|
||||||
|
if description:
|
||||||
|
desc_preview = description[:1000]
|
||||||
|
if len(description) > 1000:
|
||||||
|
desc_preview += "..."
|
||||||
|
summary_content += f"Description: {desc_preview}\n"
|
||||||
|
summary_embedding = config.embedding_model_instance.embed(
|
||||||
|
summary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(event_markdown)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"Luma Event - {event_name}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"event_id": event_id,
|
||||||
|
"event_name": event_name,
|
||||||
|
"event_url": event_url,
|
||||||
|
"start_at": start_at,
|
||||||
|
"end_at": end_at,
|
||||||
|
"timezone": timezone,
|
||||||
|
"location": location,
|
||||||
|
"city": city,
|
||||||
|
"hosts": host_names,
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(f"Successfully updated Luma event {event_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -340,6 +429,7 @@ async def index_luma_events(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -15,11 +15,12 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
build_document_metadata_string,
|
build_document_metadata_string,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -282,22 +283,82 @@ async def index_notion_pages(
|
||||||
combined_document_string = build_document_metadata_string(
|
combined_document_string = build_document_metadata_string(
|
||||||
metadata_sections
|
metadata_sections
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Notion page
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.NOTION_CONNECTOR, page_id, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(
|
content_hash = generate_content_hash(
|
||||||
combined_document_string, search_space_id
|
combined_document_string, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
existing_document = await check_document_by_unique_identifier(
|
||||||
session, content_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
if existing_document:
|
||||||
logger.info(
|
# Document exists - check if content has changed
|
||||||
f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
|
if existing_document.content_hash == content_hash:
|
||||||
)
|
logger.info(
|
||||||
documents_skipped += 1
|
f"Document for Notion page {page_title} unchanged. Skipping."
|
||||||
continue
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Notion page {page_title}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get user's long context LLM
|
||||||
|
user_llm = await get_user_long_context_llm(
|
||||||
|
session, user_id, search_space_id
|
||||||
|
)
|
||||||
|
if not user_llm:
|
||||||
|
logger.error(
|
||||||
|
f"No long context LLM configured for user {user_id}"
|
||||||
|
)
|
||||||
|
skipped_pages.append(f"{page_title} (no LLM configured)")
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate summary with metadata
|
||||||
|
document_metadata = {
|
||||||
|
"page_title": page_title,
|
||||||
|
"page_id": page_id,
|
||||||
|
"document_type": "Notion Page",
|
||||||
|
"connector_type": "Notion",
|
||||||
|
}
|
||||||
|
(
|
||||||
|
summary_content,
|
||||||
|
summary_embedding,
|
||||||
|
) = await generate_document_summary(
|
||||||
|
markdown_content, user_llm, document_metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(markdown_content)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.title = f"Notion - {page_title}"
|
||||||
|
existing_document.content = summary_content
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"page_title": page_title,
|
||||||
|
"page_id": page_id,
|
||||||
|
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
}
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(f"Successfully updated Notion page: {page_title}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Get user's long context LLM
|
# Get user's long context LLM
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
|
|
@ -336,6 +397,7 @@ async def index_notion_pages(
|
||||||
},
|
},
|
||||||
content=summary_content,
|
content=summary_content,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -15,12 +15,13 @@ from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
build_document_metadata_markdown,
|
build_document_metadata_markdown,
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_duplicate_document_by_hash,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
logger,
|
logger,
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
|
|
@ -235,6 +236,7 @@ async def index_slack_messages(
|
||||||
|
|
||||||
for msg in formatted_messages:
|
for msg in formatted_messages:
|
||||||
timestamp = msg.get("datetime", "Unknown Time")
|
timestamp = msg.get("datetime", "Unknown Time")
|
||||||
|
msg_ts = msg.get("ts", timestamp) # Get original Slack timestamp
|
||||||
msg_user_name = msg.get("user_name", "Unknown User")
|
msg_user_name = msg.get("user_name", "Unknown User")
|
||||||
msg_user_email = msg.get("user_email", "Unknown Email")
|
msg_user_email = msg.get("user_email", "Unknown Email")
|
||||||
msg_text = msg.get("text", "")
|
msg_text = msg.get("text", "")
|
||||||
|
|
@ -261,22 +263,68 @@ async def index_slack_messages(
|
||||||
combined_document_string = build_document_metadata_markdown(
|
combined_document_string = build_document_metadata_markdown(
|
||||||
metadata_sections
|
metadata_sections
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this Slack message
|
||||||
|
unique_identifier = f"{channel_id}_{msg_ts}"
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.SLACK_CONNECTOR, unique_identifier, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(
|
content_hash = generate_content_hash(
|
||||||
combined_document_string, search_space_id
|
combined_document_string, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document_by_hash = await check_duplicate_document_by_hash(
|
existing_document = await check_document_by_unique_identifier(
|
||||||
session, content_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document_by_hash:
|
if existing_document:
|
||||||
logger.info(
|
# Document exists - check if content has changed
|
||||||
f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
|
if existing_document.content_hash == content_hash:
|
||||||
)
|
logger.info(
|
||||||
documents_skipped += 1
|
f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping."
|
||||||
continue
|
)
|
||||||
|
documents_skipped += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update chunks and embedding
|
||||||
|
chunks = await create_document_chunks(
|
||||||
|
combined_document_string
|
||||||
|
)
|
||||||
|
doc_embedding = config.embedding_model_instance.embed(
|
||||||
|
combined_document_string
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update existing document
|
||||||
|
existing_document.content = combined_document_string
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = doc_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_id": channel_id,
|
||||||
|
"start_date": start_date_str,
|
||||||
|
"end_date": end_date_str,
|
||||||
|
"message_count": len(formatted_messages),
|
||||||
|
"indexed_at": datetime.now().strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Delete old chunks and add new ones
|
||||||
|
existing_document.chunks = chunks
|
||||||
|
|
||||||
|
documents_indexed += 1
|
||||||
|
logger.info(f"Successfully updated Slack message {msg_ts}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(combined_document_string)
|
chunks = await create_document_chunks(combined_document_string)
|
||||||
doc_embedding = config.embedding_model_instance.embed(
|
doc_embedding = config.embedding_model_instance.embed(
|
||||||
|
|
@ -300,6 +348,7 @@ async def index_slack_messages(
|
||||||
embedding=doc_embedding,
|
embedding=doc_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
session.add(document)
|
session.add(document)
|
||||||
|
|
|
||||||
|
|
@ -29,3 +29,27 @@ async def check_duplicate_document(
|
||||||
select(Document).where(Document.content_hash == content_hash)
|
select(Document).where(Document.content_hash == content_hash)
|
||||||
)
|
)
|
||||||
return existing_doc_result.scalars().first()
|
return existing_doc_result.scalars().first()
|
||||||
|
|
||||||
|
|
||||||
|
async def check_document_by_unique_identifier(
|
||||||
|
session: AsyncSession, unique_identifier_hash: str
|
||||||
|
) -> Document | None:
|
||||||
|
"""
|
||||||
|
Check if a document with the given unique identifier hash already exists.
|
||||||
|
Eagerly loads chunks to avoid lazy loading issues during updates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
unique_identifier_hash: Hash of the unique identifier from the source
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Existing document if found, None otherwise
|
||||||
|
"""
|
||||||
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
||||||
|
existing_doc_result = await session.execute(
|
||||||
|
select(Document)
|
||||||
|
.options(selectinload(Document.chunks))
|
||||||
|
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
||||||
|
)
|
||||||
|
return existing_doc_result.scalars().first()
|
||||||
|
|
|
||||||
|
|
@ -15,10 +15,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document,
|
check_document_by_unique_identifier,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -85,25 +86,42 @@ async def add_extension_received_document(
|
||||||
|
|
||||||
document_parts.append("</DOCUMENT>")
|
document_parts.append("</DOCUMENT>")
|
||||||
combined_document_string = "\n".join(document_parts)
|
combined_document_string = "\n".join(document_parts)
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this extension document (using URL)
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.EXTENSION, content.metadata.VisitedWebPageURL, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
existing_document = await check_document_by_unique_identifier(
|
||||||
if existing_document:
|
session, unique_identifier_hash
|
||||||
await task_logger.log_task_success(
|
)
|
||||||
log_entry,
|
|
||||||
f"Extension document already exists: {content.metadata.VisitedWebPageTitle}",
|
|
||||||
{
|
|
||||||
"duplicate_detected": True,
|
|
||||||
"existing_document_id": existing_document.id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
logging.info(
|
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Extension document unchanged: {content.metadata.VisitedWebPageTitle}",
|
||||||
|
{
|
||||||
|
"duplicate_detected": True,
|
||||||
|
"existing_document_id": existing_document.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f"Document for URL {content.metadata.VisitedWebPageURL} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
if not user_llm:
|
if not user_llm:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -127,21 +145,36 @@ async def add_extension_received_document(
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(content.pageContent)
|
chunks = await create_document_chunks(content.pageContent)
|
||||||
|
|
||||||
# Create and store document
|
# Update or create document
|
||||||
document = Document(
|
if existing_document:
|
||||||
search_space_id=search_space_id,
|
# Update existing document
|
||||||
title=content.metadata.VisitedWebPageTitle,
|
existing_document.title = content.metadata.VisitedWebPageTitle
|
||||||
document_type=DocumentType.EXTENSION,
|
existing_document.content = summary_content
|
||||||
document_metadata=content.metadata.model_dump(),
|
existing_document.content_hash = content_hash
|
||||||
content=summary_content,
|
existing_document.embedding = summary_embedding
|
||||||
embedding=summary_embedding,
|
existing_document.document_metadata = content.metadata.model_dump()
|
||||||
chunks=chunks,
|
existing_document.chunks = chunks
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
await session.commit()
|
||||||
await session.commit()
|
await session.refresh(existing_document)
|
||||||
await session.refresh(document)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=content.metadata.VisitedWebPageTitle,
|
||||||
|
document_type=DocumentType.EXTENSION,
|
||||||
|
document_metadata=content.metadata.model_dump(),
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(document)
|
||||||
|
|
||||||
# Log success
|
# Log success
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
|
|
|
||||||
|
|
@ -15,10 +15,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document,
|
check_document_by_unique_identifier,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -47,19 +48,31 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements
|
unstructured_processed_elements
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this file
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, file_name, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
logging.info(
|
# Document exists - check if content has changed
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
if existing_document.content_hash == content_hash:
|
||||||
)
|
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
||||||
return existing_document
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for file {file_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: Check if file_markdown exceeds token limit of embedding model
|
# Get user's long context LLM (needed for both create and update)
|
||||||
|
|
||||||
# Get user's long context LLM
|
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
if not user_llm:
|
if not user_llm:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -79,24 +92,42 @@ async def add_received_file_document_using_unstructured(
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(file_in_markdown)
|
chunks = await create_document_chunks(file_in_markdown)
|
||||||
|
|
||||||
# Create and store document
|
# Update or create document
|
||||||
document = Document(
|
if existing_document:
|
||||||
search_space_id=search_space_id,
|
# Update existing document
|
||||||
title=file_name,
|
existing_document.title = file_name
|
||||||
document_type=DocumentType.FILE,
|
existing_document.content = summary_content
|
||||||
document_metadata={
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "UNSTRUCTURED",
|
"ETL_SERVICE": "UNSTRUCTURED",
|
||||||
},
|
}
|
||||||
content=summary_content,
|
existing_document.chunks = chunks
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
await session.commit()
|
||||||
await session.commit()
|
await session.refresh(existing_document)
|
||||||
await session.refresh(document)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=file_name,
|
||||||
|
document_type=DocumentType.FILE,
|
||||||
|
document_metadata={
|
||||||
|
"FILE_NAME": file_name,
|
||||||
|
"ETL_SERVICE": "UNSTRUCTURED",
|
||||||
|
},
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(document)
|
||||||
|
|
||||||
return document
|
return document
|
||||||
except SQLAlchemyError as db_error:
|
except SQLAlchemyError as db_error:
|
||||||
|
|
@ -131,17 +162,31 @@ async def add_received_file_document_using_llamacloud(
|
||||||
# Combine all markdown documents into one
|
# Combine all markdown documents into one
|
||||||
file_in_markdown = llamacloud_markdown_document
|
file_in_markdown = llamacloud_markdown_document
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this file
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, file_name, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
existing_document = await check_document_by_unique_identifier(
|
||||||
if existing_document:
|
session, unique_identifier_hash
|
||||||
logging.info(
|
)
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
||||||
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for file {file_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
if not user_llm:
|
if not user_llm:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -161,24 +206,42 @@ async def add_received_file_document_using_llamacloud(
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(file_in_markdown)
|
chunks = await create_document_chunks(file_in_markdown)
|
||||||
|
|
||||||
# Create and store document
|
# Update or create document
|
||||||
document = Document(
|
if existing_document:
|
||||||
search_space_id=search_space_id,
|
# Update existing document
|
||||||
title=file_name,
|
existing_document.title = file_name
|
||||||
document_type=DocumentType.FILE,
|
existing_document.content = summary_content
|
||||||
document_metadata={
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "LLAMACLOUD",
|
"ETL_SERVICE": "LLAMACLOUD",
|
||||||
},
|
}
|
||||||
content=summary_content,
|
existing_document.chunks = chunks
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
await session.commit()
|
||||||
await session.commit()
|
await session.refresh(existing_document)
|
||||||
await session.refresh(document)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=file_name,
|
||||||
|
document_type=DocumentType.FILE,
|
||||||
|
document_metadata={
|
||||||
|
"FILE_NAME": file_name,
|
||||||
|
"ETL_SERVICE": "LLAMACLOUD",
|
||||||
|
},
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(document)
|
||||||
|
|
||||||
return document
|
return document
|
||||||
except SQLAlchemyError as db_error:
|
except SQLAlchemyError as db_error:
|
||||||
|
|
@ -214,17 +277,31 @@ async def add_received_file_document_using_docling(
|
||||||
try:
|
try:
|
||||||
file_in_markdown = docling_markdown_document
|
file_in_markdown = docling_markdown_document
|
||||||
|
|
||||||
|
# Generate unique identifier hash for this file
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, file_name, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
existing_document = await check_document_by_unique_identifier(
|
||||||
if existing_document:
|
session, unique_identifier_hash
|
||||||
logging.info(
|
)
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
||||||
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for file {file_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
if not user_llm:
|
if not user_llm:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -268,20 +345,38 @@ async def add_received_file_document_using_docling(
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(file_in_markdown)
|
chunks = await create_document_chunks(file_in_markdown)
|
||||||
|
|
||||||
# Create and store document
|
# Update or create document
|
||||||
document = Document(
|
if existing_document:
|
||||||
search_space_id=search_space_id,
|
# Update existing document
|
||||||
title=file_name,
|
existing_document.title = file_name
|
||||||
document_type=DocumentType.FILE,
|
existing_document.content = enhanced_summary_content
|
||||||
document_metadata={
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "DOCLING",
|
"ETL_SERVICE": "DOCLING",
|
||||||
},
|
}
|
||||||
content=enhanced_summary_content,
|
existing_document.chunks = chunks
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
await session.commit()
|
||||||
content_hash=content_hash,
|
await session.refresh(existing_document)
|
||||||
)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=file_name,
|
||||||
|
document_type=DocumentType.FILE,
|
||||||
|
document_metadata={
|
||||||
|
"FILE_NAME": file_name,
|
||||||
|
"ETL_SERVICE": "DOCLING",
|
||||||
|
},
|
||||||
|
content=enhanced_summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
session.add(document)
|
session.add(document)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
|
||||||
|
|
@ -14,10 +14,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document,
|
check_document_by_unique_identifier,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,25 +57,41 @@ async def add_received_markdown_file_document(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Generate unique identifier hash for this markdown file
|
||||||
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, file_name, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this content hash already exists
|
# Check if document with this unique identifier already exists
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
existing_document = await check_document_by_unique_identifier(
|
||||||
if existing_document:
|
session, unique_identifier_hash
|
||||||
await task_logger.log_task_success(
|
)
|
||||||
log_entry,
|
|
||||||
f"Markdown file document already exists: {file_name}",
|
|
||||||
{
|
|
||||||
"duplicate_detected": True,
|
|
||||||
"existing_document_id": existing_document.id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
logging.info(
|
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Markdown file document unchanged: {file_name}",
|
||||||
|
{
|
||||||
|
"duplicate_detected": True,
|
||||||
|
"existing_document_id": existing_document.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f"Document for markdown file {file_name} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for markdown file {file_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
if not user_llm:
|
if not user_llm:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -93,23 +110,40 @@ async def add_received_markdown_file_document(
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(file_in_markdown)
|
chunks = await create_document_chunks(file_in_markdown)
|
||||||
|
|
||||||
# Create and store document
|
# Update or create document
|
||||||
document = Document(
|
if existing_document:
|
||||||
search_space_id=search_space_id,
|
# Update existing document
|
||||||
title=file_name,
|
existing_document.title = file_name
|
||||||
document_type=DocumentType.FILE,
|
existing_document.content = summary_content
|
||||||
document_metadata={
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
},
|
}
|
||||||
content=summary_content,
|
existing_document.chunks = chunks
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
await session.commit()
|
||||||
await session.commit()
|
await session.refresh(existing_document)
|
||||||
await session.refresh(document)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=file_name,
|
||||||
|
document_type=DocumentType.FILE,
|
||||||
|
document_metadata={
|
||||||
|
"FILE_NAME": file_name,
|
||||||
|
},
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(document)
|
||||||
|
|
||||||
# Log success
|
# Log success
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
|
|
|
||||||
|
|
@ -17,10 +17,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document,
|
check_document_by_unique_identifier,
|
||||||
md,
|
md,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -129,31 +130,49 @@ async def add_crawled_url_document(
|
||||||
|
|
||||||
document_parts.append("</DOCUMENT>")
|
document_parts.append("</DOCUMENT>")
|
||||||
combined_document_string = "\n".join(document_parts)
|
combined_document_string = "\n".join(document_parts)
|
||||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
|
||||||
|
|
||||||
# Check for duplicates
|
# Generate unique identifier hash for this URL
|
||||||
await task_logger.log_task_progress(
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
log_entry,
|
DocumentType.CRAWLED_URL, url, search_space_id
|
||||||
f"Checking for duplicate content: {url}",
|
|
||||||
{"stage": "duplicate_check", "content_hash": content_hash},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
# Generate content hash
|
||||||
if existing_document:
|
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Document already exists for URL: {url}",
|
|
||||||
{
|
|
||||||
"duplicate_detected": True,
|
|
||||||
"existing_document_id": existing_document.id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
logging.info(
|
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
|
|
||||||
# Get LLM for summary generation
|
# Check if document with this unique identifier already exists
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Checking for existing URL: {url}",
|
||||||
|
{"stage": "duplicate_check", "url": url},
|
||||||
|
)
|
||||||
|
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"URL document unchanged: {url}",
|
||||||
|
{
|
||||||
|
"duplicate_detected": True,
|
||||||
|
"existing_document_id": existing_document.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logging.info(f"Document for URL {url} unchanged. Skipping.")
|
||||||
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(f"Content changed for URL {url}. Updating document.")
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Updating URL document: {url}",
|
||||||
|
{"stage": "document_update", "url": url},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get LLM for summary generation (needed for both create and update)
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Preparing for summary generation: {url}",
|
f"Preparing for summary generation: {url}",
|
||||||
|
|
@ -194,27 +213,50 @@ async def add_crawled_url_document(
|
||||||
|
|
||||||
chunks = await create_document_chunks(content_in_markdown)
|
chunks = await create_document_chunks(content_in_markdown)
|
||||||
|
|
||||||
# Create and store document
|
# Update or create document
|
||||||
await task_logger.log_task_progress(
|
if existing_document:
|
||||||
log_entry,
|
# Update existing document
|
||||||
f"Creating document in database for URL: {url}",
|
await task_logger.log_task_progress(
|
||||||
{"stage": "document_creation", "chunks_count": len(chunks)},
|
log_entry,
|
||||||
)
|
f"Updating document in database for URL: {url}",
|
||||||
|
{"stage": "document_update", "chunks_count": len(chunks)},
|
||||||
|
)
|
||||||
|
|
||||||
document = Document(
|
existing_document.title = (
|
||||||
search_space_id=search_space_id,
|
url_crawled[0].metadata["title"]
|
||||||
title=url_crawled[0].metadata["title"]
|
if isinstance(crawl_loader, FireCrawlLoader)
|
||||||
if isinstance(crawl_loader, FireCrawlLoader)
|
else url_crawled[0].metadata["source"]
|
||||||
else url_crawled[0].metadata["source"],
|
)
|
||||||
document_type=DocumentType.CRAWLED_URL,
|
existing_document.content = summary_content
|
||||||
document_metadata=url_crawled[0].metadata,
|
existing_document.content_hash = content_hash
|
||||||
content=summary_content,
|
existing_document.embedding = summary_embedding
|
||||||
embedding=summary_embedding,
|
existing_document.document_metadata = url_crawled[0].metadata
|
||||||
chunks=chunks,
|
existing_document.chunks = chunks
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Creating document in database for URL: {url}",
|
||||||
|
{"stage": "document_creation", "chunks_count": len(chunks)},
|
||||||
|
)
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=url_crawled[0].metadata["title"]
|
||||||
|
if isinstance(crawl_loader, FireCrawlLoader)
|
||||||
|
else url_crawled[0].metadata["source"],
|
||||||
|
document_type=DocumentType.CRAWLED_URL,
|
||||||
|
document_metadata=url_crawled[0].metadata,
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
await session.refresh(document)
|
await session.refresh(document)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,10 +17,11 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_duplicate_document,
|
check_document_by_unique_identifier,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -201,32 +202,54 @@ async def add_youtube_video_document(
|
||||||
|
|
||||||
document_parts.append("</DOCUMENT>")
|
document_parts.append("</DOCUMENT>")
|
||||||
combined_document_string = "\n".join(document_parts)
|
combined_document_string = "\n".join(document_parts)
|
||||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
|
||||||
|
|
||||||
# Check for duplicates
|
# Generate unique identifier hash for this YouTube video
|
||||||
await task_logger.log_task_progress(
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
log_entry,
|
DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
|
||||||
f"Checking for duplicate video content: {video_id}",
|
|
||||||
{"stage": "duplicate_check", "content_hash": content_hash},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
existing_document = await check_duplicate_document(session, content_hash)
|
# Generate content hash
|
||||||
if existing_document:
|
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}",
|
|
||||||
{
|
|
||||||
"duplicate_detected": True,
|
|
||||||
"existing_document_id": existing_document.id,
|
|
||||||
"video_id": video_id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
logging.info(
|
|
||||||
f"Document with content hash {content_hash} already exists. Skipping processing."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
|
|
||||||
# Get LLM for summary generation
|
# Check if document with this unique identifier already exists
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Checking for existing video: {video_id}",
|
||||||
|
{"stage": "duplicate_check", "video_id": video_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, unique_identifier_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_document:
|
||||||
|
# Document exists - check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
|
||||||
|
{
|
||||||
|
"duplicate_detected": True,
|
||||||
|
"existing_document_id": existing_document.id,
|
||||||
|
"video_id": video_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f"Document for YouTube video {video_id} unchanged. Skipping."
|
||||||
|
)
|
||||||
|
return existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for YouTube video {video_id}. Updating document."
|
||||||
|
)
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}",
|
||||||
|
{"stage": "document_update", "video_id": video_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get LLM for summary generation (needed for both create and update)
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
|
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
|
||||||
|
|
@ -270,33 +293,60 @@ async def add_youtube_video_document(
|
||||||
|
|
||||||
chunks = await create_document_chunks(combined_document_string)
|
chunks = await create_document_chunks(combined_document_string)
|
||||||
|
|
||||||
# Create document
|
# Update or create document
|
||||||
await task_logger.log_task_progress(
|
if existing_document:
|
||||||
log_entry,
|
# Update existing document
|
||||||
f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
|
await task_logger.log_task_progress(
|
||||||
{"stage": "document_creation", "chunks_count": len(chunks)},
|
log_entry,
|
||||||
)
|
f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
|
||||||
|
{"stage": "document_update", "chunks_count": len(chunks)},
|
||||||
|
)
|
||||||
|
|
||||||
document = Document(
|
existing_document.title = video_data.get("title", "YouTube Video")
|
||||||
title=video_data.get("title", "YouTube Video"),
|
existing_document.content = summary_content
|
||||||
document_type=DocumentType.YOUTUBE_VIDEO,
|
existing_document.content_hash = content_hash
|
||||||
document_metadata={
|
existing_document.embedding = summary_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
"url": url,
|
"url": url,
|
||||||
"video_id": video_id,
|
"video_id": video_id,
|
||||||
"video_title": video_data.get("title", "YouTube Video"),
|
"video_title": video_data.get("title", "YouTube Video"),
|
||||||
"author": video_data.get("author_name", "Unknown"),
|
"author": video_data.get("author_name", "Unknown"),
|
||||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||||
},
|
}
|
||||||
content=summary_content,
|
existing_document.chunks = chunks
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
content_hash=content_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
await session.commit()
|
||||||
await session.commit()
|
await session.refresh(existing_document)
|
||||||
await session.refresh(document)
|
document = existing_document
|
||||||
|
else:
|
||||||
|
# Create new document
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
|
||||||
|
{"stage": "document_creation", "chunks_count": len(chunks)},
|
||||||
|
)
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
title=video_data.get("title", "YouTube Video"),
|
||||||
|
document_type=DocumentType.YOUTUBE_VIDEO,
|
||||||
|
document_metadata={
|
||||||
|
"url": url,
|
||||||
|
"video_id": video_id,
|
||||||
|
"video_title": video_data.get("title", "YouTube Video"),
|
||||||
|
"author": video_data.get("author_name", "Unknown"),
|
||||||
|
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||||
|
},
|
||||||
|
content=summary_content,
|
||||||
|
embedding=summary_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
session.add(document)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(document)
|
||||||
|
|
||||||
# Log success
|
# Log success
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import hashlib
|
||||||
from litellm import get_model_info, token_counter
|
from litellm import get_model_info, token_counter
|
||||||
|
|
||||||
from app.config import config
|
from app.config import config
|
||||||
from app.db import Chunk
|
from app.db import Chunk, DocumentType
|
||||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -308,3 +308,40 @@ def generate_content_hash(content: str, search_space_id: int) -> str:
|
||||||
"""Generate SHA-256 hash for the given content combined with search space ID."""
|
"""Generate SHA-256 hash for the given content combined with search space ID."""
|
||||||
combined_data = f"{search_space_id}:{content}"
|
combined_data = f"{search_space_id}:{content}"
|
||||||
return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()
|
return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_unique_identifier_hash(
|
||||||
|
document_type: DocumentType,
|
||||||
|
unique_identifier: str | int | float,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate SHA-256 hash for a unique document identifier from connector sources.
|
||||||
|
|
||||||
|
This function creates a consistent hash based on the document type, its unique
|
||||||
|
identifier from the source system, and the search space ID. This helps prevent
|
||||||
|
duplicate documents when syncing from various connectors like Slack, Notion, Jira, etc.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_type: The type of document (e.g., SLACK_CONNECTOR, NOTION_CONNECTOR)
|
||||||
|
unique_identifier: The unique ID from the source system (e.g., message ID, page ID)
|
||||||
|
search_space_id: The search space this document belongs to
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: SHA-256 hash string representing the unique document identifier
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> generate_unique_identifier_hash(
|
||||||
|
... DocumentType.SLACK_CONNECTOR,
|
||||||
|
... "1234567890.123456",
|
||||||
|
... 42
|
||||||
|
... )
|
||||||
|
'a1b2c3d4e5f6...'
|
||||||
|
"""
|
||||||
|
# Convert unique_identifier to string to handle different types
|
||||||
|
identifier_str = str(unique_identifier)
|
||||||
|
|
||||||
|
# Combine document type value, unique identifier, and search space ID
|
||||||
|
combined_data = f"{document_type.value}:{identifier_str}:{search_space_id}"
|
||||||
|
|
||||||
|
return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue