mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-28 21:49:40 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/inbox
This commit is contained in:
commit
614761bb17
64 changed files with 2604 additions and 730 deletions
|
|
@ -0,0 +1,114 @@
|
|||
"""Add public chat sharing and cloning features to new_chat_threads
|
||||
|
||||
Revision ID: 81
|
||||
Revises: 80
|
||||
Create Date: 2026-01-23
|
||||
|
||||
Adds columns for:
|
||||
1. Public sharing via tokenized URLs (public_share_token, public_share_enabled)
|
||||
2. Clone tracking for audit (cloned_from_thread_id, cloned_at)
|
||||
3. History bootstrap flag for cloned chats (needs_history_bootstrap)
|
||||
4. Clone pending flag for two-phase clone (clone_pending)
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "81"
|
||||
down_revision: str | None = "80"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add public sharing and cloning columns to new_chat_threads."""
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE new_chat_threads
|
||||
ADD COLUMN IF NOT EXISTS public_share_token VARCHAR(64);
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE new_chat_threads
|
||||
ADD COLUMN IF NOT EXISTS public_share_enabled BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS ix_new_chat_threads_public_share_token
|
||||
ON new_chat_threads(public_share_token)
|
||||
WHERE public_share_token IS NOT NULL;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_new_chat_threads_public_share_enabled
|
||||
ON new_chat_threads(public_share_enabled)
|
||||
WHERE public_share_enabled = TRUE;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE new_chat_threads
|
||||
ADD COLUMN IF NOT EXISTS cloned_from_thread_id INTEGER
|
||||
REFERENCES new_chat_threads(id) ON DELETE SET NULL;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE new_chat_threads
|
||||
ADD COLUMN IF NOT EXISTS cloned_at TIMESTAMP WITH TIME ZONE;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE new_chat_threads
|
||||
ADD COLUMN IF NOT EXISTS needs_history_bootstrap BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE new_chat_threads
|
||||
ADD COLUMN IF NOT EXISTS clone_pending BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_new_chat_threads_cloned_from_thread_id
|
||||
ON new_chat_threads(cloned_from_thread_id)
|
||||
WHERE cloned_from_thread_id IS NOT NULL;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove public sharing and cloning columns from new_chat_threads."""
|
||||
|
||||
op.execute("DROP INDEX IF EXISTS ix_new_chat_threads_cloned_from_thread_id")
|
||||
op.execute("ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS clone_pending")
|
||||
op.execute(
|
||||
"ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS needs_history_bootstrap"
|
||||
)
|
||||
op.execute("ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS cloned_at")
|
||||
op.execute(
|
||||
"ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS cloned_from_thread_id"
|
||||
)
|
||||
|
||||
op.execute("DROP INDEX IF EXISTS ix_new_chat_threads_public_share_enabled")
|
||||
op.execute("DROP INDEX IF EXISTS ix_new_chat_threads_public_share_token")
|
||||
op.execute(
|
||||
"ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS public_share_enabled"
|
||||
)
|
||||
op.execute("ALTER TABLE new_chat_threads DROP COLUMN IF EXISTS public_share_token")
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
"""Add status and thread_id to podcasts
|
||||
|
||||
Revision ID: 82
|
||||
Revises: 81
|
||||
Create Date: 2026-01-27
|
||||
|
||||
Adds status enum and thread_id FK to podcasts.
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "82"
|
||||
down_revision: str | None = "81"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute(
|
||||
"""
|
||||
CREATE TYPE podcast_status AS ENUM ('pending', 'generating', 'ready', 'failed');
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ADD COLUMN IF NOT EXISTS status podcast_status NOT NULL DEFAULT 'ready';
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE podcasts
|
||||
ADD COLUMN IF NOT EXISTS thread_id INTEGER
|
||||
REFERENCES new_chat_threads(id) ON DELETE SET NULL;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_podcasts_thread_id
|
||||
ON podcasts(thread_id);
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_podcasts_status
|
||||
ON podcasts(status);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP INDEX IF EXISTS ix_podcasts_status")
|
||||
op.execute("DROP INDEX IF EXISTS ix_podcasts_thread_id")
|
||||
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS thread_id")
|
||||
op.execute("ALTER TABLE podcasts DROP COLUMN IF EXISTS status")
|
||||
op.execute("DROP TYPE IF EXISTS podcast_status")
|
||||
|
|
@ -120,6 +120,7 @@ async def create_surfsense_deep_agent(
|
|||
connector_service: ConnectorService,
|
||||
checkpointer: Checkpointer,
|
||||
user_id: str | None = None,
|
||||
thread_id: int | None = None,
|
||||
agent_config: AgentConfig | None = None,
|
||||
enabled_tools: list[str] | None = None,
|
||||
disabled_tools: list[str] | None = None,
|
||||
|
|
@ -232,6 +233,7 @@ async def create_surfsense_deep_agent(
|
|||
"connector_service": connector_service,
|
||||
"firecrawl_api_key": firecrawl_api_key,
|
||||
"user_id": user_id, # Required for memory tools
|
||||
"thread_id": thread_id, # For podcast tool
|
||||
# Dynamic connector/document type discovery for knowledge base tool
|
||||
"available_connectors": available_connectors,
|
||||
"available_document_types": available_document_types,
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ import redis
|
|||
from langchain_core.tools import tool
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Podcast, PodcastStatus
|
||||
|
||||
# Redis connection for tracking active podcast tasks
|
||||
# Uses the same Redis instance as Celery
|
||||
REDIS_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
|
|
@ -32,50 +34,44 @@ def get_redis_client() -> redis.Redis:
|
|||
return _redis_client
|
||||
|
||||
|
||||
def get_active_podcast_key(search_space_id: int) -> str:
|
||||
"""Generate Redis key for tracking active podcast task."""
|
||||
return f"podcast:active:{search_space_id}"
|
||||
def _redis_key(search_space_id: int) -> str:
|
||||
return f"podcast:generating:{search_space_id}"
|
||||
|
||||
|
||||
def get_active_podcast_task(search_space_id: int) -> str | None:
|
||||
"""Check if there's an active podcast task for this search space."""
|
||||
def get_generating_podcast_id(search_space_id: int) -> int | None:
|
||||
"""Get the podcast ID currently being generated for this search space."""
|
||||
try:
|
||||
client = get_redis_client()
|
||||
return client.get(get_active_podcast_key(search_space_id))
|
||||
value = client.get(_redis_key(search_space_id))
|
||||
return int(value) if value else None
|
||||
except Exception:
|
||||
# If Redis is unavailable, allow the request (fail open)
|
||||
return None
|
||||
|
||||
|
||||
def set_active_podcast_task(search_space_id: int, task_id: str) -> None:
|
||||
"""Mark a podcast task as active for this search space."""
|
||||
def set_generating_podcast(search_space_id: int, podcast_id: int) -> None:
|
||||
"""Mark a podcast as currently generating for this search space."""
|
||||
try:
|
||||
client = get_redis_client()
|
||||
# Set with 30-minute expiry as safety net (podcast should complete before this)
|
||||
client.setex(get_active_podcast_key(search_space_id), 1800, task_id)
|
||||
client.setex(_redis_key(search_space_id), 1800, str(podcast_id))
|
||||
except Exception as e:
|
||||
print(f"[generate_podcast] Warning: Could not set active task in Redis: {e}")
|
||||
|
||||
|
||||
def clear_active_podcast_task(search_space_id: int) -> None:
|
||||
"""Clear the active podcast task for this search space."""
|
||||
try:
|
||||
client = get_redis_client()
|
||||
client.delete(get_active_podcast_key(search_space_id))
|
||||
except Exception as e:
|
||||
print(f"[generate_podcast] Warning: Could not clear active task in Redis: {e}")
|
||||
print(f"[generate_podcast] Warning: Could not set generating podcast in Redis: {e}")
|
||||
|
||||
|
||||
def create_generate_podcast_tool(
|
||||
search_space_id: int,
|
||||
db_session: AsyncSession,
|
||||
thread_id: int | None = None,
|
||||
):
|
||||
"""
|
||||
Factory function to create the generate_podcast tool with injected dependencies.
|
||||
|
||||
Pre-creates podcast record with pending status so podcast_id is available
|
||||
immediately for frontend polling.
|
||||
|
||||
Args:
|
||||
search_space_id: The user's search space ID
|
||||
db_session: Database session (not used - Celery creates its own)
|
||||
db_session: Database session for creating the podcast record
|
||||
thread_id: The chat thread ID for associating the podcast
|
||||
|
||||
Returns:
|
||||
A configured tool function for generating podcasts
|
||||
|
|
@ -98,76 +94,71 @@ def create_generate_podcast_tool(
|
|||
- "Make a podcast about..."
|
||||
- "Turn this into a podcast"
|
||||
|
||||
The tool will start generating a podcast in the background.
|
||||
The podcast will be available once generation completes.
|
||||
|
||||
IMPORTANT: Only one podcast can be generated at a time. If a podcast
|
||||
is already being generated, this tool will return a message asking
|
||||
the user to wait.
|
||||
|
||||
Args:
|
||||
source_content: The text content to convert into a podcast.
|
||||
This can be a summary, research findings, or any text
|
||||
the user wants transformed into an audio podcast.
|
||||
podcast_title: Title for the podcast (default: "SurfSense Podcast")
|
||||
user_prompt: Optional instructions for podcast style, tone, or format.
|
||||
For example: "Make it casual and fun" or "Focus on the key insights"
|
||||
|
||||
Returns:
|
||||
A dictionary containing:
|
||||
- status: "processing" (task submitted), "already_generating", or "error"
|
||||
- task_id: The Celery task ID for polling status (if processing)
|
||||
- status: PodcastStatus value (pending, generating, or failed)
|
||||
- podcast_id: The podcast ID for polling (when status is pending or generating)
|
||||
- title: The podcast title
|
||||
- message: Status message for the user
|
||||
- message: Status message (or "error" field if status is failed)
|
||||
"""
|
||||
try:
|
||||
# Check if a podcast is already being generated for this search space
|
||||
active_task_id = get_active_podcast_task(search_space_id)
|
||||
if active_task_id:
|
||||
generating_podcast_id = get_generating_podcast_id(search_space_id)
|
||||
if generating_podcast_id:
|
||||
print(
|
||||
f"[generate_podcast] Blocked duplicate request. Active task: {active_task_id}"
|
||||
f"[generate_podcast] Blocked duplicate request. Generating podcast: {generating_podcast_id}"
|
||||
)
|
||||
return {
|
||||
"status": "already_generating",
|
||||
"task_id": active_task_id,
|
||||
"status": PodcastStatus.GENERATING.value,
|
||||
"podcast_id": generating_podcast_id,
|
||||
"title": podcast_title,
|
||||
"message": "A podcast is already being generated. Please wait for it to complete before requesting another one.",
|
||||
"message": "A podcast is already being generated. Please wait for it to complete.",
|
||||
}
|
||||
|
||||
# Import Celery task here to avoid circular imports
|
||||
podcast = Podcast(
|
||||
title=podcast_title,
|
||||
status=PodcastStatus.PENDING,
|
||||
search_space_id=search_space_id,
|
||||
thread_id=thread_id,
|
||||
)
|
||||
db_session.add(podcast)
|
||||
await db_session.commit()
|
||||
await db_session.refresh(podcast)
|
||||
|
||||
from app.tasks.celery_tasks.podcast_tasks import (
|
||||
generate_content_podcast_task,
|
||||
)
|
||||
|
||||
# Submit Celery task for background processing
|
||||
task = generate_content_podcast_task.delay(
|
||||
podcast_id=podcast.id,
|
||||
source_content=source_content,
|
||||
search_space_id=search_space_id,
|
||||
podcast_title=podcast_title,
|
||||
user_prompt=user_prompt,
|
||||
)
|
||||
|
||||
# Mark this task as active
|
||||
set_active_podcast_task(search_space_id, task.id)
|
||||
set_generating_podcast(search_space_id, podcast.id)
|
||||
|
||||
print(f"[generate_podcast] Submitted Celery task: {task.id}")
|
||||
print(f"[generate_podcast] Created podcast {podcast.id}, task: {task.id}")
|
||||
|
||||
# Return immediately with task_id for polling
|
||||
return {
|
||||
"status": "processing",
|
||||
"task_id": task.id,
|
||||
"status": PodcastStatus.PENDING.value,
|
||||
"podcast_id": podcast.id,
|
||||
"title": podcast_title,
|
||||
"message": "Podcast generation started. This may take a few minutes.",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
print(f"[generate_podcast] Error submitting task: {error_message}")
|
||||
print(f"[generate_podcast] Error: {error_message}")
|
||||
return {
|
||||
"status": "error",
|
||||
"status": PodcastStatus.FAILED.value,
|
||||
"error": error_message,
|
||||
"title": podcast_title,
|
||||
"task_id": None,
|
||||
"podcast_id": None,
|
||||
}
|
||||
|
||||
return generate_podcast
|
||||
|
|
|
|||
|
|
@ -107,8 +107,9 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
|||
factory=lambda deps: create_generate_podcast_tool(
|
||||
search_space_id=deps["search_space_id"],
|
||||
db_session=deps["db_session"],
|
||||
thread_id=deps["thread_id"],
|
||||
),
|
||||
requires=["search_space_id", "db_session"],
|
||||
requires=["search_space_id", "db_session", "thread_id"],
|
||||
),
|
||||
# Link preview tool - fetches Open Graph metadata for URLs
|
||||
ToolDefinition(
|
||||
|
|
|
|||
|
|
@ -93,6 +93,13 @@ class SearchSourceConnectorType(str, Enum):
|
|||
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
|
||||
|
||||
|
||||
class PodcastStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
GENERATING = "generating"
|
||||
READY = "ready"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class LiteLLMProvider(str, Enum):
|
||||
"""
|
||||
Enum for LLM providers supported by LiteLLM.
|
||||
|
|
@ -397,6 +404,47 @@ class NewChatThread(BaseModel, TimestampMixin):
|
|||
index=True,
|
||||
)
|
||||
|
||||
# Public sharing - cryptographic token for public URL access
|
||||
public_share_token = Column(
|
||||
String(64),
|
||||
nullable=True,
|
||||
unique=True,
|
||||
index=True,
|
||||
)
|
||||
# Whether public sharing is currently enabled for this thread
|
||||
public_share_enabled = Column(
|
||||
Boolean,
|
||||
nullable=False,
|
||||
default=False,
|
||||
server_default="false",
|
||||
)
|
||||
|
||||
# Clone tracking - for audit and history bootstrap
|
||||
cloned_from_thread_id = Column(
|
||||
Integer,
|
||||
ForeignKey("new_chat_threads.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
index=True,
|
||||
)
|
||||
cloned_at = Column(
|
||||
TIMESTAMP(timezone=True),
|
||||
nullable=True,
|
||||
)
|
||||
# Flag to bootstrap LangGraph checkpointer with DB messages on first message
|
||||
needs_history_bootstrap = Column(
|
||||
Boolean,
|
||||
nullable=False,
|
||||
default=False,
|
||||
server_default="false",
|
||||
)
|
||||
# Flag indicating content clone is pending (two-phase clone)
|
||||
clone_pending = Column(
|
||||
Boolean,
|
||||
nullable=False,
|
||||
default=False,
|
||||
server_default="false",
|
||||
)
|
||||
|
||||
# Relationships
|
||||
search_space = relationship("SearchSpace", back_populates="new_chat_threads")
|
||||
created_by = relationship("User", back_populates="new_chat_threads")
|
||||
|
|
@ -709,14 +757,34 @@ class Podcast(BaseModel, TimestampMixin):
|
|||
__tablename__ = "podcasts"
|
||||
|
||||
title = Column(String(500), nullable=False)
|
||||
podcast_transcript = Column(JSONB, nullable=True) # List of transcript entries
|
||||
file_location = Column(Text, nullable=True) # Path to the audio file
|
||||
podcast_transcript = Column(JSONB, nullable=True)
|
||||
file_location = Column(Text, nullable=True)
|
||||
status = Column(
|
||||
SQLAlchemyEnum(
|
||||
PodcastStatus,
|
||||
name="podcast_status",
|
||||
create_type=False,
|
||||
values_callable=lambda x: [e.value for e in x],
|
||||
),
|
||||
nullable=False,
|
||||
default=PodcastStatus.READY,
|
||||
server_default="ready",
|
||||
index=True,
|
||||
)
|
||||
|
||||
search_space_id = Column(
|
||||
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
search_space = relationship("SearchSpace", back_populates="podcasts")
|
||||
|
||||
thread_id = Column(
|
||||
Integer,
|
||||
ForeignKey("new_chat_threads.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
index=True,
|
||||
)
|
||||
thread = relationship("NewChatThread")
|
||||
|
||||
|
||||
class SearchSpace(BaseModel, TimestampMixin):
|
||||
__tablename__ = "searchspaces"
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ from .notes_routes import router as notes_router
|
|||
from .notifications_routes import router as notifications_router
|
||||
from .notion_add_connector_route import router as notion_add_connector_router
|
||||
from .podcasts_routes import router as podcasts_router
|
||||
from .public_chat_routes import router as public_chat_router
|
||||
from .rbac_routes import router as rbac_router
|
||||
from .search_source_connectors_routes import router as search_source_connectors_router
|
||||
from .search_spaces_routes import router as search_spaces_router
|
||||
|
|
@ -68,4 +69,5 @@ router.include_router(circleback_webhook_router) # Circleback meeting webhooks
|
|||
router.include_router(surfsense_docs_router) # Surfsense documentation for citations
|
||||
router.include_router(notifications_router) # Notifications with Electric SQL sync
|
||||
router.include_router(composio_router) # Composio OAuth and toolkit management
|
||||
router.include_router(public_chat_router) # Public chat sharing and cloning
|
||||
router.include_router(incentive_tasks_router) # Incentive tasks for earning free pages
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ from app.db import (
|
|||
get_async_session,
|
||||
)
|
||||
from app.schemas.new_chat import (
|
||||
CompleteCloneResponse,
|
||||
NewChatMessageAppend,
|
||||
NewChatMessageRead,
|
||||
NewChatRequest,
|
||||
|
|
@ -45,11 +46,14 @@ from app.schemas.new_chat import (
|
|||
NewChatThreadUpdate,
|
||||
NewChatThreadVisibilityUpdate,
|
||||
NewChatThreadWithMessages,
|
||||
PublicShareToggleRequest,
|
||||
PublicShareToggleResponse,
|
||||
RegenerateRequest,
|
||||
ThreadHistoryLoadResponse,
|
||||
ThreadListItem,
|
||||
ThreadListResponse,
|
||||
)
|
||||
from app.services.public_chat_service import toggle_public_share
|
||||
from app.tasks.chat.stream_new_chat import stream_new_chat
|
||||
from app.users import current_active_user
|
||||
from app.utils.rbac import check_permission
|
||||
|
|
@ -215,6 +219,7 @@ async def list_threads(
|
|||
visibility=thread.visibility,
|
||||
created_by_id=thread.created_by_id,
|
||||
is_own_thread=is_own_thread,
|
||||
public_share_enabled=thread.public_share_enabled,
|
||||
created_at=thread.created_at,
|
||||
updated_at=thread.updated_at,
|
||||
)
|
||||
|
|
@ -316,6 +321,7 @@ async def search_threads(
|
|||
thread.created_by_id == user.id
|
||||
or (thread.created_by_id is None and is_search_space_owner)
|
||||
),
|
||||
public_share_enabled=thread.public_share_enabled,
|
||||
created_at=thread.created_at,
|
||||
updated_at=thread.updated_at,
|
||||
)
|
||||
|
|
@ -664,6 +670,62 @@ async def delete_thread(
|
|||
) from None
|
||||
|
||||
|
||||
@router.post("/threads/{thread_id}/complete-clone", response_model=CompleteCloneResponse)
|
||||
async def complete_clone(
|
||||
thread_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Complete the cloning process for a thread.
|
||||
|
||||
Copies messages and podcasts from the source thread.
|
||||
Sets clone_pending=False and needs_history_bootstrap=True when done.
|
||||
|
||||
Requires authentication and ownership of the thread.
|
||||
"""
|
||||
from app.services.public_chat_service import complete_clone_content
|
||||
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(NewChatThread).filter(NewChatThread.id == thread_id)
|
||||
)
|
||||
thread = result.scalars().first()
|
||||
|
||||
if not thread:
|
||||
raise HTTPException(status_code=404, detail="Thread not found")
|
||||
|
||||
if thread.created_by_id != user.id:
|
||||
raise HTTPException(status_code=403, detail="Not authorized")
|
||||
|
||||
if not thread.clone_pending:
|
||||
raise HTTPException(status_code=400, detail="Clone already completed")
|
||||
|
||||
if not thread.cloned_from_thread_id:
|
||||
raise HTTPException(status_code=400, detail="No source thread to clone from")
|
||||
|
||||
message_count = await complete_clone_content(
|
||||
session=session,
|
||||
target_thread=thread,
|
||||
source_thread_id=thread.cloned_from_thread_id,
|
||||
target_search_space_id=thread.search_space_id,
|
||||
)
|
||||
|
||||
return CompleteCloneResponse(
|
||||
status="success",
|
||||
message_count=message_count,
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"An unexpected error occurred while completing clone: {e!s}",
|
||||
) from None
|
||||
|
||||
|
||||
@router.patch("/threads/{thread_id}/visibility", response_model=NewChatThreadRead)
|
||||
async def update_thread_visibility(
|
||||
thread_id: int,
|
||||
|
|
@ -729,6 +791,32 @@ async def update_thread_visibility(
|
|||
) from None
|
||||
|
||||
|
||||
@router.patch(
|
||||
"/threads/{thread_id}/public-share", response_model=PublicShareToggleResponse
|
||||
)
|
||||
async def update_thread_public_share(
|
||||
thread_id: int,
|
||||
request: Request,
|
||||
toggle_request: PublicShareToggleRequest,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Enable or disable public sharing for a thread.
|
||||
|
||||
Only the creator of the thread can manage public sharing.
|
||||
When enabled, returns a public URL that anyone can use to view the chat.
|
||||
"""
|
||||
base_url = str(request.base_url).rstrip("/")
|
||||
return await toggle_public_share(
|
||||
session=session,
|
||||
thread_id=thread_id,
|
||||
enabled=toggle_request.enabled,
|
||||
user=user,
|
||||
base_url=base_url,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Message Endpoints
|
||||
# =============================================================================
|
||||
|
|
@ -996,6 +1084,7 @@ async def handle_new_chat(
|
|||
attachments=request.attachments,
|
||||
mentioned_document_ids=request.mentioned_document_ids,
|
||||
mentioned_surfsense_doc_ids=request.mentioned_surfsense_doc_ids,
|
||||
needs_history_bootstrap=thread.needs_history_bootstrap,
|
||||
),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
|
|
@ -1223,6 +1312,7 @@ async def regenerate_response(
|
|||
mentioned_document_ids=request.mentioned_document_ids,
|
||||
mentioned_surfsense_doc_ids=request.mentioned_surfsense_doc_ids,
|
||||
checkpoint_id=target_checkpoint_id,
|
||||
needs_history_bootstrap=thread.needs_history_bootstrap,
|
||||
):
|
||||
yield chunk
|
||||
# If we get here, streaming completed successfully
|
||||
|
|
|
|||
|
|
@ -1,21 +1,19 @@
|
|||
"""
|
||||
Podcast routes for task status polling and audio retrieval.
|
||||
Podcast routes for CRUD operations and audio streaming.
|
||||
|
||||
These routes support the podcast generation feature in new-chat.
|
||||
Note: The old Chat-based podcast generation has been removed.
|
||||
Frontend polls GET /podcasts/{podcast_id} to check status field.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from celery.result import AsyncResult
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.celery_app import celery_app
|
||||
from app.db import (
|
||||
Permission,
|
||||
Podcast,
|
||||
|
|
@ -25,7 +23,7 @@ from app.db import (
|
|||
get_async_session,
|
||||
)
|
||||
from app.schemas import PodcastRead
|
||||
from app.users import current_active_user
|
||||
from app.users import current_active_user, current_optional_user
|
||||
from app.utils.rbac import check_permission
|
||||
|
||||
router = APIRouter()
|
||||
|
|
@ -84,12 +82,17 @@ async def read_podcasts(
|
|||
async def read_podcast(
|
||||
podcast_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
user: User | None = Depends(current_optional_user),
|
||||
):
|
||||
"""
|
||||
Get a specific podcast by ID.
|
||||
Requires PODCASTS_READ permission for the search space.
|
||||
|
||||
Access is allowed if:
|
||||
- User is authenticated with PODCASTS_READ permission, OR
|
||||
- Podcast belongs to a publicly shared thread
|
||||
"""
|
||||
from app.services.public_chat_service import is_podcast_publicly_accessible
|
||||
|
||||
try:
|
||||
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
|
||||
podcast = result.scalars().first()
|
||||
|
|
@ -100,16 +103,20 @@ async def read_podcast(
|
|||
detail="Podcast not found",
|
||||
)
|
||||
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
podcast.search_space_id,
|
||||
Permission.PODCASTS_READ.value,
|
||||
"You don't have permission to read podcasts in this search space",
|
||||
)
|
||||
is_public = await is_podcast_publicly_accessible(session, podcast_id)
|
||||
|
||||
return podcast
|
||||
if not is_public:
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Authentication required")
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
podcast.search_space_id,
|
||||
Permission.PODCASTS_READ.value,
|
||||
"You don't have permission to read podcasts in this search space",
|
||||
)
|
||||
|
||||
return PodcastRead.from_orm_with_entries(podcast)
|
||||
except HTTPException as he:
|
||||
raise he
|
||||
except SQLAlchemyError:
|
||||
|
|
@ -161,46 +168,49 @@ async def delete_podcast(
|
|||
async def stream_podcast(
|
||||
podcast_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
user: User | None = Depends(current_optional_user),
|
||||
):
|
||||
"""
|
||||
Stream a podcast audio file.
|
||||
Requires PODCASTS_READ permission for the search space.
|
||||
|
||||
Access is allowed if:
|
||||
- User is authenticated with PODCASTS_READ permission, OR
|
||||
- Podcast belongs to a publicly shared thread
|
||||
|
||||
Note: Both /stream and /audio endpoints are supported for compatibility.
|
||||
"""
|
||||
from app.services.public_chat_service import is_podcast_publicly_accessible
|
||||
|
||||
try:
|
||||
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
|
||||
podcast = result.scalars().first()
|
||||
|
||||
if not podcast:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Podcast not found",
|
||||
raise HTTPException(status_code=404, detail="Podcast not found")
|
||||
|
||||
is_public = await is_podcast_publicly_accessible(session, podcast_id)
|
||||
|
||||
if not is_public:
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Authentication required")
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
podcast.search_space_id,
|
||||
Permission.PODCASTS_READ.value,
|
||||
"You don't have permission to access podcasts in this search space",
|
||||
)
|
||||
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
podcast.search_space_id,
|
||||
Permission.PODCASTS_READ.value,
|
||||
"You don't have permission to access podcasts in this search space",
|
||||
)
|
||||
|
||||
# Get the file path
|
||||
file_path = podcast.file_location
|
||||
|
||||
# Check if the file exists
|
||||
if not file_path or not os.path.isfile(file_path):
|
||||
raise HTTPException(status_code=404, detail="Podcast audio file not found")
|
||||
|
||||
# Define a generator function to stream the file
|
||||
def iterfile():
|
||||
with open(file_path, mode="rb") as file_like:
|
||||
yield from file_like
|
||||
|
||||
# Return a streaming response with appropriate headers
|
||||
return StreamingResponse(
|
||||
iterfile(),
|
||||
media_type="audio/mpeg",
|
||||
|
|
@ -216,62 +226,3 @@ async def stream_podcast(
|
|||
raise HTTPException(
|
||||
status_code=500, detail=f"Error streaming podcast: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/podcasts/task/{task_id}/status")
|
||||
async def get_podcast_task_status(
|
||||
task_id: str,
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Get the status of a podcast generation task.
|
||||
Used by new-chat frontend to poll for completion.
|
||||
|
||||
Returns:
|
||||
- status: "processing" | "success" | "error"
|
||||
- podcast_id: (only if status == "success")
|
||||
- title: (only if status == "success")
|
||||
- error: (only if status == "error")
|
||||
"""
|
||||
try:
|
||||
result = AsyncResult(task_id, app=celery_app)
|
||||
|
||||
if result.ready():
|
||||
# Task completed
|
||||
if result.successful():
|
||||
task_result = result.result
|
||||
if isinstance(task_result, dict):
|
||||
if task_result.get("status") == "success":
|
||||
return {
|
||||
"status": "success",
|
||||
"podcast_id": task_result.get("podcast_id"),
|
||||
"title": task_result.get("title"),
|
||||
"transcript_entries": task_result.get("transcript_entries"),
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": task_result.get("error", "Unknown error"),
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": "Unexpected task result format",
|
||||
}
|
||||
else:
|
||||
# Task failed
|
||||
return {
|
||||
"status": "error",
|
||||
"error": str(result.result) if result.result else "Task failed",
|
||||
}
|
||||
else:
|
||||
# Task still processing
|
||||
return {
|
||||
"status": "processing",
|
||||
"state": result.state,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Error checking task status: {e!s}"
|
||||
) from e
|
||||
|
|
|
|||
82
surfsense_backend/app/routes/public_chat_routes.py
Normal file
82
surfsense_backend/app/routes/public_chat_routes.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
"""
|
||||
Routes for public chat access (unauthenticated and mixed-auth endpoints).
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import ChatVisibility, NewChatThread, User, get_async_session
|
||||
from app.schemas.new_chat import (
|
||||
CloneInitResponse,
|
||||
PublicChatResponse,
|
||||
)
|
||||
from app.services.public_chat_service import (
|
||||
get_public_chat,
|
||||
get_thread_by_share_token,
|
||||
get_user_default_search_space,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
|
||||
router = APIRouter(prefix="/public", tags=["public"])
|
||||
|
||||
|
||||
@router.get("/{share_token}", response_model=PublicChatResponse)
|
||||
async def read_public_chat(
|
||||
share_token: str,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
):
|
||||
"""
|
||||
Get a public chat by share token.
|
||||
|
||||
No authentication required.
|
||||
Returns sanitized content (citations stripped).
|
||||
"""
|
||||
return await get_public_chat(session, share_token)
|
||||
|
||||
|
||||
@router.post("/{share_token}/clone", response_model=CloneInitResponse)
|
||||
async def clone_public_chat_endpoint(
|
||||
share_token: str,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Initialize cloning a public chat to the user's account.
|
||||
|
||||
Creates an empty thread with clone_pending=True.
|
||||
Frontend should redirect to the new thread and call /complete-clone.
|
||||
|
||||
Requires authentication.
|
||||
"""
|
||||
source_thread = await get_thread_by_share_token(session, share_token)
|
||||
|
||||
if not source_thread:
|
||||
raise HTTPException(status_code=404, detail="Chat not found or no longer public")
|
||||
|
||||
target_search_space_id = await get_user_default_search_space(session, user.id)
|
||||
|
||||
if target_search_space_id is None:
|
||||
raise HTTPException(status_code=400, detail="No search space found for user")
|
||||
|
||||
new_thread = NewChatThread(
|
||||
title=source_thread.title,
|
||||
archived=False,
|
||||
visibility=ChatVisibility.PRIVATE,
|
||||
search_space_id=target_search_space_id,
|
||||
created_by_id=user.id,
|
||||
public_share_enabled=False,
|
||||
cloned_from_thread_id=source_thread.id,
|
||||
cloned_at=datetime.now(UTC),
|
||||
clone_pending=True,
|
||||
)
|
||||
session.add(new_thread)
|
||||
await session.commit()
|
||||
await session.refresh(new_thread)
|
||||
|
||||
return CloneInitResponse(
|
||||
thread_id=new_thread.id,
|
||||
search_space_id=target_search_space_id,
|
||||
share_token=share_token,
|
||||
)
|
||||
|
|
@ -123,7 +123,9 @@ async def list_all_permissions(
|
|||
for perm in Permission:
|
||||
# Extract category from permission value (e.g., "documents:read" -> "documents")
|
||||
category = perm.value.split(":")[0] if ":" in perm.value else "general"
|
||||
description = PERMISSION_DESCRIPTIONS.get(perm.value, f"Permission for {perm.value}")
|
||||
description = PERMISSION_DESCRIPTIONS.get(
|
||||
perm.value, f"Permission for {perm.value}"
|
||||
)
|
||||
|
||||
permissions.append(
|
||||
PermissionInfo(
|
||||
|
|
|
|||
|
|
@ -95,6 +95,9 @@ class NewChatThreadRead(NewChatThreadBase, IDModel):
|
|||
search_space_id: int
|
||||
visibility: ChatVisibility
|
||||
created_by_id: UUID | None = None
|
||||
public_share_enabled: bool = False
|
||||
public_share_token: str | None = None
|
||||
clone_pending: bool = False
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
|
@ -133,7 +136,8 @@ class ThreadListItem(BaseModel):
|
|||
archived: bool
|
||||
visibility: ChatVisibility
|
||||
created_by_id: UUID | None = None
|
||||
is_own_thread: bool = False # True if the current user created this thread
|
||||
is_own_thread: bool = False
|
||||
public_share_enabled: bool = False
|
||||
created_at: datetime = Field(alias="createdAt")
|
||||
updated_at: datetime = Field(alias="updatedAt")
|
||||
|
||||
|
|
@ -204,3 +208,63 @@ class RegenerateRequest(BaseModel):
|
|||
attachments: list[ChatAttachment] | None = None
|
||||
mentioned_document_ids: list[int] | None = None
|
||||
mentioned_surfsense_doc_ids: list[int] | None = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Public Sharing Schemas
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class PublicShareToggleRequest(BaseModel):
|
||||
"""Request to enable/disable public sharing for a thread."""
|
||||
|
||||
enabled: bool
|
||||
|
||||
|
||||
class PublicShareToggleResponse(BaseModel):
|
||||
"""Response after toggling public sharing."""
|
||||
|
||||
enabled: bool
|
||||
public_url: str | None = None
|
||||
share_token: str | None = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Public Chat View Schemas (for unauthenticated access)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class PublicAuthor(BaseModel):
|
||||
display_name: str | None = None
|
||||
avatar_url: str | None = None
|
||||
|
||||
|
||||
class PublicChatMessage(BaseModel):
|
||||
role: NewChatMessageRole
|
||||
content: Any
|
||||
author: PublicAuthor | None = None
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class PublicChatThread(BaseModel):
|
||||
title: str
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class PublicChatResponse(BaseModel):
|
||||
thread: PublicChatThread
|
||||
messages: list[PublicChatMessage]
|
||||
|
||||
|
||||
class CloneInitResponse(BaseModel):
|
||||
|
||||
|
||||
thread_id: int
|
||||
search_space_id: int
|
||||
share_token: str
|
||||
|
||||
|
||||
class CompleteCloneResponse(BaseModel):
|
||||
|
||||
status: str
|
||||
message_count: int
|
||||
|
|
|
|||
|
|
@ -1,11 +1,19 @@
|
|||
"""Podcast schemas for API responses."""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class PodcastStatusEnum(str, Enum):
|
||||
PENDING = "pending"
|
||||
GENERATING = "generating"
|
||||
READY = "ready"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class PodcastBase(BaseModel):
|
||||
"""Base podcast schema."""
|
||||
|
||||
|
|
@ -33,7 +41,24 @@ class PodcastRead(PodcastBase):
|
|||
"""Schema for reading a podcast."""
|
||||
|
||||
id: int
|
||||
status: PodcastStatusEnum = PodcastStatusEnum.READY
|
||||
created_at: datetime
|
||||
transcript_entries: int | None = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
@classmethod
|
||||
def from_orm_with_entries(cls, obj):
|
||||
"""Create PodcastRead with transcript_entries computed."""
|
||||
data = {
|
||||
"id": obj.id,
|
||||
"title": obj.title,
|
||||
"podcast_transcript": obj.podcast_transcript,
|
||||
"file_location": obj.file_location,
|
||||
"search_space_id": obj.search_space_id,
|
||||
"status": obj.status,
|
||||
"created_at": obj.created_at,
|
||||
"transcript_entries": len(obj.podcast_transcript) if obj.podcast_transcript else None,
|
||||
}
|
||||
return cls(**data)
|
||||
|
|
|
|||
376
surfsense_backend/app/services/public_chat_service.py
Normal file
376
surfsense_backend/app/services/public_chat_service.py
Normal file
|
|
@ -0,0 +1,376 @@
|
|||
"""
|
||||
Service layer for public chat sharing and cloning.
|
||||
"""
|
||||
|
||||
import re
|
||||
import secrets
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.db import NewChatThread, User
|
||||
|
||||
UI_TOOLS = {
|
||||
"display_image",
|
||||
"link_preview",
|
||||
"generate_podcast",
|
||||
"scrape_webpage",
|
||||
"multi_link_preview",
|
||||
}
|
||||
|
||||
|
||||
def strip_citations(text: str) -> str:
|
||||
"""
|
||||
Remove [citation:X] and [citation:doc-X] patterns from text.
|
||||
Preserves newlines to maintain markdown formatting.
|
||||
"""
|
||||
# Remove citation patterns
|
||||
text = re.sub(r"[\[【]\u200B?citation:(doc-)?\d+\u200B?[\]】]", "", text)
|
||||
# Collapse multiple spaces/tabs (but NOT newlines) into single space
|
||||
text = re.sub(r"[^\S\n]+", " ", text)
|
||||
# Normalize excessive blank lines (3+ newlines → 2)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
# Clean up spaces around newlines
|
||||
text = re.sub(r" *\n *", "\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def sanitize_content_for_public(content: list | str | None) -> list:
|
||||
"""
|
||||
Filter message content for public view.
|
||||
Strips citations and filters to UI-relevant tools.
|
||||
"""
|
||||
if content is None:
|
||||
return []
|
||||
|
||||
if isinstance(content, str):
|
||||
clean_text = strip_citations(content)
|
||||
return [{"type": "text", "text": clean_text}] if clean_text else []
|
||||
|
||||
if not isinstance(content, list):
|
||||
return []
|
||||
|
||||
sanitized = []
|
||||
for part in content:
|
||||
if not isinstance(part, dict):
|
||||
continue
|
||||
|
||||
part_type = part.get("type")
|
||||
|
||||
if part_type == "text":
|
||||
clean_text = strip_citations(part.get("text", ""))
|
||||
if clean_text:
|
||||
sanitized.append({"type": "text", "text": clean_text})
|
||||
|
||||
elif part_type == "tool-call":
|
||||
tool_name = part.get("toolName")
|
||||
if tool_name not in UI_TOOLS:
|
||||
continue
|
||||
sanitized.append(part)
|
||||
|
||||
return sanitized
|
||||
|
||||
|
||||
async def get_author_display(
|
||||
session: AsyncSession,
|
||||
author_id: UUID | None,
|
||||
user_cache: dict[UUID, dict],
|
||||
) -> dict | None:
|
||||
"""Transform author UUID to display info."""
|
||||
if author_id is None:
|
||||
return None
|
||||
|
||||
if author_id not in user_cache:
|
||||
result = await session.execute(select(User).filter(User.id == author_id))
|
||||
user = result.scalars().first()
|
||||
if user:
|
||||
user_cache[author_id] = {
|
||||
"display_name": user.display_name or "User",
|
||||
"avatar_url": user.avatar_url,
|
||||
}
|
||||
else:
|
||||
user_cache[author_id] = {
|
||||
"display_name": "Unknown User",
|
||||
"avatar_url": None,
|
||||
}
|
||||
|
||||
return user_cache[author_id]
|
||||
|
||||
|
||||
async def toggle_public_share(
|
||||
session: AsyncSession,
|
||||
thread_id: int,
|
||||
enabled: bool,
|
||||
user: User,
|
||||
base_url: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Enable or disable public sharing for a thread.
|
||||
|
||||
Only the thread owner can toggle public sharing.
|
||||
When enabling, generates a new token if one doesn't exist.
|
||||
When disabling, keeps the token for potential re-enable.
|
||||
"""
|
||||
result = await session.execute(
|
||||
select(NewChatThread).filter(NewChatThread.id == thread_id)
|
||||
)
|
||||
thread = result.scalars().first()
|
||||
|
||||
if not thread:
|
||||
raise HTTPException(status_code=404, detail="Thread not found")
|
||||
|
||||
if thread.created_by_id != user.id:
|
||||
raise HTTPException(
|
||||
status_code=403,
|
||||
detail="Only the creator of this chat can manage public sharing",
|
||||
)
|
||||
|
||||
if enabled and not thread.public_share_token:
|
||||
thread.public_share_token = secrets.token_urlsafe(48)
|
||||
|
||||
thread.public_share_enabled = enabled
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(thread)
|
||||
|
||||
if enabled:
|
||||
return {
|
||||
"enabled": True,
|
||||
"public_url": f"{base_url}/public/{thread.public_share_token}",
|
||||
"share_token": thread.public_share_token,
|
||||
}
|
||||
|
||||
return {
|
||||
"enabled": False,
|
||||
"public_url": None,
|
||||
"share_token": None,
|
||||
}
|
||||
|
||||
|
||||
async def get_public_chat(
|
||||
session: AsyncSession,
|
||||
share_token: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Get a public chat by share token.
|
||||
|
||||
Returns sanitized content suitable for public viewing.
|
||||
"""
|
||||
result = await session.execute(
|
||||
select(NewChatThread)
|
||||
.options(selectinload(NewChatThread.messages))
|
||||
.filter(
|
||||
NewChatThread.public_share_token == share_token,
|
||||
NewChatThread.public_share_enabled.is_(True),
|
||||
)
|
||||
)
|
||||
thread = result.scalars().first()
|
||||
|
||||
if not thread:
|
||||
raise HTTPException(status_code=404, detail="Not found")
|
||||
|
||||
user_cache: dict[UUID, dict] = {}
|
||||
|
||||
messages = []
|
||||
for msg in sorted(thread.messages, key=lambda m: m.created_at):
|
||||
author = await get_author_display(session, msg.author_id, user_cache)
|
||||
sanitized_content = sanitize_content_for_public(msg.content)
|
||||
|
||||
messages.append(
|
||||
{
|
||||
"role": msg.role,
|
||||
"content": sanitized_content,
|
||||
"author": author,
|
||||
"created_at": msg.created_at,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"thread": {
|
||||
"title": thread.title,
|
||||
"created_at": thread.created_at,
|
||||
},
|
||||
"messages": messages,
|
||||
}
|
||||
|
||||
|
||||
async def get_thread_by_share_token(
|
||||
session: AsyncSession,
|
||||
share_token: str,
|
||||
) -> NewChatThread | None:
|
||||
"""Get a thread by its public share token if sharing is enabled."""
|
||||
result = await session.execute(
|
||||
select(NewChatThread)
|
||||
.options(selectinload(NewChatThread.messages))
|
||||
.filter(
|
||||
NewChatThread.public_share_token == share_token,
|
||||
NewChatThread.public_share_enabled.is_(True),
|
||||
)
|
||||
)
|
||||
return result.scalars().first()
|
||||
|
||||
|
||||
async def get_user_default_search_space(
|
||||
session: AsyncSession,
|
||||
user_id: UUID,
|
||||
) -> int | None:
|
||||
"""
|
||||
Get user's default search space for cloning.
|
||||
|
||||
Returns the first search space where user is owner, or None if not found.
|
||||
"""
|
||||
from app.db import SearchSpaceMembership
|
||||
|
||||
result = await session.execute(
|
||||
select(SearchSpaceMembership)
|
||||
.filter(
|
||||
SearchSpaceMembership.user_id == user_id,
|
||||
SearchSpaceMembership.is_owner.is_(True),
|
||||
)
|
||||
.limit(1)
|
||||
)
|
||||
membership = result.scalars().first()
|
||||
|
||||
if membership:
|
||||
return membership.search_space_id
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def complete_clone_content(
|
||||
session: AsyncSession,
|
||||
target_thread: NewChatThread,
|
||||
source_thread_id: int,
|
||||
target_search_space_id: int,
|
||||
) -> int:
|
||||
"""
|
||||
Copy messages and podcasts from source thread to target thread.
|
||||
|
||||
Sets clone_pending=False and needs_history_bootstrap=True when done.
|
||||
Returns the number of messages copied.
|
||||
"""
|
||||
from app.db import NewChatMessage
|
||||
|
||||
result = await session.execute(
|
||||
select(NewChatThread)
|
||||
.options(selectinload(NewChatThread.messages))
|
||||
.filter(NewChatThread.id == source_thread_id)
|
||||
)
|
||||
source_thread = result.scalars().first()
|
||||
|
||||
if not source_thread:
|
||||
raise ValueError("Source thread not found")
|
||||
|
||||
podcast_id_map: dict[int, int] = {}
|
||||
message_count = 0
|
||||
|
||||
for msg in sorted(source_thread.messages, key=lambda m: m.created_at):
|
||||
new_content = sanitize_content_for_public(msg.content)
|
||||
|
||||
if isinstance(new_content, list):
|
||||
for part in new_content:
|
||||
if (
|
||||
isinstance(part, dict)
|
||||
and part.get("type") == "tool-call"
|
||||
and part.get("toolName") == "generate_podcast"
|
||||
):
|
||||
result_data = part.get("result", {})
|
||||
old_podcast_id = result_data.get("podcast_id")
|
||||
if old_podcast_id and old_podcast_id not in podcast_id_map:
|
||||
new_podcast_id = await _clone_podcast(
|
||||
session,
|
||||
old_podcast_id,
|
||||
target_search_space_id,
|
||||
target_thread.id,
|
||||
)
|
||||
if new_podcast_id:
|
||||
podcast_id_map[old_podcast_id] = new_podcast_id
|
||||
|
||||
if old_podcast_id and old_podcast_id in podcast_id_map:
|
||||
result_data["podcast_id"] = podcast_id_map[old_podcast_id]
|
||||
|
||||
new_message = NewChatMessage(
|
||||
thread_id=target_thread.id,
|
||||
role=msg.role,
|
||||
content=new_content,
|
||||
author_id=msg.author_id,
|
||||
created_at=msg.created_at,
|
||||
)
|
||||
session.add(new_message)
|
||||
message_count += 1
|
||||
|
||||
target_thread.clone_pending = False
|
||||
target_thread.needs_history_bootstrap = True
|
||||
|
||||
await session.commit()
|
||||
|
||||
return message_count
|
||||
|
||||
|
||||
async def _clone_podcast(
|
||||
session: AsyncSession,
|
||||
podcast_id: int,
|
||||
target_search_space_id: int,
|
||||
target_thread_id: int,
|
||||
) -> int | None:
|
||||
"""Clone a podcast record and its audio file. Only clones ready podcasts."""
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.db import Podcast, PodcastStatus
|
||||
|
||||
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
|
||||
original = result.scalars().first()
|
||||
if not original or original.status != PodcastStatus.READY:
|
||||
return None
|
||||
|
||||
new_file_path = None
|
||||
if original.file_location:
|
||||
original_path = Path(original.file_location)
|
||||
if original_path.exists():
|
||||
new_filename = f"{uuid.uuid4()}_podcast.mp3"
|
||||
new_dir = Path("podcasts")
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
new_file_path = str(new_dir / new_filename)
|
||||
shutil.copy2(original.file_location, new_file_path)
|
||||
|
||||
new_podcast = Podcast(
|
||||
title=original.title,
|
||||
podcast_transcript=original.podcast_transcript,
|
||||
file_location=new_file_path,
|
||||
status=PodcastStatus.READY,
|
||||
search_space_id=target_search_space_id,
|
||||
thread_id=target_thread_id,
|
||||
)
|
||||
session.add(new_podcast)
|
||||
await session.flush()
|
||||
|
||||
return new_podcast.id
|
||||
|
||||
|
||||
async def is_podcast_publicly_accessible(
|
||||
session: AsyncSession,
|
||||
podcast_id: int,
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a podcast belongs to a publicly shared thread.
|
||||
|
||||
Uses the thread_id foreign key for efficient lookup.
|
||||
"""
|
||||
from app.db import Podcast
|
||||
|
||||
result = await session.execute(
|
||||
select(Podcast)
|
||||
.options(selectinload(Podcast.thread))
|
||||
.filter(Podcast.id == podcast_id)
|
||||
)
|
||||
podcast = result.scalars().first()
|
||||
|
||||
if not podcast or not podcast.thread:
|
||||
return False
|
||||
|
||||
return podcast.thread.public_share_enabled
|
||||
|
|
@ -4,15 +4,15 @@ import asyncio
|
|||
import logging
|
||||
import sys
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.pool import NullPool
|
||||
|
||||
# Import for content-based podcast (new-chat)
|
||||
from app.agents.podcaster.graph import graph as podcaster_graph
|
||||
from app.agents.podcaster.state import State as PodcasterState
|
||||
from app.celery_app import celery_app
|
||||
from app.config import config
|
||||
from app.db import Podcast
|
||||
from app.db import Podcast, PodcastStatus
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -44,8 +44,8 @@ def get_celery_session_maker():
|
|||
# =============================================================================
|
||||
|
||||
|
||||
def _clear_active_podcast_redis_key(search_space_id: int) -> None:
|
||||
"""Clear the active podcast task key from Redis when task completes."""
|
||||
def _clear_generating_podcast(search_space_id: int) -> None:
|
||||
"""Clear the generating podcast marker from Redis when task completes."""
|
||||
import os
|
||||
|
||||
import redis
|
||||
|
|
@ -53,34 +53,24 @@ def _clear_active_podcast_redis_key(search_space_id: int) -> None:
|
|||
try:
|
||||
redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
client = redis.from_url(redis_url, decode_responses=True)
|
||||
key = f"podcast:active:{search_space_id}"
|
||||
key = f"podcast:generating:{search_space_id}"
|
||||
client.delete(key)
|
||||
logger.info(f"Cleared active podcast key for search_space_id={search_space_id}")
|
||||
logger.info(f"Cleared generating podcast key for search_space_id={search_space_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not clear active podcast key: {e}")
|
||||
logger.warning(f"Could not clear generating podcast key: {e}")
|
||||
|
||||
|
||||
@celery_app.task(name="generate_content_podcast", bind=True)
|
||||
def generate_content_podcast_task(
|
||||
self,
|
||||
podcast_id: int,
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
podcast_title: str = "SurfSense Podcast",
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Celery task to generate podcast from source content (for new-chat).
|
||||
|
||||
This task generates a podcast directly from provided content.
|
||||
|
||||
Args:
|
||||
source_content: The text content to convert into a podcast
|
||||
search_space_id: ID of the search space
|
||||
podcast_title: Title for the podcast
|
||||
user_prompt: Optional instructions for podcast style/tone
|
||||
|
||||
Returns:
|
||||
dict with podcast_id on success, or error info on failure
|
||||
Celery task to generate podcast from source content.
|
||||
Updates existing podcast record created by the tool.
|
||||
"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
|
@ -88,9 +78,9 @@ def generate_content_podcast_task(
|
|||
try:
|
||||
result = loop.run_until_complete(
|
||||
_generate_content_podcast(
|
||||
podcast_id,
|
||||
source_content,
|
||||
search_space_id,
|
||||
podcast_title,
|
||||
user_prompt,
|
||||
)
|
||||
)
|
||||
|
|
@ -98,46 +88,69 @@ def generate_content_podcast_task(
|
|||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating content podcast: {e!s}")
|
||||
return {"status": "error", "error": str(e)}
|
||||
loop.run_until_complete(_mark_podcast_failed(podcast_id))
|
||||
return {"status": "failed", "podcast_id": podcast_id}
|
||||
finally:
|
||||
# Always clear the active podcast key when task completes (success or failure)
|
||||
_clear_active_podcast_redis_key(search_space_id)
|
||||
_clear_generating_podcast(search_space_id)
|
||||
asyncio.set_event_loop(None)
|
||||
loop.close()
|
||||
|
||||
|
||||
async def _generate_content_podcast(
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
podcast_title: str = "SurfSense Podcast",
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
"""Generate content-based podcast with new session."""
|
||||
async def _mark_podcast_failed(podcast_id: int) -> None:
|
||||
"""Mark a podcast as failed in the database."""
|
||||
async with get_celery_session_maker()() as session:
|
||||
try:
|
||||
# Configure the podcaster graph
|
||||
result = await session.execute(
|
||||
select(Podcast).filter(Podcast.id == podcast_id)
|
||||
)
|
||||
podcast = result.scalars().first()
|
||||
if podcast:
|
||||
podcast.status = PodcastStatus.FAILED
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to mark podcast as failed: {e}")
|
||||
|
||||
|
||||
async def _generate_content_podcast(
|
||||
podcast_id: int,
|
||||
source_content: str,
|
||||
search_space_id: int,
|
||||
user_prompt: str | None = None,
|
||||
) -> dict:
|
||||
"""Generate content-based podcast and update existing record."""
|
||||
async with get_celery_session_maker()() as session:
|
||||
result = await session.execute(
|
||||
select(Podcast).filter(Podcast.id == podcast_id)
|
||||
)
|
||||
podcast = result.scalars().first()
|
||||
|
||||
if not podcast:
|
||||
raise ValueError(f"Podcast {podcast_id} not found")
|
||||
|
||||
try:
|
||||
podcast.status = PodcastStatus.GENERATING
|
||||
await session.commit()
|
||||
|
||||
graph_config = {
|
||||
"configurable": {
|
||||
"podcast_title": podcast_title,
|
||||
"podcast_title": podcast.title,
|
||||
"search_space_id": search_space_id,
|
||||
"user_prompt": user_prompt,
|
||||
}
|
||||
}
|
||||
|
||||
# Initialize the podcaster state with the source content
|
||||
initial_state = PodcasterState(
|
||||
source_content=source_content,
|
||||
db_session=session,
|
||||
)
|
||||
|
||||
# Run the podcaster graph
|
||||
result = await podcaster_graph.ainvoke(initial_state, config=graph_config)
|
||||
graph_result = await podcaster_graph.ainvoke(
|
||||
initial_state, config=graph_config
|
||||
)
|
||||
|
||||
# Extract results
|
||||
podcast_transcript = result.get("podcast_transcript", [])
|
||||
file_path = result.get("final_podcast_file_path", "")
|
||||
podcast_transcript = graph_result.get("podcast_transcript", [])
|
||||
file_path = graph_result.get("final_podcast_file_path", "")
|
||||
|
||||
# Convert transcript to serializable format
|
||||
serializable_transcript = []
|
||||
for entry in podcast_transcript:
|
||||
if hasattr(entry, "speaker_id"):
|
||||
|
|
@ -152,27 +165,22 @@ async def _generate_content_podcast(
|
|||
}
|
||||
)
|
||||
|
||||
# Save podcast to database
|
||||
podcast = Podcast(
|
||||
title=podcast_title,
|
||||
podcast_transcript=serializable_transcript,
|
||||
file_location=file_path,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
session.add(podcast)
|
||||
podcast.podcast_transcript = serializable_transcript
|
||||
podcast.file_location = file_path
|
||||
podcast.status = PodcastStatus.READY
|
||||
await session.commit()
|
||||
await session.refresh(podcast)
|
||||
|
||||
logger.info(f"Successfully generated content podcast: {podcast.id}")
|
||||
logger.info(f"Successfully generated podcast: {podcast.id}")
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"status": "ready",
|
||||
"podcast_id": podcast.id,
|
||||
"title": podcast_title,
|
||||
"title": podcast.title,
|
||||
"transcript_entries": len(serializable_transcript),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in _generate_content_podcast: {e!s}")
|
||||
await session.rollback()
|
||||
podcast.status = PodcastStatus.FAILED
|
||||
await session.commit()
|
||||
raise
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ from app.services.chat_session_state_service import (
|
|||
)
|
||||
from app.services.connector_service import ConnectorService
|
||||
from app.services.new_streaming_service import VercelStreamingService
|
||||
from app.utils.content_utils import bootstrap_history_from_db
|
||||
|
||||
|
||||
def format_attachments_as_context(attachments: list[ChatAttachment]) -> str:
|
||||
|
|
@ -205,13 +206,13 @@ async def stream_new_chat(
|
|||
mentioned_document_ids: list[int] | None = None,
|
||||
mentioned_surfsense_doc_ids: list[int] | None = None,
|
||||
checkpoint_id: str | None = None,
|
||||
needs_history_bootstrap: bool = False,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""
|
||||
Stream chat responses from the new SurfSense deep agent.
|
||||
|
||||
This uses the Vercel AI SDK Data Stream Protocol (SSE format) for streaming.
|
||||
The chat_id is used as LangGraph's thread_id for memory/checkpointing.
|
||||
Message history can be passed from the frontend for context.
|
||||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
|
|
@ -221,6 +222,7 @@ async def stream_new_chat(
|
|||
user_id: The current user's UUID string (for memory tools and session state)
|
||||
llm_config_id: The LLM configuration ID (default: -1 for first global config)
|
||||
attachments: Optional attachments with extracted content
|
||||
needs_history_bootstrap: If True, load message history from DB (for cloned chats)
|
||||
mentioned_document_ids: Optional list of document IDs mentioned with @ in the chat
|
||||
mentioned_surfsense_doc_ids: Optional list of SurfSense doc IDs mentioned with @ in the chat
|
||||
checkpoint_id: Optional checkpoint ID to rewind/fork from (for edit/reload operations)
|
||||
|
|
@ -300,13 +302,29 @@ async def stream_new_chat(
|
|||
connector_service=connector_service,
|
||||
checkpointer=checkpointer,
|
||||
user_id=user_id, # Pass user ID for memory tools
|
||||
thread_id=chat_id, # Pass chat ID for podcast association
|
||||
agent_config=agent_config, # Pass prompt configuration
|
||||
firecrawl_api_key=firecrawl_api_key, # Pass Firecrawl API key if configured
|
||||
)
|
||||
|
||||
# Build input with message history from frontend
|
||||
# Build input with message history
|
||||
langchain_messages = []
|
||||
|
||||
# Bootstrap history for cloned chats (no LangGraph checkpoint exists yet)
|
||||
if needs_history_bootstrap:
|
||||
langchain_messages = await bootstrap_history_from_db(session, chat_id)
|
||||
|
||||
# Clear the flag so we don't bootstrap again on next message
|
||||
from app.db import NewChatThread
|
||||
|
||||
thread_result = await session.execute(
|
||||
select(NewChatThread).filter(NewChatThread.id == chat_id)
|
||||
)
|
||||
thread = thread_result.scalars().first()
|
||||
if thread:
|
||||
thread.needs_history_bootstrap = False
|
||||
await session.commit()
|
||||
|
||||
# Fetch mentioned documents if any (with chunks for proper citations)
|
||||
mentioned_documents: list[Document] = []
|
||||
if mentioned_document_ids:
|
||||
|
|
|
|||
|
|
@ -37,18 +37,30 @@ from .base import (
|
|||
from .markdown_processor import add_received_markdown_file_document
|
||||
|
||||
# Constants for LlamaCloud retry configuration
|
||||
LLAMACLOUD_MAX_RETRIES = 3
|
||||
LLAMACLOUD_BASE_DELAY = 5 # Base delay in seconds for exponential backoff
|
||||
LLAMACLOUD_MAX_RETRIES = 5 # Increased from 3 for large file resilience
|
||||
LLAMACLOUD_BASE_DELAY = 10 # Base delay in seconds for exponential backoff
|
||||
LLAMACLOUD_MAX_DELAY = 120 # Maximum delay between retries (2 minutes)
|
||||
LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
|
||||
ssl.SSLError,
|
||||
httpx.ConnectError,
|
||||
httpx.ConnectTimeout,
|
||||
httpx.ReadTimeout,
|
||||
httpx.WriteTimeout,
|
||||
httpx.RemoteProtocolError,
|
||||
httpx.LocalProtocolError,
|
||||
ConnectionError,
|
||||
ConnectionResetError,
|
||||
TimeoutError,
|
||||
OSError, # Catches various network-level errors
|
||||
)
|
||||
|
||||
# Timeout calculation constants
|
||||
UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024 # 100 KB/s (conservative for slow connections)
|
||||
MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
|
||||
MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
|
||||
BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
|
||||
PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing
|
||||
|
||||
|
||||
def get_google_drive_unique_identifier(
|
||||
connector: dict | None,
|
||||
|
|
@ -204,6 +216,48 @@ async def find_existing_document_with_migration(
|
|||
return existing_document
|
||||
|
||||
|
||||
def calculate_upload_timeout(file_size_bytes: int) -> float:
|
||||
"""
|
||||
Calculate appropriate upload timeout based on file size.
|
||||
|
||||
Assumes a conservative slow connection speed to handle worst-case scenarios.
|
||||
|
||||
Args:
|
||||
file_size_bytes: Size of the file in bytes
|
||||
|
||||
Returns:
|
||||
Timeout in seconds
|
||||
"""
|
||||
# Calculate time needed at slow connection speed
|
||||
# Add 50% buffer for network variability and SSL overhead
|
||||
estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
|
||||
|
||||
# Clamp to reasonable bounds
|
||||
return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
|
||||
|
||||
|
||||
def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
|
||||
"""
|
||||
Calculate job processing timeout based on page count and file size.
|
||||
|
||||
Args:
|
||||
estimated_pages: Estimated number of pages
|
||||
file_size_bytes: Size of the file in bytes
|
||||
|
||||
Returns:
|
||||
Timeout in seconds
|
||||
"""
|
||||
# Base timeout + time per page
|
||||
page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
|
||||
|
||||
# Also consider file size (large images take longer to process)
|
||||
# ~1 minute per 10MB of file size
|
||||
size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
|
||||
|
||||
# Use the larger of the two estimates
|
||||
return max(page_based_timeout, size_based_timeout)
|
||||
|
||||
|
||||
async def parse_with_llamacloud_retry(
|
||||
file_path: str,
|
||||
estimated_pages: int,
|
||||
|
|
@ -213,6 +267,9 @@ async def parse_with_llamacloud_retry(
|
|||
"""
|
||||
Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
|
||||
|
||||
Uses dynamic timeout calculations based on file size and page count to handle
|
||||
very large files reliably.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to parse
|
||||
estimated_pages: Estimated number of pages for timeout calculation
|
||||
|
|
@ -225,25 +282,37 @@ async def parse_with_llamacloud_retry(
|
|||
Raises:
|
||||
Exception: If all retries fail
|
||||
"""
|
||||
import os
|
||||
import random
|
||||
|
||||
from llama_cloud_services import LlamaParse
|
||||
from llama_cloud_services.parse.utils import ResultType
|
||||
|
||||
# Calculate timeouts based on estimated pages
|
||||
# Base timeout of 300 seconds + 30 seconds per page for large documents
|
||||
base_timeout = 300
|
||||
per_page_timeout = 30
|
||||
job_timeout = base_timeout + (estimated_pages * per_page_timeout)
|
||||
|
||||
# Create custom httpx client with larger timeouts for file uploads
|
||||
# The SSL error often occurs during large file uploads, so we need generous timeouts
|
||||
# Get file size for timeout calculations
|
||||
file_size_bytes = os.path.getsize(file_path)
|
||||
file_size_mb = file_size_bytes / (1024 * 1024)
|
||||
|
||||
# Calculate dynamic timeouts based on file size and page count
|
||||
upload_timeout = calculate_upload_timeout(file_size_bytes)
|
||||
job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
|
||||
|
||||
# HTTP client timeouts - scaled based on file size
|
||||
# Write timeout is critical for large file uploads
|
||||
custom_timeout = httpx.Timeout(
|
||||
connect=60.0, # 60 seconds to establish connection
|
||||
read=300.0, # 5 minutes to read response
|
||||
write=300.0, # 5 minutes to write/upload (important for large files)
|
||||
pool=60.0, # 60 seconds to acquire connection from pool
|
||||
connect=120.0, # 2 minutes to establish connection (handles slow DNS, etc.)
|
||||
read=upload_timeout, # Dynamic based on file size
|
||||
write=upload_timeout, # Dynamic based on file size (upload time)
|
||||
pool=120.0, # 2 minutes to acquire connection from pool
|
||||
)
|
||||
|
||||
logging.info(
|
||||
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
|
||||
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
|
||||
f"job_timeout={job_timeout:.0f}s"
|
||||
)
|
||||
|
||||
last_exception = None
|
||||
attempt_errors = []
|
||||
|
||||
for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
|
||||
try:
|
||||
|
|
@ -257,46 +326,67 @@ async def parse_with_llamacloud_retry(
|
|||
language="en",
|
||||
result_type=ResultType.MD,
|
||||
# Timeout settings for large files
|
||||
max_timeout=max(2000, job_timeout), # Overall max timeout
|
||||
max_timeout=int(max(2000, job_timeout + upload_timeout)),
|
||||
job_timeout_in_seconds=job_timeout,
|
||||
job_timeout_extra_time_per_page_in_seconds=per_page_timeout,
|
||||
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
|
||||
# Use our custom client with larger timeouts
|
||||
custom_client=custom_client,
|
||||
)
|
||||
|
||||
# Parse the file asynchronously
|
||||
result = await parser.aparse(file_path)
|
||||
|
||||
# Success - log if we had previous failures
|
||||
if attempt > 1:
|
||||
logging.info(
|
||||
f"LlamaCloud upload succeeded on attempt {attempt} after "
|
||||
f"{len(attempt_errors)} failures"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
|
||||
last_exception = e
|
||||
error_type = type(e).__name__
|
||||
error_msg = str(e)[:200]
|
||||
attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
|
||||
|
||||
if attempt < LLAMACLOUD_MAX_RETRIES:
|
||||
# Calculate exponential backoff delay
|
||||
delay = LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1))
|
||||
# Calculate exponential backoff with jitter
|
||||
# Base delay doubles each attempt, capped at max delay
|
||||
base_delay = min(
|
||||
LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
|
||||
LLAMACLOUD_MAX_DELAY
|
||||
)
|
||||
# Add random jitter (±25%) to prevent thundering herd
|
||||
jitter = base_delay * 0.25 * (2 * random.random() - 1)
|
||||
delay = base_delay + jitter
|
||||
|
||||
if task_logger and log_entry:
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay}s",
|
||||
f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s",
|
||||
{
|
||||
"error_type": error_type,
|
||||
"error_message": str(e)[:200],
|
||||
"error_message": error_msg,
|
||||
"attempt": attempt,
|
||||
"retry_delay": delay,
|
||||
"file_size_mb": round(file_size_mb, 1),
|
||||
"upload_timeout": upload_timeout,
|
||||
},
|
||||
)
|
||||
else:
|
||||
logging.warning(
|
||||
f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): {error_type}. "
|
||||
f"Retrying in {delay}s..."
|
||||
f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
|
||||
f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..."
|
||||
)
|
||||
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
logging.error(
|
||||
f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts: {error_type} - {e}"
|
||||
f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. "
|
||||
f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. "
|
||||
f"Errors: {'; '.join(attempt_errors)}"
|
||||
)
|
||||
|
||||
except Exception:
|
||||
|
|
@ -304,7 +394,10 @@ async def parse_with_llamacloud_retry(
|
|||
raise
|
||||
|
||||
# All retries exhausted
|
||||
raise last_exception or RuntimeError("LlamaCloud parsing failed after all retries")
|
||||
raise last_exception or RuntimeError(
|
||||
f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
|
||||
f"File size: {file_size_mb:.1f}MB"
|
||||
)
|
||||
|
||||
|
||||
async def add_received_file_document_using_unstructured(
|
||||
|
|
|
|||
|
|
@ -229,3 +229,4 @@ auth_backend = AuthenticationBackend(
|
|||
fastapi_users = FastAPIUsers[User, uuid.UUID](get_user_manager, [auth_backend])
|
||||
|
||||
current_active_user = fastapi_users.current_user(active=True)
|
||||
current_optional_user = fastapi_users.current_user(active=True, optional=True)
|
||||
|
|
|
|||
75
surfsense_backend/app/utils/content_utils.py
Normal file
75
surfsense_backend/app/utils/content_utils.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
"""
|
||||
Utilities for working with message content.
|
||||
|
||||
Message content in new_chat_messages can be stored in various formats:
|
||||
- String: Simple text content
|
||||
- List: Array of content parts [{"type": "text", "text": "..."}, {"type": "tool-call", ...}]
|
||||
- Dict: Single content object
|
||||
|
||||
These utilities help extract and transform content for different use cases.
|
||||
"""
|
||||
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
|
||||
def extract_text_content(content: str | dict | list) -> str:
|
||||
"""Extract plain text content from various message formats."""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, dict):
|
||||
# Handle dict with 'text' key
|
||||
if "text" in content:
|
||||
return content["text"]
|
||||
return str(content)
|
||||
if isinstance(content, list):
|
||||
# Handle list of parts (e.g., [{"type": "text", "text": "..."}])
|
||||
texts = []
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") == "text":
|
||||
texts.append(part.get("text", ""))
|
||||
elif isinstance(part, str):
|
||||
texts.append(part)
|
||||
return "\n".join(texts) if texts else ""
|
||||
return ""
|
||||
|
||||
|
||||
async def bootstrap_history_from_db(
|
||||
session: AsyncSession,
|
||||
thread_id: int,
|
||||
) -> list[HumanMessage | AIMessage]:
|
||||
"""
|
||||
Load message history from database and convert to LangChain format.
|
||||
|
||||
Used for cloned chats where the LangGraph checkpointer has no state,
|
||||
but we have messages in the database that should be used as context.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
thread_id: The chat thread ID
|
||||
|
||||
Returns:
|
||||
List of LangChain messages (HumanMessage/AIMessage)
|
||||
"""
|
||||
from app.db import NewChatMessage
|
||||
|
||||
result = await session.execute(
|
||||
select(NewChatMessage)
|
||||
.filter(NewChatMessage.thread_id == thread_id)
|
||||
.order_by(NewChatMessage.created_at)
|
||||
)
|
||||
db_messages = result.scalars().all()
|
||||
|
||||
langchain_messages: list[HumanMessage | AIMessage] = []
|
||||
|
||||
for msg in db_messages:
|
||||
text_content = extract_text_content(msg.content)
|
||||
if not text_content:
|
||||
continue
|
||||
if msg.role == "user":
|
||||
langchain_messages.append(HumanMessage(content=text_content))
|
||||
elif msg.role == "assistant":
|
||||
langchain_messages.append(AIMessage(content=text_content))
|
||||
|
||||
return langchain_messages
|
||||
Loading…
Add table
Add a link
Reference in a new issue