mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-10 16:22:38 +02:00
Harden vision LLM fallback, folder upload validation, and export memory
This commit is contained in:
parent
e164fe0612
commit
4ccdd80e26
4 changed files with 79 additions and 15 deletions
|
|
@ -80,7 +80,14 @@ class EtlPipelineService:
|
||||||
request.filename,
|
request.filename,
|
||||||
)
|
)
|
||||||
|
|
||||||
return await self._extract_document(request)
|
try:
|
||||||
|
return await self._extract_document(request)
|
||||||
|
except (EtlUnsupportedFileError, EtlServiceUnavailableError):
|
||||||
|
raise EtlUnsupportedFileError(
|
||||||
|
f"Cannot process image {request.filename}: vision LLM "
|
||||||
|
f"{'failed' if self._vision_llm else 'not configured'} and "
|
||||||
|
f"document parser does not support this format"
|
||||||
|
) from None
|
||||||
|
|
||||||
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
||||||
from pathlib import PurePosixPath
|
from pathlib import PurePosixPath
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
@ -13,6 +14,8 @@ _MAX_IMAGE_BYTES = (
|
||||||
5 * 1024 * 1024
|
5 * 1024 * 1024
|
||||||
) # 5 MB (Anthropic Claude's limit, the most restrictive)
|
) # 5 MB (Anthropic Claude's limit, the most restrictive)
|
||||||
|
|
||||||
|
_INVOKE_TIMEOUT_SECONDS = 120
|
||||||
|
|
||||||
_EXT_TO_MIME: dict[str, str] = {
|
_EXT_TO_MIME: dict[str, str] = {
|
||||||
".png": "image/png",
|
".png": "image/png",
|
||||||
".jpg": "image/jpeg",
|
".jpg": "image/jpeg",
|
||||||
|
|
@ -52,7 +55,9 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||||
{"type": "image_url", "image_url": {"url": data_url}},
|
{"type": "image_url", "image_url": {"url": data_url}},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
response = await llm.ainvoke([message])
|
response = await asyncio.wait_for(
|
||||||
|
llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS
|
||||||
|
)
|
||||||
text = response.content if hasattr(response, "content") else str(response)
|
text = response.content if hasattr(response, "content") else str(response)
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
|
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
|
||||||
from pydantic import BaseModel as PydanticBaseModel
|
from pydantic import BaseModel as PydanticBaseModel, Field
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy.future import select
|
from sqlalchemy.future import select
|
||||||
from sqlalchemy.orm import selectinload
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
@ -1395,10 +1395,13 @@ class FolderMtimeCheckFile(PydanticBaseModel):
|
||||||
mtime: float
|
mtime: float
|
||||||
|
|
||||||
|
|
||||||
|
_MAX_MTIME_CHECK_FILES = 10_000
|
||||||
|
|
||||||
|
|
||||||
class FolderMtimeCheckRequest(PydanticBaseModel):
|
class FolderMtimeCheckRequest(PydanticBaseModel):
|
||||||
folder_name: str
|
folder_name: str
|
||||||
search_space_id: int
|
search_space_id: int
|
||||||
files: list[FolderMtimeCheckFile]
|
files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES)
|
||||||
|
|
||||||
|
|
||||||
class FolderUnlinkRequest(PydanticBaseModel):
|
class FolderUnlinkRequest(PydanticBaseModel):
|
||||||
|
|
@ -1531,6 +1534,23 @@ async def folder_upload(
|
||||||
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from app.services.folder_service import MAX_FOLDER_DEPTH
|
||||||
|
|
||||||
|
max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0)
|
||||||
|
if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels "
|
||||||
|
f"exceeds the maximum of {MAX_FOLDER_DEPTH}.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if root_folder_id:
|
||||||
|
root_folder = await session.get(Folder, root_folder_id)
|
||||||
|
if not root_folder or root_folder.search_space_id != search_space_id:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail="Root folder not found in this search space"
|
||||||
|
)
|
||||||
|
|
||||||
if not root_folder_id:
|
if not root_folder_id:
|
||||||
watched_metadata = {
|
watched_metadata = {
|
||||||
"watched": True,
|
"watched": True,
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
"""Service for exporting knowledge base content as a ZIP archive."""
|
"""Service for exporting knowledge base content as a ZIP archive."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
@ -106,23 +107,38 @@ async def build_export_zip(
|
||||||
|
|
||||||
folder_path_map = _build_folder_path_map(folders)
|
folder_path_map = _build_folder_path_map(folders)
|
||||||
|
|
||||||
doc_query = select(Document).where(Document.search_space_id == search_space_id)
|
batch_size = 100
|
||||||
|
|
||||||
|
base_doc_query = select(Document).where(Document.search_space_id == search_space_id)
|
||||||
if target_folder_ids is not None:
|
if target_folder_ids is not None:
|
||||||
doc_query = doc_query.where(Document.folder_id.in_(target_folder_ids))
|
base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids))
|
||||||
doc_result = await session.execute(doc_query)
|
base_doc_query = base_doc_query.order_by(Document.id)
|
||||||
documents = list(doc_result.scalars().all())
|
|
||||||
|
|
||||||
fd, tmp_path = tempfile.mkstemp(suffix=".zip")
|
fd, tmp_path = tempfile.mkstemp(suffix=".zip")
|
||||||
os.close(fd)
|
os.close(fd)
|
||||||
|
|
||||||
try:
|
used_paths: dict[str, int] = {}
|
||||||
used_paths: dict[str, int] = {}
|
skipped_docs: list[str] = []
|
||||||
skipped_docs: list[str] = []
|
is_first_batch = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
offset = 0
|
||||||
|
while True:
|
||||||
|
batch_query = base_doc_query.limit(batch_size).offset(offset)
|
||||||
|
batch_result = await session.execute(batch_query)
|
||||||
|
documents = list(batch_result.scalars().all())
|
||||||
|
if not documents:
|
||||||
|
break
|
||||||
|
|
||||||
|
entries: list[tuple[str, str]] = []
|
||||||
|
|
||||||
with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
status = doc.status or {}
|
status = doc.status or {}
|
||||||
state = status.get("state", "ready") if isinstance(status, dict) else "ready"
|
state = (
|
||||||
|
status.get("state", "ready")
|
||||||
|
if isinstance(status, dict)
|
||||||
|
else "ready"
|
||||||
|
)
|
||||||
if state in ("pending", "processing"):
|
if state in ("pending", "processing"):
|
||||||
skipped_docs.append(doc.title or "Untitled")
|
skipped_docs.append(doc.title or "Untitled")
|
||||||
continue
|
continue
|
||||||
|
|
@ -137,7 +153,9 @@ async def build_export_zip(
|
||||||
dir_path = ""
|
dir_path = ""
|
||||||
|
|
||||||
base_name = _sanitize_filename(doc.title or "Untitled")
|
base_name = _sanitize_filename(doc.title or "Untitled")
|
||||||
file_path = f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
|
file_path = (
|
||||||
|
f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
|
||||||
|
)
|
||||||
|
|
||||||
if file_path in used_paths:
|
if file_path in used_paths:
|
||||||
used_paths[file_path] += 1
|
used_paths[file_path] += 1
|
||||||
|
|
@ -149,7 +167,21 @@ async def build_export_zip(
|
||||||
)
|
)
|
||||||
used_paths[file_path] = used_paths.get(file_path, 0) + 1
|
used_paths[file_path] = used_paths.get(file_path, 0) + 1
|
||||||
|
|
||||||
zf.writestr(file_path, markdown)
|
entries.append((file_path, markdown))
|
||||||
|
|
||||||
|
if entries:
|
||||||
|
mode = "w" if is_first_batch else "a"
|
||||||
|
batch_entries = entries
|
||||||
|
|
||||||
|
def _write_batch(m: str = mode, e: list = batch_entries) -> None:
|
||||||
|
with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
for path, content in e:
|
||||||
|
zf.writestr(path, content)
|
||||||
|
|
||||||
|
await asyncio.to_thread(_write_batch)
|
||||||
|
is_first_batch = False
|
||||||
|
|
||||||
|
offset += batch_size
|
||||||
|
|
||||||
export_name = "knowledge-base"
|
export_name = "knowledge-base"
|
||||||
if folder_id is not None and folder_id in folder_path_map:
|
if folder_id is not None and folder_id in folder_path_map:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue