mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-29 19:35:20 +02:00
Route uploaded images to vision LLM with document-parser fallback
This commit is contained in:
parent
78fa2d926a
commit
7e90a8ed3c
7 changed files with 199 additions and 5 deletions
|
|
@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
|
|||
class EtlPipelineService:
|
||||
"""Single pipeline for extracting markdown from files. All callers use this."""
|
||||
|
||||
def __init__(self, *, vision_llm=None):
|
||||
self._vision_llm = vision_llm
|
||||
|
||||
async def extract(self, request: EtlRequest) -> EtlResult:
|
||||
category = classify_file(request.filename)
|
||||
|
||||
|
|
@ -47,6 +50,28 @@ class EtlPipelineService:
|
|||
content_type="audio",
|
||||
)
|
||||
|
||||
if category == FileCategory.IMAGE:
|
||||
return await self._extract_image(request)
|
||||
|
||||
return await self._extract_document(request)
|
||||
|
||||
async def _extract_image(self, request: EtlRequest) -> EtlResult:
|
||||
if self._vision_llm:
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
||||
|
||||
content = await parse_with_vision_llm(
|
||||
request.file_path, request.filename, self._vision_llm
|
||||
)
|
||||
return EtlResult(
|
||||
markdown_content=content,
|
||||
etl_service="VISION_LLM",
|
||||
content_type="image",
|
||||
)
|
||||
|
||||
logging.info(
|
||||
"No vision LLM provided, falling back to document parser for %s",
|
||||
request.filename,
|
||||
)
|
||||
return await self._extract_document(request)
|
||||
|
||||
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ from pathlib import PurePosixPath
|
|||
|
||||
from app.utils.file_extensions import (
|
||||
DOCUMENT_EXTENSIONS,
|
||||
IMAGE_EXTENSIONS,
|
||||
get_document_extensions_for_service,
|
||||
)
|
||||
|
||||
|
|
@ -105,6 +106,7 @@ class FileCategory(Enum):
|
|||
PLAINTEXT = "plaintext"
|
||||
AUDIO = "audio"
|
||||
DIRECT_CONVERT = "direct_convert"
|
||||
IMAGE = "image"
|
||||
UNSUPPORTED = "unsupported"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
|
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
|
|||
return FileCategory.AUDIO
|
||||
if suffix in DIRECT_CONVERT_EXTENSIONS:
|
||||
return FileCategory.DIRECT_CONVERT
|
||||
if suffix in IMAGE_EXTENSIONS:
|
||||
return FileCategory.IMAGE
|
||||
if suffix in DOCUMENT_EXTENSIONS:
|
||||
return FileCategory.DOCUMENT
|
||||
return FileCategory.UNSUPPORTED
|
||||
|
|
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
|
|||
"""Return True if *filename* cannot be processed by *etl_service*.
|
||||
|
||||
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
||||
skipped. Document files are checked against the per-parser extension set.
|
||||
skipped. Image and document files are checked against the per-parser
|
||||
extension set (images fall back to the document parser when no vision LLM
|
||||
is available, so the same service constraint applies).
|
||||
"""
|
||||
category = classify_file(filename)
|
||||
if category == FileCategory.UNSUPPORTED:
|
||||
return True
|
||||
if category == FileCategory.DOCUMENT:
|
||||
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
|
||||
suffix = PurePosixPath(filename).suffix.lower()
|
||||
return suffix not in get_document_extensions_for_service(etl_service)
|
||||
return False
|
||||
|
|
|
|||
37
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
37
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import base64
|
||||
import mimetypes
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
_PROMPT = (
|
||||
"Analyze this image thoroughly and produce a detailed markdown description.\n\n"
|
||||
"Include:\n"
|
||||
"- All visible text, transcribed verbatim\n"
|
||||
"- Description of diagrams, charts, tables, or visual structures\n"
|
||||
"- Key subjects, objects, or scenes depicted\n\n"
|
||||
"Output only the markdown content, no preamble."
|
||||
)
|
||||
|
||||
|
||||
def _image_to_data_url(file_path: str) -> str:
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type or not mime_type.startswith("image/"):
|
||||
mime_type = "image/png"
|
||||
with open(file_path, "rb") as f:
|
||||
encoded = base64.b64encode(f.read()).decode("ascii")
|
||||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
data_url = _image_to_data_url(file_path)
|
||||
message = HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": _PROMPT},
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
]
|
||||
)
|
||||
response = await llm.ainvoke([message])
|
||||
text = response.content if hasattr(response, "content") else str(response)
|
||||
if not text or not text.strip():
|
||||
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||
return text.strip()
|
||||
|
|
@ -333,6 +333,7 @@ async def process_file_in_background(
|
|||
async def _extract_file_content(
|
||||
file_path: str,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
session: AsyncSession,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
|
|
@ -360,6 +361,7 @@ async def _extract_file_content(
|
|||
FileCategory.PLAINTEXT: "Reading file",
|
||||
FileCategory.DIRECT_CONVERT: "Converting file",
|
||||
FileCategory.AUDIO: "Transcribing audio",
|
||||
FileCategory.IMAGE: "Analyzing image",
|
||||
FileCategory.UNSUPPORTED: "Unsupported file type",
|
||||
FileCategory.DOCUMENT: "Extracting content",
|
||||
}
|
||||
|
|
@ -383,7 +385,13 @@ async def _extract_file_content(
|
|||
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
|
||||
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
vision_llm = None
|
||||
if category == FileCategory.IMAGE:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(session, search_space_id)
|
||||
|
||||
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||
EtlRequest(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
|
|
@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
|
|||
markdown_content, etl_service = await _extract_file_content(
|
||||
file_path,
|
||||
filename,
|
||||
search_space_id,
|
||||
session,
|
||||
user_id,
|
||||
task_logger,
|
||||
|
|
|
|||
|
|
@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
|
|||
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
|
||||
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
|
||||
Unstructured).
|
||||
|
||||
Image extensions intentionally remain in the per-parser sets for fallback
|
||||
compatibility. IMAGE_EXTENSIONS is used only for routing classification.
|
||||
"""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Image extensions (used by file_classifier for routing to vision LLM)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
IMAGE_EXTENSIONS: frozenset[str] = frozenset(
|
||||
{
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
".webp",
|
||||
".svg",
|
||||
".heic",
|
||||
".heif",
|
||||
}
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser document extension sets (from official documentation)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
|
|||
("doc.docx", "document"),
|
||||
("slides.pptx", "document"),
|
||||
("sheet.xlsx", "document"),
|
||||
("photo.png", "document"),
|
||||
("photo.jpg", "document"),
|
||||
("photo.png", "image"),
|
||||
("photo.jpg", "image"),
|
||||
("photo.webp", "image"),
|
||||
("photo.gif", "image"),
|
||||
("photo.heic", "image"),
|
||||
("book.epub", "document"),
|
||||
("letter.odt", "document"),
|
||||
("readme.md", "plaintext"),
|
||||
|
|
@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
|
|||
await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(eml_file), filename="mail.eml")
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Image extraction via vision LLM
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_extract_image_with_vision_llm(tmp_path):
|
||||
"""An image file is analyzed by the vision LLM when provided."""
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
img_file = tmp_path / "photo.png"
|
||||
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.content = "# A photo of a sunset over the ocean"
|
||||
fake_llm = AsyncMock()
|
||||
fake_llm.ainvoke.return_value = fake_response
|
||||
|
||||
service = EtlPipelineService(vision_llm=fake_llm)
|
||||
result = await service.extract(
|
||||
EtlRequest(file_path=str(img_file), filename="photo.png")
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# A photo of a sunset over the ocean"
|
||||
assert result.etl_service == "VISION_LLM"
|
||||
assert result.content_type == "image"
|
||||
fake_llm.ainvoke.assert_called_once()
|
||||
|
||||
|
||||
async def test_extract_image_falls_back_to_document_without_vision_llm(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""Without a vision LLM, image files fall back to the document parser."""
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {"content": "# OCR text from image"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
img_file = tmp_path / "scan.png"
|
||||
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
|
||||
|
||||
service = EtlPipelineService()
|
||||
result = await service.extract(
|
||||
EtlRequest(file_path=str(img_file), filename="scan.png")
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# OCR text from image"
|
||||
assert result.etl_service == "DOCLING"
|
||||
assert result.content_type == "document"
|
||||
|
|
|
|||
|
|
@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
|
|||
)
|
||||
|
||||
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# IMAGE_EXTENSIONS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ext",
|
||||
[
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
".webp",
|
||||
".svg",
|
||||
".heic",
|
||||
".heif",
|
||||
],
|
||||
)
|
||||
def test_image_extensions_contains_expected(ext):
|
||||
from app.utils.file_extensions import IMAGE_EXTENSIONS
|
||||
|
||||
assert ext in IMAGE_EXTENSIONS
|
||||
|
||||
|
||||
def test_image_extensions_are_subset_of_document_extensions():
|
||||
"""Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
|
||||
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
|
||||
|
||||
missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
|
||||
assert not missing, (
|
||||
f"Image extensions missing from document sets (breaks fallback): {missing}"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue