mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
Route uploaded images to vision LLM with document-parser fallback
This commit is contained in:
parent
78fa2d926a
commit
7e90a8ed3c
7 changed files with 199 additions and 5 deletions
|
|
@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
|
|||
class EtlPipelineService:
|
||||
"""Single pipeline for extracting markdown from files. All callers use this."""
|
||||
|
||||
def __init__(self, *, vision_llm=None):
|
||||
self._vision_llm = vision_llm
|
||||
|
||||
async def extract(self, request: EtlRequest) -> EtlResult:
|
||||
category = classify_file(request.filename)
|
||||
|
||||
|
|
@ -47,6 +50,28 @@ class EtlPipelineService:
|
|||
content_type="audio",
|
||||
)
|
||||
|
||||
if category == FileCategory.IMAGE:
|
||||
return await self._extract_image(request)
|
||||
|
||||
return await self._extract_document(request)
|
||||
|
||||
async def _extract_image(self, request: EtlRequest) -> EtlResult:
|
||||
if self._vision_llm:
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
||||
|
||||
content = await parse_with_vision_llm(
|
||||
request.file_path, request.filename, self._vision_llm
|
||||
)
|
||||
return EtlResult(
|
||||
markdown_content=content,
|
||||
etl_service="VISION_LLM",
|
||||
content_type="image",
|
||||
)
|
||||
|
||||
logging.info(
|
||||
"No vision LLM provided, falling back to document parser for %s",
|
||||
request.filename,
|
||||
)
|
||||
return await self._extract_document(request)
|
||||
|
||||
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ from pathlib import PurePosixPath
|
|||
|
||||
from app.utils.file_extensions import (
|
||||
DOCUMENT_EXTENSIONS,
|
||||
IMAGE_EXTENSIONS,
|
||||
get_document_extensions_for_service,
|
||||
)
|
||||
|
||||
|
|
@ -105,6 +106,7 @@ class FileCategory(Enum):
|
|||
PLAINTEXT = "plaintext"
|
||||
AUDIO = "audio"
|
||||
DIRECT_CONVERT = "direct_convert"
|
||||
IMAGE = "image"
|
||||
UNSUPPORTED = "unsupported"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
|
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
|
|||
return FileCategory.AUDIO
|
||||
if suffix in DIRECT_CONVERT_EXTENSIONS:
|
||||
return FileCategory.DIRECT_CONVERT
|
||||
if suffix in IMAGE_EXTENSIONS:
|
||||
return FileCategory.IMAGE
|
||||
if suffix in DOCUMENT_EXTENSIONS:
|
||||
return FileCategory.DOCUMENT
|
||||
return FileCategory.UNSUPPORTED
|
||||
|
|
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
|
|||
"""Return True if *filename* cannot be processed by *etl_service*.
|
||||
|
||||
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
||||
skipped. Document files are checked against the per-parser extension set.
|
||||
skipped. Image and document files are checked against the per-parser
|
||||
extension set (images fall back to the document parser when no vision LLM
|
||||
is available, so the same service constraint applies).
|
||||
"""
|
||||
category = classify_file(filename)
|
||||
if category == FileCategory.UNSUPPORTED:
|
||||
return True
|
||||
if category == FileCategory.DOCUMENT:
|
||||
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
|
||||
suffix = PurePosixPath(filename).suffix.lower()
|
||||
return suffix not in get_document_extensions_for_service(etl_service)
|
||||
return False
|
||||
|
|
|
|||
37
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
37
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import base64
|
||||
import mimetypes
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
_PROMPT = (
|
||||
"Analyze this image thoroughly and produce a detailed markdown description.\n\n"
|
||||
"Include:\n"
|
||||
"- All visible text, transcribed verbatim\n"
|
||||
"- Description of diagrams, charts, tables, or visual structures\n"
|
||||
"- Key subjects, objects, or scenes depicted\n\n"
|
||||
"Output only the markdown content, no preamble."
|
||||
)
|
||||
|
||||
|
||||
def _image_to_data_url(file_path: str) -> str:
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type or not mime_type.startswith("image/"):
|
||||
mime_type = "image/png"
|
||||
with open(file_path, "rb") as f:
|
||||
encoded = base64.b64encode(f.read()).decode("ascii")
|
||||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
data_url = _image_to_data_url(file_path)
|
||||
message = HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": _PROMPT},
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
]
|
||||
)
|
||||
response = await llm.ainvoke([message])
|
||||
text = response.content if hasattr(response, "content") else str(response)
|
||||
if not text or not text.strip():
|
||||
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||
return text.strip()
|
||||
|
|
@ -333,6 +333,7 @@ async def process_file_in_background(
|
|||
async def _extract_file_content(
|
||||
file_path: str,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
session: AsyncSession,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
|
|
@ -360,6 +361,7 @@ async def _extract_file_content(
|
|||
FileCategory.PLAINTEXT: "Reading file",
|
||||
FileCategory.DIRECT_CONVERT: "Converting file",
|
||||
FileCategory.AUDIO: "Transcribing audio",
|
||||
FileCategory.IMAGE: "Analyzing image",
|
||||
FileCategory.UNSUPPORTED: "Unsupported file type",
|
||||
FileCategory.DOCUMENT: "Extracting content",
|
||||
}
|
||||
|
|
@ -383,7 +385,13 @@ async def _extract_file_content(
|
|||
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
|
||||
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
vision_llm = None
|
||||
if category == FileCategory.IMAGE:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(session, search_space_id)
|
||||
|
||||
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||
EtlRequest(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
|
|
@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
|
|||
markdown_content, etl_service = await _extract_file_content(
|
||||
file_path,
|
||||
filename,
|
||||
search_space_id,
|
||||
session,
|
||||
user_id,
|
||||
task_logger,
|
||||
|
|
|
|||
|
|
@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
|
|||
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
|
||||
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
|
||||
Unstructured).
|
||||
|
||||
Image extensions intentionally remain in the per-parser sets for fallback
|
||||
compatibility. IMAGE_EXTENSIONS is used only for routing classification.
|
||||
"""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Image extensions (used by file_classifier for routing to vision LLM)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
IMAGE_EXTENSIONS: frozenset[str] = frozenset(
|
||||
{
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".gif",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".tif",
|
||||
".webp",
|
||||
".svg",
|
||||
".heic",
|
||||
".heif",
|
||||
}
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-parser document extension sets (from official documentation)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue