mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-08 23:32:40 +02:00
Route uploaded images to vision LLM with document-parser fallback
This commit is contained in:
parent
78fa2d926a
commit
7e90a8ed3c
7 changed files with 199 additions and 5 deletions
37
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
37
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import base64
|
||||
import mimetypes
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
_PROMPT = (
|
||||
"Analyze this image thoroughly and produce a detailed markdown description.\n\n"
|
||||
"Include:\n"
|
||||
"- All visible text, transcribed verbatim\n"
|
||||
"- Description of diagrams, charts, tables, or visual structures\n"
|
||||
"- Key subjects, objects, or scenes depicted\n\n"
|
||||
"Output only the markdown content, no preamble."
|
||||
)
|
||||
|
||||
|
||||
def _image_to_data_url(file_path: str) -> str:
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
if not mime_type or not mime_type.startswith("image/"):
|
||||
mime_type = "image/png"
|
||||
with open(file_path, "rb") as f:
|
||||
encoded = base64.b64encode(f.read()).decode("ascii")
|
||||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
data_url = _image_to_data_url(file_path)
|
||||
message = HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": _PROMPT},
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
]
|
||||
)
|
||||
response = await llm.ainvoke([message])
|
||||
text = response.content if hasattr(response, "content") else str(response)
|
||||
if not text or not text.strip():
|
||||
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||
return text.strip()
|
||||
Loading…
Add table
Add a link
Reference in a new issue