SurfSense/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py

37 lines
1.3 KiB
Python

import base64
import mimetypes
from langchain_core.messages import HumanMessage
_PROMPT = (
"Analyze this image thoroughly and produce a detailed markdown description.\n\n"
"Include:\n"
"- All visible text, transcribed verbatim\n"
"- Description of diagrams, charts, tables, or visual structures\n"
"- Key subjects, objects, or scenes depicted\n\n"
"Output only the markdown content, no preamble."
)
def _image_to_data_url(file_path: str) -> str:
mime_type, _ = mimetypes.guess_type(file_path)
if not mime_type or not mime_type.startswith("image/"):
mime_type = "image/png"
with open(file_path, "rb") as f:
encoded = base64.b64encode(f.read()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
data_url = _image_to_data_url(file_path)
message = HumanMessage(
content=[
{"type": "text", "text": _PROMPT},
{"type": "image_url", "image_url": {"url": data_url}},
]
)
response = await llm.ainvoke([message])
text = response.content if hasattr(response, "content") else str(response)
if not text or not text.strip():
raise ValueError(f"Vision LLM returned empty content for {filename}")
return text.strip()