2026-04-09 14:33:33 +02:00
|
|
|
import base64
|
|
|
|
|
import mimetypes
|
2026-04-09 15:17:08 +02:00
|
|
|
import os
|
2026-04-09 14:33:33 +02:00
|
|
|
|
|
|
|
|
from langchain_core.messages import HumanMessage
|
|
|
|
|
|
|
|
|
|
_PROMPT = (
|
2026-04-09 14:56:18 +02:00
|
|
|
"Describe this image in markdown. "
|
|
|
|
|
"Transcribe any visible text verbatim. "
|
|
|
|
|
"Be concise but complete — let the image content guide the level of detail."
|
2026-04-09 14:33:33 +02:00
|
|
|
)
|
|
|
|
|
|
2026-04-09 15:17:08 +02:00
|
|
|
_MAX_IMAGE_BYTES = 5 * 1024 * 1024 # 5 MB (Anthropic Claude's limit, the most restrictive)
|
|
|
|
|
|
2026-04-09 14:33:33 +02:00
|
|
|
|
|
|
|
|
def _image_to_data_url(file_path: str) -> str:
|
2026-04-09 15:17:08 +02:00
|
|
|
file_size = os.path.getsize(file_path)
|
|
|
|
|
if file_size > _MAX_IMAGE_BYTES:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
|
|
|
|
|
f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
|
|
|
|
|
)
|
2026-04-09 14:33:33 +02:00
|
|
|
mime_type, _ = mimetypes.guess_type(file_path)
|
|
|
|
|
if not mime_type or not mime_type.startswith("image/"):
|
|
|
|
|
mime_type = "image/png"
|
|
|
|
|
with open(file_path, "rb") as f:
|
|
|
|
|
encoded = base64.b64encode(f.read()).decode("ascii")
|
|
|
|
|
return f"data:{mime_type};base64,{encoded}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
|
|
|
|
data_url = _image_to_data_url(file_path)
|
|
|
|
|
message = HumanMessage(
|
|
|
|
|
content=[
|
|
|
|
|
{"type": "text", "text": _PROMPT},
|
|
|
|
|
{"type": "image_url", "image_url": {"url": data_url}},
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
response = await llm.ainvoke([message])
|
|
|
|
|
text = response.content if hasattr(response, "content") else str(response)
|
|
|
|
|
if not text or not text.strip():
|
|
|
|
|
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
|
|
|
|
return text.strip()
|