mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
141 lines
5.2 KiB
Python
141 lines
5.2 KiB
Python
"""Stub DoclingService.process_document for E2E.
|
|
|
|
The real ``DoclingService.process_document`` calls
|
|
``DocumentConverter.convert(file_path)`` which lazily downloads the
|
|
``docling-project/docling-layout-heron`` model from Hugging Face Hub.
|
|
The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
|
|
``docker/docker-compose.e2e.yml``), so that download fails with
|
|
``LocalEntryNotFoundError`` and the indexing Celery task retries until
|
|
the Playwright test hits its ~4-minute step timeout. In CI that is the
|
|
difference between the suite finishing and the 30-minute job timeout
|
|
killing the run before any report can upload.
|
|
|
|
Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
|
|
entirely. ``DoclingService.__init__`` is intentionally left untouched
|
|
because constructing ``DocumentConverter(...)`` is cheap and offline —
|
|
it is only ``.convert()`` that triggers the offline-model download.
|
|
|
|
Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
|
|
by ``generate_canary_pdfs.py`` and embeds its canary token as plain
|
|
``(text) Tj`` PDF text operators. Extracting those operators gives us
|
|
the canary string back, which is what the Playwright assertions look
|
|
for in the resulting Document row.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Matches the `(escaped text) Tj` text-show operator emitted by
|
|
# generate_canary_pdfs.py. Inside the parens, the escape rules are:
|
|
# \\ -> backslash
|
|
# \( -> literal (
|
|
# \) -> literal )
|
|
# The character class [^\\()] consumes any non-escape byte; \\. consumes
|
|
# an escape sequence. Sufficient for our synthetic fixtures.
|
|
_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
|
|
|
|
|
|
def _extract_text_from_synthetic_pdf(file_path: str) -> str:
|
|
"""Pull every ``(text) Tj`` payload out of a fixture PDF in order.
|
|
|
|
Returns an empty string if the file cannot be read. We do not try to
|
|
handle arbitrary PDFs because the fake is only ever invoked against
|
|
fixtures we generate ourselves.
|
|
"""
|
|
try:
|
|
data = Path(file_path).read_bytes()
|
|
except OSError as exc:
|
|
logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
|
|
return ""
|
|
|
|
lines: list[str] = []
|
|
for match in _TJ_PATTERN.finditer(data):
|
|
raw = match.group(1)
|
|
# Order-sensitive unescape via sentinel: protect `\\` first so
|
|
# the subsequent `\(` / `\)` passes do not corrupt it.
|
|
text = (
|
|
raw.replace(rb"\\", b"\x00")
|
|
.replace(rb"\(", b"(")
|
|
.replace(rb"\)", b")")
|
|
.replace(b"\x00", b"\\")
|
|
)
|
|
try:
|
|
lines.append(text.decode("utf-8"))
|
|
except UnicodeDecodeError:
|
|
lines.append(text.decode("latin-1"))
|
|
return "\n".join(lines)
|
|
|
|
|
|
async def fake_process_document(
|
|
self,
|
|
file_path: str,
|
|
filename: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Drop-in replacement for ``DoclingService.process_document``.
|
|
|
|
Returns the same dict shape as the production method so callers
|
|
(``app/etl_pipeline/parsers/docling.py``) can keep reading
|
|
``result["content"]`` without changes.
|
|
"""
|
|
extracted = _extract_text_from_synthetic_pdf(file_path)
|
|
display_name = filename or Path(file_path).name
|
|
|
|
if extracted:
|
|
content = f"# {display_name}\n\n{extracted}\n"
|
|
else:
|
|
# Empty fallback so the indexing pipeline does not error out on
|
|
# an unexpected payload. A failing canary assertion is a much
|
|
# clearer failure mode than a hard parser exception.
|
|
content = (
|
|
f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
|
|
)
|
|
|
|
logger.info(
|
|
"[fake-docling] returning %d chars for %s",
|
|
len(content),
|
|
display_name,
|
|
)
|
|
|
|
return {
|
|
"content": content,
|
|
"full_text": content,
|
|
"service_used": "docling-fake",
|
|
"status": "success",
|
|
"processing_notes": "e2e fake DoclingService — no real PDF parsing",
|
|
}
|
|
|
|
|
|
def install(patches: list[Any]) -> None:
|
|
"""Patch ``DoclingService.process_document`` at the class level.
|
|
|
|
Patching the class method (rather than each call site) is correct
|
|
here because every consumer goes through
|
|
``create_docling_service()`` → ``DoclingService()`` → instance method
|
|
dispatch, so the descriptor protocol picks up our replacement. There
|
|
is exactly one such consumer today
|
|
(``app/etl_pipeline/parsers/docling.py``), but patching the class is
|
|
future-proof.
|
|
|
|
Fails loud rather than warning, because a silent passthrough means
|
|
real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
|
|
"""
|
|
from unittest.mock import patch as _patch
|
|
|
|
target = "app.services.docling_service.DoclingService.process_document"
|
|
try:
|
|
p = _patch(target, fake_process_document)
|
|
p.start()
|
|
patches.append(p)
|
|
logger.info("[fake-docling] patched %s", target)
|
|
except (ModuleNotFoundError, AttributeError) as exc:
|
|
raise RuntimeError(
|
|
f"Could not patch Docling binding {target!r}: {exc!s}. "
|
|
f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
|
|
f"to point at the new binding site."
|
|
) from exc
|