SurfSense/surfsense_backend/tests/e2e/fakes/docling_service.py
2026-05-12 04:00:04 +05:30

141 lines
5.2 KiB
Python

"""Stub DoclingService.process_document for E2E.
The real ``DoclingService.process_document`` calls
``DocumentConverter.convert(file_path)`` which lazily downloads the
``docling-project/docling-layout-heron`` model from Hugging Face Hub.
The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
``docker/docker-compose.e2e.yml``), so that download fails with
``LocalEntryNotFoundError`` and the indexing Celery task retries until
the Playwright test hits its ~4-minute step timeout. In CI that is the
difference between the suite finishing and the 30-minute job timeout
killing the run before any report can upload.
Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
entirely. ``DoclingService.__init__`` is intentionally left untouched
because constructing ``DocumentConverter(...)`` is cheap and offline —
it is only ``.convert()`` that triggers the offline-model download.
Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
by ``generate_canary_pdfs.py`` and embeds its canary token as plain
``(text) Tj`` PDF text operators. Extracting those operators gives us
the canary string back, which is what the Playwright assertions look
for in the resulting Document row.
"""
from __future__ import annotations
import logging
import re
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
# Matches the `(escaped text) Tj` text-show operator emitted by
# generate_canary_pdfs.py. Inside the parens, the escape rules are:
# \\ -> backslash
# \( -> literal (
# \) -> literal )
# The character class [^\\()] consumes any non-escape byte; \\. consumes
# an escape sequence. Sufficient for our synthetic fixtures.
_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
def _extract_text_from_synthetic_pdf(file_path: str) -> str:
"""Pull every ``(text) Tj`` payload out of a fixture PDF in order.
Returns an empty string if the file cannot be read. We do not try to
handle arbitrary PDFs because the fake is only ever invoked against
fixtures we generate ourselves.
"""
try:
data = Path(file_path).read_bytes()
except OSError as exc:
logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
return ""
lines: list[str] = []
for match in _TJ_PATTERN.finditer(data):
raw = match.group(1)
# Order-sensitive unescape via sentinel: protect `\\` first so
# the subsequent `\(` / `\)` passes do not corrupt it.
text = (
raw.replace(rb"\\", b"\x00")
.replace(rb"\(", b"(")
.replace(rb"\)", b")")
.replace(b"\x00", b"\\")
)
try:
lines.append(text.decode("utf-8"))
except UnicodeDecodeError:
lines.append(text.decode("latin-1"))
return "\n".join(lines)
async def fake_process_document(
self,
file_path: str,
filename: str | None = None,
) -> dict[str, Any]:
"""Drop-in replacement for ``DoclingService.process_document``.
Returns the same dict shape as the production method so callers
(``app/etl_pipeline/parsers/docling.py``) can keep reading
``result["content"]`` without changes.
"""
extracted = _extract_text_from_synthetic_pdf(file_path)
display_name = filename or Path(file_path).name
if extracted:
content = f"# {display_name}\n\n{extracted}\n"
else:
# Empty fallback so the indexing pipeline does not error out on
# an unexpected payload. A failing canary assertion is a much
# clearer failure mode than a hard parser exception.
content = (
f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
)
logger.info(
"[fake-docling] returning %d chars for %s",
len(content),
display_name,
)
return {
"content": content,
"full_text": content,
"service_used": "docling-fake",
"status": "success",
"processing_notes": "e2e fake DoclingService — no real PDF parsing",
}
def install(patches: list[Any]) -> None:
"""Patch ``DoclingService.process_document`` at the class level.
Patching the class method (rather than each call site) is correct
here because every consumer goes through
``create_docling_service()`` → ``DoclingService()`` → instance method
dispatch, so the descriptor protocol picks up our replacement. There
is exactly one such consumer today
(``app/etl_pipeline/parsers/docling.py``), but patching the class is
future-proof.
Fails loud rather than warning, because a silent passthrough means
real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
"""
from unittest.mock import patch as _patch
target = "app.services.docling_service.DoclingService.process_document"
try:
p = _patch(target, fake_process_document)
p.start()
patches.append(p)
logger.info("[fake-docling] patched %s", target)
except (ModuleNotFoundError, AttributeError) as exc:
raise RuntimeError(
f"Could not patch Docling binding {target!r}: {exc!s}. "
f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
f"to point at the new binding site."
) from exc