mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
chore: add fake DoclingService for E2E tests and integrate into runtime fakes
This commit is contained in:
parent
b247ff37df
commit
c052fc9304
3 changed files with 143 additions and 0 deletions
139
surfsense_backend/tests/e2e/fakes/docling_service.py
Normal file
139
surfsense_backend/tests/e2e/fakes/docling_service.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
"""Stub DoclingService.process_document for E2E.
|
||||
|
||||
The real ``DoclingService.process_document`` calls
|
||||
``DocumentConverter.convert(file_path)`` which lazily downloads the
|
||||
``docling-project/docling-layout-heron`` model from Hugging Face Hub.
|
||||
The hermetic E2E container sets ``HF_HUB_OFFLINE=1`` (see
|
||||
``docker/docker-compose.e2e.yml``), so that download fails with
|
||||
``LocalEntryNotFoundError`` and the indexing Celery task retries until
|
||||
the Playwright test hits its ~4-minute step timeout. In CI that is the
|
||||
difference between the suite finishing and the 30-minute job timeout
|
||||
killing the run before any report can upload.
|
||||
|
||||
Stubbing ``process_document`` bypasses ``DocumentConverter.convert()``
|
||||
entirely. ``DoclingService.__init__`` is intentionally left untouched
|
||||
because constructing ``DocumentConverter(...)`` is cheap and offline —
|
||||
it is only ``.convert()`` that triggers the offline-model download.
|
||||
|
||||
Every canary PDF under ``tests/e2e/fakes/fixtures/binary/`` is produced
|
||||
by ``generate_canary_pdfs.py`` and embeds its canary token as plain
|
||||
``(text) Tj`` PDF text operators. Extracting those operators gives us
|
||||
the canary string back, which is what the Playwright assertions look
|
||||
for in the resulting Document row.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Matches the `(escaped text) Tj` text-show operator emitted by
|
||||
# generate_canary_pdfs.py. Inside the parens, the escape rules are:
|
||||
# \\ -> backslash
|
||||
# \( -> literal (
|
||||
# \) -> literal )
|
||||
# The character class [^\\()] consumes any non-escape byte; \\. consumes
|
||||
# an escape sequence. Sufficient for our synthetic fixtures.
|
||||
_TJ_PATTERN = re.compile(rb"\(((?:[^\\()]|\\.)*)\)\s*Tj")
|
||||
|
||||
|
||||
def _extract_text_from_synthetic_pdf(file_path: str) -> str:
|
||||
"""Pull every ``(text) Tj`` payload out of a fixture PDF in order.
|
||||
|
||||
Returns an empty string if the file cannot be read. We do not try to
|
||||
handle arbitrary PDFs because the fake is only ever invoked against
|
||||
fixtures we generate ourselves.
|
||||
"""
|
||||
try:
|
||||
data = Path(file_path).read_bytes()
|
||||
except OSError as exc:
|
||||
logger.warning("[fake-docling] could not read %s: %s", file_path, exc)
|
||||
return ""
|
||||
|
||||
lines: list[str] = []
|
||||
for match in _TJ_PATTERN.finditer(data):
|
||||
raw = match.group(1)
|
||||
# Order-sensitive unescape via sentinel: protect `\\` first so
|
||||
# the subsequent `\(` / `\)` passes do not corrupt it.
|
||||
text = (
|
||||
raw.replace(rb"\\", b"\x00")
|
||||
.replace(rb"\(", b"(")
|
||||
.replace(rb"\)", b")")
|
||||
.replace(b"\x00", b"\\")
|
||||
)
|
||||
try:
|
||||
lines.append(text.decode("utf-8"))
|
||||
except UnicodeDecodeError:
|
||||
lines.append(text.decode("latin-1"))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def fake_process_document(
|
||||
self,
|
||||
file_path: str,
|
||||
filename: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Drop-in replacement for ``DoclingService.process_document``.
|
||||
|
||||
Returns the same dict shape as the production method so callers
|
||||
(``app/etl_pipeline/parsers/docling.py``) can keep reading
|
||||
``result["content"]`` without changes.
|
||||
"""
|
||||
extracted = _extract_text_from_synthetic_pdf(file_path)
|
||||
display_name = filename or Path(file_path).name
|
||||
|
||||
if extracted:
|
||||
content = f"# {display_name}\n\n{extracted}\n"
|
||||
else:
|
||||
# Empty fallback so the indexing pipeline does not error out on
|
||||
# an unexpected payload. A failing canary assertion is a much
|
||||
# clearer failure mode than a hard parser exception.
|
||||
content = f"# {display_name}\n\n(empty docling fake — no text-show operators found)\n"
|
||||
|
||||
logger.info(
|
||||
"[fake-docling] returning %d chars for %s",
|
||||
len(content),
|
||||
display_name,
|
||||
)
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
"full_text": content,
|
||||
"service_used": "docling-fake",
|
||||
"status": "success",
|
||||
"processing_notes": "e2e fake DoclingService — no real PDF parsing",
|
||||
}
|
||||
|
||||
|
||||
def install(patches: list[Any]) -> None:
|
||||
"""Patch ``DoclingService.process_document`` at the class level.
|
||||
|
||||
Patching the class method (rather than each call site) is correct
|
||||
here because every consumer goes through
|
||||
``create_docling_service()`` → ``DoclingService()`` → instance method
|
||||
dispatch, so the descriptor protocol picks up our replacement. There
|
||||
is exactly one such consumer today
|
||||
(``app/etl_pipeline/parsers/docling.py``), but patching the class is
|
||||
future-proof.
|
||||
|
||||
Fails loud rather than warning, because a silent passthrough means
|
||||
real Docling + ``HF_HUB_OFFLINE=1`` = 4 minutes of CI hang per test.
|
||||
"""
|
||||
from unittest.mock import patch as _patch
|
||||
|
||||
target = "app.services.docling_service.DoclingService.process_document"
|
||||
try:
|
||||
p = _patch(target, fake_process_document)
|
||||
p.start()
|
||||
patches.append(p)
|
||||
logger.info("[fake-docling] patched %s", target)
|
||||
except (ModuleNotFoundError, AttributeError) as exc:
|
||||
raise RuntimeError(
|
||||
f"Could not patch Docling binding {target!r}: {exc!s}. "
|
||||
f"Update surfsense_backend/tests/e2e/fakes/docling_service.py "
|
||||
f"to point at the new binding site."
|
||||
) from exc
|
||||
|
|
@ -209,6 +209,7 @@ def _install_runtime_fakes() -> None:
|
|||
clickup_module as _fake_clickup_module,
|
||||
confluence_indexer as _fake_confluence_indexer,
|
||||
confluence_oauth as _fake_confluence_oauth,
|
||||
docling_service as _fake_docling_service,
|
||||
dropbox_api as _fake_dropbox_api,
|
||||
embeddings as _fake_embeddings,
|
||||
jira_module as _fake_jira_module,
|
||||
|
|
@ -222,6 +223,7 @@ def _install_runtime_fakes() -> None:
|
|||
)
|
||||
|
||||
_fake_embeddings.install(_active_patches)
|
||||
_fake_docling_service.install(_active_patches)
|
||||
_fake_confluence_oauth.install(_active_patches)
|
||||
_fake_confluence_indexer.install(_active_patches)
|
||||
_fake_native_google.install(_active_patches)
|
||||
|
|
|
|||
|
|
@ -117,6 +117,7 @@ from tests.e2e.fakes import ( # noqa: E402
|
|||
clickup_module as _fake_clickup_module,
|
||||
confluence_indexer as _fake_confluence_indexer,
|
||||
confluence_oauth as _fake_confluence_oauth,
|
||||
docling_service as _fake_docling_service,
|
||||
dropbox_api as _fake_dropbox_api,
|
||||
embeddings as _fake_embeddings,
|
||||
jira_module as _fake_jira_module,
|
||||
|
|
@ -197,6 +198,7 @@ def _patch_llm_bindings() -> None:
|
|||
|
||||
_patch_llm_bindings()
|
||||
_fake_embeddings.install(_active_patches)
|
||||
_fake_docling_service.install(_active_patches)
|
||||
_fake_confluence_oauth.install(_active_patches)
|
||||
_fake_confluence_indexer.install(_active_patches)
|
||||
_fake_native_google.install(_active_patches)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue