test(e2e): add canary PDFs and reproducer for connector docling coverage

2026-05-12 09:12:40 +02:00 · 2026-05-09 05:02:04 +05:30 · 2026-05-09 05:02:04 +05:30 · 523563b948
commit 523563b948
parent 03ce8c1b81
5 changed files with 84 additions and 0 deletions
--- a/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf
--- a/surfsense_backend/tests/e2e/fakes/fixtures/binary/drive-canary.pdf
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/drive-canary.pdf
--- a/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf
--- a/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py
@ -0,0 +1,84 @@
+"""Generate deterministic one-page PDFs for connector E2E fixtures."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+PDF_FIXTURES = {
+    "drive-canary.pdf": (
+        "Native Drive PDF Canary",
+        "This one-page text-layer PDF proves native Drive Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_PDF_001",
+    ),
+    "onedrive-canary.pdf": (
+        "OneDrive PDF Canary",
+        "This one-page text-layer PDF proves OneDrive Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_PDF_001",
+    ),
+    "dropbox-canary.pdf": (
+        "Dropbox PDF Canary",
+        "This one-page text-layer PDF proves Dropbox Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_PDF_001",
+    ),
+    "composio-drive-canary.pdf": (
+        "Composio Drive PDF Canary",
+        "This one-page text-layer PDF proves Composio Drive Docling coverage.",
+        "SURFSENSE_E2E_CANARY_TOKEN_COMPOSIO_DRIVE_PDF_001",
+    ),
+}
+
+
+def _escape_pdf_text(text: str) -> str:
+    return text.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)")
+
+
+def _build_pdf(lines: tuple[str, str, str]) -> bytes:
+    text_ops = ["BT", "/F1 12 Tf", "72 760 Td"]
+    for index, line in enumerate(lines):
+        if index:
+            text_ops.append("0 -18 Td")
+        text_ops.append(f"({_escape_pdf_text(line)}) Tj")
+    text_ops.append("ET")
+    stream = "\n".join(text_ops).encode("ascii")
+
+    objects = [
+        b"<< /Type /Catalog /Pages 2 0 R >>",
+        b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
+        b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
+        b"/Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>",
+        b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
+        b"<< /Length "
+        + str(len(stream)).encode("ascii")
+        + b" >>\nstream\n"
+        + stream
+        + b"\nendstream",
+    ]
+
+    pdf = bytearray(b"%PDF-1.4\n")
+    offsets = [0]
+    for obj_number, obj in enumerate(objects, start=1):
+        offsets.append(len(pdf))
+        pdf.extend(f"{obj_number} 0 obj\n".encode("ascii"))
+        pdf.extend(obj)
+        pdf.extend(b"\nendobj\n")
+
+    xref_offset = len(pdf)
+    pdf.extend(f"xref\n0 {len(objects) + 1}\n".encode("ascii"))
+    pdf.extend(b"0000000000 65535 f \n")
+    for offset in offsets[1:]:
+        pdf.extend(f"{offset:010d} 00000 n \n".encode("ascii"))
+    pdf.extend(
+        f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\n"
+        f"startxref\n{xref_offset}\n%%EOF\n".encode("ascii")
+    )
+    return bytes(pdf)
+
+
+def main() -> None:
+    out_dir = Path(__file__).parent
+    for filename, lines in PDF_FIXTURES.items():
+        (out_dir / filename).write_bytes(_build_pdf(lines))
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf
+++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf