From 523563b94800cd5748334eaa48f40255f3e54582 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 9 May 2026 05:02:04 +0530 Subject: [PATCH] test(e2e): add canary PDFs and reproducer for connector docling coverage --- .../fixtures/binary/composio-drive-canary.pdf | Bin 0 -> 748 bytes .../fakes/fixtures/binary/drive-canary.pdf | Bin 0 -> 735 bytes .../fakes/fixtures/binary/dropbox-canary.pdf | Bin 0 -> 727 bytes .../fixtures/binary/generate_canary_pdfs.py | 84 ++++++++++++++++++ .../fakes/fixtures/binary/onedrive-canary.pdf | Bin 0 -> 730 bytes 5 files changed, 84 insertions(+) create mode 100644 surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf create mode 100644 surfsense_backend/tests/e2e/fakes/fixtures/binary/drive-canary.pdf create mode 100644 surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf create mode 100644 surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py create mode 100644 surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/composio-drive-canary.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e4ed9f001149e98ed54a75e7be9239fb52326e1f GIT binary patch literal 748 zcmZWn!A|2a5WV{==2B_*P!gw200|+{G|?iKrb+_R3auj3PK!~9E8DGFzJAAUSxT{T zaAw|n^JYA~={WE6vmxUUf_i>sS62|!yA?u^3M*Q*gkUO`Xdog$3D?&QySW3p|F48S zF1V9(15X17|4S-r-C5vAM3?zabY(OWbYCa9$GH?EwS}jU@ClFL{4yfF5{+8x85wd; zb43h);V^U+sgC=oY-dT$KzxXP*mkI|)xAbgt)H!L+cnQ27zu;>%r9&=Xyr^C_25)L zRDNT(v@Z~a0|zixqj+PZnguxrvRyCOIhjvFsORiQs@_)0NCjgpHwZL+NJS^~?kCi* zEQCJ4@VNR#8c-efS2TRExV8OO>`;5gN~;YG;6!VzX07a&Fr%a<*4c?<^-$(jHmR~E zi?SxYncS3rn!32lCQVx0PmAidXvXF3W7d$XCJedfXSJK$p{v8wZ#6E+Yr?|+2a-4* z#IS(R*d_QZbf2;K;#-V!nx#8OxM)b*coG}%qnOn~wy6EGR`MMop~XL(1j(b@fp^ftyh5>7};z`0j0f0Wn A>i_@% literal 0 HcmV?d00001 diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/dropbox-canary.pdf new file mode 100644 index 0000000000000000000000000000000000000000..14ebc02d8b5487c25fbf08d179965931ed65435c GIT binary patch literal 727 zcmZWn(N4lZ5PZL{*cTFg!1hX8At59JZA?@kZA~=9GzTnLQ+wok81U=eJ3s+1eQ4)q zW@oqCm<;14zwsFd7v$`Nb-UnXyETF{5L%RS1r=a_`^D2Z=w3bg7!a* ya~hjDCp^!m#k;^-`~=fV6eTLVmuc~h5Yi4vX;$j6kF^#`+nj;#vqmEt$Lt4i48$t{ literal 0 HcmV?d00001 diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py b/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py new file mode 100644 index 000000000..9fb336058 --- /dev/null +++ b/surfsense_backend/tests/e2e/fakes/fixtures/binary/generate_canary_pdfs.py @@ -0,0 +1,84 @@ +"""Generate deterministic one-page PDFs for connector E2E fixtures.""" + +from __future__ import annotations + +from pathlib import Path + +PDF_FIXTURES = { + "drive-canary.pdf": ( + "Native Drive PDF Canary", + "This one-page text-layer PDF proves native Drive Docling coverage.", + "SURFSENSE_E2E_CANARY_TOKEN_DRIVE_PDF_001", + ), + "onedrive-canary.pdf": ( + "OneDrive PDF Canary", + "This one-page text-layer PDF proves OneDrive Docling coverage.", + "SURFSENSE_E2E_CANARY_TOKEN_ONEDRIVE_PDF_001", + ), + "dropbox-canary.pdf": ( + "Dropbox PDF Canary", + "This one-page text-layer PDF proves Dropbox Docling coverage.", + "SURFSENSE_E2E_CANARY_TOKEN_DROPBOX_PDF_001", + ), + "composio-drive-canary.pdf": ( + "Composio Drive PDF Canary", + "This one-page text-layer PDF proves Composio Drive Docling coverage.", + "SURFSENSE_E2E_CANARY_TOKEN_COMPOSIO_DRIVE_PDF_001", + ), +} + + +def _escape_pdf_text(text: str) -> str: + return text.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)") + + +def _build_pdf(lines: tuple[str, str, str]) -> bytes: + text_ops = ["BT", "/F1 12 Tf", "72 760 Td"] + for index, line in enumerate(lines): + if index: + text_ops.append("0 -18 Td") + text_ops.append(f"({_escape_pdf_text(line)}) Tj") + text_ops.append("ET") + stream = "\n".join(text_ops).encode("ascii") + + objects = [ + b"<< /Type /Catalog /Pages 2 0 R >>", + b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>", + b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + b"/Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>", + b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>", + b"<< /Length " + + str(len(stream)).encode("ascii") + + b" >>\nstream\n" + + stream + + b"\nendstream", + ] + + pdf = bytearray(b"%PDF-1.4\n") + offsets = [0] + for obj_number, obj in enumerate(objects, start=1): + offsets.append(len(pdf)) + pdf.extend(f"{obj_number} 0 obj\n".encode("ascii")) + pdf.extend(obj) + pdf.extend(b"\nendobj\n") + + xref_offset = len(pdf) + pdf.extend(f"xref\n0 {len(objects) + 1}\n".encode("ascii")) + pdf.extend(b"0000000000 65535 f \n") + for offset in offsets[1:]: + pdf.extend(f"{offset:010d} 00000 n \n".encode("ascii")) + pdf.extend( + f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\n" + f"startxref\n{xref_offset}\n%%EOF\n".encode("ascii") + ) + return bytes(pdf) + + +def main() -> None: + out_dir = Path(__file__).parent + for filename, lines in PDF_FIXTURES.items(): + (out_dir / filename).write_bytes(_build_pdf(lines)) + + +if __name__ == "__main__": + main() diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf b/surfsense_backend/tests/e2e/fakes/fixtures/binary/onedrive-canary.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ed0636b31c613ed780b57fee703f424b246dfd29 GIT binary patch literal 730 zcmZWnVNb#^5dGd?abHOE18yB`U}8)Jx@c4|+cYu8EENu%nJwuG0e`*Mf{NPg!}j{# zyLY#1-Hp;VKj|_KA?TMkc6J6qv0WkrLuqBL7ZBXZ1vU^7A%pXChHB=B?)$f3#|y5j zSp!cUM}H<4y;2tVfzaLj7rHVAiMp#3+~TZ~1HFN#koXCYpm!RPUxtldnJEQwNmES< z|H7f`DpDQyP+4Wkt$}!te#LgI-_^ZGkgr~>Z`(D`AsEO8cQ=<`W5)}VZ6gMtMq2Z~+HS&zaeArv!qG*LJ*)f$234nwJ=*&ajj#zJWGzGoD# zRRdaKdr4yliyPan)$RTJ*_LXfGNR-_$PnN^7|~!#W)wGh(tNO7+++M zWih!H<8m?a8;gLBH4u<-jp633kw=J11^g1-gzgT?)S z80R!OcMf<_mv-+D*5L=3HL|MF_|?p-4}^r)Kx?;E`;~00G}h;gyFJ!wiAl;n0le76 A>i_@% literal 0 HcmV?d00001