From 8536bac29a5eb4f22586dd4a2d69cc6d35ef9eb2 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 6 May 2026 17:58:58 +0530 Subject: [PATCH] test(backend): enhance Drive file filtering and add unit tests for _drive_list_files --- .../tests/e2e/fakes/composio_module.py | 43 ++++++++++++++++++- .../tests/e2e/fakes/fixtures/drive_files.json | 15 +++++++ .../unit/e2e_fakes/test_drive_list_files.py | 38 ++++++++++++++++ .../connectors/composio/drive/journey.spec.ts | 38 +++++++--------- surfsense_web/tests/helpers/api/documents.ts | 29 +++++++++++++ 5 files changed, 139 insertions(+), 24 deletions(-) create mode 100644 surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py diff --git a/surfsense_backend/tests/e2e/fakes/composio_module.py b/surfsense_backend/tests/e2e/fakes/composio_module.py index 6e9561a0f..87e1968ba 100644 --- a/surfsense_backend/tests/e2e/fakes/composio_module.py +++ b/surfsense_backend/tests/e2e/fakes/composio_module.py @@ -320,15 +320,54 @@ def _drive_list_files(args: dict[str, Any]) -> dict[str, Any]: except IndexError: folder_id = "root" - files = _DRIVE_FIXTURE.get(folder_id, []) + files = _filter_drive_files_for_query(q, _DRIVE_FIXTURE.get(folder_id, [])) return { "data": { - "files": list(files), + "files": files, "nextPageToken": None, } } +def _extract_quoted_value(q: str, anchor: str) -> str | None: + anchor_idx = q.find(anchor) + if anchor_idx == -1: + return None + + after_anchor = q[anchor_idx + len(anchor) :] + first_quote_idx = after_anchor.find("'") + if first_quote_idx == -1: + return None + + after_first_quote = after_anchor[first_quote_idx + 1 :] + second_quote_idx = after_first_quote.find("'") + if second_quote_idx == -1: + return None + + return after_first_quote[:second_quote_idx] + + +def _filter_drive_files_for_query(q: str, files: list[dict[str, Any]]) -> list[dict[str, Any]]: + filtered = list(files) + + if "trashed = false" in q: + filtered = [entry for entry in filtered if entry.get("trashed") is not True] + + excluded_mime_type = _extract_quoted_value(q, "mimeType !=") + if excluded_mime_type: + filtered = [ + entry for entry in filtered if entry.get("mimeType") != excluded_mime_type + ] + + included_mime_type = _extract_quoted_value(q, "mimeType =") + if included_mime_type: + filtered = [ + entry for entry in filtered if entry.get("mimeType") == included_mime_type + ] + + return filtered + + def _drive_download_file(args: dict[str, Any]) -> dict[str, Any]: """Mimic GOOGLEDRIVE_DOWNLOAD_FILE. diff --git a/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json b/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json index 9d010a2e9..1c5c38838 100644 --- a/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json +++ b/surfsense_backend/tests/e2e/fakes/fixtures/drive_files.json @@ -34,6 +34,21 @@ "mimeType": "text/csv", "modifiedTime": "2025-01-25T13:45:00.000Z", "createdTime": "2025-01-25T13:45:00.000Z" + }, + { + "id": "fake-shortcut-canary", + "name": "Shortcut to Canary", + "mimeType": "application/vnd.google-apps.shortcut", + "modifiedTime": "2025-02-10T12:00:00.000Z", + "createdTime": "2025-02-10T12:00:00.000Z" + }, + { + "id": "fake-file-trashed", + "name": "trashed-e2e-note.txt", + "mimeType": "text/plain", + "modifiedTime": "2025-02-11T09:00:00.000Z", + "createdTime": "2025-02-11T09:00:00.000Z", + "trashed": true } ], "fake-folder-projects": [ diff --git a/surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py b/surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py new file mode 100644 index 000000000..a111869a4 --- /dev/null +++ b/surfsense_backend/tests/unit/e2e_fakes/test_drive_list_files.py @@ -0,0 +1,38 @@ +from tests.e2e.fakes.composio_module import _drive_list_files + + +def _ids(result: dict) -> set[str]: + return {item["id"] for item in result["data"]["files"]} + + +def test_drive_list_files_filters_shortcuts_and_trashed_items(): + result = _drive_list_files( + { + "q": ( + "'root' in parents and trashed = false and " + "mimeType != 'application/vnd.google-apps.shortcut'" + ) + } + ) + + ids = _ids(result) + + assert "fake-file-canary" in ids + assert "fake-shortcut-canary" not in ids + assert "fake-file-trashed" not in ids + + +def test_drive_list_files_filters_to_exact_mime_type(): + result = _drive_list_files( + {"q": "'root' in parents and trashed = false and mimeType = 'text/plain'"} + ) + + assert _ids(result) == {"fake-file-canary"} + + +def test_drive_list_files_uses_requested_parent_folder(): + result = _drive_list_files( + {"q": "'fake-folder-projects' in parents and trashed = false"} + ) + + assert _ids(result) == {"fake-file-roadmap"} diff --git a/surfsense_web/tests/connectors/composio/drive/journey.spec.ts b/surfsense_web/tests/connectors/composio/drive/journey.spec.ts index e97532f6a..8db271e43 100644 --- a/surfsense_web/tests/connectors/composio/drive/journey.spec.ts +++ b/surfsense_web/tests/connectors/composio/drive/journey.spec.ts @@ -1,6 +1,6 @@ import { composioDriveTest as test, expect } from "../../../fixtures"; import { listConnectors, triggerIndex, updateConnectorConfig } from "../../../helpers/api/connectors"; -import { listDocuments } from "../../../helpers/api/documents"; +import { getEditorContent, listDocuments } from "../../../helpers/api/documents"; import { CANARY_TOKENS, FAKE_DRIVE_FILES } from "../../../helpers/canary"; import { openConnectorPopup } from "../../../helpers/ui/connector-popup"; import { @@ -9,19 +9,11 @@ import { } from "../../../helpers/waits/indexing"; /** - * Composio Drive user journey. + * Proves the Drive wiring from OAuth fixture -> selection persistence -> + * indexing -> stored source_markdown -> editor-content retrieval. * - * User expectation: - * "I connect Google Drive, choose the files/folders I care about, - * wait for indexing, and then my Drive content is available in SurfSense." - * - * The OAuth connection is handled by the composioDriveConnector fixture so - * this test can focus on the user-visible expectation. The spec still touches - * the browser (dashboard + connector dialog) and then uses API helpers for - * selection/indexing to keep the expensive pipeline assertion deterministic. - * - * If this passes, the seam from Composio connection -> selection persistence -> - * Celery indexing -> document storage is wired correctly. + * UI-driven file selection, chat retrieval, and LLM/embedding quality are + * covered by later phases or narrower tests. */ test.describe("Composio Drive journey", () => { test( @@ -33,11 +25,9 @@ test.describe("Composio Drive journey", () => { waitUntil: "domcontentloaded", }); await openConnectorPopup(page); - await expect( - page - .getByRole("dialog", { name: "Manage Connectors" }) - .getByText("Search your Drive files via Composio") - ).toBeVisible(); + const connectorDialog = page.getByRole("dialog", { name: "Manage Connectors" }); + await expect(connectorDialog).toBeVisible(); + await expect(connectorDialog.getByRole("button", { name: "Manage" })).toBeVisible(); await updateConnectorConfig(request, apiToken, composioDriveConnector.id, { ...composioDriveConnector.config, @@ -89,13 +79,17 @@ test.describe("Composio Drive journey", () => { const canaryDoc = docs.find((d) => d.title === FAKE_DRIVE_FILES.canary.name); expect(canaryDoc, "canary document must exist after indexing").toBeDefined(); + if (!canaryDoc) throw new Error("unreachable: canaryDoc asserted defined above"); - const content = canaryDoc!.content ?? ""; + // content holds the LLM summary; the raw file body lives in source_markdown. + // editor-content is the same endpoint the UI hits when opening a document. + const editor = await getEditorContent(request, apiToken, searchSpace.id, canaryDoc.id); expect( - content, - `canary token ${CANARY_TOKENS.driveCanaryFile} should appear in Document.content; ` + - `got first 200 chars: ${content.slice(0, 200)}` + editor.source_markdown, + `canary token ${CANARY_TOKENS.driveCanaryFile} should appear in editor source_markdown; ` + + `got first 200 chars: ${editor.source_markdown.slice(0, 200)}` ).toContain(CANARY_TOKENS.driveCanaryFile); + expect(editor.chunk_count).toBeGreaterThan(0); const refreshedConnectors = await listConnectors(request, apiToken, searchSpace.id); const refreshed = refreshedConnectors.find((c) => c.id === composioDriveConnector.id); diff --git a/surfsense_web/tests/helpers/api/documents.ts b/surfsense_web/tests/helpers/api/documents.ts index 2276327bf..051728935 100644 --- a/surfsense_web/tests/helpers/api/documents.ts +++ b/surfsense_web/tests/helpers/api/documents.ts @@ -38,3 +38,32 @@ export function isDocumentReady(doc: DocumentRow): boolean { typeof doc.status === "string" ? doc.status : doc.status?.state; return state === "ready" || state === "READY"; } + +export type EditorContent = { + document_id: number; + title: string; + document_type: string; + source_markdown: string; + content_size_bytes: number; + chunk_count: number; + truncated: boolean; +}; + +// Same endpoint the UI hits when a user opens a document in the dashboard. +export async function getEditorContent( + request: APIRequestContext, + token: string, + searchSpaceId: number, + documentId: number +): Promise { + const response = await request.get( + `${BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, + { headers: authHeaders(token) } + ); + if (!response.ok()) { + throw new Error( + `getEditorContent failed (${response.status()}): ${await response.text()}` + ); + } + return (await response.json()) as EditorContent; +}