test(backend): add E2E harness foundation (entrypoints, middleware, LLM/embedding fakes)

This commit is contained in:
Anish Sarkar 2026-05-06 17:17:42 +05:30
parent c720866a67
commit 58ba95fad2
9 changed files with 550 additions and 0 deletions

View file

@ -0,0 +1,125 @@
"""E2E Celery worker entrypoint.
Same sys.modules hijack + LLM/embedding patches as run_backend.py,
applied before importing the production celery_app. Celery workers
run in a separate Python interpreter, so the patches must be applied
here too they would NOT carry over from the FastAPI process.
Production is unaffected: celery_worker.py at the repo root is the
production entrypoint and never imports this file.
Usage:
cd surfsense_backend
uv run python tests/e2e/run_celery.py
"""
from __future__ import annotations
import logging
import os
import sys
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_BACKEND_ROOT = os.path.abspath(os.path.join(_THIS_DIR, "..", ".."))
if _BACKEND_ROOT not in sys.path:
sys.path.insert(0, _BACKEND_ROOT)
# ---------------------------------------------------------------------------
# 1) Hijack sys.modules BEFORE production celery imports anything.
# ---------------------------------------------------------------------------
import tests.e2e.fakes.composio_module as _fake_composio # noqa: E402
sys.modules["composio"] = _fake_composio
# ---------------------------------------------------------------------------
# 2) Logging + dotenv.
# ---------------------------------------------------------------------------
from dotenv import load_dotenv # noqa: E402
load_dotenv()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("surfsense.e2e.celery")
logger.warning(
"*** SURFSENSE E2E CELERY WORKER — fake Composio + LLM + embeddings, "
"this MUST NOT be reachable in production. ***"
)
# ---------------------------------------------------------------------------
# 3) Import the production celery_app. All task modules load here.
# ---------------------------------------------------------------------------
from app.celery_app import celery_app # noqa: E402
# ---------------------------------------------------------------------------
# 4) Patch LLM + embedding bindings inside the worker process.
# ---------------------------------------------------------------------------
from unittest.mock import patch # noqa: E402
from tests.e2e.fakes import embeddings as _fake_embeddings # noqa: E402
from tests.e2e.fakes.llm import fake_get_user_long_context_llm # noqa: E402
_active_patches: list = []
def _patch_llm_bindings() -> None:
targets = [
"app.services.llm_service.get_user_long_context_llm",
"app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
"app.tasks.document_processors.file_processors.get_user_long_context_llm",
]
for target in targets:
try:
p = patch(target, fake_get_user_long_context_llm)
p.start()
_active_patches.append(p)
logger.info("[fake-llm] patched %s in celery worker", target)
except (ModuleNotFoundError, AttributeError) as exc:
logger.warning(
"[fake-llm] could not patch %s in celery worker: %s.",
target,
exc,
)
_patch_llm_bindings()
_fake_embeddings.install(_active_patches)
# ---------------------------------------------------------------------------
# 5) Start the worker.
# ---------------------------------------------------------------------------
def _main() -> None:
# Default queues mirror production (default queue + connectors queue
# so Drive indexing tasks are picked up).
queue_name = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
queues = f"{queue_name},{queue_name}.connectors"
celery_app.worker_main(
argv=[
"worker",
"--loglevel=info",
f"--queues={queues}",
"--concurrency=2",
"--without-gossip",
"--without-mingle",
]
)
if __name__ == "__main__":
_main()