SurfSense/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py
Anish Sarkar 6ac5256431 feat: implement background processing for binary attachments in Obsidian plugin
- Added a new Celery task for indexing non-markdown attachments.
- Enhanced the Obsidian plugin schema to support binary attachments.
- Updated routes to enqueue binary attachments for background processing.
- Improved metadata handling for binary attachments during indexing.
- Added tests for binary attachment processing and validation.
2026-04-22 23:00:34 +05:30

161 lines
4.4 KiB
Python

from __future__ import annotations
import base64
from datetime import UTC, datetime
import pytest
from app.etl_pipeline.etl_document import EtlResult
from app.schemas.obsidian_plugin import HeadingRef, NotePayload
from app.services.obsidian_plugin_indexer import (
_build_metadata,
_extract_binary_attachment_markdown,
_is_image_attachment,
_require_extracted_attachment_content,
)
def test_build_metadata_serializes_headings_to_plain_json() -> None:
now = datetime.now(UTC)
payload = NotePayload(
vault_id="vault-1",
path="notes.md",
name="notes",
extension="md",
content="# Notes",
headings=[HeadingRef(heading="Notes", level=1)],
content_hash="abc123",
mtime=now,
ctime=now,
)
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
assert metadata["headings"] == [{"heading": "Notes", "level": 1}]
def test_build_metadata_marks_binary_attachment_fields() -> None:
now = datetime.now(UTC)
payload = NotePayload(
vault_id="vault-1",
path="assets/diagram.png",
name="diagram",
extension="png",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
mime_type="image/png",
)
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
assert metadata["is_binary"] is True
assert metadata["mime_type"] == "image/png"
@pytest.mark.asyncio
async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> None:
now = datetime.now(UTC)
payload = NotePayload(
vault_id="vault-1",
path="assets/diagram.png",
name="diagram",
extension="png",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
binary_base64="not-valid-base64!!",
)
content, metadata = await _extract_binary_attachment_markdown(
payload, vision_llm=None
)
assert content == ""
assert metadata["attachment_extraction_status"] == "invalid_binary_payload"
@pytest.mark.asyncio
async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
now = datetime.now(UTC)
payload = NotePayload(
vault_id="vault-1",
path="assets/spec.pdf",
name="spec",
extension="pdf",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
)
async def _fake_run_etl_extract( # noqa: ANN001
*, file_path, filename, vision_llm
):
assert filename == "spec.pdf"
assert file_path
assert vision_llm is None
return EtlResult(
markdown_content="Extracted content",
etl_service="TEST_ETL",
content_type="document",
)
monkeypatch.setattr(
"app.services.obsidian_plugin_indexer._run_etl_extract",
_fake_run_etl_extract,
)
content, metadata = await _extract_binary_attachment_markdown(
payload, vision_llm=None
)
assert content == "Extracted content"
assert metadata["attachment_extraction_status"] == "ok"
assert metadata["attachment_etl_service"] == "TEST_ETL"
def test_is_image_attachment_detects_image_extensions() -> None:
now = datetime.now(UTC)
image_payload = NotePayload(
vault_id="vault-1",
path="assets/screenshot.PNG",
name="screenshot",
extension="PNG",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
)
pdf_payload = NotePayload(
vault_id="vault-1",
path="assets/spec.pdf",
name="spec",
extension="pdf",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
)
assert _is_image_attachment(image_payload) is True
assert _is_image_attachment(pdf_payload) is False
def test_require_extracted_attachment_content_rejects_empty_content() -> None:
with pytest.raises(
RuntimeError, match="Attachment extraction failed for assets/img.png"
):
_require_extracted_attachment_content(
content=" ",
etl_meta={"attachment_extraction_status": "etl_failed"},
path="assets/img.png",
)