mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
- Added a new Celery task for indexing non-markdown attachments. - Enhanced the Obsidian plugin schema to support binary attachments. - Updated routes to enqueue binary attachments for background processing. - Improved metadata handling for binary attachments during indexing. - Added tests for binary attachment processing and validation.
161 lines
4.4 KiB
Python
161 lines
4.4 KiB
Python
from __future__ import annotations
|
|
|
|
import base64
|
|
from datetime import UTC, datetime
|
|
|
|
import pytest
|
|
|
|
from app.etl_pipeline.etl_document import EtlResult
|
|
from app.schemas.obsidian_plugin import HeadingRef, NotePayload
|
|
from app.services.obsidian_plugin_indexer import (
|
|
_build_metadata,
|
|
_extract_binary_attachment_markdown,
|
|
_is_image_attachment,
|
|
_require_extracted_attachment_content,
|
|
)
|
|
|
|
|
|
def test_build_metadata_serializes_headings_to_plain_json() -> None:
|
|
now = datetime.now(UTC)
|
|
payload = NotePayload(
|
|
vault_id="vault-1",
|
|
path="notes.md",
|
|
name="notes",
|
|
extension="md",
|
|
content="# Notes",
|
|
headings=[HeadingRef(heading="Notes", level=1)],
|
|
content_hash="abc123",
|
|
mtime=now,
|
|
ctime=now,
|
|
)
|
|
|
|
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
|
|
|
|
assert metadata["headings"] == [{"heading": "Notes", "level": 1}]
|
|
|
|
|
|
def test_build_metadata_marks_binary_attachment_fields() -> None:
|
|
now = datetime.now(UTC)
|
|
payload = NotePayload(
|
|
vault_id="vault-1",
|
|
path="assets/diagram.png",
|
|
name="diagram",
|
|
extension="png",
|
|
content="",
|
|
content_hash="abc123",
|
|
mtime=now,
|
|
ctime=now,
|
|
is_binary=True,
|
|
mime_type="image/png",
|
|
)
|
|
|
|
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
|
|
|
|
assert metadata["is_binary"] is True
|
|
assert metadata["mime_type"] == "image/png"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> None:
|
|
now = datetime.now(UTC)
|
|
payload = NotePayload(
|
|
vault_id="vault-1",
|
|
path="assets/diagram.png",
|
|
name="diagram",
|
|
extension="png",
|
|
content="",
|
|
content_hash="abc123",
|
|
mtime=now,
|
|
ctime=now,
|
|
is_binary=True,
|
|
binary_base64="not-valid-base64!!",
|
|
)
|
|
|
|
content, metadata = await _extract_binary_attachment_markdown(
|
|
payload, vision_llm=None
|
|
)
|
|
|
|
assert content == ""
|
|
assert metadata["attachment_extraction_status"] == "invalid_binary_payload"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
|
|
now = datetime.now(UTC)
|
|
payload = NotePayload(
|
|
vault_id="vault-1",
|
|
path="assets/spec.pdf",
|
|
name="spec",
|
|
extension="pdf",
|
|
content="",
|
|
content_hash="abc123",
|
|
mtime=now,
|
|
ctime=now,
|
|
is_binary=True,
|
|
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
|
|
)
|
|
|
|
async def _fake_run_etl_extract( # noqa: ANN001
|
|
*, file_path, filename, vision_llm
|
|
):
|
|
assert filename == "spec.pdf"
|
|
assert file_path
|
|
assert vision_llm is None
|
|
return EtlResult(
|
|
markdown_content="Extracted content",
|
|
etl_service="TEST_ETL",
|
|
content_type="document",
|
|
)
|
|
|
|
monkeypatch.setattr(
|
|
"app.services.obsidian_plugin_indexer._run_etl_extract",
|
|
_fake_run_etl_extract,
|
|
)
|
|
|
|
content, metadata = await _extract_binary_attachment_markdown(
|
|
payload, vision_llm=None
|
|
)
|
|
|
|
assert content == "Extracted content"
|
|
assert metadata["attachment_extraction_status"] == "ok"
|
|
assert metadata["attachment_etl_service"] == "TEST_ETL"
|
|
|
|
|
|
def test_is_image_attachment_detects_image_extensions() -> None:
|
|
now = datetime.now(UTC)
|
|
image_payload = NotePayload(
|
|
vault_id="vault-1",
|
|
path="assets/screenshot.PNG",
|
|
name="screenshot",
|
|
extension="PNG",
|
|
content="",
|
|
content_hash="abc123",
|
|
mtime=now,
|
|
ctime=now,
|
|
is_binary=True,
|
|
)
|
|
pdf_payload = NotePayload(
|
|
vault_id="vault-1",
|
|
path="assets/spec.pdf",
|
|
name="spec",
|
|
extension="pdf",
|
|
content="",
|
|
content_hash="abc123",
|
|
mtime=now,
|
|
ctime=now,
|
|
is_binary=True,
|
|
)
|
|
|
|
assert _is_image_attachment(image_payload) is True
|
|
assert _is_image_attachment(pdf_payload) is False
|
|
|
|
|
|
def test_require_extracted_attachment_content_rejects_empty_content() -> None:
|
|
with pytest.raises(
|
|
RuntimeError, match="Attachment extraction failed for assets/img.png"
|
|
):
|
|
_require_extracted_attachment_content(
|
|
content=" ",
|
|
etl_meta={"attachment_extraction_status": "etl_failed"},
|
|
path="assets/img.png",
|
|
)
|