mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-28 18:36:23 +02:00
feat: implement background processing for binary attachments in Obsidian plugin
- Added a new Celery task for indexing non-markdown attachments. - Enhanced the Obsidian plugin schema to support binary attachments. - Updated routes to enqueue binary attachments for background processing. - Improved metadata handling for binary attachments during indexing. - Added tests for binary attachment processing and validation.
This commit is contained in:
parent
5047527b47
commit
6ac5256431
11 changed files with 519 additions and 68 deletions
|
|
@ -469,3 +469,51 @@ class TestWireContractSmoke:
|
|||
assert stats_resp.vault_id == vault_id
|
||||
assert stats_resp.files_synced == 0
|
||||
assert stats_resp.last_sync_at is None
|
||||
|
||||
async def test_sync_queues_binary_attachments(
|
||||
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
||||
):
|
||||
vault_id = str(uuid.uuid4())
|
||||
await obsidian_connect(
|
||||
ConnectRequest(
|
||||
vault_id=vault_id,
|
||||
vault_name="Queue Vault",
|
||||
search_space_id=db_search_space.id,
|
||||
vault_fingerprint="fp-" + uuid.uuid4().hex,
|
||||
),
|
||||
user=db_user,
|
||||
session=db_session,
|
||||
)
|
||||
|
||||
fake_doc = type("FakeDoc", (), {"id": 12345})()
|
||||
binary_note = _make_note_payload(vault_id, "image.png", "hash-bin")
|
||||
binary_note.extension = "png"
|
||||
binary_note.is_binary = True
|
||||
binary_note.binary_base64 = "aGVsbG8="
|
||||
binary_note.content = ""
|
||||
|
||||
with (
|
||||
patch(
|
||||
"app.routes.obsidian_plugin_routes.upsert_note",
|
||||
new=AsyncMock(return_value=fake_doc),
|
||||
) as upsert_mock,
|
||||
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
|
||||
):
|
||||
sync_resp = await obsidian_sync(
|
||||
SyncBatchRequest(
|
||||
vault_id=vault_id,
|
||||
notes=[
|
||||
_make_note_payload(vault_id, "ok.md", "hash-ok"),
|
||||
binary_note,
|
||||
],
|
||||
),
|
||||
user=db_user,
|
||||
session=db_session,
|
||||
)
|
||||
|
||||
assert sync_resp.indexed == 2
|
||||
assert sync_resp.failed == 0
|
||||
statuses = {it.path: it.status for it in sync_resp.items}
|
||||
assert statuses == {"ok.md": "ok", "image.png": "queued"}
|
||||
assert upsert_mock.await_count == 1
|
||||
queue_mock.assert_called_once()
|
||||
|
|
|
|||
|
|
@ -1,9 +1,18 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from app.etl_pipeline.etl_document import EtlResult
|
||||
from app.schemas.obsidian_plugin import HeadingRef, NotePayload
|
||||
from app.services.obsidian_plugin_indexer import _build_metadata
|
||||
from app.services.obsidian_plugin_indexer import (
|
||||
_build_metadata,
|
||||
_extract_binary_attachment_markdown,
|
||||
_is_image_attachment,
|
||||
_require_extracted_attachment_content,
|
||||
)
|
||||
|
||||
|
||||
def test_build_metadata_serializes_headings_to_plain_json() -> None:
|
||||
|
|
@ -23,3 +32,130 @@ def test_build_metadata_serializes_headings_to_plain_json() -> None:
|
|||
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
|
||||
|
||||
assert metadata["headings"] == [{"heading": "Notes", "level": 1}]
|
||||
|
||||
|
||||
def test_build_metadata_marks_binary_attachment_fields() -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/diagram.png",
|
||||
name="diagram",
|
||||
extension="png",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
|
||||
|
||||
assert metadata["is_binary"] is True
|
||||
assert metadata["mime_type"] == "image/png"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/diagram.png",
|
||||
name="diagram",
|
||||
extension="png",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64="not-valid-base64!!",
|
||||
)
|
||||
|
||||
content, metadata = await _extract_binary_attachment_markdown(
|
||||
payload, vision_llm=None
|
||||
)
|
||||
|
||||
assert content == ""
|
||||
assert metadata["attachment_extraction_status"] == "invalid_binary_payload"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/spec.pdf",
|
||||
name="spec",
|
||||
extension="pdf",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
|
||||
)
|
||||
|
||||
async def _fake_run_etl_extract( # noqa: ANN001
|
||||
*, file_path, filename, vision_llm
|
||||
):
|
||||
assert filename == "spec.pdf"
|
||||
assert file_path
|
||||
assert vision_llm is None
|
||||
return EtlResult(
|
||||
markdown_content="Extracted content",
|
||||
etl_service="TEST_ETL",
|
||||
content_type="document",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.services.obsidian_plugin_indexer._run_etl_extract",
|
||||
_fake_run_etl_extract,
|
||||
)
|
||||
|
||||
content, metadata = await _extract_binary_attachment_markdown(
|
||||
payload, vision_llm=None
|
||||
)
|
||||
|
||||
assert content == "Extracted content"
|
||||
assert metadata["attachment_extraction_status"] == "ok"
|
||||
assert metadata["attachment_etl_service"] == "TEST_ETL"
|
||||
|
||||
|
||||
def test_is_image_attachment_detects_image_extensions() -> None:
|
||||
now = datetime.now(UTC)
|
||||
image_payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/screenshot.PNG",
|
||||
name="screenshot",
|
||||
extension="PNG",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
)
|
||||
pdf_payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/spec.pdf",
|
||||
name="spec",
|
||||
extension="pdf",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
)
|
||||
|
||||
assert _is_image_attachment(image_payload) is True
|
||||
assert _is_image_attachment(pdf_payload) is False
|
||||
|
||||
|
||||
def test_require_extracted_attachment_content_rejects_empty_content() -> None:
|
||||
with pytest.raises(
|
||||
RuntimeError, match="Attachment extraction failed for assets/img.png"
|
||||
):
|
||||
_require_extracted_attachment_content(
|
||||
content=" ",
|
||||
etl_meta={"attachment_extraction_status": "etl_failed"},
|
||||
path="assets/img.png",
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue