mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 09:46:25 +02:00
feat: implement background processing for binary attachments in Obsidian plugin
- Added a new Celery task for indexing non-markdown attachments. - Enhanced the Obsidian plugin schema to support binary attachments. - Updated routes to enqueue binary attachments for background processing. - Improved metadata handling for binary attachments during indexing. - Added tests for binary attachment processing and validation.
This commit is contained in:
parent
5047527b47
commit
6ac5256431
11 changed files with 519 additions and 68 deletions
|
|
@ -1,9 +1,18 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from app.etl_pipeline.etl_document import EtlResult
|
||||
from app.schemas.obsidian_plugin import HeadingRef, NotePayload
|
||||
from app.services.obsidian_plugin_indexer import _build_metadata
|
||||
from app.services.obsidian_plugin_indexer import (
|
||||
_build_metadata,
|
||||
_extract_binary_attachment_markdown,
|
||||
_is_image_attachment,
|
||||
_require_extracted_attachment_content,
|
||||
)
|
||||
|
||||
|
||||
def test_build_metadata_serializes_headings_to_plain_json() -> None:
|
||||
|
|
@ -23,3 +32,130 @@ def test_build_metadata_serializes_headings_to_plain_json() -> None:
|
|||
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
|
||||
|
||||
assert metadata["headings"] == [{"heading": "Notes", "level": 1}]
|
||||
|
||||
|
||||
def test_build_metadata_marks_binary_attachment_fields() -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/diagram.png",
|
||||
name="diagram",
|
||||
extension="png",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42)
|
||||
|
||||
assert metadata["is_binary"] is True
|
||||
assert metadata["mime_type"] == "image/png"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/diagram.png",
|
||||
name="diagram",
|
||||
extension="png",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64="not-valid-base64!!",
|
||||
)
|
||||
|
||||
content, metadata = await _extract_binary_attachment_markdown(
|
||||
payload, vision_llm=None
|
||||
)
|
||||
|
||||
assert content == ""
|
||||
assert metadata["attachment_extraction_status"] == "invalid_binary_payload"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/spec.pdf",
|
||||
name="spec",
|
||||
extension="pdf",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
|
||||
)
|
||||
|
||||
async def _fake_run_etl_extract( # noqa: ANN001
|
||||
*, file_path, filename, vision_llm
|
||||
):
|
||||
assert filename == "spec.pdf"
|
||||
assert file_path
|
||||
assert vision_llm is None
|
||||
return EtlResult(
|
||||
markdown_content="Extracted content",
|
||||
etl_service="TEST_ETL",
|
||||
content_type="document",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.services.obsidian_plugin_indexer._run_etl_extract",
|
||||
_fake_run_etl_extract,
|
||||
)
|
||||
|
||||
content, metadata = await _extract_binary_attachment_markdown(
|
||||
payload, vision_llm=None
|
||||
)
|
||||
|
||||
assert content == "Extracted content"
|
||||
assert metadata["attachment_extraction_status"] == "ok"
|
||||
assert metadata["attachment_etl_service"] == "TEST_ETL"
|
||||
|
||||
|
||||
def test_is_image_attachment_detects_image_extensions() -> None:
|
||||
now = datetime.now(UTC)
|
||||
image_payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/screenshot.PNG",
|
||||
name="screenshot",
|
||||
extension="PNG",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
)
|
||||
pdf_payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/spec.pdf",
|
||||
name="spec",
|
||||
extension="pdf",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
)
|
||||
|
||||
assert _is_image_attachment(image_payload) is True
|
||||
assert _is_image_attachment(pdf_payload) is False
|
||||
|
||||
|
||||
def test_require_extracted_attachment_content_rejects_empty_content() -> None:
|
||||
with pytest.raises(
|
||||
RuntimeError, match="Attachment extraction failed for assets/img.png"
|
||||
):
|
||||
_require_extracted_attachment_content(
|
||||
content=" ",
|
||||
etl_meta={"attachment_extraction_status": "etl_failed"},
|
||||
path="assets/img.png",
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue