from __future__ import annotations import base64 from datetime import UTC, datetime import pytest from pydantic import ValidationError from app.etl_pipeline.etl_document import EtlResult from app.schemas.obsidian_plugin import HeadingRef, NotePayload from app.services.obsidian_plugin_indexer import ( _build_metadata, _extract_binary_attachment_markdown, _is_image_attachment, _require_extracted_attachment_content, ) _FAKE_PNG_B64 = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode("ascii") def test_build_metadata_serializes_headings_to_plain_json() -> None: now = datetime.now(UTC) payload = NotePayload( vault_id="vault-1", path="notes.md", name="notes", extension="md", content="# Notes", headings=[HeadingRef(heading="Notes", level=1)], content_hash="abc123", mtime=now, ctime=now, ) metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42) assert metadata["headings"] == [{"heading": "Notes", "level": 1}] def test_build_metadata_marks_binary_attachment_fields() -> None: now = datetime.now(UTC) payload = NotePayload( vault_id="vault-1", path="assets/diagram.png", name="diagram", extension="png", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, binary_base64=_FAKE_PNG_B64, mime_type="image/png", ) metadata = _build_metadata(payload, vault_name="My Vault", connector_id=42) assert metadata["is_binary"] is True assert metadata["mime_type"] == "image/png" @pytest.mark.asyncio async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> None: now = datetime.now(UTC) payload = NotePayload( vault_id="vault-1", path="assets/diagram.png", name="diagram", extension="png", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, binary_base64="not-valid-base64!!", mime_type="image/png", ) content, metadata = await _extract_binary_attachment_markdown( payload, vision_llm=None ) assert content == "" assert metadata["attachment_extraction_status"] == "invalid_binary_payload" @pytest.mark.asyncio async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None: now = datetime.now(UTC) payload = NotePayload( vault_id="vault-1", path="assets/spec.pdf", name="spec", extension="pdf", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"), mime_type="application/pdf", ) async def _fake_run_etl_extract( # noqa: ANN001 *, file_path, filename, vision_llm ): assert filename == "spec.pdf" assert file_path assert vision_llm is None return EtlResult( markdown_content="Extracted content", etl_service="TEST_ETL", content_type="document", ) monkeypatch.setattr( "app.services.obsidian_plugin_indexer._run_etl_extract", _fake_run_etl_extract, ) content, metadata = await _extract_binary_attachment_markdown( payload, vision_llm=None ) assert content == "Extracted content" assert metadata["attachment_extraction_status"] == "ok" assert metadata["attachment_etl_service"] == "TEST_ETL" def test_is_image_attachment_detects_image_extensions() -> None: now = datetime.now(UTC) image_payload = NotePayload( vault_id="vault-1", path="assets/screenshot.PNG", name="screenshot", extension="PNG", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, binary_base64=_FAKE_PNG_B64, mime_type="image/png", ) pdf_payload = NotePayload( vault_id="vault-1", path="assets/spec.pdf", name="spec", extension="pdf", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, binary_base64=_FAKE_PNG_B64, mime_type="application/pdf", ) assert _is_image_attachment(image_payload) is True assert _is_image_attachment(pdf_payload) is False def test_note_payload_rejects_binary_without_base64() -> None: now = datetime.now(UTC) with pytest.raises(ValidationError, match="binary_base64 is required"): NotePayload( vault_id="vault-1", path="assets/diagram.png", name="diagram", extension="png", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, mime_type="image/png", ) def test_note_payload_rejects_binary_without_mime_type() -> None: now = datetime.now(UTC) with pytest.raises(ValidationError, match="mime_type is required"): NotePayload( vault_id="vault-1", path="assets/diagram.png", name="diagram", extension="png", content="", content_hash="abc123", mtime=now, ctime=now, is_binary=True, binary_base64=_FAKE_PNG_B64, ) def test_note_payload_rejects_markdown_with_binary_fields() -> None: now = datetime.now(UTC) with pytest.raises( ValidationError, match="binary_base64 and mime_type must be omitted when is_binary is False", ): NotePayload( vault_id="vault-1", path="notes.md", name="notes", extension="md", content="# Notes", content_hash="abc123", mtime=now, ctime=now, binary_base64=_FAKE_PNG_B64, ) def test_require_extracted_attachment_content_rejects_empty_content() -> None: with pytest.raises( RuntimeError, match="Attachment extraction failed for assets/img.png" ): _require_extracted_attachment_content( content=" ", etl_meta={"attachment_extraction_status": "etl_failed"}, path="assets/img.png", )