diff --git a/surfsense_backend/app/routes/obsidian_plugin_routes.py b/surfsense_backend/app/routes/obsidian_plugin_routes.py index bc8b2d88b..9d41be2fb 100644 --- a/surfsense_backend/app/routes/obsidian_plugin_routes.py +++ b/surfsense_backend/app/routes/obsidian_plugin_routes.py @@ -26,6 +26,8 @@ from app.db import ( get_async_session, ) from app.schemas.obsidian_plugin import ( + ALLOWED_ATTACHMENT_EXTENSIONS, + ATTACHMENT_MIME_TYPES, ConnectRequest, ConnectResponse, DeleteAck, @@ -465,6 +467,31 @@ async def obsidian_sync( for note in payload.notes: try: if note.is_binary: + ext = note.extension.lstrip(".").lower() + if ext not in ALLOWED_ATTACHMENT_EXTENSIONS: + failed += 1 + items.append( + SyncAckItem( + path=note.path, + status="error", + error=f"unsupported attachment extension: .{ext}", + ) + ) + continue + expected_mime = ATTACHMENT_MIME_TYPES[ext] + if note.mime_type != expected_mime: + failed += 1 + items.append( + SyncAckItem( + path=note.path, + status="error", + error=( + f"mime_type '{note.mime_type}' does not match " + f"extension .{ext}" + ), + ) + ) + continue _queue_obsidian_attachment( connector_id=connector.id, note_payload=note.model_dump(mode="json"), diff --git a/surfsense_backend/app/schemas/obsidian_plugin.py b/surfsense_backend/app/schemas/obsidian_plugin.py index 6231960af..7fdc2d932 100644 --- a/surfsense_backend/app/schemas/obsidian_plugin.py +++ b/surfsense_backend/app/schemas/obsidian_plugin.py @@ -10,11 +10,26 @@ from __future__ import annotations from datetime import datetime from typing import Any, Literal -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator _PLUGIN_MODEL_CONFIG = ConfigDict(extra="ignore") +# Source of truth for the attachment whitelist. Mirrors MIME_BY_EXTENSION in +# surfsense_obsidian/src/sync-engine.ts — keep in sync. +ATTACHMENT_MIME_TYPES: dict[str, str] = { + "pdf": "application/pdf", + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "webp": "image/webp", + "svg": "image/svg+xml", + "txt": "text/plain", +} +ALLOWED_ATTACHMENT_EXTENSIONS: frozenset[str] = frozenset(ATTACHMENT_MIME_TYPES) + + class _PluginBase(BaseModel): """Base schema carrying the shared forward-compatibility config.""" @@ -78,6 +93,19 @@ class NotePayload(_PluginBase): mtime: datetime ctime: datetime + @model_validator(mode="after") + def _enforce_binary_invariants(self) -> NotePayload: + if self.is_binary: + if not self.binary_base64: + raise ValueError("binary_base64 is required when is_binary is True") + if not self.mime_type: + raise ValueError("mime_type is required when is_binary is True") + elif self.binary_base64 is not None or self.mime_type is not None: + raise ValueError( + "binary_base64 and mime_type must be omitted when is_binary is False", + ) + return self + class SyncBatchRequest(_PluginBase): """Batch upsert; plugin sends 10-20 notes per request.""" diff --git a/surfsense_backend/app/services/obsidian_plugin_indexer.py b/surfsense_backend/app/services/obsidian_plugin_indexer.py index 2f487dd14..f024d36ed 100644 --- a/surfsense_backend/app/services/obsidian_plugin_indexer.py +++ b/surfsense_backend/app/services/obsidian_plugin_indexer.py @@ -119,8 +119,7 @@ def _build_metadata( } if payload.is_binary: meta["is_binary"] = True - if payload.mime_type: - meta["mime_type"] = payload.mime_type + meta["mime_type"] = payload.mime_type if extra: meta.update(extra) return meta @@ -154,9 +153,6 @@ def _build_document_string( async def _extract_binary_attachment_markdown( payload: NotePayload, *, vision_llm ) -> tuple[str, dict[str, Any]]: - if not payload.binary_base64: - return "", {"attachment_extraction_status": "missing_binary_payload"} - try: raw_bytes = base64.b64decode(payload.binary_base64, validate=True) except Exception: @@ -208,7 +204,7 @@ async def _run_etl_extract(*, file_path: str, filename: str, vision_llm): def _is_image_attachment(payload: NotePayload) -> bool: ext = payload.extension.lower().lstrip(".") - return ext in {"png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "svg"} + return ext in {"png", "jpg", "jpeg", "gif", "webp", "svg"} async def _resolve_attachment_vision_llm( diff --git a/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py b/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py index 9093a9eaf..9d84afc12 100644 --- a/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py +++ b/surfsense_backend/tests/integration/test_obsidian_plugin_routes.py @@ -490,6 +490,7 @@ class TestWireContractSmoke: binary_note.extension = "png" binary_note.is_binary = True binary_note.binary_base64 = "aGVsbG8=" + binary_note.mime_type = "image/png" binary_note.content = "" with ( @@ -517,3 +518,107 @@ class TestWireContractSmoke: assert statuses == {"ok.md": "ok", "image.png": "queued"} assert upsert_mock.await_count == 1 queue_mock.assert_called_once() + + async def test_sync_rejects_unsupported_attachment_extension( + self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace + ): + vault_id = str(uuid.uuid4()) + await obsidian_connect( + ConnectRequest( + vault_id=vault_id, + vault_name="Reject Vault", + search_space_id=db_search_space.id, + vault_fingerprint="fp-" + uuid.uuid4().hex, + ), + user=db_user, + session=db_session, + ) + + fake_doc = type("FakeDoc", (), {"id": 12345})() + bad_note = _make_note_payload(vault_id, "photo.heic", "hash-heic") + bad_note.extension = "heic" + bad_note.is_binary = True + bad_note.binary_base64 = "aGVsbG8=" + bad_note.mime_type = "image/heic" + bad_note.content = "" + + with ( + patch( + "app.routes.obsidian_plugin_routes.upsert_note", + new=AsyncMock(return_value=fake_doc), + ), + patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock, + ): + sync_resp = await obsidian_sync( + SyncBatchRequest( + vault_id=vault_id, + notes=[ + _make_note_payload(vault_id, "ok.md", "hash-ok"), + bad_note, + ], + ), + user=db_user, + session=db_session, + ) + + assert sync_resp.indexed == 1 + assert sync_resp.failed == 1 + items_by_path = {it.path: it for it in sync_resp.items} + assert items_by_path["ok.md"].status == "ok" + assert items_by_path["photo.heic"].status == "error" + assert "unsupported attachment extension" in ( + items_by_path["photo.heic"].error or "" + ) + queue_mock.assert_not_called() + + async def test_sync_rejects_mime_extension_mismatch( + self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace + ): + vault_id = str(uuid.uuid4()) + await obsidian_connect( + ConnectRequest( + vault_id=vault_id, + vault_name="Mismatch Vault", + search_space_id=db_search_space.id, + vault_fingerprint="fp-" + uuid.uuid4().hex, + ), + user=db_user, + session=db_session, + ) + + fake_doc = type("FakeDoc", (), {"id": 12345})() + mismatched = _make_note_payload(vault_id, "image.png", "hash-png") + mismatched.extension = "png" + mismatched.is_binary = True + mismatched.binary_base64 = "aGVsbG8=" + mismatched.mime_type = "application/pdf" + mismatched.content = "" + + with ( + patch( + "app.routes.obsidian_plugin_routes.upsert_note", + new=AsyncMock(return_value=fake_doc), + ), + patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock, + ): + sync_resp = await obsidian_sync( + SyncBatchRequest( + vault_id=vault_id, + notes=[ + _make_note_payload(vault_id, "ok.md", "hash-ok"), + mismatched, + ], + ), + user=db_user, + session=db_session, + ) + + assert sync_resp.indexed == 1 + assert sync_resp.failed == 1 + items_by_path = {it.path: it for it in sync_resp.items} + assert items_by_path["ok.md"].status == "ok" + assert items_by_path["image.png"].status == "error" + assert "does not match extension" in ( + items_by_path["image.png"].error or "" + ) + queue_mock.assert_not_called() diff --git a/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py b/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py index 1aecc208a..7ab3c52e0 100644 --- a/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py +++ b/surfsense_backend/tests/unit/test_obsidian_plugin_indexer.py @@ -4,6 +4,7 @@ import base64 from datetime import UTC, datetime import pytest +from pydantic import ValidationError from app.etl_pipeline.etl_document import EtlResult from app.schemas.obsidian_plugin import HeadingRef, NotePayload @@ -15,6 +16,9 @@ from app.services.obsidian_plugin_indexer import ( ) +_FAKE_PNG_B64 = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode("ascii") + + def test_build_metadata_serializes_headings_to_plain_json() -> None: now = datetime.now(UTC) payload = NotePayload( @@ -46,6 +50,7 @@ def test_build_metadata_marks_binary_attachment_fields() -> None: mtime=now, ctime=now, is_binary=True, + binary_base64=_FAKE_PNG_B64, mime_type="image/png", ) @@ -69,6 +74,7 @@ async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> No ctime=now, is_binary=True, binary_base64="not-valid-base64!!", + mime_type="image/png", ) content, metadata = await _extract_binary_attachment_markdown( @@ -93,6 +99,7 @@ async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None: ctime=now, is_binary=True, binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"), + mime_type="application/pdf", ) async def _fake_run_etl_extract( # noqa: ANN001 @@ -133,6 +140,8 @@ def test_is_image_attachment_detects_image_extensions() -> None: mtime=now, ctime=now, is_binary=True, + binary_base64=_FAKE_PNG_B64, + mime_type="image/png", ) pdf_payload = NotePayload( vault_id="vault-1", @@ -144,12 +153,67 @@ def test_is_image_attachment_detects_image_extensions() -> None: mtime=now, ctime=now, is_binary=True, + binary_base64=_FAKE_PNG_B64, + mime_type="application/pdf", ) assert _is_image_attachment(image_payload) is True assert _is_image_attachment(pdf_payload) is False +def test_note_payload_rejects_binary_without_base64() -> None: + now = datetime.now(UTC) + with pytest.raises(ValidationError, match="binary_base64 is required"): + NotePayload( + vault_id="vault-1", + path="assets/diagram.png", + name="diagram", + extension="png", + content="", + content_hash="abc123", + mtime=now, + ctime=now, + is_binary=True, + mime_type="image/png", + ) + + +def test_note_payload_rejects_binary_without_mime_type() -> None: + now = datetime.now(UTC) + with pytest.raises(ValidationError, match="mime_type is required"): + NotePayload( + vault_id="vault-1", + path="assets/diagram.png", + name="diagram", + extension="png", + content="", + content_hash="abc123", + mtime=now, + ctime=now, + is_binary=True, + binary_base64=_FAKE_PNG_B64, + ) + + +def test_note_payload_rejects_markdown_with_binary_fields() -> None: + now = datetime.now(UTC) + with pytest.raises( + ValidationError, + match="binary_base64 and mime_type must be omitted when is_binary is False", + ): + NotePayload( + vault_id="vault-1", + path="notes.md", + name="notes", + extension="md", + content="# Notes", + content_hash="abc123", + mtime=now, + ctime=now, + binary_base64=_FAKE_PNG_B64, + ) + + def test_require_extracted_attachment_content_rejects_empty_content() -> None: with pytest.raises( RuntimeError, match="Attachment extraction failed for assets/img.png" diff --git a/surfsense_obsidian/src/attachments-confirm-modal.ts b/surfsense_obsidian/src/attachments-confirm-modal.ts index 54a0a4afd..1a79fd2bd 100644 --- a/surfsense_obsidian/src/attachments-confirm-modal.ts +++ b/surfsense_obsidian/src/attachments-confirm-modal.ts @@ -16,10 +16,10 @@ export class AttachmentsConfirmModal extends Modal { this.contentEl.empty(); new Setting(this.contentEl).setDesc( - "Syncing attachments (images, PDFs, and other non-Markdown files) can make indexing slower, especially on large vaults.", + "Syncing attachments (images & PDFs) can make indexing slower, especially on large vaults." ); new Setting(this.contentEl).setDesc( - "You can disable this anytime in settings if syncing becomes too slow.", + "Syncing attachments can make indexing slower on large vaults. You can disable this anytime.", ); new Setting(this.contentEl) diff --git a/surfsense_obsidian/src/settings.ts b/surfsense_obsidian/src/settings.ts index acb650790..eb1e5b9f5 100644 --- a/surfsense_obsidian/src/settings.ts +++ b/surfsense_obsidian/src/settings.ts @@ -140,7 +140,7 @@ export class SurfSenseSettingTab extends PluginSettingTab { new Setting(containerEl) .setName("Sync interval") .setDesc( - "How often to check for changes made outside Obsidian. Set to off to only sync manually.", + "How often to check for changes made outside Obsidian.", ) .addDropdown((drop) => { const options: Array<[number, string]> = [ @@ -205,7 +205,7 @@ export class SurfSenseSettingTab extends PluginSettingTab { new Setting(containerEl) .setName("Include attachments") .setDesc( - "Also sync non-Markdown files such as images and PDFs.", + "Also sync non-Markdown files such as images and PDFs. Other file types are skipped.", ) .addToggle((toggle) => toggle @@ -231,7 +231,7 @@ export class SurfSenseSettingTab extends PluginSettingTab { new Setting(containerEl) .setName("Sync only on WiFi") .setDesc( - "Pause automatic syncing on cellular. Note: only Android can detect network type — on iOS this toggle has no effect.", + "Pause automatic syncing on cellular. Note: only Android can detect network type, on iOS this toggle has no effect.", ) .addToggle((toggle) => toggle diff --git a/surfsense_obsidian/src/sync-engine.ts b/surfsense_obsidian/src/sync-engine.ts index ccf0c485f..fafb8135b 100644 --- a/surfsense_obsidian/src/sync-engine.ts +++ b/surfsense_obsidian/src/sync-engine.ts @@ -393,7 +393,7 @@ export class SyncEngine { ctime: file.stat.ctime, is_binary: true, binary_base64: binaryBase64, - mime_type: mimeTypeFromExtension(file.extension), + mime_type: mimeTypeFor(file.extension), }; } @@ -560,9 +560,10 @@ export class SyncEngine { private shouldTrack(file: TAbstractFile): boolean { if (!isTFile(file)) return false; + if (this.isMarkdown(file)) return true; const settings = this.deps.getSettings(); - if (!settings.includeAttachments && !this.isMarkdown(file)) return false; - return true; + if (!settings.includeAttachments) return false; + return ALLOWED_ATTACHMENT_EXTENSIONS.has(file.extension.toLowerCase()); } private isExcluded(path: string, settings: SyncEngineSettings): boolean { @@ -599,23 +600,29 @@ function arrayBufferToBase64(buf: ArrayBuffer): string { return btoa(binary); } -function mimeTypeFromExtension(extension: string): string | undefined { - const ext = extension.toLowerCase(); - const mimeByExt: Record = { - pdf: "application/pdf", - png: "image/png", - jpg: "image/jpeg", - jpeg: "image/jpeg", - gif: "image/gif", - webp: "image/webp", - svg: "image/svg+xml", - txt: "text/plain", - csv: "text/csv", - docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation", - xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - }; - return mimeByExt[ext]; +/** Source of truth for the attachment whitelist. Mirrors ATTACHMENT_MIME_TYPES on the backend. */ +export const MIME_BY_EXTENSION = { + pdf: "application/pdf", + png: "image/png", + jpg: "image/jpeg", + jpeg: "image/jpeg", + gif: "image/gif", + webp: "image/webp", + svg: "image/svg+xml", + txt: "text/plain", +} as const satisfies Record; + +export const ALLOWED_ATTACHMENT_EXTENSIONS: ReadonlySet = new Set( + Object.keys(MIME_BY_EXTENSION), +); + +function mimeTypeFor(extension: string): string { + const ext = extension.toLowerCase() as keyof typeof MIME_BY_EXTENSION; + const mime = MIME_BY_EXTENSION[ext]; + if (!mime) { + throw new Error(`Unsupported attachment extension: .${extension}`); + } + return mime; } function formatRelative(ts: number): string { diff --git a/surfsense_obsidian/src/types.ts b/surfsense_obsidian/src/types.ts index d4e1d6040..2cb040463 100644 --- a/surfsense_obsidian/src/types.ts +++ b/surfsense_obsidian/src/types.ts @@ -67,7 +67,7 @@ export interface RenameItem { export type QueueItem = UpsertItem | DeleteItem | RenameItem; -export interface NotePayload { +interface NotePayloadBase { vault_id: string; path: string; name: string; @@ -85,15 +85,23 @@ export interface NotePayload { size: number; mtime: number; ctime: number; - /** Non-markdown attachment marker; enables backend ETL path. */ - is_binary?: boolean; - /** Base64-encoded file bytes for binary attachments. */ - binary_base64?: string; - /** Optional MIME type hint for backend parsers. */ - mime_type?: string; - [key: string]: unknown; } +export interface MarkdownNotePayload extends NotePayloadBase { + is_binary?: false; +} + +export interface BinaryNotePayload extends NotePayloadBase { + /** Non-markdown attachment marker; enables backend ETL path. */ + is_binary: true; + /** Base64-encoded file bytes for binary attachments. */ + binary_base64: string; + /** Canonical MIME type for the extension; required by the backend. */ + mime_type: string; +} + +export type NotePayload = MarkdownNotePayload | BinaryNotePayload; + export interface HeadingRef { heading: string; level: number;