feat(obsidian_plugin): validate binary attachments and enforce MIME type checks

This commit is contained in:
Anish Sarkar 2026-04-25 00:23:17 +05:30
parent 3b9be79d65
commit e84dc87c5b
9 changed files with 275 additions and 40 deletions

View file

@ -26,6 +26,8 @@ from app.db import (
get_async_session, get_async_session,
) )
from app.schemas.obsidian_plugin import ( from app.schemas.obsidian_plugin import (
ALLOWED_ATTACHMENT_EXTENSIONS,
ATTACHMENT_MIME_TYPES,
ConnectRequest, ConnectRequest,
ConnectResponse, ConnectResponse,
DeleteAck, DeleteAck,
@ -465,6 +467,31 @@ async def obsidian_sync(
for note in payload.notes: for note in payload.notes:
try: try:
if note.is_binary: if note.is_binary:
ext = note.extension.lstrip(".").lower()
if ext not in ALLOWED_ATTACHMENT_EXTENSIONS:
failed += 1
items.append(
SyncAckItem(
path=note.path,
status="error",
error=f"unsupported attachment extension: .{ext}",
)
)
continue
expected_mime = ATTACHMENT_MIME_TYPES[ext]
if note.mime_type != expected_mime:
failed += 1
items.append(
SyncAckItem(
path=note.path,
status="error",
error=(
f"mime_type '{note.mime_type}' does not match "
f"extension .{ext}"
),
)
)
continue
_queue_obsidian_attachment( _queue_obsidian_attachment(
connector_id=connector.id, connector_id=connector.id,
note_payload=note.model_dump(mode="json"), note_payload=note.model_dump(mode="json"),

View file

@ -10,11 +10,26 @@ from __future__ import annotations
from datetime import datetime from datetime import datetime
from typing import Any, Literal from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field, model_validator
_PLUGIN_MODEL_CONFIG = ConfigDict(extra="ignore") _PLUGIN_MODEL_CONFIG = ConfigDict(extra="ignore")
# Source of truth for the attachment whitelist. Mirrors MIME_BY_EXTENSION in
# surfsense_obsidian/src/sync-engine.ts — keep in sync.
ATTACHMENT_MIME_TYPES: dict[str, str] = {
"pdf": "application/pdf",
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"gif": "image/gif",
"webp": "image/webp",
"svg": "image/svg+xml",
"txt": "text/plain",
}
ALLOWED_ATTACHMENT_EXTENSIONS: frozenset[str] = frozenset(ATTACHMENT_MIME_TYPES)
class _PluginBase(BaseModel): class _PluginBase(BaseModel):
"""Base schema carrying the shared forward-compatibility config.""" """Base schema carrying the shared forward-compatibility config."""
@ -78,6 +93,19 @@ class NotePayload(_PluginBase):
mtime: datetime mtime: datetime
ctime: datetime ctime: datetime
@model_validator(mode="after")
def _enforce_binary_invariants(self) -> NotePayload:
if self.is_binary:
if not self.binary_base64:
raise ValueError("binary_base64 is required when is_binary is True")
if not self.mime_type:
raise ValueError("mime_type is required when is_binary is True")
elif self.binary_base64 is not None or self.mime_type is not None:
raise ValueError(
"binary_base64 and mime_type must be omitted when is_binary is False",
)
return self
class SyncBatchRequest(_PluginBase): class SyncBatchRequest(_PluginBase):
"""Batch upsert; plugin sends 10-20 notes per request.""" """Batch upsert; plugin sends 10-20 notes per request."""

View file

@ -119,8 +119,7 @@ def _build_metadata(
} }
if payload.is_binary: if payload.is_binary:
meta["is_binary"] = True meta["is_binary"] = True
if payload.mime_type: meta["mime_type"] = payload.mime_type
meta["mime_type"] = payload.mime_type
if extra: if extra:
meta.update(extra) meta.update(extra)
return meta return meta
@ -154,9 +153,6 @@ def _build_document_string(
async def _extract_binary_attachment_markdown( async def _extract_binary_attachment_markdown(
payload: NotePayload, *, vision_llm payload: NotePayload, *, vision_llm
) -> tuple[str, dict[str, Any]]: ) -> tuple[str, dict[str, Any]]:
if not payload.binary_base64:
return "", {"attachment_extraction_status": "missing_binary_payload"}
try: try:
raw_bytes = base64.b64decode(payload.binary_base64, validate=True) raw_bytes = base64.b64decode(payload.binary_base64, validate=True)
except Exception: except Exception:
@ -208,7 +204,7 @@ async def _run_etl_extract(*, file_path: str, filename: str, vision_llm):
def _is_image_attachment(payload: NotePayload) -> bool: def _is_image_attachment(payload: NotePayload) -> bool:
ext = payload.extension.lower().lstrip(".") ext = payload.extension.lower().lstrip(".")
return ext in {"png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "svg"} return ext in {"png", "jpg", "jpeg", "gif", "webp", "svg"}
async def _resolve_attachment_vision_llm( async def _resolve_attachment_vision_llm(

View file

@ -490,6 +490,7 @@ class TestWireContractSmoke:
binary_note.extension = "png" binary_note.extension = "png"
binary_note.is_binary = True binary_note.is_binary = True
binary_note.binary_base64 = "aGVsbG8=" binary_note.binary_base64 = "aGVsbG8="
binary_note.mime_type = "image/png"
binary_note.content = "" binary_note.content = ""
with ( with (
@ -517,3 +518,107 @@ class TestWireContractSmoke:
assert statuses == {"ok.md": "ok", "image.png": "queued"} assert statuses == {"ok.md": "ok", "image.png": "queued"}
assert upsert_mock.await_count == 1 assert upsert_mock.await_count == 1
queue_mock.assert_called_once() queue_mock.assert_called_once()
async def test_sync_rejects_unsupported_attachment_extension(
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
vault_id = str(uuid.uuid4())
await obsidian_connect(
ConnectRequest(
vault_id=vault_id,
vault_name="Reject Vault",
search_space_id=db_search_space.id,
vault_fingerprint="fp-" + uuid.uuid4().hex,
),
user=db_user,
session=db_session,
)
fake_doc = type("FakeDoc", (), {"id": 12345})()
bad_note = _make_note_payload(vault_id, "photo.heic", "hash-heic")
bad_note.extension = "heic"
bad_note.is_binary = True
bad_note.binary_base64 = "aGVsbG8="
bad_note.mime_type = "image/heic"
bad_note.content = ""
with (
patch(
"app.routes.obsidian_plugin_routes.upsert_note",
new=AsyncMock(return_value=fake_doc),
),
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
):
sync_resp = await obsidian_sync(
SyncBatchRequest(
vault_id=vault_id,
notes=[
_make_note_payload(vault_id, "ok.md", "hash-ok"),
bad_note,
],
),
user=db_user,
session=db_session,
)
assert sync_resp.indexed == 1
assert sync_resp.failed == 1
items_by_path = {it.path: it for it in sync_resp.items}
assert items_by_path["ok.md"].status == "ok"
assert items_by_path["photo.heic"].status == "error"
assert "unsupported attachment extension" in (
items_by_path["photo.heic"].error or ""
)
queue_mock.assert_not_called()
async def test_sync_rejects_mime_extension_mismatch(
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
vault_id = str(uuid.uuid4())
await obsidian_connect(
ConnectRequest(
vault_id=vault_id,
vault_name="Mismatch Vault",
search_space_id=db_search_space.id,
vault_fingerprint="fp-" + uuid.uuid4().hex,
),
user=db_user,
session=db_session,
)
fake_doc = type("FakeDoc", (), {"id": 12345})()
mismatched = _make_note_payload(vault_id, "image.png", "hash-png")
mismatched.extension = "png"
mismatched.is_binary = True
mismatched.binary_base64 = "aGVsbG8="
mismatched.mime_type = "application/pdf"
mismatched.content = ""
with (
patch(
"app.routes.obsidian_plugin_routes.upsert_note",
new=AsyncMock(return_value=fake_doc),
),
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
):
sync_resp = await obsidian_sync(
SyncBatchRequest(
vault_id=vault_id,
notes=[
_make_note_payload(vault_id, "ok.md", "hash-ok"),
mismatched,
],
),
user=db_user,
session=db_session,
)
assert sync_resp.indexed == 1
assert sync_resp.failed == 1
items_by_path = {it.path: it for it in sync_resp.items}
assert items_by_path["ok.md"].status == "ok"
assert items_by_path["image.png"].status == "error"
assert "does not match extension" in (
items_by_path["image.png"].error or ""
)
queue_mock.assert_not_called()

View file

@ -4,6 +4,7 @@ import base64
from datetime import UTC, datetime from datetime import UTC, datetime
import pytest import pytest
from pydantic import ValidationError
from app.etl_pipeline.etl_document import EtlResult from app.etl_pipeline.etl_document import EtlResult
from app.schemas.obsidian_plugin import HeadingRef, NotePayload from app.schemas.obsidian_plugin import HeadingRef, NotePayload
@ -15,6 +16,9 @@ from app.services.obsidian_plugin_indexer import (
) )
_FAKE_PNG_B64 = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode("ascii")
def test_build_metadata_serializes_headings_to_plain_json() -> None: def test_build_metadata_serializes_headings_to_plain_json() -> None:
now = datetime.now(UTC) now = datetime.now(UTC)
payload = NotePayload( payload = NotePayload(
@ -46,6 +50,7 @@ def test_build_metadata_marks_binary_attachment_fields() -> None:
mtime=now, mtime=now,
ctime=now, ctime=now,
is_binary=True, is_binary=True,
binary_base64=_FAKE_PNG_B64,
mime_type="image/png", mime_type="image/png",
) )
@ -69,6 +74,7 @@ async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> No
ctime=now, ctime=now,
is_binary=True, is_binary=True,
binary_base64="not-valid-base64!!", binary_base64="not-valid-base64!!",
mime_type="image/png",
) )
content, metadata = await _extract_binary_attachment_markdown( content, metadata = await _extract_binary_attachment_markdown(
@ -93,6 +99,7 @@ async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
ctime=now, ctime=now,
is_binary=True, is_binary=True,
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"), binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
mime_type="application/pdf",
) )
async def _fake_run_etl_extract( # noqa: ANN001 async def _fake_run_etl_extract( # noqa: ANN001
@ -133,6 +140,8 @@ def test_is_image_attachment_detects_image_extensions() -> None:
mtime=now, mtime=now,
ctime=now, ctime=now,
is_binary=True, is_binary=True,
binary_base64=_FAKE_PNG_B64,
mime_type="image/png",
) )
pdf_payload = NotePayload( pdf_payload = NotePayload(
vault_id="vault-1", vault_id="vault-1",
@ -144,12 +153,67 @@ def test_is_image_attachment_detects_image_extensions() -> None:
mtime=now, mtime=now,
ctime=now, ctime=now,
is_binary=True, is_binary=True,
binary_base64=_FAKE_PNG_B64,
mime_type="application/pdf",
) )
assert _is_image_attachment(image_payload) is True assert _is_image_attachment(image_payload) is True
assert _is_image_attachment(pdf_payload) is False assert _is_image_attachment(pdf_payload) is False
def test_note_payload_rejects_binary_without_base64() -> None:
now = datetime.now(UTC)
with pytest.raises(ValidationError, match="binary_base64 is required"):
NotePayload(
vault_id="vault-1",
path="assets/diagram.png",
name="diagram",
extension="png",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
mime_type="image/png",
)
def test_note_payload_rejects_binary_without_mime_type() -> None:
now = datetime.now(UTC)
with pytest.raises(ValidationError, match="mime_type is required"):
NotePayload(
vault_id="vault-1",
path="assets/diagram.png",
name="diagram",
extension="png",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
binary_base64=_FAKE_PNG_B64,
)
def test_note_payload_rejects_markdown_with_binary_fields() -> None:
now = datetime.now(UTC)
with pytest.raises(
ValidationError,
match="binary_base64 and mime_type must be omitted when is_binary is False",
):
NotePayload(
vault_id="vault-1",
path="notes.md",
name="notes",
extension="md",
content="# Notes",
content_hash="abc123",
mtime=now,
ctime=now,
binary_base64=_FAKE_PNG_B64,
)
def test_require_extracted_attachment_content_rejects_empty_content() -> None: def test_require_extracted_attachment_content_rejects_empty_content() -> None:
with pytest.raises( with pytest.raises(
RuntimeError, match="Attachment extraction failed for assets/img.png" RuntimeError, match="Attachment extraction failed for assets/img.png"

View file

@ -16,10 +16,10 @@ export class AttachmentsConfirmModal extends Modal {
this.contentEl.empty(); this.contentEl.empty();
new Setting(this.contentEl).setDesc( new Setting(this.contentEl).setDesc(
"Syncing attachments (images, PDFs, and other non-Markdown files) can make indexing slower, especially on large vaults.", "Syncing attachments (images & PDFs) can make indexing slower, especially on large vaults."
); );
new Setting(this.contentEl).setDesc( new Setting(this.contentEl).setDesc(
"You can disable this anytime in settings if syncing becomes too slow.", "Syncing attachments can make indexing slower on large vaults. You can disable this anytime.",
); );
new Setting(this.contentEl) new Setting(this.contentEl)

View file

@ -140,7 +140,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
new Setting(containerEl) new Setting(containerEl)
.setName("Sync interval") .setName("Sync interval")
.setDesc( .setDesc(
"How often to check for changes made outside Obsidian. Set to off to only sync manually.", "How often to check for changes made outside Obsidian.",
) )
.addDropdown((drop) => { .addDropdown((drop) => {
const options: Array<[number, string]> = [ const options: Array<[number, string]> = [
@ -205,7 +205,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
new Setting(containerEl) new Setting(containerEl)
.setName("Include attachments") .setName("Include attachments")
.setDesc( .setDesc(
"Also sync non-Markdown files such as images and PDFs.", "Also sync non-Markdown files such as images and PDFs. Other file types are skipped.",
) )
.addToggle((toggle) => .addToggle((toggle) =>
toggle toggle
@ -231,7 +231,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
new Setting(containerEl) new Setting(containerEl)
.setName("Sync only on WiFi") .setName("Sync only on WiFi")
.setDesc( .setDesc(
"Pause automatic syncing on cellular. Note: only Android can detect network type on iOS this toggle has no effect.", "Pause automatic syncing on cellular. Note: only Android can detect network type, on iOS this toggle has no effect.",
) )
.addToggle((toggle) => .addToggle((toggle) =>
toggle toggle

View file

@ -393,7 +393,7 @@ export class SyncEngine {
ctime: file.stat.ctime, ctime: file.stat.ctime,
is_binary: true, is_binary: true,
binary_base64: binaryBase64, binary_base64: binaryBase64,
mime_type: mimeTypeFromExtension(file.extension), mime_type: mimeTypeFor(file.extension),
}; };
} }
@ -560,9 +560,10 @@ export class SyncEngine {
private shouldTrack(file: TAbstractFile): boolean { private shouldTrack(file: TAbstractFile): boolean {
if (!isTFile(file)) return false; if (!isTFile(file)) return false;
if (this.isMarkdown(file)) return true;
const settings = this.deps.getSettings(); const settings = this.deps.getSettings();
if (!settings.includeAttachments && !this.isMarkdown(file)) return false; if (!settings.includeAttachments) return false;
return true; return ALLOWED_ATTACHMENT_EXTENSIONS.has(file.extension.toLowerCase());
} }
private isExcluded(path: string, settings: SyncEngineSettings): boolean { private isExcluded(path: string, settings: SyncEngineSettings): boolean {
@ -599,23 +600,29 @@ function arrayBufferToBase64(buf: ArrayBuffer): string {
return btoa(binary); return btoa(binary);
} }
function mimeTypeFromExtension(extension: string): string | undefined { /** Source of truth for the attachment whitelist. Mirrors ATTACHMENT_MIME_TYPES on the backend. */
const ext = extension.toLowerCase(); export const MIME_BY_EXTENSION = {
const mimeByExt: Record<string, string> = { pdf: "application/pdf",
pdf: "application/pdf", png: "image/png",
png: "image/png", jpg: "image/jpeg",
jpg: "image/jpeg", jpeg: "image/jpeg",
jpeg: "image/jpeg", gif: "image/gif",
gif: "image/gif", webp: "image/webp",
webp: "image/webp", svg: "image/svg+xml",
svg: "image/svg+xml", txt: "text/plain",
txt: "text/plain", } as const satisfies Record<string, string>;
csv: "text/csv",
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", export const ALLOWED_ATTACHMENT_EXTENSIONS: ReadonlySet<string> = new Set(
pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation", Object.keys(MIME_BY_EXTENSION),
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", );
};
return mimeByExt[ext]; function mimeTypeFor(extension: string): string {
const ext = extension.toLowerCase() as keyof typeof MIME_BY_EXTENSION;
const mime = MIME_BY_EXTENSION[ext];
if (!mime) {
throw new Error(`Unsupported attachment extension: .${extension}`);
}
return mime;
} }
function formatRelative(ts: number): string { function formatRelative(ts: number): string {

View file

@ -67,7 +67,7 @@ export interface RenameItem {
export type QueueItem = UpsertItem | DeleteItem | RenameItem; export type QueueItem = UpsertItem | DeleteItem | RenameItem;
export interface NotePayload { interface NotePayloadBase {
vault_id: string; vault_id: string;
path: string; path: string;
name: string; name: string;
@ -85,15 +85,23 @@ export interface NotePayload {
size: number; size: number;
mtime: number; mtime: number;
ctime: number; ctime: number;
/** Non-markdown attachment marker; enables backend ETL path. */
is_binary?: boolean;
/** Base64-encoded file bytes for binary attachments. */
binary_base64?: string;
/** Optional MIME type hint for backend parsers. */
mime_type?: string;
[key: string]: unknown;
} }
export interface MarkdownNotePayload extends NotePayloadBase {
is_binary?: false;
}
export interface BinaryNotePayload extends NotePayloadBase {
/** Non-markdown attachment marker; enables backend ETL path. */
is_binary: true;
/** Base64-encoded file bytes for binary attachments. */
binary_base64: string;
/** Canonical MIME type for the extension; required by the backend. */
mime_type: string;
}
export type NotePayload = MarkdownNotePayload | BinaryNotePayload;
export interface HeadingRef { export interface HeadingRef {
heading: string; heading: string;
level: number; level: number;