feat(obsidian_plugin): validate binary attachments and enforce MIME type checks

This commit is contained in:
Anish Sarkar 2026-04-25 00:23:17 +05:30
parent 3b9be79d65
commit e84dc87c5b
9 changed files with 275 additions and 40 deletions

View file

@ -26,6 +26,8 @@ from app.db import (
get_async_session,
)
from app.schemas.obsidian_plugin import (
ALLOWED_ATTACHMENT_EXTENSIONS,
ATTACHMENT_MIME_TYPES,
ConnectRequest,
ConnectResponse,
DeleteAck,
@ -465,6 +467,31 @@ async def obsidian_sync(
for note in payload.notes:
try:
if note.is_binary:
ext = note.extension.lstrip(".").lower()
if ext not in ALLOWED_ATTACHMENT_EXTENSIONS:
failed += 1
items.append(
SyncAckItem(
path=note.path,
status="error",
error=f"unsupported attachment extension: .{ext}",
)
)
continue
expected_mime = ATTACHMENT_MIME_TYPES[ext]
if note.mime_type != expected_mime:
failed += 1
items.append(
SyncAckItem(
path=note.path,
status="error",
error=(
f"mime_type '{note.mime_type}' does not match "
f"extension .{ext}"
),
)
)
continue
_queue_obsidian_attachment(
connector_id=connector.id,
note_payload=note.model_dump(mode="json"),

View file

@ -10,11 +10,26 @@ from __future__ import annotations
from datetime import datetime
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Field, model_validator
_PLUGIN_MODEL_CONFIG = ConfigDict(extra="ignore")
# Source of truth for the attachment whitelist. Mirrors MIME_BY_EXTENSION in
# surfsense_obsidian/src/sync-engine.ts — keep in sync.
ATTACHMENT_MIME_TYPES: dict[str, str] = {
"pdf": "application/pdf",
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"gif": "image/gif",
"webp": "image/webp",
"svg": "image/svg+xml",
"txt": "text/plain",
}
ALLOWED_ATTACHMENT_EXTENSIONS: frozenset[str] = frozenset(ATTACHMENT_MIME_TYPES)
class _PluginBase(BaseModel):
"""Base schema carrying the shared forward-compatibility config."""
@ -78,6 +93,19 @@ class NotePayload(_PluginBase):
mtime: datetime
ctime: datetime
@model_validator(mode="after")
def _enforce_binary_invariants(self) -> NotePayload:
if self.is_binary:
if not self.binary_base64:
raise ValueError("binary_base64 is required when is_binary is True")
if not self.mime_type:
raise ValueError("mime_type is required when is_binary is True")
elif self.binary_base64 is not None or self.mime_type is not None:
raise ValueError(
"binary_base64 and mime_type must be omitted when is_binary is False",
)
return self
class SyncBatchRequest(_PluginBase):
"""Batch upsert; plugin sends 10-20 notes per request."""

View file

@ -119,8 +119,7 @@ def _build_metadata(
}
if payload.is_binary:
meta["is_binary"] = True
if payload.mime_type:
meta["mime_type"] = payload.mime_type
meta["mime_type"] = payload.mime_type
if extra:
meta.update(extra)
return meta
@ -154,9 +153,6 @@ def _build_document_string(
async def _extract_binary_attachment_markdown(
payload: NotePayload, *, vision_llm
) -> tuple[str, dict[str, Any]]:
if not payload.binary_base64:
return "", {"attachment_extraction_status": "missing_binary_payload"}
try:
raw_bytes = base64.b64decode(payload.binary_base64, validate=True)
except Exception:
@ -208,7 +204,7 @@ async def _run_etl_extract(*, file_path: str, filename: str, vision_llm):
def _is_image_attachment(payload: NotePayload) -> bool:
ext = payload.extension.lower().lstrip(".")
return ext in {"png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "svg"}
return ext in {"png", "jpg", "jpeg", "gif", "webp", "svg"}
async def _resolve_attachment_vision_llm(

View file

@ -490,6 +490,7 @@ class TestWireContractSmoke:
binary_note.extension = "png"
binary_note.is_binary = True
binary_note.binary_base64 = "aGVsbG8="
binary_note.mime_type = "image/png"
binary_note.content = ""
with (
@ -517,3 +518,107 @@ class TestWireContractSmoke:
assert statuses == {"ok.md": "ok", "image.png": "queued"}
assert upsert_mock.await_count == 1
queue_mock.assert_called_once()
async def test_sync_rejects_unsupported_attachment_extension(
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
vault_id = str(uuid.uuid4())
await obsidian_connect(
ConnectRequest(
vault_id=vault_id,
vault_name="Reject Vault",
search_space_id=db_search_space.id,
vault_fingerprint="fp-" + uuid.uuid4().hex,
),
user=db_user,
session=db_session,
)
fake_doc = type("FakeDoc", (), {"id": 12345})()
bad_note = _make_note_payload(vault_id, "photo.heic", "hash-heic")
bad_note.extension = "heic"
bad_note.is_binary = True
bad_note.binary_base64 = "aGVsbG8="
bad_note.mime_type = "image/heic"
bad_note.content = ""
with (
patch(
"app.routes.obsidian_plugin_routes.upsert_note",
new=AsyncMock(return_value=fake_doc),
),
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
):
sync_resp = await obsidian_sync(
SyncBatchRequest(
vault_id=vault_id,
notes=[
_make_note_payload(vault_id, "ok.md", "hash-ok"),
bad_note,
],
),
user=db_user,
session=db_session,
)
assert sync_resp.indexed == 1
assert sync_resp.failed == 1
items_by_path = {it.path: it for it in sync_resp.items}
assert items_by_path["ok.md"].status == "ok"
assert items_by_path["photo.heic"].status == "error"
assert "unsupported attachment extension" in (
items_by_path["photo.heic"].error or ""
)
queue_mock.assert_not_called()
async def test_sync_rejects_mime_extension_mismatch(
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
vault_id = str(uuid.uuid4())
await obsidian_connect(
ConnectRequest(
vault_id=vault_id,
vault_name="Mismatch Vault",
search_space_id=db_search_space.id,
vault_fingerprint="fp-" + uuid.uuid4().hex,
),
user=db_user,
session=db_session,
)
fake_doc = type("FakeDoc", (), {"id": 12345})()
mismatched = _make_note_payload(vault_id, "image.png", "hash-png")
mismatched.extension = "png"
mismatched.is_binary = True
mismatched.binary_base64 = "aGVsbG8="
mismatched.mime_type = "application/pdf"
mismatched.content = ""
with (
patch(
"app.routes.obsidian_plugin_routes.upsert_note",
new=AsyncMock(return_value=fake_doc),
),
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
):
sync_resp = await obsidian_sync(
SyncBatchRequest(
vault_id=vault_id,
notes=[
_make_note_payload(vault_id, "ok.md", "hash-ok"),
mismatched,
],
),
user=db_user,
session=db_session,
)
assert sync_resp.indexed == 1
assert sync_resp.failed == 1
items_by_path = {it.path: it for it in sync_resp.items}
assert items_by_path["ok.md"].status == "ok"
assert items_by_path["image.png"].status == "error"
assert "does not match extension" in (
items_by_path["image.png"].error or ""
)
queue_mock.assert_not_called()

View file

@ -4,6 +4,7 @@ import base64
from datetime import UTC, datetime
import pytest
from pydantic import ValidationError
from app.etl_pipeline.etl_document import EtlResult
from app.schemas.obsidian_plugin import HeadingRef, NotePayload
@ -15,6 +16,9 @@ from app.services.obsidian_plugin_indexer import (
)
_FAKE_PNG_B64 = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode("ascii")
def test_build_metadata_serializes_headings_to_plain_json() -> None:
now = datetime.now(UTC)
payload = NotePayload(
@ -46,6 +50,7 @@ def test_build_metadata_marks_binary_attachment_fields() -> None:
mtime=now,
ctime=now,
is_binary=True,
binary_base64=_FAKE_PNG_B64,
mime_type="image/png",
)
@ -69,6 +74,7 @@ async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> No
ctime=now,
is_binary=True,
binary_base64="not-valid-base64!!",
mime_type="image/png",
)
content, metadata = await _extract_binary_attachment_markdown(
@ -93,6 +99,7 @@ async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
ctime=now,
is_binary=True,
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
mime_type="application/pdf",
)
async def _fake_run_etl_extract( # noqa: ANN001
@ -133,6 +140,8 @@ def test_is_image_attachment_detects_image_extensions() -> None:
mtime=now,
ctime=now,
is_binary=True,
binary_base64=_FAKE_PNG_B64,
mime_type="image/png",
)
pdf_payload = NotePayload(
vault_id="vault-1",
@ -144,12 +153,67 @@ def test_is_image_attachment_detects_image_extensions() -> None:
mtime=now,
ctime=now,
is_binary=True,
binary_base64=_FAKE_PNG_B64,
mime_type="application/pdf",
)
assert _is_image_attachment(image_payload) is True
assert _is_image_attachment(pdf_payload) is False
def test_note_payload_rejects_binary_without_base64() -> None:
now = datetime.now(UTC)
with pytest.raises(ValidationError, match="binary_base64 is required"):
NotePayload(
vault_id="vault-1",
path="assets/diagram.png",
name="diagram",
extension="png",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
mime_type="image/png",
)
def test_note_payload_rejects_binary_without_mime_type() -> None:
now = datetime.now(UTC)
with pytest.raises(ValidationError, match="mime_type is required"):
NotePayload(
vault_id="vault-1",
path="assets/diagram.png",
name="diagram",
extension="png",
content="",
content_hash="abc123",
mtime=now,
ctime=now,
is_binary=True,
binary_base64=_FAKE_PNG_B64,
)
def test_note_payload_rejects_markdown_with_binary_fields() -> None:
now = datetime.now(UTC)
with pytest.raises(
ValidationError,
match="binary_base64 and mime_type must be omitted when is_binary is False",
):
NotePayload(
vault_id="vault-1",
path="notes.md",
name="notes",
extension="md",
content="# Notes",
content_hash="abc123",
mtime=now,
ctime=now,
binary_base64=_FAKE_PNG_B64,
)
def test_require_extracted_attachment_content_rejects_empty_content() -> None:
with pytest.raises(
RuntimeError, match="Attachment extraction failed for assets/img.png"

View file

@ -16,10 +16,10 @@ export class AttachmentsConfirmModal extends Modal {
this.contentEl.empty();
new Setting(this.contentEl).setDesc(
"Syncing attachments (images, PDFs, and other non-Markdown files) can make indexing slower, especially on large vaults.",
"Syncing attachments (images & PDFs) can make indexing slower, especially on large vaults."
);
new Setting(this.contentEl).setDesc(
"You can disable this anytime in settings if syncing becomes too slow.",
"Syncing attachments can make indexing slower on large vaults. You can disable this anytime.",
);
new Setting(this.contentEl)

View file

@ -140,7 +140,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
new Setting(containerEl)
.setName("Sync interval")
.setDesc(
"How often to check for changes made outside Obsidian. Set to off to only sync manually.",
"How often to check for changes made outside Obsidian.",
)
.addDropdown((drop) => {
const options: Array<[number, string]> = [
@ -205,7 +205,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
new Setting(containerEl)
.setName("Include attachments")
.setDesc(
"Also sync non-Markdown files such as images and PDFs.",
"Also sync non-Markdown files such as images and PDFs. Other file types are skipped.",
)
.addToggle((toggle) =>
toggle
@ -231,7 +231,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
new Setting(containerEl)
.setName("Sync only on WiFi")
.setDesc(
"Pause automatic syncing on cellular. Note: only Android can detect network type on iOS this toggle has no effect.",
"Pause automatic syncing on cellular. Note: only Android can detect network type, on iOS this toggle has no effect.",
)
.addToggle((toggle) =>
toggle

View file

@ -393,7 +393,7 @@ export class SyncEngine {
ctime: file.stat.ctime,
is_binary: true,
binary_base64: binaryBase64,
mime_type: mimeTypeFromExtension(file.extension),
mime_type: mimeTypeFor(file.extension),
};
}
@ -560,9 +560,10 @@ export class SyncEngine {
private shouldTrack(file: TAbstractFile): boolean {
if (!isTFile(file)) return false;
if (this.isMarkdown(file)) return true;
const settings = this.deps.getSettings();
if (!settings.includeAttachments && !this.isMarkdown(file)) return false;
return true;
if (!settings.includeAttachments) return false;
return ALLOWED_ATTACHMENT_EXTENSIONS.has(file.extension.toLowerCase());
}
private isExcluded(path: string, settings: SyncEngineSettings): boolean {
@ -599,23 +600,29 @@ function arrayBufferToBase64(buf: ArrayBuffer): string {
return btoa(binary);
}
function mimeTypeFromExtension(extension: string): string | undefined {
const ext = extension.toLowerCase();
const mimeByExt: Record<string, string> = {
pdf: "application/pdf",
png: "image/png",
jpg: "image/jpeg",
jpeg: "image/jpeg",
gif: "image/gif",
webp: "image/webp",
svg: "image/svg+xml",
txt: "text/plain",
csv: "text/csv",
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
};
return mimeByExt[ext];
/** Source of truth for the attachment whitelist. Mirrors ATTACHMENT_MIME_TYPES on the backend. */
export const MIME_BY_EXTENSION = {
pdf: "application/pdf",
png: "image/png",
jpg: "image/jpeg",
jpeg: "image/jpeg",
gif: "image/gif",
webp: "image/webp",
svg: "image/svg+xml",
txt: "text/plain",
} as const satisfies Record<string, string>;
export const ALLOWED_ATTACHMENT_EXTENSIONS: ReadonlySet<string> = new Set(
Object.keys(MIME_BY_EXTENSION),
);
function mimeTypeFor(extension: string): string {
const ext = extension.toLowerCase() as keyof typeof MIME_BY_EXTENSION;
const mime = MIME_BY_EXTENSION[ext];
if (!mime) {
throw new Error(`Unsupported attachment extension: .${extension}`);
}
return mime;
}
function formatRelative(ts: number): string {

View file

@ -67,7 +67,7 @@ export interface RenameItem {
export type QueueItem = UpsertItem | DeleteItem | RenameItem;
export interface NotePayload {
interface NotePayloadBase {
vault_id: string;
path: string;
name: string;
@ -85,15 +85,23 @@ export interface NotePayload {
size: number;
mtime: number;
ctime: number;
/** Non-markdown attachment marker; enables backend ETL path. */
is_binary?: boolean;
/** Base64-encoded file bytes for binary attachments. */
binary_base64?: string;
/** Optional MIME type hint for backend parsers. */
mime_type?: string;
[key: string]: unknown;
}
export interface MarkdownNotePayload extends NotePayloadBase {
is_binary?: false;
}
export interface BinaryNotePayload extends NotePayloadBase {
/** Non-markdown attachment marker; enables backend ETL path. */
is_binary: true;
/** Base64-encoded file bytes for binary attachments. */
binary_base64: string;
/** Canonical MIME type for the extension; required by the backend. */
mime_type: string;
}
export type NotePayload = MarkdownNotePayload | BinaryNotePayload;
export interface HeadingRef {
heading: string;
level: number;