mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-02 19:55:18 +02:00
feat(obsidian_plugin): validate binary attachments and enforce MIME type checks
This commit is contained in:
parent
3b9be79d65
commit
e84dc87c5b
9 changed files with 275 additions and 40 deletions
|
|
@ -26,6 +26,8 @@ from app.db import (
|
|||
get_async_session,
|
||||
)
|
||||
from app.schemas.obsidian_plugin import (
|
||||
ALLOWED_ATTACHMENT_EXTENSIONS,
|
||||
ATTACHMENT_MIME_TYPES,
|
||||
ConnectRequest,
|
||||
ConnectResponse,
|
||||
DeleteAck,
|
||||
|
|
@ -465,6 +467,31 @@ async def obsidian_sync(
|
|||
for note in payload.notes:
|
||||
try:
|
||||
if note.is_binary:
|
||||
ext = note.extension.lstrip(".").lower()
|
||||
if ext not in ALLOWED_ATTACHMENT_EXTENSIONS:
|
||||
failed += 1
|
||||
items.append(
|
||||
SyncAckItem(
|
||||
path=note.path,
|
||||
status="error",
|
||||
error=f"unsupported attachment extension: .{ext}",
|
||||
)
|
||||
)
|
||||
continue
|
||||
expected_mime = ATTACHMENT_MIME_TYPES[ext]
|
||||
if note.mime_type != expected_mime:
|
||||
failed += 1
|
||||
items.append(
|
||||
SyncAckItem(
|
||||
path=note.path,
|
||||
status="error",
|
||||
error=(
|
||||
f"mime_type '{note.mime_type}' does not match "
|
||||
f"extension .{ext}"
|
||||
),
|
||||
)
|
||||
)
|
||||
continue
|
||||
_queue_obsidian_attachment(
|
||||
connector_id=connector.id,
|
||||
note_payload=note.model_dump(mode="json"),
|
||||
|
|
|
|||
|
|
@ -10,11 +10,26 @@ from __future__ import annotations
|
|||
from datetime import datetime
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
|
||||
_PLUGIN_MODEL_CONFIG = ConfigDict(extra="ignore")
|
||||
|
||||
|
||||
# Source of truth for the attachment whitelist. Mirrors MIME_BY_EXTENSION in
|
||||
# surfsense_obsidian/src/sync-engine.ts — keep in sync.
|
||||
ATTACHMENT_MIME_TYPES: dict[str, str] = {
|
||||
"pdf": "application/pdf",
|
||||
"png": "image/png",
|
||||
"jpg": "image/jpeg",
|
||||
"jpeg": "image/jpeg",
|
||||
"gif": "image/gif",
|
||||
"webp": "image/webp",
|
||||
"svg": "image/svg+xml",
|
||||
"txt": "text/plain",
|
||||
}
|
||||
ALLOWED_ATTACHMENT_EXTENSIONS: frozenset[str] = frozenset(ATTACHMENT_MIME_TYPES)
|
||||
|
||||
|
||||
class _PluginBase(BaseModel):
|
||||
"""Base schema carrying the shared forward-compatibility config."""
|
||||
|
||||
|
|
@ -78,6 +93,19 @@ class NotePayload(_PluginBase):
|
|||
mtime: datetime
|
||||
ctime: datetime
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _enforce_binary_invariants(self) -> NotePayload:
|
||||
if self.is_binary:
|
||||
if not self.binary_base64:
|
||||
raise ValueError("binary_base64 is required when is_binary is True")
|
||||
if not self.mime_type:
|
||||
raise ValueError("mime_type is required when is_binary is True")
|
||||
elif self.binary_base64 is not None or self.mime_type is not None:
|
||||
raise ValueError(
|
||||
"binary_base64 and mime_type must be omitted when is_binary is False",
|
||||
)
|
||||
return self
|
||||
|
||||
|
||||
class SyncBatchRequest(_PluginBase):
|
||||
"""Batch upsert; plugin sends 10-20 notes per request."""
|
||||
|
|
|
|||
|
|
@ -119,8 +119,7 @@ def _build_metadata(
|
|||
}
|
||||
if payload.is_binary:
|
||||
meta["is_binary"] = True
|
||||
if payload.mime_type:
|
||||
meta["mime_type"] = payload.mime_type
|
||||
meta["mime_type"] = payload.mime_type
|
||||
if extra:
|
||||
meta.update(extra)
|
||||
return meta
|
||||
|
|
@ -154,9 +153,6 @@ def _build_document_string(
|
|||
async def _extract_binary_attachment_markdown(
|
||||
payload: NotePayload, *, vision_llm
|
||||
) -> tuple[str, dict[str, Any]]:
|
||||
if not payload.binary_base64:
|
||||
return "", {"attachment_extraction_status": "missing_binary_payload"}
|
||||
|
||||
try:
|
||||
raw_bytes = base64.b64decode(payload.binary_base64, validate=True)
|
||||
except Exception:
|
||||
|
|
@ -208,7 +204,7 @@ async def _run_etl_extract(*, file_path: str, filename: str, vision_llm):
|
|||
|
||||
def _is_image_attachment(payload: NotePayload) -> bool:
|
||||
ext = payload.extension.lower().lstrip(".")
|
||||
return ext in {"png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "svg"}
|
||||
return ext in {"png", "jpg", "jpeg", "gif", "webp", "svg"}
|
||||
|
||||
|
||||
async def _resolve_attachment_vision_llm(
|
||||
|
|
|
|||
|
|
@ -490,6 +490,7 @@ class TestWireContractSmoke:
|
|||
binary_note.extension = "png"
|
||||
binary_note.is_binary = True
|
||||
binary_note.binary_base64 = "aGVsbG8="
|
||||
binary_note.mime_type = "image/png"
|
||||
binary_note.content = ""
|
||||
|
||||
with (
|
||||
|
|
@ -517,3 +518,107 @@ class TestWireContractSmoke:
|
|||
assert statuses == {"ok.md": "ok", "image.png": "queued"}
|
||||
assert upsert_mock.await_count == 1
|
||||
queue_mock.assert_called_once()
|
||||
|
||||
async def test_sync_rejects_unsupported_attachment_extension(
|
||||
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
||||
):
|
||||
vault_id = str(uuid.uuid4())
|
||||
await obsidian_connect(
|
||||
ConnectRequest(
|
||||
vault_id=vault_id,
|
||||
vault_name="Reject Vault",
|
||||
search_space_id=db_search_space.id,
|
||||
vault_fingerprint="fp-" + uuid.uuid4().hex,
|
||||
),
|
||||
user=db_user,
|
||||
session=db_session,
|
||||
)
|
||||
|
||||
fake_doc = type("FakeDoc", (), {"id": 12345})()
|
||||
bad_note = _make_note_payload(vault_id, "photo.heic", "hash-heic")
|
||||
bad_note.extension = "heic"
|
||||
bad_note.is_binary = True
|
||||
bad_note.binary_base64 = "aGVsbG8="
|
||||
bad_note.mime_type = "image/heic"
|
||||
bad_note.content = ""
|
||||
|
||||
with (
|
||||
patch(
|
||||
"app.routes.obsidian_plugin_routes.upsert_note",
|
||||
new=AsyncMock(return_value=fake_doc),
|
||||
),
|
||||
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
|
||||
):
|
||||
sync_resp = await obsidian_sync(
|
||||
SyncBatchRequest(
|
||||
vault_id=vault_id,
|
||||
notes=[
|
||||
_make_note_payload(vault_id, "ok.md", "hash-ok"),
|
||||
bad_note,
|
||||
],
|
||||
),
|
||||
user=db_user,
|
||||
session=db_session,
|
||||
)
|
||||
|
||||
assert sync_resp.indexed == 1
|
||||
assert sync_resp.failed == 1
|
||||
items_by_path = {it.path: it for it in sync_resp.items}
|
||||
assert items_by_path["ok.md"].status == "ok"
|
||||
assert items_by_path["photo.heic"].status == "error"
|
||||
assert "unsupported attachment extension" in (
|
||||
items_by_path["photo.heic"].error or ""
|
||||
)
|
||||
queue_mock.assert_not_called()
|
||||
|
||||
async def test_sync_rejects_mime_extension_mismatch(
|
||||
self, db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
||||
):
|
||||
vault_id = str(uuid.uuid4())
|
||||
await obsidian_connect(
|
||||
ConnectRequest(
|
||||
vault_id=vault_id,
|
||||
vault_name="Mismatch Vault",
|
||||
search_space_id=db_search_space.id,
|
||||
vault_fingerprint="fp-" + uuid.uuid4().hex,
|
||||
),
|
||||
user=db_user,
|
||||
session=db_session,
|
||||
)
|
||||
|
||||
fake_doc = type("FakeDoc", (), {"id": 12345})()
|
||||
mismatched = _make_note_payload(vault_id, "image.png", "hash-png")
|
||||
mismatched.extension = "png"
|
||||
mismatched.is_binary = True
|
||||
mismatched.binary_base64 = "aGVsbG8="
|
||||
mismatched.mime_type = "application/pdf"
|
||||
mismatched.content = ""
|
||||
|
||||
with (
|
||||
patch(
|
||||
"app.routes.obsidian_plugin_routes.upsert_note",
|
||||
new=AsyncMock(return_value=fake_doc),
|
||||
),
|
||||
patch("app.routes.obsidian_plugin_routes._queue_obsidian_attachment") as queue_mock,
|
||||
):
|
||||
sync_resp = await obsidian_sync(
|
||||
SyncBatchRequest(
|
||||
vault_id=vault_id,
|
||||
notes=[
|
||||
_make_note_payload(vault_id, "ok.md", "hash-ok"),
|
||||
mismatched,
|
||||
],
|
||||
),
|
||||
user=db_user,
|
||||
session=db_session,
|
||||
)
|
||||
|
||||
assert sync_resp.indexed == 1
|
||||
assert sync_resp.failed == 1
|
||||
items_by_path = {it.path: it for it in sync_resp.items}
|
||||
assert items_by_path["ok.md"].status == "ok"
|
||||
assert items_by_path["image.png"].status == "error"
|
||||
assert "does not match extension" in (
|
||||
items_by_path["image.png"].error or ""
|
||||
)
|
||||
queue_mock.assert_not_called()
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import base64
|
|||
from datetime import UTC, datetime
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from app.etl_pipeline.etl_document import EtlResult
|
||||
from app.schemas.obsidian_plugin import HeadingRef, NotePayload
|
||||
|
|
@ -15,6 +16,9 @@ from app.services.obsidian_plugin_indexer import (
|
|||
)
|
||||
|
||||
|
||||
_FAKE_PNG_B64 = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode("ascii")
|
||||
|
||||
|
||||
def test_build_metadata_serializes_headings_to_plain_json() -> None:
|
||||
now = datetime.now(UTC)
|
||||
payload = NotePayload(
|
||||
|
|
@ -46,6 +50,7 @@ def test_build_metadata_marks_binary_attachment_fields() -> None:
|
|||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=_FAKE_PNG_B64,
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
|
|
@ -69,6 +74,7 @@ async def test_extract_binary_attachment_markdown_handles_invalid_base64() -> No
|
|||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64="not-valid-base64!!",
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
content, metadata = await _extract_binary_attachment_markdown(
|
||||
|
|
@ -93,6 +99,7 @@ async def test_extract_binary_attachment_markdown_uses_etl(monkeypatch) -> None:
|
|||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=base64.b64encode(b"%PDF-1.7 fake bytes").decode("ascii"),
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
|
||||
async def _fake_run_etl_extract( # noqa: ANN001
|
||||
|
|
@ -133,6 +140,8 @@ def test_is_image_attachment_detects_image_extensions() -> None:
|
|||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=_FAKE_PNG_B64,
|
||||
mime_type="image/png",
|
||||
)
|
||||
pdf_payload = NotePayload(
|
||||
vault_id="vault-1",
|
||||
|
|
@ -144,12 +153,67 @@ def test_is_image_attachment_detects_image_extensions() -> None:
|
|||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=_FAKE_PNG_B64,
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
|
||||
assert _is_image_attachment(image_payload) is True
|
||||
assert _is_image_attachment(pdf_payload) is False
|
||||
|
||||
|
||||
def test_note_payload_rejects_binary_without_base64() -> None:
|
||||
now = datetime.now(UTC)
|
||||
with pytest.raises(ValidationError, match="binary_base64 is required"):
|
||||
NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/diagram.png",
|
||||
name="diagram",
|
||||
extension="png",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
|
||||
def test_note_payload_rejects_binary_without_mime_type() -> None:
|
||||
now = datetime.now(UTC)
|
||||
with pytest.raises(ValidationError, match="mime_type is required"):
|
||||
NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="assets/diagram.png",
|
||||
name="diagram",
|
||||
extension="png",
|
||||
content="",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
is_binary=True,
|
||||
binary_base64=_FAKE_PNG_B64,
|
||||
)
|
||||
|
||||
|
||||
def test_note_payload_rejects_markdown_with_binary_fields() -> None:
|
||||
now = datetime.now(UTC)
|
||||
with pytest.raises(
|
||||
ValidationError,
|
||||
match="binary_base64 and mime_type must be omitted when is_binary is False",
|
||||
):
|
||||
NotePayload(
|
||||
vault_id="vault-1",
|
||||
path="notes.md",
|
||||
name="notes",
|
||||
extension="md",
|
||||
content="# Notes",
|
||||
content_hash="abc123",
|
||||
mtime=now,
|
||||
ctime=now,
|
||||
binary_base64=_FAKE_PNG_B64,
|
||||
)
|
||||
|
||||
|
||||
def test_require_extracted_attachment_content_rejects_empty_content() -> None:
|
||||
with pytest.raises(
|
||||
RuntimeError, match="Attachment extraction failed for assets/img.png"
|
||||
|
|
|
|||
|
|
@ -16,10 +16,10 @@ export class AttachmentsConfirmModal extends Modal {
|
|||
this.contentEl.empty();
|
||||
|
||||
new Setting(this.contentEl).setDesc(
|
||||
"Syncing attachments (images, PDFs, and other non-Markdown files) can make indexing slower, especially on large vaults.",
|
||||
"Syncing attachments (images & PDFs) can make indexing slower, especially on large vaults."
|
||||
);
|
||||
new Setting(this.contentEl).setDesc(
|
||||
"You can disable this anytime in settings if syncing becomes too slow.",
|
||||
"Syncing attachments can make indexing slower on large vaults. You can disable this anytime.",
|
||||
);
|
||||
|
||||
new Setting(this.contentEl)
|
||||
|
|
|
|||
|
|
@ -140,7 +140,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
|
|||
new Setting(containerEl)
|
||||
.setName("Sync interval")
|
||||
.setDesc(
|
||||
"How often to check for changes made outside Obsidian. Set to off to only sync manually.",
|
||||
"How often to check for changes made outside Obsidian.",
|
||||
)
|
||||
.addDropdown((drop) => {
|
||||
const options: Array<[number, string]> = [
|
||||
|
|
@ -205,7 +205,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
|
|||
new Setting(containerEl)
|
||||
.setName("Include attachments")
|
||||
.setDesc(
|
||||
"Also sync non-Markdown files such as images and PDFs.",
|
||||
"Also sync non-Markdown files such as images and PDFs. Other file types are skipped.",
|
||||
)
|
||||
.addToggle((toggle) =>
|
||||
toggle
|
||||
|
|
@ -231,7 +231,7 @@ export class SurfSenseSettingTab extends PluginSettingTab {
|
|||
new Setting(containerEl)
|
||||
.setName("Sync only on WiFi")
|
||||
.setDesc(
|
||||
"Pause automatic syncing on cellular. Note: only Android can detect network type — on iOS this toggle has no effect.",
|
||||
"Pause automatic syncing on cellular. Note: only Android can detect network type, on iOS this toggle has no effect.",
|
||||
)
|
||||
.addToggle((toggle) =>
|
||||
toggle
|
||||
|
|
|
|||
|
|
@ -393,7 +393,7 @@ export class SyncEngine {
|
|||
ctime: file.stat.ctime,
|
||||
is_binary: true,
|
||||
binary_base64: binaryBase64,
|
||||
mime_type: mimeTypeFromExtension(file.extension),
|
||||
mime_type: mimeTypeFor(file.extension),
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -560,9 +560,10 @@ export class SyncEngine {
|
|||
|
||||
private shouldTrack(file: TAbstractFile): boolean {
|
||||
if (!isTFile(file)) return false;
|
||||
if (this.isMarkdown(file)) return true;
|
||||
const settings = this.deps.getSettings();
|
||||
if (!settings.includeAttachments && !this.isMarkdown(file)) return false;
|
||||
return true;
|
||||
if (!settings.includeAttachments) return false;
|
||||
return ALLOWED_ATTACHMENT_EXTENSIONS.has(file.extension.toLowerCase());
|
||||
}
|
||||
|
||||
private isExcluded(path: string, settings: SyncEngineSettings): boolean {
|
||||
|
|
@ -599,23 +600,29 @@ function arrayBufferToBase64(buf: ArrayBuffer): string {
|
|||
return btoa(binary);
|
||||
}
|
||||
|
||||
function mimeTypeFromExtension(extension: string): string | undefined {
|
||||
const ext = extension.toLowerCase();
|
||||
const mimeByExt: Record<string, string> = {
|
||||
pdf: "application/pdf",
|
||||
png: "image/png",
|
||||
jpg: "image/jpeg",
|
||||
jpeg: "image/jpeg",
|
||||
gif: "image/gif",
|
||||
webp: "image/webp",
|
||||
svg: "image/svg+xml",
|
||||
txt: "text/plain",
|
||||
csv: "text/csv",
|
||||
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
};
|
||||
return mimeByExt[ext];
|
||||
/** Source of truth for the attachment whitelist. Mirrors ATTACHMENT_MIME_TYPES on the backend. */
|
||||
export const MIME_BY_EXTENSION = {
|
||||
pdf: "application/pdf",
|
||||
png: "image/png",
|
||||
jpg: "image/jpeg",
|
||||
jpeg: "image/jpeg",
|
||||
gif: "image/gif",
|
||||
webp: "image/webp",
|
||||
svg: "image/svg+xml",
|
||||
txt: "text/plain",
|
||||
} as const satisfies Record<string, string>;
|
||||
|
||||
export const ALLOWED_ATTACHMENT_EXTENSIONS: ReadonlySet<string> = new Set(
|
||||
Object.keys(MIME_BY_EXTENSION),
|
||||
);
|
||||
|
||||
function mimeTypeFor(extension: string): string {
|
||||
const ext = extension.toLowerCase() as keyof typeof MIME_BY_EXTENSION;
|
||||
const mime = MIME_BY_EXTENSION[ext];
|
||||
if (!mime) {
|
||||
throw new Error(`Unsupported attachment extension: .${extension}`);
|
||||
}
|
||||
return mime;
|
||||
}
|
||||
|
||||
function formatRelative(ts: number): string {
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ export interface RenameItem {
|
|||
|
||||
export type QueueItem = UpsertItem | DeleteItem | RenameItem;
|
||||
|
||||
export interface NotePayload {
|
||||
interface NotePayloadBase {
|
||||
vault_id: string;
|
||||
path: string;
|
||||
name: string;
|
||||
|
|
@ -85,15 +85,23 @@ export interface NotePayload {
|
|||
size: number;
|
||||
mtime: number;
|
||||
ctime: number;
|
||||
/** Non-markdown attachment marker; enables backend ETL path. */
|
||||
is_binary?: boolean;
|
||||
/** Base64-encoded file bytes for binary attachments. */
|
||||
binary_base64?: string;
|
||||
/** Optional MIME type hint for backend parsers. */
|
||||
mime_type?: string;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface MarkdownNotePayload extends NotePayloadBase {
|
||||
is_binary?: false;
|
||||
}
|
||||
|
||||
export interface BinaryNotePayload extends NotePayloadBase {
|
||||
/** Non-markdown attachment marker; enables backend ETL path. */
|
||||
is_binary: true;
|
||||
/** Base64-encoded file bytes for binary attachments. */
|
||||
binary_base64: string;
|
||||
/** Canonical MIME type for the extension; required by the backend. */
|
||||
mime_type: string;
|
||||
}
|
||||
|
||||
export type NotePayload = MarkdownNotePayload | BinaryNotePayload;
|
||||
|
||||
export interface HeadingRef {
|
||||
heading: string;
|
||||
level: number;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue