diff --git a/surfsense_backend/alembic/versions/164_remove_inactive_users.py b/surfsense_backend/alembic/versions/164_remove_inactive_users.py index bacac7f4a..3ce23e204 100644 --- a/surfsense_backend/alembic/versions/164_remove_inactive_users.py +++ b/surfsense_backend/alembic/versions/164_remove_inactive_users.py @@ -68,7 +68,7 @@ def upgrade() -> None: # has NULL last_login -> the migration is idempotent and resumable. op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};") op.execute( - f'CREATE UNLOGGED TABLE {USER_SCRATCH} AS ' + f"CREATE UNLOGGED TABLE {USER_SCRATCH} AS " 'SELECT id FROM "user" WHERE last_login IS NULL;' ) op.execute(f"ALTER TABLE {USER_SCRATCH} ADD PRIMARY KEY (id);") diff --git a/surfsense_backend/alembic/versions/165_add_chunk_position.py b/surfsense_backend/alembic/versions/165_add_chunk_position.py index 933663de8..f830170b5 100644 --- a/surfsense_backend/alembic/versions/165_add_chunk_position.py +++ b/surfsense_backend/alembic/versions/165_add_chunk_position.py @@ -87,11 +87,11 @@ def upgrade() -> None: ).scalar() or 0 ) - total_rows_display = f"~{total_rows:,}" if total_rows > 0 else "an unknown number of" + total_rows_display = ( + f"~{total_rows:,}" if total_rows > 0 else "an unknown number of" + ) - bounds = bind.execute( - sa.text("SELECT min(id), max(id) FROM chunks") - ).one() + bounds = bind.execute(sa.text("SELECT min(id), max(id) FROM chunks")).one() min_id, max_id = bounds[0], bounds[1] if min_id is None: @@ -167,9 +167,7 @@ def upgrade() -> None: op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};") logger.info("creating index ix_chunks_position...") - op.execute( - "CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);" - ) + op.execute("CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);") logger.info("creating index ix_chunks_document_id_position...") op.execute( "CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position " diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 90182fc1e..9f834dd9d 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -927,7 +927,9 @@ class Config: AZURE_DI_KEY = os.getenv("AZURE_DI_KEY") # ETL parse cache: reuse parser output for identical bytes across workspaces. - ETL_CACHE_ENABLED = os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true" + ETL_CACHE_ENABLED = ( + os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true" + ) # Bump to invalidate every cached entry after a parser/behaviour change. ETL_CACHE_PARSER_VERSION = int(os.getenv("ETL_CACHE_PARSER_VERSION", "1")) ETL_CACHE_TTL_DAYS = int(os.getenv("ETL_CACHE_TTL_DAYS", "90")) @@ -948,7 +950,9 @@ class Config: os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1") ) EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90")) - EMBEDDING_CACHE_MAX_TOTAL_MB = int(os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120")) + EMBEDDING_CACHE_MAX_TOTAL_MB = int( + os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120") + ) EMBEDDING_CACHE_EVICTION_BATCH = int( os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500") ) diff --git a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py index b6a9e5531..de4186b69 100644 --- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py +++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py @@ -20,9 +20,7 @@ logger = logging.getLogger(__name__) _HASH_CHUNK = 1024 * 1024 -async def extract_with_cache( - request: EtlRequest, *, vision_llm=None -) -> EtlResult: +async def extract_with_cache(request: EtlRequest, *, vision_llm=None) -> EtlResult: """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output.""" settings = load_etl_cache_settings() diff --git a/surfsense_backend/app/etl_pipeline/cache/eviction/task.py b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py index dcda10f61..61433f8a7 100644 --- a/surfsense_backend/app/etl_pipeline/cache/eviction/task.py +++ b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py @@ -34,7 +34,9 @@ async def _evict() -> None: index = CachedParseRepository(session) cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days) - expired = await index.select_expired(cutoff=cutoff, limit=settings.eviction_batch) + expired = await index.select_expired( + cutoff=cutoff, limit=settings.eviction_batch + ) await _drop(index, store, expired, phase="ttl") total = await index.total_size_bytes() diff --git a/surfsense_backend/app/etl_pipeline/cache/storage/backend.py b/surfsense_backend/app/etl_pipeline/cache/storage/backend.py index ac7501984..4f68ac0d3 100644 --- a/surfsense_backend/app/etl_pipeline/cache/storage/backend.py +++ b/surfsense_backend/app/etl_pipeline/cache/storage/backend.py @@ -38,7 +38,9 @@ def resolve_cache_backend() -> StorageBackend: if backend == "local": if not settings.storage_local_root: - raise ValueError("ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache.") + raise ValueError( + "ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache." + ) from app.file_storage.backends.local import LocalFileBackend return LocalFileBackend(settings.storage_local_root) diff --git a/surfsense_backend/app/indexing_pipeline/cache/serialization.py b/surfsense_backend/app/indexing_pipeline/cache/serialization.py index f9d53b471..fde0acd00 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/serialization.py +++ b/surfsense_backend/app/indexing_pipeline/cache/serialization.py @@ -31,7 +31,9 @@ def serialize(embedding_set: EmbeddingSet) -> bytes: for chunk in embedding_set.chunks: vector = np.asarray(chunk.embedding, dtype=np.float32).reshape(-1) if vector.shape[0] != dim: - raise ValueError("All vectors in an embedding set must share one dimension.") + raise ValueError( + "All vectors in an embedding set must share one dimension." + ) rows.append(vector) texts.append(chunk.text) @@ -67,5 +69,7 @@ def deserialize(blob: bytes) -> EmbeddingSet: return EmbeddingSet( summary_embedding=matrix[0], - chunks=[CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)], + chunks=[ + CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count) + ], ) diff --git a/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py index 58c4a6cc1..7b0329b4e 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py +++ b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py @@ -22,13 +22,13 @@ class EmbeddingCacheStore: def backend_name(self) -> str: return self._backend.backend_name - async def save(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> tuple[str, int]: + async def save( + self, key: EmbeddingKey, embedding_set: EmbeddingSet + ) -> tuple[str, int]: """Persist the embedding set and return its storage key and byte size.""" blob = serialize(embedding_set) storage_key = build_embedding_object_key(key) - await self._backend.put( - storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE - ) + await self._backend.put(storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE) return storage_key, len(blob) async def load(self, storage_key: str) -> EmbeddingSet: diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index 5ab8c931c..8250fff98 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -86,8 +86,7 @@ async def get_editor_content( size_bytes = len(md.encode("utf-8")) line_count = md.count("\n") + 1 too_large = ( - size_bytes > EDITOR_PLATE_MAX_BYTES - or line_count > EDITOR_PLATE_MAX_LINES + size_bytes > EDITOR_PLATE_MAX_BYTES or line_count > EDITOR_PLATE_MAX_LINES ) viewer_mode = "monaco" if too_large else "plate" return { diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py index 0b4a3dcf0..f9acd02d5 100644 --- a/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py @@ -66,9 +66,7 @@ async def test_identical_uploads_are_parsed_once_then_served_from_cache( assert second.content_type == "application/pdf" -async def test_disabled_cache_parses_every_time( - tmp_path, monkeypatch, counting_parser -): +async def test_disabled_cache_parses_every_time(tmp_path, monkeypatch, counting_parser): monkeypatch.setattr(config, "ETL_CACHE_ENABLED", False) monkeypatch.setattr(config, "ETL_SERVICE", "LLAMACLOUD") diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py index 72e977f11..4665c44c8 100644 --- a/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py @@ -18,9 +18,7 @@ pytestmark = pytest.mark.integration def _key(sha: str) -> ParseKey: - return ParseKey.for_document( - sha, etl_service="LLAMACLOUD", mode="basic", version=1 - ) + return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1) async def _insert(repo, *, sha, size=100, storage_key=None): diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py index df74c97d4..e6041d63e 100644 --- a/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py @@ -17,9 +17,7 @@ pytestmark = pytest.mark.integration def _key(sha: str = "c" * 64) -> ParseKey: - return ParseKey.for_document( - sha, etl_service="LLAMACLOUD", mode="basic", version=1 - ) + return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1) async def test_recall_is_a_miss_for_an_unknown_key(db_session, cache_local_storage): diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py index e25cfaef0..939ac74a5 100644 --- a/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py @@ -20,9 +20,7 @@ pytestmark = pytest.mark.integration def _key(sha: str) -> ParseKey: - return ParseKey.for_document( - sha, etl_service="LLAMACLOUD", mode="basic", version=1 - ) + return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1) def _result(markdown: str) -> EtlResult: @@ -48,7 +46,9 @@ async def test_expired_entries_are_pruned( monkeypatch, cache_local_storage, clean_cache_table ): monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True) - monkeypatch.setattr(config, "ETL_CACHE_TTL_DAYS", -1) # cutoff in the future -> stale + monkeypatch.setattr( + config, "ETL_CACHE_TTL_DAYS", -1 + ) # cutoff in the future -> stale monkeypatch.setattr(config, "ETL_CACHE_MAX_TOTAL_MB", 10_000) # size phase no-op key = _key("a" * 64) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py index 2f4cd4a89..548208131 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py @@ -38,8 +38,12 @@ async def test_remembered_set_recalls_as_equivalent_vectors( stored = EmbeddingSet( summary_embedding=np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32), chunks=[ - CachedChunk("first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)), - CachedChunk("second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)), + CachedChunk( + "first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32) + ), + CachedChunk( + "second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32) + ), ], ) diff --git a/surfsense_backend/tests/integration/test_connector_index_authz.py b/surfsense_backend/tests/integration/test_connector_index_authz.py index 906dea8f9..cea2407cc 100644 --- a/surfsense_backend/tests/integration/test_connector_index_authz.py +++ b/surfsense_backend/tests/integration/test_connector_index_authz.py @@ -71,7 +71,10 @@ async def _make_connector( connector_type=connector_type, # A stored credential the indexer would use — the thing a cross-tenant # index must never be able to abuse. - config={"GITHUB_PAT": "victim-secret-pat", "repo_full_names": ["octocat/Hello-World"]}, + config={ + "GITHUB_PAT": "victim-secret-pat", + "repo_full_names": ["octocat/Hello-World"], + }, is_indexable=True, search_space_id=space.id, user_id=owner.id, diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py index 8db87bf1b..f8cff6355 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py @@ -23,7 +23,9 @@ def test_round_trip_preserves_texts_and_vectors(): assert [c.text for c in restored.chunks] == [c.text for c in original.chunks] assert restored.chunk_count == 3 - assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6) + assert np.allclose( + restored.summary_embedding, original.summary_embedding, atol=1e-6 + ) for got, want in zip(restored.chunks, original.chunks, strict=True): assert np.allclose(got.embedding, want.embedding, atol=1e-6)