mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-18 21:15:16 +02:00
chore: linting
This commit is contained in:
parent
c6d42fc7c8
commit
55f91a29d5
16 changed files with 50 additions and 40 deletions
|
|
@ -68,7 +68,7 @@ def upgrade() -> None:
|
|||
# has NULL last_login -> the migration is idempotent and resumable.
|
||||
op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
|
||||
op.execute(
|
||||
f'CREATE UNLOGGED TABLE {USER_SCRATCH} AS '
|
||||
f"CREATE UNLOGGED TABLE {USER_SCRATCH} AS "
|
||||
'SELECT id FROM "user" WHERE last_login IS NULL;'
|
||||
)
|
||||
op.execute(f"ALTER TABLE {USER_SCRATCH} ADD PRIMARY KEY (id);")
|
||||
|
|
|
|||
|
|
@ -87,11 +87,11 @@ def upgrade() -> None:
|
|||
).scalar()
|
||||
or 0
|
||||
)
|
||||
total_rows_display = f"~{total_rows:,}" if total_rows > 0 else "an unknown number of"
|
||||
total_rows_display = (
|
||||
f"~{total_rows:,}" if total_rows > 0 else "an unknown number of"
|
||||
)
|
||||
|
||||
bounds = bind.execute(
|
||||
sa.text("SELECT min(id), max(id) FROM chunks")
|
||||
).one()
|
||||
bounds = bind.execute(sa.text("SELECT min(id), max(id) FROM chunks")).one()
|
||||
min_id, max_id = bounds[0], bounds[1]
|
||||
|
||||
if min_id is None:
|
||||
|
|
@ -167,9 +167,7 @@ def upgrade() -> None:
|
|||
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
|
||||
|
||||
logger.info("creating index ix_chunks_position...")
|
||||
op.execute(
|
||||
"CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);"
|
||||
)
|
||||
op.execute("CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);")
|
||||
logger.info("creating index ix_chunks_document_id_position...")
|
||||
op.execute(
|
||||
"CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position "
|
||||
|
|
|
|||
|
|
@ -927,7 +927,9 @@ class Config:
|
|||
AZURE_DI_KEY = os.getenv("AZURE_DI_KEY")
|
||||
|
||||
# ETL parse cache: reuse parser output for identical bytes across workspaces.
|
||||
ETL_CACHE_ENABLED = os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true"
|
||||
ETL_CACHE_ENABLED = (
|
||||
os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true"
|
||||
)
|
||||
# Bump to invalidate every cached entry after a parser/behaviour change.
|
||||
ETL_CACHE_PARSER_VERSION = int(os.getenv("ETL_CACHE_PARSER_VERSION", "1"))
|
||||
ETL_CACHE_TTL_DAYS = int(os.getenv("ETL_CACHE_TTL_DAYS", "90"))
|
||||
|
|
@ -948,7 +950,9 @@ class Config:
|
|||
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
|
||||
)
|
||||
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
|
||||
EMBEDDING_CACHE_MAX_TOTAL_MB = int(os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120"))
|
||||
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
|
||||
os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120")
|
||||
)
|
||||
EMBEDDING_CACHE_EVICTION_BATCH = int(
|
||||
os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500")
|
||||
)
|
||||
|
|
|
|||
|
|
@ -20,9 +20,7 @@ logger = logging.getLogger(__name__)
|
|||
_HASH_CHUNK = 1024 * 1024
|
||||
|
||||
|
||||
async def extract_with_cache(
|
||||
request: EtlRequest, *, vision_llm=None
|
||||
) -> EtlResult:
|
||||
async def extract_with_cache(request: EtlRequest, *, vision_llm=None) -> EtlResult:
|
||||
"""Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
|
||||
settings = load_etl_cache_settings()
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,9 @@ async def _evict() -> None:
|
|||
index = CachedParseRepository(session)
|
||||
|
||||
cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days)
|
||||
expired = await index.select_expired(cutoff=cutoff, limit=settings.eviction_batch)
|
||||
expired = await index.select_expired(
|
||||
cutoff=cutoff, limit=settings.eviction_batch
|
||||
)
|
||||
await _drop(index, store, expired, phase="ttl")
|
||||
|
||||
total = await index.total_size_bytes()
|
||||
|
|
|
|||
|
|
@ -38,7 +38,9 @@ def resolve_cache_backend() -> StorageBackend:
|
|||
|
||||
if backend == "local":
|
||||
if not settings.storage_local_root:
|
||||
raise ValueError("ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache.")
|
||||
raise ValueError(
|
||||
"ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache."
|
||||
)
|
||||
from app.file_storage.backends.local import LocalFileBackend
|
||||
|
||||
return LocalFileBackend(settings.storage_local_root)
|
||||
|
|
|
|||
|
|
@ -31,7 +31,9 @@ def serialize(embedding_set: EmbeddingSet) -> bytes:
|
|||
for chunk in embedding_set.chunks:
|
||||
vector = np.asarray(chunk.embedding, dtype=np.float32).reshape(-1)
|
||||
if vector.shape[0] != dim:
|
||||
raise ValueError("All vectors in an embedding set must share one dimension.")
|
||||
raise ValueError(
|
||||
"All vectors in an embedding set must share one dimension."
|
||||
)
|
||||
rows.append(vector)
|
||||
texts.append(chunk.text)
|
||||
|
||||
|
|
@ -67,5 +69,7 @@ def deserialize(blob: bytes) -> EmbeddingSet:
|
|||
|
||||
return EmbeddingSet(
|
||||
summary_embedding=matrix[0],
|
||||
chunks=[CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)],
|
||||
chunks=[
|
||||
CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)
|
||||
],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -22,13 +22,13 @@ class EmbeddingCacheStore:
|
|||
def backend_name(self) -> str:
|
||||
return self._backend.backend_name
|
||||
|
||||
async def save(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> tuple[str, int]:
|
||||
async def save(
|
||||
self, key: EmbeddingKey, embedding_set: EmbeddingSet
|
||||
) -> tuple[str, int]:
|
||||
"""Persist the embedding set and return its storage key and byte size."""
|
||||
blob = serialize(embedding_set)
|
||||
storage_key = build_embedding_object_key(key)
|
||||
await self._backend.put(
|
||||
storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE
|
||||
)
|
||||
await self._backend.put(storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE)
|
||||
return storage_key, len(blob)
|
||||
|
||||
async def load(self, storage_key: str) -> EmbeddingSet:
|
||||
|
|
|
|||
|
|
@ -86,8 +86,7 @@ async def get_editor_content(
|
|||
size_bytes = len(md.encode("utf-8"))
|
||||
line_count = md.count("\n") + 1
|
||||
too_large = (
|
||||
size_bytes > EDITOR_PLATE_MAX_BYTES
|
||||
or line_count > EDITOR_PLATE_MAX_LINES
|
||||
size_bytes > EDITOR_PLATE_MAX_BYTES or line_count > EDITOR_PLATE_MAX_LINES
|
||||
)
|
||||
viewer_mode = "monaco" if too_large else "plate"
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -66,9 +66,7 @@ async def test_identical_uploads_are_parsed_once_then_served_from_cache(
|
|||
assert second.content_type == "application/pdf"
|
||||
|
||||
|
||||
async def test_disabled_cache_parses_every_time(
|
||||
tmp_path, monkeypatch, counting_parser
|
||||
):
|
||||
async def test_disabled_cache_parses_every_time(tmp_path, monkeypatch, counting_parser):
|
||||
monkeypatch.setattr(config, "ETL_CACHE_ENABLED", False)
|
||||
monkeypatch.setattr(config, "ETL_SERVICE", "LLAMACLOUD")
|
||||
|
||||
|
|
|
|||
|
|
@ -18,9 +18,7 @@ pytestmark = pytest.mark.integration
|
|||
|
||||
|
||||
def _key(sha: str) -> ParseKey:
|
||||
return ParseKey.for_document(
|
||||
sha, etl_service="LLAMACLOUD", mode="basic", version=1
|
||||
)
|
||||
return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1)
|
||||
|
||||
|
||||
async def _insert(repo, *, sha, size=100, storage_key=None):
|
||||
|
|
|
|||
|
|
@ -17,9 +17,7 @@ pytestmark = pytest.mark.integration
|
|||
|
||||
|
||||
def _key(sha: str = "c" * 64) -> ParseKey:
|
||||
return ParseKey.for_document(
|
||||
sha, etl_service="LLAMACLOUD", mode="basic", version=1
|
||||
)
|
||||
return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1)
|
||||
|
||||
|
||||
async def test_recall_is_a_miss_for_an_unknown_key(db_session, cache_local_storage):
|
||||
|
|
|
|||
|
|
@ -20,9 +20,7 @@ pytestmark = pytest.mark.integration
|
|||
|
||||
|
||||
def _key(sha: str) -> ParseKey:
|
||||
return ParseKey.for_document(
|
||||
sha, etl_service="LLAMACLOUD", mode="basic", version=1
|
||||
)
|
||||
return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1)
|
||||
|
||||
|
||||
def _result(markdown: str) -> EtlResult:
|
||||
|
|
@ -48,7 +46,9 @@ async def test_expired_entries_are_pruned(
|
|||
monkeypatch, cache_local_storage, clean_cache_table
|
||||
):
|
||||
monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True)
|
||||
monkeypatch.setattr(config, "ETL_CACHE_TTL_DAYS", -1) # cutoff in the future -> stale
|
||||
monkeypatch.setattr(
|
||||
config, "ETL_CACHE_TTL_DAYS", -1
|
||||
) # cutoff in the future -> stale
|
||||
monkeypatch.setattr(config, "ETL_CACHE_MAX_TOTAL_MB", 10_000) # size phase no-op
|
||||
|
||||
key = _key("a" * 64)
|
||||
|
|
|
|||
|
|
@ -38,8 +38,12 @@ async def test_remembered_set_recalls_as_equivalent_vectors(
|
|||
stored = EmbeddingSet(
|
||||
summary_embedding=np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32),
|
||||
chunks=[
|
||||
CachedChunk("first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)),
|
||||
CachedChunk("second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)),
|
||||
CachedChunk(
|
||||
"first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)
|
||||
),
|
||||
CachedChunk(
|
||||
"second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -71,7 +71,10 @@ async def _make_connector(
|
|||
connector_type=connector_type,
|
||||
# A stored credential the indexer would use — the thing a cross-tenant
|
||||
# index must never be able to abuse.
|
||||
config={"GITHUB_PAT": "victim-secret-pat", "repo_full_names": ["octocat/Hello-World"]},
|
||||
config={
|
||||
"GITHUB_PAT": "victim-secret-pat",
|
||||
"repo_full_names": ["octocat/Hello-World"],
|
||||
},
|
||||
is_indexable=True,
|
||||
search_space_id=space.id,
|
||||
user_id=owner.id,
|
||||
|
|
|
|||
|
|
@ -23,7 +23,9 @@ def test_round_trip_preserves_texts_and_vectors():
|
|||
|
||||
assert [c.text for c in restored.chunks] == [c.text for c in original.chunks]
|
||||
assert restored.chunk_count == 3
|
||||
assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6)
|
||||
assert np.allclose(
|
||||
restored.summary_embedding, original.summary_embedding, atol=1e-6
|
||||
)
|
||||
for got, want in zip(restored.chunks, original.chunks, strict=True):
|
||||
assert np.allclose(got.embedding, want.embedding, atol=1e-6)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue