chore: linting

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-06-17 22:31:36 -07:00
parent c6d42fc7c8
commit 55f91a29d5
16 changed files with 50 additions and 40 deletions

View file

@ -68,7 +68,7 @@ def upgrade() -> None:
# has NULL last_login -> the migration is idempotent and resumable.
op.execute(f"DROP TABLE IF EXISTS {USER_SCRATCH};")
op.execute(
f'CREATE UNLOGGED TABLE {USER_SCRATCH} AS '
f"CREATE UNLOGGED TABLE {USER_SCRATCH} AS "
'SELECT id FROM "user" WHERE last_login IS NULL;'
)
op.execute(f"ALTER TABLE {USER_SCRATCH} ADD PRIMARY KEY (id);")

View file

@ -87,11 +87,11 @@ def upgrade() -> None:
).scalar()
or 0
)
total_rows_display = f"~{total_rows:,}" if total_rows > 0 else "an unknown number of"
total_rows_display = (
f"~{total_rows:,}" if total_rows > 0 else "an unknown number of"
)
bounds = bind.execute(
sa.text("SELECT min(id), max(id) FROM chunks")
).one()
bounds = bind.execute(sa.text("SELECT min(id), max(id) FROM chunks")).one()
min_id, max_id = bounds[0], bounds[1]
if min_id is None:
@ -167,9 +167,7 @@ def upgrade() -> None:
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
logger.info("creating index ix_chunks_position...")
op.execute(
"CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);"
)
op.execute("CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);")
logger.info("creating index ix_chunks_document_id_position...")
op.execute(
"CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position "

View file

@ -927,7 +927,9 @@ class Config:
AZURE_DI_KEY = os.getenv("AZURE_DI_KEY")
# ETL parse cache: reuse parser output for identical bytes across workspaces.
ETL_CACHE_ENABLED = os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true"
ETL_CACHE_ENABLED = (
os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true"
)
# Bump to invalidate every cached entry after a parser/behaviour change.
ETL_CACHE_PARSER_VERSION = int(os.getenv("ETL_CACHE_PARSER_VERSION", "1"))
ETL_CACHE_TTL_DAYS = int(os.getenv("ETL_CACHE_TTL_DAYS", "90"))
@ -948,7 +950,9 @@ class Config:
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
)
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
EMBEDDING_CACHE_MAX_TOTAL_MB = int(os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120"))
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120")
)
EMBEDDING_CACHE_EVICTION_BATCH = int(
os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500")
)

View file

@ -20,9 +20,7 @@ logger = logging.getLogger(__name__)
_HASH_CHUNK = 1024 * 1024
async def extract_with_cache(
request: EtlRequest, *, vision_llm=None
) -> EtlResult:
async def extract_with_cache(request: EtlRequest, *, vision_llm=None) -> EtlResult:
"""Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
settings = load_etl_cache_settings()

View file

@ -34,7 +34,9 @@ async def _evict() -> None:
index = CachedParseRepository(session)
cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days)
expired = await index.select_expired(cutoff=cutoff, limit=settings.eviction_batch)
expired = await index.select_expired(
cutoff=cutoff, limit=settings.eviction_batch
)
await _drop(index, store, expired, phase="ttl")
total = await index.total_size_bytes()

View file

@ -38,7 +38,9 @@ def resolve_cache_backend() -> StorageBackend:
if backend == "local":
if not settings.storage_local_root:
raise ValueError("ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache.")
raise ValueError(
"ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache."
)
from app.file_storage.backends.local import LocalFileBackend
return LocalFileBackend(settings.storage_local_root)

View file

@ -31,7 +31,9 @@ def serialize(embedding_set: EmbeddingSet) -> bytes:
for chunk in embedding_set.chunks:
vector = np.asarray(chunk.embedding, dtype=np.float32).reshape(-1)
if vector.shape[0] != dim:
raise ValueError("All vectors in an embedding set must share one dimension.")
raise ValueError(
"All vectors in an embedding set must share one dimension."
)
rows.append(vector)
texts.append(chunk.text)
@ -67,5 +69,7 @@ def deserialize(blob: bytes) -> EmbeddingSet:
return EmbeddingSet(
summary_embedding=matrix[0],
chunks=[CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)],
chunks=[
CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)
],
)

View file

@ -22,13 +22,13 @@ class EmbeddingCacheStore:
def backend_name(self) -> str:
return self._backend.backend_name
async def save(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> tuple[str, int]:
async def save(
self, key: EmbeddingKey, embedding_set: EmbeddingSet
) -> tuple[str, int]:
"""Persist the embedding set and return its storage key and byte size."""
blob = serialize(embedding_set)
storage_key = build_embedding_object_key(key)
await self._backend.put(
storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE
)
await self._backend.put(storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE)
return storage_key, len(blob)
async def load(self, storage_key: str) -> EmbeddingSet:

View file

@ -86,8 +86,7 @@ async def get_editor_content(
size_bytes = len(md.encode("utf-8"))
line_count = md.count("\n") + 1
too_large = (
size_bytes > EDITOR_PLATE_MAX_BYTES
or line_count > EDITOR_PLATE_MAX_LINES
size_bytes > EDITOR_PLATE_MAX_BYTES or line_count > EDITOR_PLATE_MAX_LINES
)
viewer_mode = "monaco" if too_large else "plate"
return {

View file

@ -66,9 +66,7 @@ async def test_identical_uploads_are_parsed_once_then_served_from_cache(
assert second.content_type == "application/pdf"
async def test_disabled_cache_parses_every_time(
tmp_path, monkeypatch, counting_parser
):
async def test_disabled_cache_parses_every_time(tmp_path, monkeypatch, counting_parser):
monkeypatch.setattr(config, "ETL_CACHE_ENABLED", False)
monkeypatch.setattr(config, "ETL_SERVICE", "LLAMACLOUD")

View file

@ -18,9 +18,7 @@ pytestmark = pytest.mark.integration
def _key(sha: str) -> ParseKey:
return ParseKey.for_document(
sha, etl_service="LLAMACLOUD", mode="basic", version=1
)
return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1)
async def _insert(repo, *, sha, size=100, storage_key=None):

View file

@ -17,9 +17,7 @@ pytestmark = pytest.mark.integration
def _key(sha: str = "c" * 64) -> ParseKey:
return ParseKey.for_document(
sha, etl_service="LLAMACLOUD", mode="basic", version=1
)
return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1)
async def test_recall_is_a_miss_for_an_unknown_key(db_session, cache_local_storage):

View file

@ -20,9 +20,7 @@ pytestmark = pytest.mark.integration
def _key(sha: str) -> ParseKey:
return ParseKey.for_document(
sha, etl_service="LLAMACLOUD", mode="basic", version=1
)
return ParseKey.for_document(sha, etl_service="LLAMACLOUD", mode="basic", version=1)
def _result(markdown: str) -> EtlResult:
@ -48,7 +46,9 @@ async def test_expired_entries_are_pruned(
monkeypatch, cache_local_storage, clean_cache_table
):
monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True)
monkeypatch.setattr(config, "ETL_CACHE_TTL_DAYS", -1) # cutoff in the future -> stale
monkeypatch.setattr(
config, "ETL_CACHE_TTL_DAYS", -1
) # cutoff in the future -> stale
monkeypatch.setattr(config, "ETL_CACHE_MAX_TOTAL_MB", 10_000) # size phase no-op
key = _key("a" * 64)

View file

@ -38,8 +38,12 @@ async def test_remembered_set_recalls_as_equivalent_vectors(
stored = EmbeddingSet(
summary_embedding=np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32),
chunks=[
CachedChunk("first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)),
CachedChunk("second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)),
CachedChunk(
"first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)
),
CachedChunk(
"second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)
),
],
)

View file

@ -71,7 +71,10 @@ async def _make_connector(
connector_type=connector_type,
# A stored credential the indexer would use — the thing a cross-tenant
# index must never be able to abuse.
config={"GITHUB_PAT": "victim-secret-pat", "repo_full_names": ["octocat/Hello-World"]},
config={
"GITHUB_PAT": "victim-secret-pat",
"repo_full_names": ["octocat/Hello-World"],
},
is_indexable=True,
search_space_id=space.id,
user_id=owner.id,

View file

@ -23,7 +23,9 @@ def test_round_trip_preserves_texts_and_vectors():
assert [c.text for c in restored.chunks] == [c.text for c in original.chunks]
assert restored.chunk_count == 3
assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6)
assert np.allclose(
restored.summary_embedding, original.summary_embedding, atol=1e-6
)
for got, want in zip(restored.chunks, original.chunks, strict=True):
assert np.allclose(got.embedding, want.embedding, atol=1e-6)