Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
270 lines
11 KiB
Python
270 lines
11 KiB
Python
"""Plan 05-10 — asyncio-backed coalescing write queue for LanceDB.
|
|
|
|
Motivation (from 05-08 diagnosis + 05-10 plan): each synchronous
|
|
``tbl.add([row])`` call against a LanceDB table allocates roughly
|
|
~0.3 MB of pyarrow working-set overhead that is sub-linear per call
|
|
but linear in call count. Seeding the store record-by-record (one
|
|
call per record) drives peak RSS to ~1.3 GB at N=5k. This module
|
|
coalesces inserts inside a 100 ms window (or ``max_batch`` records,
|
|
whichever fires first) and forwards them as a single ``await
|
|
tbl.add(batch)`` call. At N=10k with max_batch=128 the buffer
|
|
overhead drops from ~3 GB (10000 * 0.3 MB) to ~24 MB (79 * 0.3 MB).
|
|
|
|
Contract (see ``tests/test_write_queue.py`` for the machine-checked
|
|
version):
|
|
|
|
- ``enqueue(record)`` returns an ``asyncio.Future`` that resolves
|
|
only after the record's batch has landed on disk. Callers that
|
|
want sync-equivalent durability **must** await the future.
|
|
- A single ``tbl.add(batch)`` call carries all records coalesced
|
|
inside one window, up to ``max_batch``.
|
|
- ``stop()`` drains pending records and flushes them synchronously
|
|
before returning. Enqueues after ``stop()`` raise ``RuntimeError``.
|
|
- Back-pressure: when the buffer is already at ``max_queue_size``
|
|
the next ``enqueue()`` awaits the next flush before accepting —
|
|
never unbounded memory growth.
|
|
- Flush failures propagate: if ``tbl.add(batch)`` raises, every
|
|
pending Future in that batch resolves with that exception. The
|
|
queue itself stays running so subsequent enqueues still work.
|
|
- ``on_flushed(batch)`` (optional) fires once per successful flush,
|
|
synchronously inside the loop, **before** futures are resolved.
|
|
The callback receives the exact list of records in the order
|
|
they were flushed — use this to mirror writes to a secondary
|
|
index (Plan 05-12 runtime-graph hook).
|
|
|
|
Constitutional invariants:
|
|
- C3 (no paid-API): pure stdlib + a LanceDB async table handle.
|
|
- C6 (LanceDB authoritative): nothing in this module short-circuits
|
|
the write; ``tbl.add(batch)`` is the only persistence path.
|
|
- (no drift): a resolved Future means the batch reached
|
|
disk. An exception means no Future in that batch reached disk;
|
|
the caller is expected to retry or surface the error.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from typing import Any, Callable, Optional
|
|
|
|
__all__ = ["AsyncWriteQueue"]
|
|
|
|
|
|
class AsyncWriteQueue:
|
|
"""Coalescing write queue on top of a LanceDB AsyncTable.
|
|
|
|
The table object only needs to expose ``await add(batch)`` — the
|
|
tests ship a minimal ``MockAsyncTable`` that satisfies this shape.
|
|
|
|
Parameters
|
|
----------
|
|
table
|
|
LanceDB ``AsyncTable`` (or any object with ``async def
|
|
add(self, batch: list[dict]) -> None``).
|
|
coalesce_ms
|
|
Flush window in milliseconds. On every iteration of the
|
|
coalesce loop we wait at most this long for the next record
|
|
before flushing whatever we have.
|
|
max_batch
|
|
Hard cap on records per ``tbl.add`` call. Reached before the
|
|
timeout, triggers an immediate flush.
|
|
max_queue_size
|
|
Hard cap on buffered (queued + pending) records. The
|
|
``enqueue()`` call awaits the next flush once the cap is hit.
|
|
on_flushed
|
|
Optional callback ``callable(batch: list) -> None`` fired
|
|
after each successful flush, inside the queue's event loop,
|
|
before pending futures are resolved. Exceptions raised by the
|
|
callback are swallowed (logged as a no-op) so a bad hook can
|
|
never break the write path.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
table: Any,
|
|
*,
|
|
coalesce_ms: int = 100,
|
|
max_batch: int = 128,
|
|
max_queue_size: int = 4096,
|
|
on_flushed: Optional[Callable[[list], None]] = None,
|
|
) -> None:
|
|
self._table = table
|
|
self._coalesce_s: float = max(coalesce_ms, 1) / 1000.0
|
|
self._max_batch: int = max(max_batch, 1)
|
|
self._max_queue_size: int = max(max_queue_size, 1)
|
|
self._on_flushed = on_flushed
|
|
|
|
# Runtime state (set in start()).
|
|
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
self._queue: Optional[asyncio.Queue] = None
|
|
# Event set after every flush so back-pressured enqueues can wake.
|
|
self._flush_event: Optional[asyncio.Event] = None
|
|
self._coalesce_task: Optional[asyncio.Task] = None
|
|
self._stopping: bool = False
|
|
self._stopped: bool = False
|
|
|
|
# ------------------------------------------------------------------ lifecycle
|
|
|
|
async def start(self) -> None:
|
|
"""Attach to the current loop and spin up the coalesce task."""
|
|
if self._coalesce_task is not None:
|
|
return
|
|
self._loop = asyncio.get_running_loop()
|
|
self._queue = asyncio.Queue()
|
|
self._flush_event = asyncio.Event()
|
|
self._stopping = False
|
|
self._stopped = False
|
|
self._coalesce_task = asyncio.create_task(
|
|
self._coalesce_loop(), name="iai-mcp-write-coalesce"
|
|
)
|
|
|
|
async def stop(self) -> None:
|
|
"""Drain pending records, flush them, then shut the loop down.
|
|
|
|
Idempotent: calling stop() on an already-stopped queue is a
|
|
no-op.
|
|
"""
|
|
if self._stopped:
|
|
return
|
|
self._stopping = True
|
|
assert self._queue is not None
|
|
# Sentinel wakes the coalesce loop out of its wait_for on an
|
|
# otherwise-empty queue.
|
|
await self._queue.put(_SENTINEL)
|
|
if self._coalesce_task is not None:
|
|
await self._coalesce_task
|
|
self._coalesce_task = None
|
|
self._stopped = True
|
|
|
|
# ------------------------------------------------------------------ enqueue
|
|
|
|
async def enqueue(self, record: Any) -> asyncio.Future:
|
|
"""Append ``record`` to the coalesce buffer.
|
|
|
|
Returns a Future that resolves to ``None`` after the record's
|
|
batch has been flushed (``tbl.add`` returned), or resolves
|
|
with the exception raised by ``tbl.add`` for that batch.
|
|
|
|
Blocks (awaits) when the queue is already at ``max_queue_size``
|
|
until a flush frees a slot.
|
|
"""
|
|
if self._stopped or self._stopping:
|
|
raise RuntimeError("AsyncWriteQueue is stopped; cannot enqueue")
|
|
assert self._queue is not None and self._flush_event is not None
|
|
|
|
# Back-pressure: wait for a flush if we're already at the cap.
|
|
# Use a loop because multiple concurrent enqueues may race on
|
|
# the same wake-up.
|
|
while self._queue.qsize() >= self._max_queue_size:
|
|
self._flush_event.clear()
|
|
await self._flush_event.wait()
|
|
|
|
fut: asyncio.Future = self._loop.create_future() # type: ignore[union-attr]
|
|
await self._queue.put(_Pending(record=record, future=fut))
|
|
return fut
|
|
|
|
# ------------------------------------------------------------------ internals
|
|
|
|
async def _coalesce_loop(self) -> None:
|
|
"""Main loop: collect up to ``max_batch`` records per window,
|
|
then flush. Exits after the sentinel drain when ``stop()``
|
|
is called.
|
|
"""
|
|
assert self._queue is not None and self._flush_event is not None
|
|
while True:
|
|
batch: list[_Pending] = []
|
|
# First item: block indefinitely until we get something or
|
|
# the sentinel arrives.
|
|
first = await self._queue.get()
|
|
if first is _SENTINEL:
|
|
# Drain any stragglers that snuck in before the sentinel.
|
|
while not self._queue.empty():
|
|
item = self._queue.get_nowait()
|
|
if item is _SENTINEL:
|
|
continue
|
|
batch.append(item)
|
|
if batch:
|
|
await self._flush(batch)
|
|
return
|
|
batch.append(first)
|
|
|
|
# Fill the batch within the coalesce window.
|
|
deadline = self._loop.time() + self._coalesce_s # type: ignore[union-attr]
|
|
while len(batch) < self._max_batch:
|
|
remaining = deadline - self._loop.time() # type: ignore[union-attr]
|
|
if remaining <= 0:
|
|
break
|
|
try:
|
|
item = await asyncio.wait_for(
|
|
self._queue.get(), timeout=remaining
|
|
)
|
|
except asyncio.TimeoutError:
|
|
break
|
|
if item is _SENTINEL:
|
|
# Flush what we have, then re-enter the outer loop
|
|
# to let the sentinel branch above handle shutdown.
|
|
await self._flush(batch)
|
|
# Re-queue the sentinel so the outer loop sees it.
|
|
await self._queue.put(_SENTINEL)
|
|
batch = []
|
|
break
|
|
batch.append(item)
|
|
|
|
if batch:
|
|
await self._flush(batch)
|
|
|
|
async def _flush(self, batch: list[_Pending]) -> None:
|
|
"""Push a batch through ``tbl.add`` and resolve each Future."""
|
|
records = [p.record for p in batch]
|
|
try:
|
|
await self._table.add(records)
|
|
except BaseException as exc: # noqa: BLE001
|
|
for p in batch:
|
|
if not p.future.done():
|
|
p.future.set_exception(exc)
|
|
self._notify_flushed()
|
|
return
|
|
|
|
# Hook first (synchronous, in-loop) — so graph-sync observes
|
|
# the write before any caller that awaits the future can race
|
|
# against the in-RAM graph.
|
|
if self._on_flushed is not None:
|
|
try:
|
|
self._on_flushed(records)
|
|
except Exception:
|
|
# Invariant: a bad hook can never break the write
|
|
# path. Swallow; structured logging lives in the
|
|
# hook owner (store._fire_graph_sync_hook already
|
|
# handles this for the graph-sync case).
|
|
pass
|
|
for p in batch:
|
|
if not p.future.done():
|
|
p.future.set_result(None)
|
|
self._notify_flushed()
|
|
|
|
def _notify_flushed(self) -> None:
|
|
"""Wake any enqueue() calls that are back-pressured."""
|
|
if self._flush_event is not None and not self._flush_event.is_set():
|
|
self._flush_event.set()
|
|
|
|
|
|
# ---------------------------------------------------------------------- internals
|
|
|
|
|
|
class _Pending:
|
|
"""Record + the Future its caller is awaiting. Tiny wrapper so we
|
|
can drop it onto asyncio.Queue without worrying about dataclass
|
|
equality semantics (Futures don't hash)."""
|
|
|
|
__slots__ = ("record", "future")
|
|
|
|
def __init__(self, record: Any, future: asyncio.Future) -> None:
|
|
self.record = record
|
|
self.future = future
|
|
|
|
|
|
class _Sentinel:
|
|
"""Marker object for graceful shutdown."""
|
|
|
|
__repr__ = lambda self: "<AsyncWriteQueue.sentinel>" # noqa: E731
|
|
|
|
|
|
_SENTINEL = _Sentinel()
|