mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-22 21:28:12 +02:00
53 lines
1.9 KiB
Python
53 lines
1.9 KiB
Python
"""Recall and remember parser output, coordinating the index and blob store."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.etl_pipeline.cache.persistence import CachedParseRepository
|
|
from app.etl_pipeline.cache.schemas import ParseKey
|
|
from app.etl_pipeline.cache.storage import MarkdownCacheStore
|
|
from app.etl_pipeline.etl_document import EtlResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EtlCacheService:
|
|
def __init__(self, session: AsyncSession) -> None:
|
|
self._index = CachedParseRepository(session)
|
|
self._store = MarkdownCacheStore()
|
|
|
|
async def recall(self, key: ParseKey) -> EtlResult | None:
|
|
"""Return the cached result, or None on a miss."""
|
|
row = await self._index.get(key)
|
|
if row is None:
|
|
return None
|
|
|
|
try:
|
|
markdown = await self._store.load(row.storage_key)
|
|
except Exception:
|
|
# Index points at a blob that is gone; treat as a miss and re-parse.
|
|
logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True)
|
|
return None
|
|
|
|
await self._index.mark_used(row.id)
|
|
return EtlResult(
|
|
markdown_content=markdown,
|
|
etl_service=row.etl_service,
|
|
actual_pages=row.actual_pages,
|
|
content_type=row.content_type,
|
|
)
|
|
|
|
async def remember(self, key: ParseKey, result: EtlResult) -> None:
|
|
"""Store a freshly parsed result for future reuse."""
|
|
storage_key = await self._store.save(key, result.markdown_content)
|
|
await self._index.insert(
|
|
key=key,
|
|
content_type=result.content_type,
|
|
actual_pages=result.actual_pages,
|
|
storage_backend=self._store.backend_name,
|
|
storage_key=storage_key,
|
|
size_bytes=len(result.markdown_content.encode("utf-8")),
|
|
)
|