mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
feat(etl-cache): route all file-based sources through the parse cache
Every file ingestion path (Dropbox, Google Drive / Composio Drive, OneDrive, local folder, Obsidian, and the legacy upload handlers) now parses via the extract_with_cache facade instead of calling EtlPipelineService.extract directly, so identical bytes are deduplicated globally regardless of source. vision_llm is passed through, keeping the existing cacheability gate intact.
This commit is contained in:
parent
99cf212c31
commit
0fb1d3d37b
6 changed files with 33 additions and 25 deletions
|
|
@ -90,11 +90,12 @@ async def download_and_extract_content(
|
||||||
if error:
|
if error:
|
||||||
return None, metadata, error
|
return None, metadata, error
|
||||||
|
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
|
|
||||||
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
result = await extract_with_cache(
|
||||||
EtlRequest(file_path=temp_file_path, filename=file_name)
|
EtlRequest(file_path=temp_file_path, filename=file_name),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
markdown = result.markdown_content
|
markdown = result.markdown_content
|
||||||
return markdown, metadata, None
|
return markdown, metadata, None
|
||||||
|
|
|
||||||
|
|
@ -122,12 +122,13 @@ async def download_and_extract_content(
|
||||||
async def _parse_file_to_markdown(
|
async def _parse_file_to_markdown(
|
||||||
file_path: str, filename: str, *, vision_llm=None
|
file_path: str, filename: str, *, vision_llm=None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Parse a local file to markdown using the unified ETL pipeline."""
|
"""Parse a local file to markdown via the cache-aware ETL pipeline."""
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
|
|
||||||
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
result = await extract_with_cache(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -84,11 +84,12 @@ async def download_and_extract_content(
|
||||||
async def _parse_file_to_markdown(
|
async def _parse_file_to_markdown(
|
||||||
file_path: str, filename: str, *, vision_llm=None
|
file_path: str, filename: str, *, vision_llm=None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Parse a local file to markdown using the unified ETL pipeline."""
|
"""Parse a local file to markdown via the cache-aware ETL pipeline."""
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
|
|
||||||
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
result = await extract_with_cache(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
|
||||||
|
|
@ -199,11 +199,12 @@ async def _extract_binary_attachment_markdown(
|
||||||
|
|
||||||
async def _run_etl_extract(*, file_path: str, filename: str, vision_llm):
|
async def _run_etl_extract(*, file_path: str, filename: str, vision_llm):
|
||||||
"""Lazy-load ETL dependencies to avoid module-import cycles."""
|
"""Lazy-load ETL dependencies to avoid module-import cycles."""
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
|
|
||||||
return await EtlPipelineService(vision_llm=vision_llm).extract(
|
return await extract_with_cache(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -162,12 +162,13 @@ async def _read_file_content(
|
||||||
All file types (plaintext, audio, direct-convert, document, image) are
|
All file types (plaintext, audio, direct-convert, document, image) are
|
||||||
handled by ``EtlPipelineService``.
|
handled by ``EtlPipelineService``.
|
||||||
"""
|
"""
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
|
from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
|
|
||||||
mode = ProcessingMode.coerce(processing_mode)
|
mode = ProcessingMode.coerce(processing_mode)
|
||||||
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
result = await extract_with_cache(
|
||||||
EtlRequest(file_path=file_path, filename=filename, processing_mode=mode)
|
EtlRequest(file_path=file_path, filename=filename, processing_mode=mode),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,9 @@
|
||||||
"""
|
"""
|
||||||
File document processors orchestrating content extraction and indexing.
|
File document processors orchestrating content extraction and indexing.
|
||||||
|
|
||||||
Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and
|
Delegates content extraction to the cache-aware ``extract_with_cache`` facade
|
||||||
keeps only orchestration concerns (notifications, logging, page limits, saving).
|
(over ``EtlPipelineService``) and keeps only orchestration concerns
|
||||||
|
(notifications, logging, page limits, saving).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -116,8 +117,8 @@ async def _log_page_divergence(
|
||||||
|
|
||||||
async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
|
async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
|
||||||
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
|
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
|
|
||||||
await _notify(ctx, "parsing", "Processing file")
|
await _notify(ctx, "parsing", "Processing file")
|
||||||
await ctx.task_logger.log_task_progress(
|
await ctx.task_logger.log_task_progress(
|
||||||
|
|
@ -136,8 +137,9 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
||||||
|
|
||||||
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||||
|
|
||||||
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
etl_result = await extract_with_cache(
|
||||||
EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
|
EtlRequest(file_path=ctx.file_path, filename=ctx.filename),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
|
|
@ -183,8 +185,8 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
||||||
|
|
||||||
async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
||||||
"""Route a document file to the configured ETL service via the unified pipeline."""
|
"""Route a document file to the configured ETL service via the unified pipeline."""
|
||||||
|
from app.etl_pipeline.cache import extract_with_cache
|
||||||
from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
|
from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
|
||||||
from app.services.etl_credit_service import (
|
from app.services.etl_credit_service import (
|
||||||
EtlCreditService,
|
EtlCreditService,
|
||||||
InsufficientCreditsError,
|
InsufficientCreditsError,
|
||||||
|
|
@ -237,13 +239,14 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
||||||
|
|
||||||
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||||
|
|
||||||
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
etl_result = await extract_with_cache(
|
||||||
EtlRequest(
|
EtlRequest(
|
||||||
file_path=ctx.file_path,
|
file_path=ctx.file_path,
|
||||||
filename=ctx.filename,
|
filename=ctx.filename,
|
||||||
estimated_pages=estimated_pages,
|
estimated_pages=estimated_pages,
|
||||||
processing_mode=mode,
|
processing_mode=mode,
|
||||||
)
|
),
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue