mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-30 11:26:24 +02:00
refactor: enhance Google Drive indexer to support file extension filtering, improving file handling and error reporting
This commit is contained in:
parent
0fb92b7c56
commit
f03bf05aaa
3 changed files with 46 additions and 15 deletions
|
|
@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch):
|
|||
|
||||
# -- LlamaParse mock (external API) --------------------------------
|
||||
|
||||
class _FakeMarkdownDoc:
|
||||
def __init__(self, text: str):
|
||||
self.text = text
|
||||
|
||||
class _FakeLlamaParseResult:
|
||||
async def aget_markdown_documents(self, *, split_by_page=False):
|
||||
return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)]
|
||||
|
||||
async def _fake_llamacloud_parse(**kwargs):
|
||||
_reject_empty(kwargs["file_path"])
|
||||
return _FakeLlamaParseResult()
|
||||
async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str:
|
||||
_reject_empty(file_path)
|
||||
return _MOCK_ETL_MARKDOWN
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
|
||||
"app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud",
|
||||
_fake_llamacloud_parse,
|
||||
)
|
||||
|
||||
# -- Docling mock (heavy library boundary) -------------------------
|
||||
|
||||
async def _fake_docling_parse(file_path: str, filename: str):
|
||||
async def _fake_docling_parse(file_path: str, filename: str) -> str:
|
||||
_reject_empty(file_path)
|
||||
return _MOCK_ETL_MARKDOWN
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.tasks.document_processors.file_processors.parse_with_docling",
|
||||
"app.etl_pipeline.parsers.docling.parse_with_docling",
|
||||
_fake_docling_parse,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -272,6 +272,23 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
|
|||
download_and_index_mock = AsyncMock(return_value=(0, 0))
|
||||
monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
|
||||
|
||||
from app.services.page_limit_service import PageLimitService as _RealPLS
|
||||
|
||||
mock_page_limit_instance = MagicMock()
|
||||
mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
|
||||
mock_page_limit_instance.update_page_usage = AsyncMock()
|
||||
|
||||
class _MockPageLimitService:
|
||||
estimate_pages_from_metadata = staticmethod(
|
||||
_RealPLS.estimate_pages_from_metadata
|
||||
)
|
||||
|
||||
def __init__(self, session):
|
||||
self.get_page_usage = mock_page_limit_instance.get_page_usage
|
||||
self.update_page_usage = mock_page_limit_instance.update_page_usage
|
||||
|
||||
monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
|
||||
|
||||
return {
|
||||
"dropbox_client": mock_dropbox_client,
|
||||
"session": mock_session,
|
||||
|
|
@ -377,6 +394,23 @@ def selected_files_mocks(mock_dropbox_client, monkeypatch):
|
|||
download_and_index_mock = AsyncMock(return_value=(0, 0))
|
||||
monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
|
||||
|
||||
from app.services.page_limit_service import PageLimitService as _RealPLS
|
||||
|
||||
mock_page_limit_instance = MagicMock()
|
||||
mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
|
||||
mock_page_limit_instance.update_page_usage = AsyncMock()
|
||||
|
||||
class _MockPageLimitService:
|
||||
estimate_pages_from_metadata = staticmethod(
|
||||
_RealPLS.estimate_pages_from_metadata
|
||||
)
|
||||
|
||||
def __init__(self, session):
|
||||
self.get_page_usage = mock_page_limit_instance.get_page_usage
|
||||
self.update_page_usage = mock_page_limit_instance.update_page_usage
|
||||
|
||||
monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
|
||||
|
||||
return {
|
||||
"dropbox_client": mock_dropbox_client,
|
||||
"session": mock_session,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue