mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
refactor: enhance Google Drive indexer to support file extension filtering, improving file handling and error reporting
This commit is contained in:
parent
0fb92b7c56
commit
f03bf05aaa
3 changed files with 46 additions and 15 deletions
|
|
@ -25,7 +25,10 @@ from app.connectors.google_drive import (
|
|||
get_files_in_folder,
|
||||
get_start_page_token,
|
||||
)
|
||||
from app.connectors.google_drive.file_types import should_skip_file as skip_mime
|
||||
from app.connectors.google_drive.file_types import (
|
||||
should_skip_by_extension,
|
||||
should_skip_file as skip_mime,
|
||||
)
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||
|
|
@ -78,6 +81,8 @@ async def _should_skip_file(
|
|||
|
||||
if skip_mime(mime_type):
|
||||
return True, "folder/shortcut"
|
||||
if should_skip_by_extension(file_name):
|
||||
return True, "unsupported extension"
|
||||
if not file_id:
|
||||
return True, "missing file_id"
|
||||
|
||||
|
|
|
|||
|
|
@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch):
|
|||
|
||||
# -- LlamaParse mock (external API) --------------------------------
|
||||
|
||||
class _FakeMarkdownDoc:
|
||||
def __init__(self, text: str):
|
||||
self.text = text
|
||||
|
||||
class _FakeLlamaParseResult:
|
||||
async def aget_markdown_documents(self, *, split_by_page=False):
|
||||
return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)]
|
||||
|
||||
async def _fake_llamacloud_parse(**kwargs):
|
||||
_reject_empty(kwargs["file_path"])
|
||||
return _FakeLlamaParseResult()
|
||||
async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str:
|
||||
_reject_empty(file_path)
|
||||
return _MOCK_ETL_MARKDOWN
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
|
||||
"app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud",
|
||||
_fake_llamacloud_parse,
|
||||
)
|
||||
|
||||
# -- Docling mock (heavy library boundary) -------------------------
|
||||
|
||||
async def _fake_docling_parse(file_path: str, filename: str):
|
||||
async def _fake_docling_parse(file_path: str, filename: str) -> str:
|
||||
_reject_empty(file_path)
|
||||
return _MOCK_ETL_MARKDOWN
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.tasks.document_processors.file_processors.parse_with_docling",
|
||||
"app.etl_pipeline.parsers.docling.parse_with_docling",
|
||||
_fake_docling_parse,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -272,6 +272,23 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
|
|||
download_and_index_mock = AsyncMock(return_value=(0, 0))
|
||||
monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
|
||||
|
||||
from app.services.page_limit_service import PageLimitService as _RealPLS
|
||||
|
||||
mock_page_limit_instance = MagicMock()
|
||||
mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
|
||||
mock_page_limit_instance.update_page_usage = AsyncMock()
|
||||
|
||||
class _MockPageLimitService:
|
||||
estimate_pages_from_metadata = staticmethod(
|
||||
_RealPLS.estimate_pages_from_metadata
|
||||
)
|
||||
|
||||
def __init__(self, session):
|
||||
self.get_page_usage = mock_page_limit_instance.get_page_usage
|
||||
self.update_page_usage = mock_page_limit_instance.update_page_usage
|
||||
|
||||
monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
|
||||
|
||||
return {
|
||||
"dropbox_client": mock_dropbox_client,
|
||||
"session": mock_session,
|
||||
|
|
@ -377,6 +394,23 @@ def selected_files_mocks(mock_dropbox_client, monkeypatch):
|
|||
download_and_index_mock = AsyncMock(return_value=(0, 0))
|
||||
monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
|
||||
|
||||
from app.services.page_limit_service import PageLimitService as _RealPLS
|
||||
|
||||
mock_page_limit_instance = MagicMock()
|
||||
mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
|
||||
mock_page_limit_instance.update_page_usage = AsyncMock()
|
||||
|
||||
class _MockPageLimitService:
|
||||
estimate_pages_from_metadata = staticmethod(
|
||||
_RealPLS.estimate_pages_from_metadata
|
||||
)
|
||||
|
||||
def __init__(self, session):
|
||||
self.get_page_usage = mock_page_limit_instance.get_page_usage
|
||||
self.update_page_usage = mock_page_limit_instance.update_page_usage
|
||||
|
||||
monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
|
||||
|
||||
return {
|
||||
"dropbox_client": mock_dropbox_client,
|
||||
"session": mock_session,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue