diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 539cfdd32..58c9f5003 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -49,8 +49,6 @@ PLAINTEXT_EXTENSIONS = frozenset( ".markdown", ".txt", ".text", - ".csv", - ".tsv", ".json", ".jsonl", ".yaml", @@ -60,8 +58,6 @@ PLAINTEXT_EXTENSIONS = frozenset( ".cfg", ".conf", ".xml", - ".html", - ".htm", ".css", ".scss", ".less", @@ -149,6 +145,9 @@ AUDIO_EXTENSIONS = frozenset( ) +DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"}) + + def _is_plaintext_file(filename: str) -> bool: return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS @@ -157,9 +156,17 @@ def _is_audio_file(filename: str) -> bool: return Path(filename).suffix.lower() in AUDIO_EXTENSIONS +def _is_direct_convert_file(filename: str) -> bool: + return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS + + def _needs_etl(filename: str) -> bool: - """File is not plaintext and not audio — requires ETL service to parse.""" - return not _is_plaintext_file(filename) and not _is_audio_file(filename) + """File is not plaintext, not audio, and not direct-convert — requires ETL.""" + return ( + not _is_plaintext_file(filename) + and not _is_audio_file(filename) + and not _is_direct_convert_file(filename) + ) HeartbeatCallbackType = Callable[[int], Awaitable[None]] @@ -260,6 +267,13 @@ async def _read_file_content(file_path: str, filename: str) -> str: if _is_plaintext_file(filename): return _read_plaintext_file(file_path) + if _is_direct_convert_file(filename): + from app.tasks.document_processors._direct_converters import ( + convert_file_directly, + ) + + return convert_file_directly(file_path, filename) + if _is_audio_file(filename): etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 4062c3a3b..4c900bf51 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -804,3 +804,158 @@ class TestPipelineIntegration: ) assert len(docs) == 1 assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY) + + +# ==================================================================== +# Tier 7: Direct Converters (DC1-DC4) +# ==================================================================== + + +class TestDirectConvert: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc1_csv_produces_markdown_table( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC1: CSV file is indexed as a markdown table, not raw comma-separated text.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "| name" in doc.source_markdown + assert "| Alice" in doc.source_markdown + assert "name,age,city" not in doc.source_markdown + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc2_tsv_produces_markdown_table( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC2: TSV file is indexed as a markdown table.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "data.tsv").write_text("name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "| name" in doc.source_markdown + assert "| Alice" in doc.source_markdown + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc3_html_produces_clean_markdown( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC3: HTML file is indexed as clean markdown, not raw HTML.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "page.html").write_text( + "
Hello world
" + ) + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "Title" in doc.source_markdown + assert "