mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-29 19:35:20 +02:00
feat: add direct conversion support for CSV, TSV, and HTML files in local folder indexing
This commit is contained in:
parent
6f4c0d5e6c
commit
b759bb36a9
2 changed files with 175 additions and 6 deletions
|
|
@ -49,8 +49,6 @@ PLAINTEXT_EXTENSIONS = frozenset(
|
|||
".markdown",
|
||||
".txt",
|
||||
".text",
|
||||
".csv",
|
||||
".tsv",
|
||||
".json",
|
||||
".jsonl",
|
||||
".yaml",
|
||||
|
|
@ -60,8 +58,6 @@ PLAINTEXT_EXTENSIONS = frozenset(
|
|||
".cfg",
|
||||
".conf",
|
||||
".xml",
|
||||
".html",
|
||||
".htm",
|
||||
".css",
|
||||
".scss",
|
||||
".less",
|
||||
|
|
@ -149,6 +145,9 @@ AUDIO_EXTENSIONS = frozenset(
|
|||
)
|
||||
|
||||
|
||||
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
|
||||
|
||||
|
||||
def _is_plaintext_file(filename: str) -> bool:
|
||||
return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS
|
||||
|
||||
|
|
@ -157,9 +156,17 @@ def _is_audio_file(filename: str) -> bool:
|
|||
return Path(filename).suffix.lower() in AUDIO_EXTENSIONS
|
||||
|
||||
|
||||
def _is_direct_convert_file(filename: str) -> bool:
|
||||
return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS
|
||||
|
||||
|
||||
def _needs_etl(filename: str) -> bool:
|
||||
"""File is not plaintext and not audio — requires ETL service to parse."""
|
||||
return not _is_plaintext_file(filename) and not _is_audio_file(filename)
|
||||
"""File is not plaintext, not audio, and not direct-convert — requires ETL."""
|
||||
return (
|
||||
not _is_plaintext_file(filename)
|
||||
and not _is_audio_file(filename)
|
||||
and not _is_direct_convert_file(filename)
|
||||
)
|
||||
|
||||
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
|
@ -260,6 +267,13 @@ async def _read_file_content(file_path: str, filename: str) -> str:
|
|||
if _is_plaintext_file(filename):
|
||||
return _read_plaintext_file(file_path)
|
||||
|
||||
if _is_direct_convert_file(filename):
|
||||
from app.tasks.document_processors._direct_converters import (
|
||||
convert_file_directly,
|
||||
)
|
||||
|
||||
return convert_file_directly(file_path, filename)
|
||||
|
||||
if _is_audio_file(filename):
|
||||
etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None
|
||||
stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None
|
||||
|
|
|
|||
|
|
@ -804,3 +804,158 @@ class TestPipelineIntegration:
|
|||
)
|
||||
assert len(docs) == 1
|
||||
assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY)
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Tier 7: Direct Converters (DC1-DC4)
|
||||
# ====================================================================
|
||||
|
||||
|
||||
class TestDirectConvert:
|
||||
@pytest.mark.usefixtures(*UNIFIED_FIXTURES)
|
||||
async def test_dc1_csv_produces_markdown_table(
|
||||
self,
|
||||
db_session: AsyncSession,
|
||||
db_user: User,
|
||||
db_search_space: SearchSpace,
|
||||
tmp_path: Path,
|
||||
):
|
||||
"""DC1: CSV file is indexed as a markdown table, not raw comma-separated text."""
|
||||
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
|
||||
|
||||
(tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n")
|
||||
|
||||
count, _skipped, _root_folder_id, err = await index_local_folder(
|
||||
session=db_session,
|
||||
search_space_id=db_search_space.id,
|
||||
user_id=str(db_user.id),
|
||||
folder_path=str(tmp_path),
|
||||
folder_name="test-folder",
|
||||
)
|
||||
|
||||
assert err is None
|
||||
assert count == 1
|
||||
|
||||
doc = (
|
||||
await db_session.execute(
|
||||
select(Document).where(
|
||||
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||
Document.search_space_id == db_search_space.id,
|
||||
)
|
||||
)
|
||||
).scalar_one()
|
||||
|
||||
assert "| name" in doc.source_markdown
|
||||
assert "| Alice" in doc.source_markdown
|
||||
assert "name,age,city" not in doc.source_markdown
|
||||
|
||||
@pytest.mark.usefixtures(*UNIFIED_FIXTURES)
|
||||
async def test_dc2_tsv_produces_markdown_table(
|
||||
self,
|
||||
db_session: AsyncSession,
|
||||
db_user: User,
|
||||
db_search_space: SearchSpace,
|
||||
tmp_path: Path,
|
||||
):
|
||||
"""DC2: TSV file is indexed as a markdown table."""
|
||||
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
|
||||
|
||||
(tmp_path / "data.tsv").write_text("name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n")
|
||||
|
||||
count, _skipped, _root_folder_id, err = await index_local_folder(
|
||||
session=db_session,
|
||||
search_space_id=db_search_space.id,
|
||||
user_id=str(db_user.id),
|
||||
folder_path=str(tmp_path),
|
||||
folder_name="test-folder",
|
||||
)
|
||||
|
||||
assert err is None
|
||||
assert count == 1
|
||||
|
||||
doc = (
|
||||
await db_session.execute(
|
||||
select(Document).where(
|
||||
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||
Document.search_space_id == db_search_space.id,
|
||||
)
|
||||
)
|
||||
).scalar_one()
|
||||
|
||||
assert "| name" in doc.source_markdown
|
||||
assert "| Alice" in doc.source_markdown
|
||||
|
||||
@pytest.mark.usefixtures(*UNIFIED_FIXTURES)
|
||||
async def test_dc3_html_produces_clean_markdown(
|
||||
self,
|
||||
db_session: AsyncSession,
|
||||
db_user: User,
|
||||
db_search_space: SearchSpace,
|
||||
tmp_path: Path,
|
||||
):
|
||||
"""DC3: HTML file is indexed as clean markdown, not raw HTML."""
|
||||
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
|
||||
|
||||
(tmp_path / "page.html").write_text(
|
||||
"<h1>Title</h1><p>Hello world</p>"
|
||||
)
|
||||
|
||||
count, _skipped, _root_folder_id, err = await index_local_folder(
|
||||
session=db_session,
|
||||
search_space_id=db_search_space.id,
|
||||
user_id=str(db_user.id),
|
||||
folder_path=str(tmp_path),
|
||||
folder_name="test-folder",
|
||||
)
|
||||
|
||||
assert err is None
|
||||
assert count == 1
|
||||
|
||||
doc = (
|
||||
await db_session.execute(
|
||||
select(Document).where(
|
||||
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||
Document.search_space_id == db_search_space.id,
|
||||
)
|
||||
)
|
||||
).scalar_one()
|
||||
|
||||
assert "Title" in doc.source_markdown
|
||||
assert "<h1>" not in doc.source_markdown
|
||||
|
||||
@pytest.mark.usefixtures(*UNIFIED_FIXTURES)
|
||||
async def test_dc4_csv_single_file_mode(
|
||||
self,
|
||||
db_session: AsyncSession,
|
||||
db_user: User,
|
||||
db_search_space: SearchSpace,
|
||||
tmp_path: Path,
|
||||
):
|
||||
"""DC4: CSV via single-file batch mode also produces a markdown table."""
|
||||
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
|
||||
|
||||
(tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n")
|
||||
|
||||
count, _skipped, _root_folder_id, err = await index_local_folder(
|
||||
session=db_session,
|
||||
search_space_id=db_search_space.id,
|
||||
user_id=str(db_user.id),
|
||||
folder_path=str(tmp_path),
|
||||
folder_name="test-folder",
|
||||
target_file_paths=[str(tmp_path / "data.csv")],
|
||||
)
|
||||
|
||||
assert err is None
|
||||
assert count == 1
|
||||
|
||||
doc = (
|
||||
await db_session.execute(
|
||||
select(Document).where(
|
||||
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||
Document.search_space_id == db_search_space.id,
|
||||
)
|
||||
)
|
||||
).scalar_one()
|
||||
|
||||
assert "| name" in doc.source_markdown
|
||||
assert "name,age,city" not in doc.source_markdown
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue