mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-10 00:02:40 +02:00
Pass vision LLM through local folder indexer call chain
This commit is contained in:
parent
7e90a8ed3c
commit
afd3c2cde2
1 changed files with 29 additions and 9 deletions
|
|
@ -153,16 +153,16 @@ def scan_folder(
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
async def _read_file_content(file_path: str, filename: str) -> str:
|
async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
|
||||||
"""Read file content via the unified ETL pipeline.
|
"""Read file content via the unified ETL pipeline.
|
||||||
|
|
||||||
All file types (plaintext, audio, direct-convert, document) are handled
|
All file types (plaintext, audio, direct-convert, document, image) are
|
||||||
by ``EtlPipelineService``.
|
handled by ``EtlPipelineService``.
|
||||||
"""
|
"""
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||||
|
|
||||||
result = await EtlPipelineService().extract(
|
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename)
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
@ -199,12 +199,14 @@ async def _compute_file_content_hash(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
|
*,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[str, str]:
|
) -> tuple[str, str]:
|
||||||
"""Read a file (via ETL if needed) and compute its content hash.
|
"""Read a file (via ETL if needed) and compute its content hash.
|
||||||
|
|
||||||
Returns (content_text, content_hash).
|
Returns (content_text, content_hash).
|
||||||
"""
|
"""
|
||||||
content = await _read_file_content(file_path, filename)
|
content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
|
||||||
return content, _content_hash(content, search_space_id)
|
return content, _content_hash(content, search_space_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -635,6 +637,10 @@ async def index_local_folder(
|
||||||
|
|
||||||
page_limit_service = PageLimitService(session)
|
page_limit_service = PageLimitService(session)
|
||||||
|
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# PHASE 1: Pre-filter files (mtime / content-hash), version changed
|
# PHASE 1: Pre-filter files (mtime / content-hash), version changed
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
@ -704,7 +710,10 @@ async def index_local_folder(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
file_path_abs, file_info["relative_path"], search_space_id
|
file_path_abs,
|
||||||
|
file_info["relative_path"],
|
||||||
|
search_space_id,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
except Exception as read_err:
|
except Exception as read_err:
|
||||||
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
||||||
|
|
@ -738,7 +747,10 @@ async def index_local_folder(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
file_path_abs, file_info["relative_path"], search_space_id
|
file_path_abs,
|
||||||
|
file_info["relative_path"],
|
||||||
|
search_space_id,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
except Exception as read_err:
|
except Exception as read_err:
|
||||||
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
||||||
|
|
@ -1080,9 +1092,13 @@ async def _index_single_file(
|
||||||
except PageLimitExceededError as e:
|
except PageLimitExceededError as e:
|
||||||
return 0, 1, f"Page limit exceeded: {e}"
|
return 0, 1, f"Page limit exceeded: {e}"
|
||||||
|
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
str(full_path), full_path.name, search_space_id
|
str(full_path), full_path.name, search_space_id, vision_llm=vision_llm
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return 0, 1, f"Could not read file: {e}"
|
return 0, 1, f"Could not read file: {e}"
|
||||||
|
|
@ -1300,6 +1316,10 @@ async def index_uploaded_files(
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
indexed_count = 0
|
indexed_count = 0
|
||||||
failed_count = 0
|
failed_count = 0
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
|
@ -1347,7 +1367,7 @@ async def index_uploaded_files(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
temp_path, filename, search_space_id
|
temp_path, filename, search_space_id, vision_llm=vision_llm
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not read {relative_path}: {e}")
|
logger.warning(f"Could not read {relative_path}: {e}")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue