mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
fix: llamaclud v2 impl
This commit is contained in:
parent
2f793e7a69
commit
0e4285095c
2 changed files with 42 additions and 25 deletions
|
|
@ -13,6 +13,9 @@ BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
||||||
env_file = BASE_DIR / ".env"
|
env_file = BASE_DIR / ".env"
|
||||||
load_dotenv(env_file)
|
load_dotenv(env_file)
|
||||||
|
|
||||||
|
os.environ.setdefault("OR_APP_NAME", "SurfSense")
|
||||||
|
os.environ.setdefault("OR_SITE_URL", "https://surfsense.com")
|
||||||
|
|
||||||
|
|
||||||
def is_ffmpeg_installed():
|
def is_ffmpeg_installed():
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -16,16 +16,37 @@ from app.etl_pipeline.constants import (
|
||||||
calculate_upload_timeout,
|
calculate_upload_timeout,
|
||||||
)
|
)
|
||||||
|
|
||||||
LLAMA_TIER_BY_MODE = {
|
LLAMA_PARSE_MODE_MAP = {
|
||||||
"basic": "cost_effective",
|
"basic": "parse_page_with_llm",
|
||||||
"premium": "agentic_plus",
|
"premium": "parse_page_with_agent",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_content(result) -> str:
|
||||||
|
"""Pull markdown text out of whatever object LlamaParse.aparse returns."""
|
||||||
|
if hasattr(result, "get_markdown_documents"):
|
||||||
|
markdown_docs = result.get_markdown_documents(split_by_page=False)
|
||||||
|
if markdown_docs and hasattr(markdown_docs[0], "text"):
|
||||||
|
return markdown_docs[0].text
|
||||||
|
if hasattr(result, "pages") and result.pages:
|
||||||
|
return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
|
||||||
|
|
||||||
|
if isinstance(result, list):
|
||||||
|
if result and hasattr(result[0], "text"):
|
||||||
|
return result[0].text
|
||||||
|
return "\n\n".join(
|
||||||
|
doc.page_content if hasattr(doc, "page_content") else str(doc)
|
||||||
|
for doc in result
|
||||||
|
)
|
||||||
|
|
||||||
|
return str(result)
|
||||||
|
|
||||||
|
|
||||||
async def parse_with_llamacloud(
|
async def parse_with_llamacloud(
|
||||||
file_path: str, estimated_pages: int, processing_mode: str = "basic"
|
file_path: str, estimated_pages: int, processing_mode: str = "basic"
|
||||||
) -> str:
|
) -> str:
|
||||||
from llama_cloud_services import LlamaParse
|
from llama_cloud_services import LlamaParse
|
||||||
|
from llama_cloud_services.parse.base import JobFailedException
|
||||||
from llama_cloud_services.parse.utils import ResultType
|
from llama_cloud_services.parse.utils import ResultType
|
||||||
|
|
||||||
file_size_bytes = os.path.getsize(file_path)
|
file_size_bytes = os.path.getsize(file_path)
|
||||||
|
|
@ -41,12 +62,13 @@ async def parse_with_llamacloud(
|
||||||
pool=120.0,
|
pool=120.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective")
|
parse_mode = LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
|
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
|
||||||
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
|
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
|
||||||
f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})"
|
f"job_timeout={job_timeout:.0f}s, parse_mode={parse_mode} "
|
||||||
|
f"(mode={processing_mode})"
|
||||||
)
|
)
|
||||||
|
|
||||||
last_exception = None
|
last_exception = None
|
||||||
|
|
@ -61,11 +83,12 @@ async def parse_with_llamacloud(
|
||||||
verbose=True,
|
verbose=True,
|
||||||
language="en",
|
language="en",
|
||||||
result_type=ResultType.MD,
|
result_type=ResultType.MD,
|
||||||
|
parse_mode=parse_mode,
|
||||||
|
ignore_errors=False,
|
||||||
max_timeout=int(max(2000, job_timeout + upload_timeout)),
|
max_timeout=int(max(2000, job_timeout + upload_timeout)),
|
||||||
job_timeout_in_seconds=job_timeout,
|
job_timeout_in_seconds=job_timeout,
|
||||||
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
|
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
|
||||||
custom_client=custom_client,
|
custom_client=custom_client,
|
||||||
tier=tier,
|
|
||||||
)
|
)
|
||||||
result = await parser.aparse(file_path)
|
result = await parser.aparse(file_path)
|
||||||
|
|
||||||
|
|
@ -75,27 +98,18 @@ async def parse_with_llamacloud(
|
||||||
f"{len(attempt_errors)} failures"
|
f"{len(attempt_errors)} failures"
|
||||||
)
|
)
|
||||||
|
|
||||||
if hasattr(result, "get_markdown_documents"):
|
content = _extract_content(result)
|
||||||
markdown_docs = result.get_markdown_documents(split_by_page=False)
|
if not content or not content.strip():
|
||||||
if markdown_docs and hasattr(markdown_docs[0], "text"):
|
raise RuntimeError(
|
||||||
return markdown_docs[0].text
|
"LlamaCloud returned empty/whitespace-only content"
|
||||||
if hasattr(result, "pages") and result.pages:
|
|
||||||
return "\n\n".join(
|
|
||||||
p.md for p in result.pages if hasattr(p, "md") and p.md
|
|
||||||
)
|
|
||||||
return str(result)
|
|
||||||
|
|
||||||
if isinstance(result, list):
|
|
||||||
if result and hasattr(result[0], "text"):
|
|
||||||
return result[0].text
|
|
||||||
return "\n\n".join(
|
|
||||||
doc.page_content if hasattr(doc, "page_content") else str(doc)
|
|
||||||
for doc in result
|
|
||||||
)
|
)
|
||||||
|
return content
|
||||||
|
|
||||||
return str(result)
|
except (
|
||||||
|
*LLAMACLOUD_RETRYABLE_EXCEPTIONS,
|
||||||
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
|
RuntimeError,
|
||||||
|
JobFailedException,
|
||||||
|
) as e:
|
||||||
last_exception = e
|
last_exception = e
|
||||||
error_type = type(e).__name__
|
error_type = type(e).__name__
|
||||||
error_msg = str(e)[:200]
|
error_msg = str(e)[:200]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue