From 0e4285095cf8db55df52592053e67a90049b747a Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 16 Apr 2026 01:15:47 -0700 Subject: [PATCH] fix: llamaclud v2 impl --- surfsense_backend/app/config/__init__.py | 3 + .../app/etl_pipeline/parsers/llamacloud.py | 64 +++++++++++-------- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 5bc1b48ce..a515e9044 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -13,6 +13,9 @@ BASE_DIR = Path(__file__).resolve().parent.parent.parent env_file = BASE_DIR / ".env" load_dotenv(env_file) +os.environ.setdefault("OR_APP_NAME", "SurfSense") +os.environ.setdefault("OR_SITE_URL", "https://surfsense.com") + def is_ffmpeg_installed(): """ diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py index 138786b74..fbc098ec2 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py +++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py @@ -16,16 +16,37 @@ from app.etl_pipeline.constants import ( calculate_upload_timeout, ) -LLAMA_TIER_BY_MODE = { - "basic": "cost_effective", - "premium": "agentic_plus", +LLAMA_PARSE_MODE_MAP = { + "basic": "parse_page_with_llm", + "premium": "parse_page_with_agent", } +def _extract_content(result) -> str: + """Pull markdown text out of whatever object LlamaParse.aparse returns.""" + if hasattr(result, "get_markdown_documents"): + markdown_docs = result.get_markdown_documents(split_by_page=False) + if markdown_docs and hasattr(markdown_docs[0], "text"): + return markdown_docs[0].text + if hasattr(result, "pages") and result.pages: + return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md) + + if isinstance(result, list): + if result and hasattr(result[0], "text"): + return result[0].text + return "\n\n".join( + doc.page_content if hasattr(doc, "page_content") else str(doc) + for doc in result + ) + + return str(result) + + async def parse_with_llamacloud( file_path: str, estimated_pages: int, processing_mode: str = "basic" ) -> str: from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.base import JobFailedException from llama_cloud_services.parse.utils import ResultType file_size_bytes = os.path.getsize(file_path) @@ -41,12 +62,13 @@ async def parse_with_llamacloud( pool=120.0, ) - tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective") + parse_mode = LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm") logging.info( f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " - f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})" + f"job_timeout={job_timeout:.0f}s, parse_mode={parse_mode} " + f"(mode={processing_mode})" ) last_exception = None @@ -61,11 +83,12 @@ async def parse_with_llamacloud( verbose=True, language="en", result_type=ResultType.MD, + parse_mode=parse_mode, + ignore_errors=False, max_timeout=int(max(2000, job_timeout + upload_timeout)), job_timeout_in_seconds=job_timeout, job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, custom_client=custom_client, - tier=tier, ) result = await parser.aparse(file_path) @@ -75,27 +98,18 @@ async def parse_with_llamacloud( f"{len(attempt_errors)} failures" ) - if hasattr(result, "get_markdown_documents"): - markdown_docs = result.get_markdown_documents(split_by_page=False) - if markdown_docs and hasattr(markdown_docs[0], "text"): - return markdown_docs[0].text - if hasattr(result, "pages") and result.pages: - return "\n\n".join( - p.md for p in result.pages if hasattr(p, "md") and p.md - ) - return str(result) - - if isinstance(result, list): - if result and hasattr(result[0], "text"): - return result[0].text - return "\n\n".join( - doc.page_content if hasattr(doc, "page_content") else str(doc) - for doc in result + content = _extract_content(result) + if not content or not content.strip(): + raise RuntimeError( + "LlamaCloud returned empty/whitespace-only content" ) + return content - return str(result) - - except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: + except ( + *LLAMACLOUD_RETRYABLE_EXCEPTIONS, + RuntimeError, + JobFailedException, + ) as e: last_exception = e error_type = type(e).__name__ error_msg = str(e)[:200]