fix: llamaclud v2 impl

2026-04-26 01:06:23 +02:00 · 2026-04-16 01:15:47 -07:00 · 2026-04-16 01:15:47 -07:00 · 0e4285095c
commit 0e4285095c
parent 2f793e7a69
2 changed files with 42 additions and 25 deletions
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -13,6 +13,9 @@ BASE_DIR = Path(__file__).resolve().parent.parent.parent
 env_file = BASE_DIR / ".env"
 load_dotenv(env_file)
 os.environ.setdefault("OR_APP_NAME", "SurfSense")
 os.environ.setdefault("OR_SITE_URL", "https://surfsense.com")
 def is_ffmpeg_installed():
    """
--- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
@ -16,16 +16,37 @@ from app.etl_pipeline.constants import (
    calculate_upload_timeout,
 )
-LLAMA_TIER_BY_MODE = {
+LLAMA_PARSE_MODE_MAP = {
-    "basic": "cost_effective",
+    "basic": "parse_page_with_llm",
-    "premium": "agentic_plus",
+    "premium": "parse_page_with_agent",
 }
 def _extract_content(result) -> str:
    """Pull markdown text out of whatever object LlamaParse.aparse returns."""
    if hasattr(result, "get_markdown_documents"):
        markdown_docs = result.get_markdown_documents(split_by_page=False)
        if markdown_docs and hasattr(markdown_docs[0], "text"):
            return markdown_docs[0].text
        if hasattr(result, "pages") and result.pages:
            return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
    if isinstance(result, list):
        if result and hasattr(result[0], "text"):
            return result[0].text
        return "\n\n".join(
            doc.page_content if hasattr(doc, "page_content") else str(doc)
            for doc in result
        )
    return str(result)
 async def parse_with_llamacloud(
    file_path: str, estimated_pages: int, processing_mode: str = "basic"
 ) -> str:
    from llama_cloud_services import LlamaParse
    from llama_cloud_services.parse.base import JobFailedException
    from llama_cloud_services.parse.utils import ResultType
    file_size_bytes = os.path.getsize(file_path)
@ -41,12 +62,13 @@ async def parse_with_llamacloud(
        pool=120.0,
    )
-    tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective")
+    parse_mode = LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
    logging.info(
        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
-        f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})"
+        f"job_timeout={job_timeout:.0f}s, parse_mode={parse_mode} "
        f"(mode={processing_mode})"
    )
    last_exception = None
@ -61,11 +83,12 @@ async def parse_with_llamacloud(
                    verbose=True,
                    language="en",
                    result_type=ResultType.MD,
                    parse_mode=parse_mode,
                    ignore_errors=False,
                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
                    job_timeout_in_seconds=job_timeout,
                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
                    custom_client=custom_client,
                    tier=tier,
                )
                result = await parser.aparse(file_path)
@ -75,27 +98,18 @@ async def parse_with_llamacloud(
                        f"{len(attempt_errors)} failures"
                    )
-                if hasattr(result, "get_markdown_documents"):
+                content = _extract_content(result)
-                    markdown_docs = result.get_markdown_documents(split_by_page=False)
+                if not content or not content.strip():
-                    if markdown_docs and hasattr(markdown_docs[0], "text"):
+                    raise RuntimeError(
-                        return markdown_docs[0].text
+                        "LlamaCloud returned empty/whitespace-only content"
                    if hasattr(result, "pages") and result.pages:
                        return "\n\n".join(
                            p.md for p in result.pages if hasattr(p, "md") and p.md
                    )
-                    return str(result)
+                return content
-                if isinstance(result, list):
+        except (
-                    if result and hasattr(result[0], "text"):
+            *LLAMACLOUD_RETRYABLE_EXCEPTIONS,
-                        return result[0].text
+            RuntimeError,
-                    return "\n\n".join(
+            JobFailedException,
-                        doc.page_content if hasattr(doc, "page_content") else str(doc)
+        ) as e:
                        for doc in result
                    )
                return str(result)
        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
            last_exception = e
            error_type = type(e).__name__
            error_msg = str(e)[:200]