fix: llamaclud v2 impl

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-16 01:15:47 -07:00
parent 2f793e7a69
commit 0e4285095c
2 changed files with 42 additions and 25 deletions

View file

@ -13,6 +13,9 @@ BASE_DIR = Path(__file__).resolve().parent.parent.parent
env_file = BASE_DIR / ".env" env_file = BASE_DIR / ".env"
load_dotenv(env_file) load_dotenv(env_file)
os.environ.setdefault("OR_APP_NAME", "SurfSense")
os.environ.setdefault("OR_SITE_URL", "https://surfsense.com")
def is_ffmpeg_installed(): def is_ffmpeg_installed():
""" """

View file

@ -16,16 +16,37 @@ from app.etl_pipeline.constants import (
calculate_upload_timeout, calculate_upload_timeout,
) )
LLAMA_TIER_BY_MODE = { LLAMA_PARSE_MODE_MAP = {
"basic": "cost_effective", "basic": "parse_page_with_llm",
"premium": "agentic_plus", "premium": "parse_page_with_agent",
} }
def _extract_content(result) -> str:
"""Pull markdown text out of whatever object LlamaParse.aparse returns."""
if hasattr(result, "get_markdown_documents"):
markdown_docs = result.get_markdown_documents(split_by_page=False)
if markdown_docs and hasattr(markdown_docs[0], "text"):
return markdown_docs[0].text
if hasattr(result, "pages") and result.pages:
return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
if isinstance(result, list):
if result and hasattr(result[0], "text"):
return result[0].text
return "\n\n".join(
doc.page_content if hasattr(doc, "page_content") else str(doc)
for doc in result
)
return str(result)
async def parse_with_llamacloud( async def parse_with_llamacloud(
file_path: str, estimated_pages: int, processing_mode: str = "basic" file_path: str, estimated_pages: int, processing_mode: str = "basic"
) -> str: ) -> str:
from llama_cloud_services import LlamaParse from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.base import JobFailedException
from llama_cloud_services.parse.utils import ResultType from llama_cloud_services.parse.utils import ResultType
file_size_bytes = os.path.getsize(file_path) file_size_bytes = os.path.getsize(file_path)
@ -41,12 +62,13 @@ async def parse_with_llamacloud(
pool=120.0, pool=120.0,
) )
tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective") parse_mode = LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
logging.info( logging.info(
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})" f"job_timeout={job_timeout:.0f}s, parse_mode={parse_mode} "
f"(mode={processing_mode})"
) )
last_exception = None last_exception = None
@ -61,11 +83,12 @@ async def parse_with_llamacloud(
verbose=True, verbose=True,
language="en", language="en",
result_type=ResultType.MD, result_type=ResultType.MD,
parse_mode=parse_mode,
ignore_errors=False,
max_timeout=int(max(2000, job_timeout + upload_timeout)), max_timeout=int(max(2000, job_timeout + upload_timeout)),
job_timeout_in_seconds=job_timeout, job_timeout_in_seconds=job_timeout,
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
custom_client=custom_client, custom_client=custom_client,
tier=tier,
) )
result = await parser.aparse(file_path) result = await parser.aparse(file_path)
@ -75,27 +98,18 @@ async def parse_with_llamacloud(
f"{len(attempt_errors)} failures" f"{len(attempt_errors)} failures"
) )
if hasattr(result, "get_markdown_documents"): content = _extract_content(result)
markdown_docs = result.get_markdown_documents(split_by_page=False) if not content or not content.strip():
if markdown_docs and hasattr(markdown_docs[0], "text"): raise RuntimeError(
return markdown_docs[0].text "LlamaCloud returned empty/whitespace-only content"
if hasattr(result, "pages") and result.pages:
return "\n\n".join(
p.md for p in result.pages if hasattr(p, "md") and p.md
) )
return str(result) return content
if isinstance(result, list): except (
if result and hasattr(result[0], "text"): *LLAMACLOUD_RETRYABLE_EXCEPTIONS,
return result[0].text RuntimeError,
return "\n\n".join( JobFailedException,
doc.page_content if hasattr(doc, "page_content") else str(doc) ) as e:
for doc in result
)
return str(result)
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
last_exception = e last_exception = e
error_type = type(e).__name__ error_type = type(e).__name__
error_msg = str(e)[:200] error_msg = str(e)[:200]