SurfSense/surfsense_evals/scripts/compute_adjusted_accuracy.py

"""Compute "intrinsic" accuracy by removing transient network errors.

A failure is *transient* if it's:
  * SSLError: SSL bad-record-mac (TLS hiccup)
  * Cloudflare 502 / 503 (provider-side load shedding)
  * empty_response with no error string and no other signal (likely
    connection reset mid-stream)
  * JSONDecodeError (parse error mid-stream)

A failure is *intrinsic* if it's a hard limit:
  * "exceeds .* limit" (size limits)
  * context_length errors
  * provider 400 with image / pdf decode failure
  * malformed-input failures

We re-compute accuracy with two denominators:
  * raw acc       = correct / 171  (what the headline reports)
  * adjusted acc  = correct / (171 - transient_failures)  (intrinsic)

Outputs a table that we can drop straight into the blog.
"""

from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path

REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"


TRANSIENT_HINTS = (
    "sslv3_alert_bad_record_mac",
    "ssl_alert_bad_record_mac",
    "ssl: ssl",
    "cloudflare",
    "error 502",
    "error 503",
    "bad gateway",
    "service unavailable",
    "gateway timeout",
    "jsondecodeerror",
)
INTRINSIC_HINTS = (
    "exceeds",
    "context_length",
    "context window",
    "could not process pdf",
    "could not process image",
)


def classify(error: str | None, raw_text: str) -> str:
    err = (error or "").lower()
    if not err and not raw_text.strip():
        return "transient_empty"
    if any(h in err for h in TRANSIENT_HINTS):
        return "transient_ssl_or_5xx"
    if any(h in err for h in INTRINSIC_HINTS):
        return "intrinsic_limit"
    if err:
        return "other_error"
    return "ok"


def main() -> None:
    rows = [
        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
        if line.strip()
    ]
    by_arm: dict[str, dict] = defaultdict(lambda: {
        "n": 0, "correct": 0,
        "transient_ssl_or_5xx": 0, "transient_empty": 0,
        "intrinsic_limit": 0, "other_error": 0,
    })
    for row in rows:
        arm = row["arm"]
        m = by_arm[arm]
        m["n"] += 1
        graded = row.get("graded") or {}
        if graded.get("correct"):
            m["correct"] += 1
        kind = classify(row.get("error"), row.get("raw_text") or "")
        if kind != "ok":
            m[kind] += 1

    print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
    print("-" * 88)
    for arm in sorted(by_arm):
        m = by_arm[arm]
        raw = m["correct"] / m["n"] * 100
        transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
        intrinsic = m["intrinsic_limit"]
        other = m["other_error"]
        usable = m["n"] - transient
        adj = m["correct"] / usable * 100 if usable else 0
        print(
            f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
        )

    print()
    print("transient   = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
    print("              succeed on retry; eval harness has no built-in retry today).")
    print("intrinsic   = hard limit (e.g. >30MB Anthropic request, model context overflow).")
    print("adj acc%    = correct / (n - transient) — what the arm scores when network noise")
    print("              is removed; closest thing we have to a like-for-like quality number.")


if __name__ == "__main__":
    main()
$DESKTOP-RTLN3BA\$punk$ feat(evals): publish multimodal_doc parser_compare benchmark + n=171 report Adds the full parser_compare experiment for the multimodal_doc suite: six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with anthropic/claude-sonnet-4.5 across the board. Source code: - core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse parse_page_with_llm/parse_page_with_agent) used by the LC arms, bypassing the SurfSense backend so each (basic/premium) extraction is a clean A/B independent of backend ETL routing. - suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py: six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc, llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with byte-identical prompts per question, deterministic grader, Wilson CIs, and the per-page preprocessing tariff cost overlay. Reproducibility: - pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence, llama-cloud-services as new deps. - .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env vars now required for parser_compare. - 12 analysis scripts under scripts/: retry pass with exponential backoff, post-retry accuracy merge, McNemar / latency / per-PDF stats, context-overflow hypothesis test, etc. Each produces one number cited by the blog report. Citation surface: - reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line technical writeup (16 sections) covering headline accuracy, per-format accuracy, McNemar pairwise significance, latency / token / per-PDF distributions, error analysis, retry experiment, post-retry final accuracy, cost amortization model with closed-form derivation, threats to validity, and reproducibility appendix. - data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw, raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary whitelisted via data/.gitignore as the verifiable numbers source. Gitignore: - ignore logs_*.txt + retry_run.log; structured artifacts cover the citation surface, debug logs are noise. - data/.gitignore default-ignores everything, whitelists the n=171 run artifacts only (parser manifest left ignored to avoid leaking local Windows usernames in absolute paths; manifest is fully regenerable via 'ingest multimodal_doc parser_compare'). - reports/.gitignore now whitelists hand-curated reports/blog/. Also retires the abandoned CRAG Task 3 implementation (download script, streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the runner / ingest module APIs to match. Co-authored-by: Cursor <cursoragent@cursor.com> 2026-05-14 19:54:41 -07:00			`"""Compute "intrinsic" accuracy by removing transient network errors.`

			`A failure is transient if it's:`
			`* SSLError: SSL bad-record-mac (TLS hiccup)`
			`* Cloudflare 502 / 503 (provider-side load shedding)`
			`* empty_response with no error string and no other signal (likely`
			`connection reset mid-stream)`
			`* JSONDecodeError (parse error mid-stream)`

			`A failure is intrinsic if it's a hard limit:`
			`* "exceeds .* limit" (size limits)`
			`* context_length errors`
			`* provider 400 with image / pdf decode failure`
			`* malformed-input failures`

			`We re-compute accuracy with two denominators:`
			`* raw acc = correct / 171 (what the headline reports)`
			`* adjusted acc = correct / (171 - transient_failures) (intrinsic)`

			`Outputs a table that we can drop straight into the blog.`
			`"""`

			`from __future__ import annotations`

			`import json`
			`from collections import defaultdict`
			`from pathlib import Path`

			`REPO = Path(__file__).resolve().parents[1]`
			`RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"`
			`RAW = RUN / "raw.jsonl"`


			`TRANSIENT_HINTS = (`
			`"sslv3_alert_bad_record_mac",`
			`"ssl_alert_bad_record_mac",`
			`"ssl: ssl",`
			`"cloudflare",`
			`"error 502",`
			`"error 503",`
			`"bad gateway",`
			`"service unavailable",`
			`"gateway timeout",`
			`"jsondecodeerror",`
			`)`
			`INTRINSIC_HINTS = (`
			`"exceeds",`
			`"context_length",`
			`"context window",`
			`"could not process pdf",`
			`"could not process image",`
			`)`


			`def classify(error: str \| None, raw_text: str) -> str:`
			`err = (error or "").lower()`
			`if not err and not raw_text.strip():`
			`return "transient_empty"`
			`if any(h in err for h in TRANSIENT_HINTS):`
			`return "transient_ssl_or_5xx"`
			`if any(h in err for h in INTRINSIC_HINTS):`
			`return "intrinsic_limit"`
			`if err:`
			`return "other_error"`
			`return "ok"`


			`def main() -> None:`
			`rows = [`
			`json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()`
			`if line.strip()`
			`]`
			`by_arm: dict[str, dict] = defaultdict(lambda: {`
			`"n": 0, "correct": 0,`
			`"transient_ssl_or_5xx": 0, "transient_empty": 0,`
			`"intrinsic_limit": 0, "other_error": 0,`
			`})`
			`for row in rows:`
			`arm = row["arm"]`
			`m = by_arm[arm]`
			`m["n"] += 1`
			`graded = row.get("graded") or {}`
			`if graded.get("correct"):`
			`m["correct"] += 1`
			`kind = classify(row.get("error"), row.get("raw_text") or "")`
			`if kind != "ok":`
			`m[kind] += 1`

			`print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")`
			`print("-" * 88)`
			`for arm in sorted(by_arm):`
			`m = by_arm[arm]`
			`raw = m["correct"] / m["n"] * 100`
			`transient = m["transient_ssl_or_5xx"] + m["transient_empty"]`
			`intrinsic = m["intrinsic_limit"]`
			`other = m["other_error"]`
			`usable = m["n"] - transient`
			`adj = m["correct"] / usable * 100 if usable else 0`
			`print(`
			`f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"`
			`)`

			`print()`
			`print("transient = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")`
			`print(" succeed on retry; eval harness has no built-in retry today).")`
			`print("intrinsic = hard limit (e.g. >30MB Anthropic request, model context overflow).")`
			`print("adj acc% = correct / (n - transient) — what the arm scores when network noise")`
			`print(" is removed; closest thing we have to a like-for-like quality number.")`


			`if __name__ == "__main__":`
			`main()`