feat: added blog posts

2026-07-02 22:01:05 +02:00 · 2026-05-15 11:55:30 -07:00 · 2026-05-15 11:55:30 -07:00 · 52a64fb96c
commit 52a64fb96c
parent eea2d68098
19 changed files with 749 additions and 1347 deletions
--- a/surfsense_web/app/(home)/blog/blog-magazine.tsx
+++ b/surfsense_web/app/(home)/blog/blog-magazine.tsx
@ -35,9 +35,7 @@ function SearchIcon({ className }: { className?: string }) {
 }

 export function BlogWithSearchMagazine({ blogs }: { blogs: BlogEntry[] }) {
-	const featured = blogs[0];
-
-	if (!featured) {
+	if (blogs.length === 0) {
 		return (
 			<div className="relative overflow-hidden bg-neutral-50 px-4 md:px-8 dark:bg-neutral-950">
 				<Container className="relative pt-12 pb-24 md:pt-20">
@ -47,6 +45,17 @@ export function BlogWithSearchMagazine({ blogs }: { blogs: BlogEntry[] }) {
 		);
 	}

+	// `blogs` arrives pre-sorted from the server: explicitly featured posts
+	// first (ordered by `featured_order` asc, then date desc), then the rest
+	// by date desc. If nothing is explicitly featured, fall back to treating
+	// the newest post as the cover so the layout never feels empty up top.
+	// `MagazineSearchGrid` re-filters using `heroSlugs` so the hero/featured
+	// posts never duplicate into the archive grid.
+	const explicitlyFeatured = blogs.filter((b) => b.featured);
+	const heroBlogs = explicitlyFeatured.length > 0 ? explicitlyFeatured : blogs.slice(0, 1);
+	const heroSlugs = new Set(heroBlogs.map((b) => b.slug));
+	const [coverStory, ...secondaryFeatured] = heroBlogs;
+
 	return (
 		<div className="relative overflow-hidden bg-neutral-50 px-4 pt-20 md:px-8 dark:bg-neutral-950">
 			<div className="pointer-events-none absolute inset-0 bg-[radial-gradient(ellipse_80%_50%_at_50%_-20%,rgba(120,119,198,0.15),transparent)] dark:bg-[radial-gradient(ellipse_80%_50%_at_50%_-20%,rgba(120,119,198,0.12),transparent)]" />
@ -57,14 +66,38 @@ export function BlogWithSearchMagazine({ blogs }: { blogs: BlogEntry[] }) {
 					</h1>
 				</header>

-				<MagazineFeatured blog={featured} />
+				<MagazineFeatured blog={coverStory} />

-				<MagazineSearchGrid blogs={blogs} featuredSlug={featured.slug} />
+				{secondaryFeatured.length > 0 ? (
+					<MoreFeatured blogs={secondaryFeatured} />
+				) : null}
+
+				<MagazineSearchGrid blogs={blogs} excludedSlugs={heroSlugs} />
 			</Container>
 		</div>
 	);
 }

+function MoreFeatured({ blogs }: { blogs: BlogEntry[] }) {
+	return (
+		<section aria-labelledby="more-featured-heading" className="mb-14">
+			<h2
+				id="more-featured-heading"
+				className="mb-6 font-serif text-2xl font-medium text-neutral-900 dark:text-neutral-100"
+			>
+				More featured
+			</h2>
+			<ul className="grid gap-6 sm:grid-cols-2">
+				{blogs.map((blog) => (
+					<li key={blog.slug}>
+						<MagazineCard blog={blog} />
+					</li>
+				))}
+			</ul>
+		</section>
+	);
+}
+
 function MagazineFeatured({ blog }: { blog: BlogEntry }) {
 	return (
 		<Link
@ -112,10 +145,11 @@ function MagazineFeatured({ blog }: { blog: BlogEntry }) {

 function MagazineSearchGrid({
 	blogs: allBlogs,
-	featuredSlug,
+	excludedSlugs,
 }: {
 	blogs: BlogEntry[];
-	featuredSlug: string;
+	/** Slugs already shown above the archive (cover story + "More featured"). */
+	excludedSlugs: Set<string>;
 }) {
 	const [search, setSearch] = useState("");

@ -128,12 +162,15 @@ function MagazineSearchGrid({
 	);

 	const gridItems = useMemo(() => {
+		// When the reader is searching, surface every match (including
+		// featured posts they may be looking for); otherwise hide the posts
+		// that are already rendered as featured above the archive.
 		const results = search.trim() ? searcher.search(search) : allBlogs;
 		if (search.trim()) {
 			return results;
 		}
-		return results.filter((b) => b.slug !== featuredSlug);
-	}, [search, searcher, allBlogs, featuredSlug]);
+		return results.filter((b) => !excludedSlugs.has(b.slug));
+	}, [search, searcher, allBlogs, excludedSlugs]);

 	return (
 		<section aria-labelledby="archive-heading">
--- a/surfsense_web/app/(home)/blog/page.tsx
+++ b/surfsense_web/app/(home)/blog/page.tsx
@ -25,6 +25,8 @@ export interface BlogEntry {
 	image: string;
 	author: string;
 	authorAvatar: string;
+	featured: boolean;
+	featuredOrder?: number;
 }

 export default async function BlogPage() {
@ -38,6 +40,8 @@ export default async function BlogPage() {
 			image?: string;
 			author?: string;
 			authorAvatar?: string;
+			featured?: boolean;
+			featured_order?: number;
 		};
 	}>;

@ -51,8 +55,20 @@ export default async function BlogPage() {
 			image: page.data.image ?? "/og-image.png",
 			author: page.data.author ?? "SurfSense Team",
 			authorAvatar: page.data.authorAvatar ?? "/logo.png",
+			featured: page.data.featured ?? false,
+			featuredOrder: page.data.featured_order,
 		}))
-		.sort((a, b) => new Date(b.date).getTime() - new Date(a.date).getTime());
+		.sort((a, b) => {
+			// Featured first; then by `featured_order` asc within featured;
+			// then by `date` desc as the universal tie-breaker.
+			if (a.featured !== b.featured) return a.featured ? -1 : 1;
+			if (a.featured && b.featured) {
+				const aOrder = a.featuredOrder ?? Number.POSITIVE_INFINITY;
+				const bOrder = b.featuredOrder ?? Number.POSITIVE_INFINITY;
+				if (aOrder !== bOrder) return aOrder - bOrder;
+			}
+			return new Date(b.date).getTime() - new Date(a.date).getTime();
+		});

 	return <BlogWithSearchMagazine blogs={blogs} />;
 }
--- a/surfsense_web/blog/content/agentic-rag-vs-long-context-llms-benchmark.mdx
+++ b/surfsense_web/blog/content/agentic-rag-vs-long-context-llms-benchmark.mdx
@ -0,0 +1,387 @@
+---
+title: "Agentic RAG vs Long-Context LLMs: A 171-Question Benchmark on 30 Long PDFs"
+description: "We benchmarked agentic RAG against long-context LLMs and native PDF attachment on 171 real questions across 30 long, multimodal PDFs, using Claude Sonnet 4.5 on every arm. Accuracy, cost per query, failure modes, and a vision-LLM-vs-OCR finding the internet still expects to go the other way."
+date: "2026-05-15"
+image: "/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png"
+author: "SurfSense Team"
+authorAvatar: "/logo.png"
+tags:
+  - "Agentic RAG"
+  - "Long-Context LLM"
+  - "RAG vs Agentic"
+  - "Vision LLM vs OCR"
+  - "Benchmark"
+  - "Claude Sonnet 4.5"
+  - "MMLongBench-Doc"
+featured: true
+featured_order: 1
+---
+
+> **TL;DR for skimmers**
+>
+> We ran six different ways of answering questions over 30 long, image-heavy PDFs (a total of 171 real questions) using the *same* large language model, Claude Sonnet 4.5, and measured accuracy, cost per question, and how often each approach broke. The result:
+>
+> - **Full-context "long-context" approaches won on raw accuracy** (LlamaCloud premium 59.6%, Azure premium 58.5%).
+> - **Agentic RAG was nearly as accurate (53.2%) at less than half the cost ($0.0827 per question vs $0.18–$0.26)** and zero failed queries out of 171.
+> - **Most accuracy gaps were not statistically significant.** 12 of 15 head-to-head comparisons could be coin-flips (McNemar test, α = 0.05).
+> - **Vision LLMs did not beat traditional OCR.** Letting Claude read the PDF directly with its built-in vision (the `native_pdf` arm) finished 5th of 6, behind every parser-based pipeline, with a stubborn 7% intrinsic failure rate that survived 5 retries with exponential backoff.
+>
+> Practical takeaway: if you are building a long-PDF Q&A product, **agentic RAG is the boring-but-correct default**. Reach for full-context only when the document fits, the budget allows, and the accuracy gain matters. Don't bet on vision LLMs replacing OCR pipelines yet.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png"
+  alt="Diagram comparing agentic RAG, long-context LLM, and native PDF pipelines for document question answering."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+## Why this matters: the agentic RAG vs long-context debate
+
+If you are shipping anything that lets a user ask questions about a PDF, whether that is a contract analyser, a research assistant, or an internal docs chatbot, you have hit one of the loudest arguments in AI engineering today.
+
+On one side: **long-context LLMs**. Modern models from Anthropic, OpenAI and Google now accept hundreds of thousands of tokens in a single prompt. Just stuff the whole document in and ask the question. Simple, fast to build.
+
+On the other side: **agentic RAG** (retrieval-augmented generation, where an agent dynamically pulls relevant chunks instead of dumping the whole document into the prompt). More complex, but classically considered cheaper and safer at scale.
+
+Layered on top of that is a quieter argument: **do you even need a document parser anymore?** Frontier models from Anthropic, OpenAI and Google now read PDFs natively using their vision stack. The story everyone wants to be true is that vision-capable LLMs make OCR pipelines obsolete. We tested that story too. Spoiler: not yet.
+
+The internet is full of opinions and very thin on data, especially for *long, multimodal* PDFs (the messy, image-heavy real-world kind). So we built a benchmark with the same model on every arm and measured what actually happens, on both questions at once.
+
+## What is agentic RAG?
+
+Quick definitions, in plain English, then we move to the data.
+
+**RAG (retrieval-augmented generation)** is the standard pattern for letting a language model answer questions about your private documents. You chunk the documents into pieces, store them in a vector database, and at query time you retrieve the chunks most likely to contain the answer and pass them to the model.
+
+**Agentic RAG** is RAG with an LLM agent in the driver's seat. Instead of one fixed retrieval step, the agent can:
+
+- ask itself sub-questions,
+- run multiple searches with different queries,
+- decide when it has enough evidence,
+- ignore irrelevant chunks,
+- and stop when the answer is complete.
+
+Think of vanilla RAG as handing a librarian one note that says *"find me the answer to X"*. Agentic RAG is handing the same librarian a research brief and a clipboard, and letting them walk back and forth between the shelves until the report writes itself.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png"
+  alt="Agentic RAG architecture diagram showing an LLM agent iteratively retrieving document chunks before producing a final answer."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+For a 5-minute video walk-through, IBM Technology has the highest-ranked explainer on YouTube right now (325k views, watched by us, and accurate):
+
+<div style={{ position: 'relative', paddingBottom: '56.25%', height: 0, overflow: 'hidden', borderRadius: '12px', margin: '1.5rem 0' }}>
+  <iframe
+    src="https://www.youtube-nocookie.com/embed/0z9_MhcYvcY"
+    title="What is Agentic RAG? - IBM Technology"
+    allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+    referrerPolicy="strict-origin-when-cross-origin"
+    allowFullScreen
+    style={{ position: 'absolute', top: 0, left: 0, width: '100%', height: '100%', border: 0 }}
+  />
+</div>
+
+## How we ran the benchmark
+
+To make the comparison fair, every arm answered the same questions and used the exact same large language model: **Claude Sonnet 4.5**, called through OpenRouter so the API path was identical.
+
+The dataset was [**MMLongBench-Doc**](https://mayubo2333.github.io/MMLongBench-Doc/) ([paper](https://arxiv.org/abs/2407.01523), [GitHub](https://github.com/mayubo2333/MMLongBench-Doc), [Hugging Face](https://huggingface.co/datasets/yubo2333/MMLongBench-Doc)), an open multimodal-document benchmark of long PDFs with vetted question-answer pairs. The full corpus is 135 PDFs averaging 47.5 pages each, with 1,091 expert-annotated questions across 7 domains (33% cross-page, 22.5% deliberately unanswerable to detect hallucinations). We used the first 30 documents (a mix of research papers, financial filings, product catalogues, and image-heavy reports) and all 171 of their answerable questions.
+
+### Why multimodal documents?
+
+Real-world PDFs are messy. They contain charts, scanned tables, photos, multi-column layouts, and footnotes inside footnotes. A clean text-only benchmark wouldn't tell us anything useful about whether these approaches survive contact with the documents people actually upload to AI products. MMLongBench-Doc was built to include exactly that messiness, which is the territory where parser quality and retrieval strategy actually start to matter. We wanted the benchmark to look like the real inbox of an AI app, not a sanitised research toy.
+
+### Why only 30 documents?
+
+The full MMLongBench-Doc corpus has 135 PDFs. Processing the entire dataset across all six arms would have taken significantly longer to complete on my machine, so we capped the run at 30 to keep iteration time reasonable. We're upfront about what that costs us statistically in the significance section below: a bigger sample would have tightened every confidence interval. The findings here should be read as strong directional evidence, not a final verdict.
+
+### The six arms
+
+| Arm | What it does | Preprocessing | What goes in the prompt |
+|---|---|---|---|
+| `native_pdf` | Sends the raw PDF file directly to the model | None | The PDF itself, every question |
+| `azure_basic_lc` | Parses the PDF with Azure Document Intelligence (cheap mode) | $1 per 1,000 pages | The whole markdown, every question |
+| `azure_premium_lc` | Same as above, premium parser (preserves layout) | $10 per 1,000 pages | The whole markdown, every question |
+| `llamacloud_basic_lc` | Parses the PDF with LlamaParse (cheap mode) | $1 per 1,000 pages | The whole markdown, every question |
+| `llamacloud_premium_lc` | LlamaParse premium with layout/table preservation | $10 per 1,000 pages | The whole markdown, every question |
+| `surfsense_agentic` | Full agentic RAG pipeline | $10 per 1,000 pages (one-time ingest) | Only the chunks the agent decides to retrieve |
+
+Arms 2-5 are what we call **"long-context" or "full-context" stuffing**: parse the PDF once, paste the entire result into every prompt. Arm 6 is the agentic RAG approach. Arm 1, the `native_pdf` "just attach the PDF" pattern, is doing double duty here. It is also the **"vision LLM replaces OCR" hypothesis**: instead of any markdown parser, the model reads the PDF directly using its built-in vision capabilities. If vision-capable LLMs are good enough to retire OCR pipelines, this arm should be at the top of the table. (It isn't.)
+
+If you want to read the implementations, every arm lives in [`surfsense_evals/src/surfsense_evals/core/arms/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals/src/surfsense_evals/core/arms) — the [`bare_llm.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py) arm handles full-context stuffing, [`native_pdf.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py) handles vision-LLM PDF attachment, and [`surfsense.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py) drives the agentic retrieval against the SurfSense `/api/v1/new_chat` endpoint. The full benchmark suite (prompts, ingest pipeline, runner) lives in [`suites/multimodal_doc/parser_compare/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare).
+
+We graded answers with a [deterministic, format-aware grader](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py) (1% relative tolerance for floats, F1 over normalised tokens for lists). We logged input/output tokens, cost, latency, and any HTTP error per question.
+
+## Headline results: who wins on accuracy?
+
+After running all 171 questions through all 6 arms, then re-running the 37 failed queries with up to 5 attempts of exponential backoff, here is the scoreboard:
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png"
+  alt="Bar chart of post-retry accuracy on 171 long-PDF questions: LlamaCloud premium 59.6%, Azure premium 58.5%, Azure basic 54.4%, SurfSense agentic RAG 53.2%, Native PDF 52.0%, LlamaCloud basic 50.9%."
+  width={2200}
+  height={1240}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+The full table including F1, raw failures, and median latency:
+
+Bolded cell = winner of that column.
+
+| Rank | Arm | Accuracy | F1 | Median latency | Raw failures |
+|---:|---|---:|---:|---:|---:|
+| 1 | LlamaCloud premium, long-context | **59.6%** | **61.1%** | **6.8 s** | 4 |
+| 2 | Azure premium, long-context | 58.5% | 59.6% | 6.9 s | 3 |
+| 3 | Azure basic, long-context | 54.4% | 56.6% | 7.1 s | 1 |
+| 4 | SurfSense agentic RAG | 53.2% | 54.3% | 52.8 s | **0** |
+| 5 | Native PDF attachment | 52.0% | 50.4% | 29.5 s | 27 |
+| 6 | LlamaCloud basic, long-context | 50.9% | 53.2% | 7.1 s | 2 |
+
+A few things jump out:
+
+1. **The two long-context premium parsers win on raw accuracy**, but only by about 6 percentage points over agentic RAG.
+2. **Agentic RAG was the only arm with zero failures** out of 171 questions.
+3. **Native PDF attachment was the worst performer** despite being the most "AI-native" approach. More on why in the failure-mode section.
+4. **Latency on agentic RAG is high (52.8 s)** because the agent does several retrieval rounds. For batch jobs it's fine; for chat UX you'd stream partial results.
+
+Now the part most blog posts skip.
+
+## Cost per query: where agentic RAG wins big
+
+Accuracy is only half the story. Every approach also has a price tag: the LLM call plus the document preprocessing.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png"
+  alt="Scatter plot of accuracy versus cost per query for six document-QA approaches; SurfSense agentic RAG sits at the cheapest end with competitive accuracy."
+  width={2200}
+  height={1280}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+Bolded cell = winner of that column.
+
+| Arm | Total $/Q | Accuracy |
+|---|---:|---:|
+| SurfSense agentic RAG | **$0.0827** | 53.2% |
+| LlamaCloud basic | $0.1049 | 50.9% |
+| Azure basic | $0.1062 | 54.4% |
+| LlamaCloud premium | $0.1885 | **59.6%** |
+| Azure premium | $0.2051 | 58.5% |
+| Native PDF | $0.2552 | 52.0% |
+
+The headline number: **agentic RAG was the cheapest arm at $0.0827 per question, about 60% cheaper than the most accurate full-context arm and 67% cheaper than native PDF attachment.** The technique wins on cost regardless of which agentic-RAG framework you use; we just happened to measure it with ours.
+
+Why is agentic RAG so much cheaper? Because every full-context arm pays the parser+LLM bill *for the entire document on every single question*. A 100-page PDF? You pay for 100 pages of input tokens 10 times if the user asks 10 questions. Agentic RAG pays the parser once at ingest time, then only sends the retrieved chunks (often 1–5% of the document) per question.
+
+There is a clean closed-form for this. If a document has *P* pages, a parser costs *c<sub>p</sub>* per page, the LLM costs *c<sub>L</sub>* per full-document call, and the user asks *Q* questions, then full-context cost-per-question is roughly:
+
+```
+Cost/Q ≈ (P × c_p) / Q + c_L
+```
+
+For agentic RAG it is:
+
+```
+Cost/Q ≈ (P × c_p) / Q + c_L × r
+```
+
+where `r` is the *retrieval ratio*, typically 0.02 to 0.10. So the more questions per document, the more agentic RAG dominates. For knowledge bases that get queried more than a couple of times, the gap widens by the day.
+
+## Failure modes: what 37 broken queries taught us
+
+We did not just count successes. We logged every error.
+
+Of 1,026 total `(arm, question)` cells, 37 returned no answer on the first pass. We then re-ran *only* those 37 with up to 5 attempts of exponential backoff (the [`retry_failed_questions.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/retry_failed_questions.py) script in the harness). The results separated **transient** (network/server) failures from **intrinsic** (the approach actually cannot do this) failures:
+
+Bolded cell = best result on that column (lowest failure rate, highest recovery rate).
+
+| Arm | First-pass failures | Recovered on retry | Intrinsic failures | Intrinsic failure rate |
+|---|---:|---:|---:|---:|
+| All 4 long-context arms (combined) | 10 | **10 (100%)** | **0** | **0%** |
+| Native PDF | 27 | 15 | 12 | 7.0% |
+| SurfSense agentic RAG | **0** | n/a | **0** | **0%** |
+
+Two findings worth highlighting:
+
+**1. Long-context "context overflow" was a myth.** We hypothesised that the long-context arms might be silently failing because the document didn't fit in the context window. We tested it: the failures clustered around HTTP/SSL errors (the request body was up to 30 MB, riding the public internet), not token limits. Once we retried, all 10 came back successfully. The Claude Sonnet 4.5 context window held up fine; the *transport layer* wobbled.
+
+**2. Native PDF has a stubborn 7% intrinsic failure rate.** Two specific PDFs broke it permanently:
+
+- a 27-page image-heavy PDF whose binary exceeded the provider's 30 MB request-body cap (6 questions broken);
+- a 166-page PDF whose response stream the provider could never reliably terminate (5 questions, repeated `empty stream` errors).
+
+Even with 5 attempts of exponential backoff, those 12 questions stayed broken. **For any production app that processes PDFs from arbitrary users, that is a 7% "this document cannot be answered today" rate**, which is unacceptable for most product flows.
+
+Agentic RAG sidesteps both problems because it never sends the raw PDF and never sends the entire document context in one giant request.
+
+### What this means for the vision-LLM-vs-OCR debate
+
+Bigger picture, the `native_pdf` numbers settle a question we wanted to answer: **on long, image-heavy PDFs, vision-capable LLMs reading the document directly did not outperform plain OCR plus markdown.** They came in 5th of 6 on accuracy (52.0% vs 50.9% to 59.6% for the OCR-based pipelines), were the most expensive arm at $0.2552 per question, and failed 7% of the time even after retries. Premium OCR with layout extraction held up better on the exact pages where you would expect vision to shine, the chart-and-table-heavy ones.
+
+The point is not that vision LLMs are bad. They are remarkable. The point is that the parser pipeline you already maintain is not yet obsolete, and the "skip the parser, attach the PDF" shortcut is not a free lunch.
+
+## Statistical significance: are these results actually different?
+
+This is the section most benchmarks omit, and it changes the conclusions.
+
+We ran McNemar's exact-binomial test on every pair of arms (15 pairs total) using [`compute_blog_extras.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/compute_blog_extras.py). McNemar is the right test for paired classifier comparisons: both arms answered the *same* questions, so we can ask: "of the questions where the two arms disagreed, did one really win more often than chance?"
+
+The result: **only 3 of 15 pairs are distinguishable at α = 0.05**.
+
+The three statistically-significant gaps are all between the *worst* arms (LlamaCloud basic, Native PDF) and the *best* arms (LlamaCloud premium, Azure premium). The most interesting comparison, **SurfSense agentic RAG vs the long-context premium arms**, does *not* clear the significance bar. The 6-point gap could plausibly be sample noise.
+
+In other words: on this dataset, the headline claim "long-context beats agentic RAG by 6 percentage points" is real on the scoreboard but **not statistically robust**. Run the same benchmark on a different sample of 30 PDFs and the order could shuffle. This is also the place where our 30-document scope bites us: a bigger run would have given more comparisons enough power to settle.
+
+## When to choose what: a decision framework
+
+Reading the data without an action plan is half the value. Here is how we would decide for a real product, using the same numbers.
+
+<img
+  src="/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png"
+  alt="Decision tree showing when to choose agentic RAG, long-context full-context, or native PDF attachment for document question answering."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+### Use **agentic RAG** when
+
+- documents are long and mixed in size (some 5 pages, some 500),
+- the same documents will be queried more than once or twice,
+- cost per query matters (SaaS pricing, large user base),
+- you cannot afford intermittent failures on big PDFs,
+- you need to scale to corpora of thousands of documents.
+
+This is the default for most production AI products.
+
+### Use **long-context (full-context) LLMs** when
+
+- documents reliably fit in the context window after parsing (typically under ~150 pages of text),
+- the accuracy gain (6 percentage points in our benchmark, or zero, depending on which way the noise goes) actually justifies the 60–150% extra cost,
+- you have one or two questions per document, not dozens,
+- you can absorb occasional network failures on large request bodies.
+
+Premium parsing matters here. **Spending $10 per 1,000 pages on a layout-aware parser is worth it**: it gave us +4 to +9 accuracy points over basic parsers on the same questions.
+
+### Use **native PDF attachment** when
+
+- you are prototyping and want to ship in an afternoon,
+- documents are small and well-formatted,
+- you can tolerate a 7% failure rate (or you have validated the specific PDFs you care about don't trip the limits).
+
+Don't use it as the default for user-uploaded PDFs in production. The 30 MB request-body cap and unstable response streams will bite you, and exponential backoff will not save you.
+
+## Frequently asked questions
+
+<Accordion type="multiple" className="w-full not-prose">
+  <AccordionItem value="faq-1">
+    <AccordionTrigger>What is agentic RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Agentic RAG is retrieval-augmented generation where an LLM agent (not a fixed pipeline) decides what to retrieve, when to stop, and how to combine evidence. Instead of one search and one answer, the agent can run multiple retrievals, refine its query, and iterate. It usually costs less than full-context prompting and handles arbitrarily large document collections.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-2">
+    <AccordionTrigger>How is agentic RAG different from traditional RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Traditional RAG runs a single, fixed retrieval step: take the user's question, find the top-k similar chunks, send them to the LLM. Agentic RAG lets the LLM plan, retrieve repeatedly, evaluate intermediate results, and decide when it has enough context. It is more flexible at the cost of more LLM calls, and it tends to outperform vanilla RAG on multi-hop or ambiguous queries.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-3">
+    <AccordionTrigger>When should I use long-context LLMs instead of RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      When the document fits in the model's context window after parsing, you have a small number of queries per document, accuracy matters more than cost, and you can tolerate occasional transport-layer failures on multi-megabyte requests. In our benchmark, full-context premium parsers led on accuracy (about 58–60%) but cost 2–3× more per query than agentic RAG.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-4">
+    <AccordionTrigger>What is a long context window?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      A long context window is the maximum amount of text (measured in tokens) that an LLM can read in a single prompt. Modern frontier models support 200,000 tokens or more, which is roughly 150,000 words or 300+ printed pages. A long context window enables "just stuff the whole document in" approaches, but it does not eliminate the need for RAG when corpora exceed what one prompt can hold or when cost matters.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-5">
+    <AccordionTrigger>How do you benchmark agentic RAG?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Run the same set of real-world questions through each approach using the *same* underlying LLM, log accuracy with a deterministic grader, log cost (LLM + preprocessing), log latency, and run pairwise McNemar tests for statistical significance. We used 171 questions across 30 long PDFs from MMLongBench-Doc.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-6">
+    <AccordionTrigger>How much does agentic RAG cost per query?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      In our benchmark, agentic RAG cost **$0.0827 per question** end-to-end (including a one-time premium parsing cost amortised across all questions for the document). The cheapest full-context arm cost $0.1049 (about 27% more); the most expensive cost $0.2552 (over 3× more). Cost per query for agentic RAG drops further as you ask more questions per document.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-7">
+    <AccordionTrigger>Is RAG dead now that we have long context?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      No, and this benchmark is part of the evidence. Long-context wins on raw accuracy by a small margin that is mostly within statistical noise, but RAG (especially agentic RAG) wins on cost per query, on robustness to large or malformed documents, and on horizontal scaling to large corpora. The right answer is "use the cheapest pattern that hits your accuracy target", which for most production apps is agentic RAG.
+    </AccordionContent>
+  </AccordionItem>
+  <AccordionItem value="faq-8">
+    <AccordionTrigger>Do vision LLMs outperform OCR for PDF question answering?</AccordionTrigger>
+    <AccordionContent className="flex flex-col gap-4 text-balance">
+      Not in our benchmark. The `native_pdf` arm, which lets Claude Sonnet 4.5 read each PDF directly using its native vision capabilities, finished 5th of 6 with 52.0% accuracy and a 7% intrinsic failure rate. Every OCR-based pipeline we tested (Azure Document Intelligence and LlamaParse, in both basic and premium tiers) either matched or beat it on accuracy at lower cost. Premium OCR with layout extraction held up especially well on chart-heavy and table-heavy pages, the exact territory where you would expect a vision model to dominate. Vision-capable LLMs may catch up as the models improve, but as of mid-2026, the safer default for long, multimodal PDFs is still parser plus markdown.
+    </AccordionContent>
+  </AccordionItem>
+</Accordion>
+
+## What this means for your AI app
+
+If you are choosing an architecture for a long-PDF Q&A product *today*:
+
+1. **Start with agentic RAG.** It is the cheapest, most robust default and gets you within statistical noise of full-context approaches.
+2. **Pay for premium parsing once.** Whether you choose RAG or full-context, layout-aware parsing buys you real accuracy points. The marginal cost is trivial against the LLM bill.
+3. **Avoid plain "attach the PDF" in production** unless you have validated every document path. The 7% intrinsic failure rate is real and not retry-able.
+4. **Don't trust accuracy gaps under 5–6 points** unless you have tested for significance. McNemar takes 30 seconds in Python and saves embarrassing benchmark posts.
+5. **Don't bet on vision LLMs replacing OCR yet.** On 30 long, image-heavy PDFs, the native PDF (vision LLM) path lost to every OCR-based pipeline on accuracy and was the most expensive arm at $0.2552 per question. The OCR pipeline you already maintain is not obsolete.
+
+## Reproduce this benchmark
+
+Everything that produced these numbers is open source. The eval harness is its own package inside the SurfSense monorepo:
+
+- [`surfsense_evals/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals) — the harness root (extensible base classes, providers, cost ledger).
+- [`suites/multimodal_doc/parser_compare/`](https://github.com/MODSetter/SurfSense/tree/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare) — the benchmark used in this post (prompts, ingest, runner).
+- [`core/arms/bare_llm.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py), [`native_pdf.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py), [`surfsense.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py) — the three arm implementations.
+- [`mmlongbench/grader.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py) — the deterministic format-aware grader.
+- [`scripts/retry_failed_questions.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/retry_failed_questions.py) — the failed-only retry pass with exponential backoff.
+- [`scripts/compute_blog_extras.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/compute_blog_extras.py) — the McNemar pairwise tests, latency/token percentiles, and per-PDF heterogeneity.
+- [`scripts/compute_post_retry_accuracy.py`](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/scripts/compute_post_retry_accuracy.py) — merges retry survivors back into the run and recomputes the headline numbers.
+
+A minimal end-to-end run looks like:
+
+```bash
+# 1. Clone, install (uv recommended)
+git clone https://github.com/MODSetter/SurfSense
+cd SurfSense/surfsense_evals
+uv sync --extra dev
+
+# 2. Configure provider keys (Azure DI, LlamaCloud, OpenRouter, SurfSense)
+cp .env.example .env
+$EDITOR .env
+
+# 3. Ingest the first 30 PDFs from MMLongBench-Doc into all parsers
+uv run python -m surfsense_evals.cli setup multimodal_doc \
+  --vision-llm anthropic/claude-sonnet-4.5
+uv run python -m surfsense_evals.cli ingest multimodal_doc \
+  --suite parser_compare --max-docs 30
+
+# 4. Run all six arms × all 171 questions
+uv run python -m surfsense_evals.cli run multimodal_doc \
+  --suite parser_compare --sample-per-doc 20 --concurrency 2
+
+# 5. Retry failures + compute final stats
+uv run python scripts/retry_failed_questions.py
+uv run python scripts/compute_post_retry_accuracy.py
+uv run python scripts/compute_blog_extras.py
+```
+
+The dataset itself is on [Hugging Face](https://huggingface.co/datasets/yubo2333/MMLongBench-Doc) and the [original GitHub repo](https://github.com/mayubo2333/MMLongBench-Doc) (NeurIPS 2024 D&B Spotlight, [paper](https://arxiv.org/abs/2407.01523)). Bring your own LLM provider; swap `anthropic/claude-sonnet-4.5` for `openai/gpt-4o`, `google/gemini-2.5-pro`, or any OpenRouter slug to repeat the experiment with a different model.
+
+If you find that the rankings shuffle on your own document set, we want to hear about it. Open an issue on [the SurfSense repo](https://github.com/MODSetter/SurfSense/issues) with the run artifacts and we will link your results from this post.
+
+The eval harness is open source and runs against any OpenRouter model, so you can re-run the same questions on `openai/gpt-4o`, `google/gemini-2.5-pro`, or whichever model you are evaluating for production. Wire your own RAG framework into the [`Arm` base class](https://github.com/MODSetter/SurfSense/blob/main/surfsense_evals/src/surfsense_evals/core/arms/base.py) (LangChain, LlamaIndex, Haystack, your own stack) and you can drop it into the same comparison without changing the rest of the pipeline.
+
+If you want a hosted way to try agentic RAG on your own PDFs without writing the harness yourself, [SurfSense](/free) is one option (it is the same agentic stack that powered the `surfsense_agentic` arm above).
--- a/surfsense_web/blog/content/no-login-ai-privacy-reality-check.mdx
+++ b/surfsense_web/blog/content/no-login-ai-privacy-reality-check.mdx
@ -0,0 +1,289 @@
+---
+title: "How to Use Claude, ChatGPT, and Gemini Without Signing Up: A Plain-English 2026 Guide"
+description: "Where to use Claude, ChatGPT, Gemini, and other top AI models without making an account. Honest 2026 guide for casual users and developers, covering message caps, what each tool limits, and the privacy reality behind 'no login required'."
+date: "2026-05-15"
+image: "/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png"
+author: "SurfSense Team"
+authorAvatar: "/logo.png"
+tags:
+  - "AI Without Login"
+  - "Free AI Chat"
+  - "ChatGPT Without Account"
+  - "Claude Without Login"
+  - "Gemini Without Login"
+  - "Claude Incognito"
+  - "Duck.ai"
+  - "Brave Leo"
+  - "Self-Hosted AI"
+featured: false
+---
+
+> **TL;DR for skimmers**
+>
+> You don't need an account to use the best AI models in 2026. Here's the brand-by-brand answer:
+>
+> - **Want ChatGPT?** Open `chatgpt.com` in a fresh tab. Guest mode works, no signup, but the message cap is tight.
+> - **Want Claude?** `claude.ai` itself still wants an account, but Anthropic [shipped incognito mode on April 9, 2026](https://support.claude.com/en/articles/12260368-using-incognito-chats). For zero-account Claude, use [Duck.ai](https://duck.ai) (Claude Haiku 4.5, anonymized) or [Brave Leo](https://brave.com/leo/) inside the Brave browser.
+> - **Want Gemini?** Google requires a Google account on `gemini.google.com`. The closest no-signup path is an open-source aggregator like our own [SurfSense /free](/free), which lists Gemini among its model options without a Google sign-in.
+> - **Want all of them in one place?** Our open-source [SurfSense /free](/free) lets you pick from ChatGPT, Claude, Gemini, DeepSeek, Mistral, Llama, and a rotating list of other models with no account, and you get 500,000 free tokens to spend across any of them. [Duck.ai](https://duck.ai) and [Brave Leo](https://brave.com/leo/) are strong privacy-first alternatives.
+> - **Care about privacy too?** Most "no login" pages still log your IP and prompt content. The exceptions are Brave Leo, Duck.ai, and self-hosted models. Skip to [the privacy honest-talk](#the-privacy-honest-talk-no-login-is-not-the-same-as-anonymous) if that's the part you came for.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png"
+  alt="Side-by-side illustration of a browser with a 'No login required' badge and the same browser with hidden IP, fingerprint, and prompt-log data being inspected, showing that no login is not the same as no tracking."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+The promise of the search query "AI without login" is simple: someone wants to chat with a top model without making an account. Maybe they're trying it out for the first time. Maybe they don't want another email in their inbox. Maybe they care about privacy. Maybe they're recommending an AI to a class, a parent, or a coworker who isn't going to deal with a signup wall.
+
+Whatever the reason, the answer in 2026 is mostly: **yes, you can do this**, but the options are scattered across product pages, browser features, and a long tail of wrapper sites that look identical. This guide is the cleanest map we could draw, organised so a casual user gets value in the first three minutes and a developer or privacy-conscious reader can keep going deeper.
+
+We're going to cite primary sources for everything that touches privacy or product behavior, so you can verify (and so the article ages well as the products change). The two cluster posts that go deeper into [Claude specifically](/blog/use-claude-without-login-2026) and into [a tested comparison of 12 free AI chats](/blog/tested-no-login-ai-chats-2026) will sit alongside this one when they're ready.
+
+## Where to use each major AI model without an account
+
+Let's go brand by brand. For each one, what works, what the cap looks like, and where to go.
+
+### ChatGPT, with no OpenAI account
+
+The shortest answer: open [chatgpt.com](https://chatgpt.com) in a private/incognito tab. ChatGPT's guest mode lets you send a few prompts and get GPT-5-class output without making an account. There's no specific page to go to; the site detects you're not logged in and gives you a guest experience automatically.
+
+**What it gives you:** the same headline GPT-5 model that paying users start with, for short queries. The interface is the standard ChatGPT UI without the sidebar or chat history panel.
+
+**What it doesn't give you:** file uploads, the Code Interpreter, conversation history (refresh the page and your chat is gone), and no advanced features like custom GPTs or memory.
+
+**The catch:** the guest cap is around 10 messages per 5-hour rolling window on the headline GPT-5-class model ([per OpenAI's current behavior](https://www.smashingapps.com/how-to-use-chatgpt-without-creating-an-account/)). After that you're auto-downgraded to the lighter GPT-5 Mini variant with no hard limit, not blocked by a hard "please sign up" wall. So you can keep going indefinitely on the smaller model, you just can't keep using the headline one.
+
+If GPT-5 quality on a no-signup page matters more than going through OpenAI specifically, [Bing Copilot](https://copilot.microsoft.com) at `copilot.microsoft.com` runs on a GPT-5-class backend, also works without a Microsoft account, and tends to have a more generous cap because Microsoft monetises through Bing search instead of subscriptions.
+
+You can also use **[SurfSense /free](/free)** (full disclosure: this is our own open-source aggregator) which lists ChatGPT among its no-signup models. You get 500,000 free tokens to spend across any model on the page, which is meaningfully more than what guest mode gives you before the signup wall. The source code is on [GitHub](https://github.com/MODSetter/SurfSense) so the privacy and quota behavior is auditable, not just promised.
+
+### Claude, with as little friction as possible
+
+Anthropic does require an account on `claude.ai` itself, but the picture is much better than it was a year ago. There are now four legitimate paths to using Claude without going through the full signup-and-stay-logged-in experience.
+
+**Path 1: Anthropic's incognito mode (existing account required).** Launched April 9, 2026, available on every Claude plan from Free to Enterprise. Click the ghost icon in the upper-right when starting a new chat. The interface gets a black border and an "Incognito chat" label. The conversation is not saved to your chat history, not used by Claude's memory feature, and not used for training. Source: [Anthropic Help Center](https://support.claude.com/en/articles/12260368-using-incognito-chats). This is the right answer if you already have a Claude account and want a temporary, no-trace conversation.
+
+**Path 2: Duck.ai (no account at all).** [duck.ai](https://duck.ai) is DuckDuckGo's chat product. Pick "Claude 4.5 Haiku" from the model dropdown and start chatting. No signup, no email. DuckDuckGo proxies your request through their own servers, so Anthropic never sees your IP. We'll cover the full privacy mechanics [below](#the-privacy-honest-talk-no-login-is-not-the-same-as-anonymous). Per-session cap exists but there's no persistent quota.
+
+**Path 3: Brave Leo (no account, browser-side).** Install the [Brave browser](https://brave.com), open the sidebar, click the Leo icon, pick Claude Haiku from the model dropdown. No signup. Brave doesn't collect identifiers tied to you ([per their docs](https://brave.com/leo/)). The trade-off is that you have to use Brave as your browser, and you're limited to Haiku on the free tier (Sonnet and Opus require Brave Leo Premium at $14.99/month).
+
+**Path 4: Multi-model aggregator pages.** These wrap the Anthropic API and serve Claude responses without an account on Anthropic. The pick we'd recommend (with the obvious disclosure that we made it) is **[SurfSense /free](/free)**: it lists Claude alongside ChatGPT, Gemini, DeepSeek, Mistral, and Llama in one chat UI, the source code is open on [GitHub](https://github.com/MODSetter/SurfSense) so the privacy and quota behavior is verifiable, and the 500K free token quota is shared across any model you pick (so you can spend the budget on Claude if that's what you came for). The closed-source alternatives ([HIX.AI](https://hix.ai/claude), [EaseMate](https://www.easemate.ai/ai-chat/ask-claude), [Eye2.ai](https://eye2.ai), [NoteGPT](https://notegpt.io/ai-models/claude-sonnet-4-5)) work too, but quality and message limits vary widely; we [tested 12 of these](/blog/tested-no-login-ai-chats-2026) in a separate post.
+
+For the developer-specific paths (Claude Code with Bedrock or Vertex AI authentication, the Claude for Open Source program, the `/passes` Guest Pass system), see our [Claude-specific deep dive](/blog/use-claude-without-login-2026).
+
+### Gemini, the awkward one
+
+Google requires a Google account on `gemini.google.com` and there is no first-party guest mode. If you don't already use a Google account, this is the model with the most signup friction.
+
+The realistic options:
+
+- **Sign in with a throwaway Google account.** Imperfect but functional, and Google's free tier on Gemini is genuinely good (15 GB of Drive storage, deep web research, native voice).
+- **Use an aggregator that wraps Gemini.** **[SurfSense /free](/free)** (disclosure: ours, open source on [GitHub](https://github.com/MODSetter/SurfSense)) lists Gemini among its model options and forwards requests to Google's API behind the scenes, so the user-facing chat works with no Google sign-in and no Google identity tied to your prompts. Quality matches the underlying Gemini API tier we pay for. Other wrapper pages do the same thing but few publish their privacy or quota behavior; ours is auditable in source, and the 500K-token quota is shared across any model on the page (not just Gemini).
+- **Pick a different model.** If "I want long-context plus web search without signing in" is the actual need, Brave Leo with Claude Haiku, Bing Copilot, or Perplexity (more on that below) are no-signup substitutes for the most common Gemini use cases.
+
+### Multiple models in one tab
+
+If you want to compare answers from Claude, GPT, Gemini, and others without juggling four browser tabs and four signup walls, three products do this without an account:
+
+- **[SurfSense /free](/free)** (disclosure: ours, open source on [GitHub](https://github.com/MODSetter/SurfSense)) gives you a rotating list of models from OpenAI, Anthropic, Google, DeepSeek, Mistral, and Meta in one chat UI, no account, with **500,000 free tokens shared across any model** on the page (meaningfully more than the per-session caps you'll hit on Duck.ai or Brave Leo before they make you wait). The model lineup updates as new models ship, and the wrapper-layer code is on GitHub so the no-account session, the no-database-storage claim, and the quota behavior are all auditable. Trade-off: when you hit the 500K, you either sign up ($5 free credit, then $1 per $1 of model cost) or self-host.
+- **[Duck.ai](https://duck.ai)** has Claude 4.5 Haiku, Llama 4 Scout, Mistral Small 3 24B, GPT-4o mini, GPT-5 mini, and gpt-oss-120b on the free tier. The big win is genuine IP anonymisation; the trade-off is that everything is the cheaper tier of each model.
+- **[Brave Leo](https://brave.com/leo/)** has Claude Haiku, Llama 3.1 8B, Mixtral, and Qwen 3 14B in the Brave browser sidebar. Best privacy story of the three, but you have to use Brave as your browser.
+
+### Honourable mentions
+
+- **Perplexity** at [perplexity.ai](https://www.perplexity.ai) lets you do search-flavored AI lookups without an account. The interface is built around citations, which is great for fact-finding and not great for chat-style writing or code.
+- **DeepAI** at [deepai.org/chat](https://deepai.org/chat) and **HotBot AI** are older standalone chat products. They work, the model quality is below the frontier, and the privacy story is unremarkable.
+- **HIX.AI, EaseMate, NoteGPT, TalkAI, ChatBot Chat App** are wrapper sites that all do roughly the same thing: front a paid model API and serve it for free without an account. Use them if convenience is the only goal, with the caveat that none of them publicly commit to not retaining your prompts (more on that below).
+
+## What you give up by not making an account
+
+There's a real tradeoff between "skip the signup" and "have a good account experience". For most casual use it's worth it. For some workflows it isn't.
+
+Things you lose without an account:
+
+- **Saved chat history.** Most no-signup paths don't persist your conversation. Refresh the page and it's gone. (Brave Leo is an exception: it stores chat history locally on your device, so it survives between sessions but never leaves your machine. Duck.ai does the same with its "Recent Chats" feature.)
+- **File uploads, in most cases.** ChatGPT guest mode and Claude incognito do not let you attach a PDF or image. Duck.ai and Brave Leo are limited too. Aggregators vary.
+- **Tighter message limits.** ChatGPT guest mode caps fast. Claude on `claude.ai` lets account holders send 30-50 messages per 5-hour rolling window; guest paths are usually tighter.
+- **Cross-device continuity.** No signup means no syncing your conversations from laptop to phone.
+- **Power features.** No memory, no custom instructions, no Code Interpreter, no Anthropic Projects, no Gemini Workspace integrations.
+
+Things you gain:
+
+- **No marketing emails.** You're not on a list. You won't get the "you haven't tried our new feature!" emails or the retargeting ads that follow you around the web.
+- **No persistent identity.** The provider sees a session, not a user. Your prompts aren't tied to your purchase history, your YouTube viewing habits, or any other product the provider runs.
+- **No risk of accidental cross-account leakage.** A coworker who borrows your laptop sees a fresh chat, not your private history.
+
+For a quick prompt or a one-off question, the gains usually win. For sustained work, expect to either tolerate the limits or eventually sign up.
+
+## The privacy honest-talk: "no login" is not the same as "anonymous"
+
+Here's the part that surprises most readers, and the one most listicles dodge: skipping the signup does not make your AI usage anonymous. It removes one identifier (your account) but leaves several others in place. Whether that matters depends entirely on what you're pasting into the chat box.
+
+The honest categorisation of no-signup AI looks like this.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png"
+  alt="A 3-tier pyramid showing: bottom tier 'genuinely anonymized' with Brave Leo, Duck.ai, and self-hosted Ollama; middle tier 'no account but provider logs' with chatgpt.com guest, Claude incognito, and Bing Copilot; top tier 'wrapper sites that also log you' with HIX, EaseMate, NoteGPT examples."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+**Anonymised tools** (the smallest group). The provider commits to not logging your IP and not retaining your prompts. There are essentially three options today:
+
+- **Brave Leo.** Per [Brave's product page](https://brave.com/leo/): "We do not collect identifiers that can be linked to you (such as IP Address), and no personal data is retained." Translation: the model provider that powers Leo (Anthropic, Mistral, Meta) sees the prompt content but not your IP.
+- **Duck.ai.** Per [DuckDuckGo's documentation](https://duckduckgo.com/duckduckgo-help-pages/duckai/): DuckDuckGo "anonymizes" your request by stripping your IP and replacing it with their own before forwarding to the underlying model. They don't store prompts, don't train on your data, and your "Recent Chats" are stored locally on your device.
+- **Self-hosted open-weights models.** Llama 3.3, Qwen 2.5, Mistral Large, DeepSeek R1 running locally on your laptop via [Ollama](https://ollama.com) or [LM Studio](https://lmstudio.ai). The model never leaves your machine. The only entity that sees your data is you. The "How to" is in [the section below](#for-developers-and-the-privacy-serious-self-host-in-5-minutes).
+
+**Account-free first-party tools.** No signup required, but the provider running the model still sees your IP, your prompt text, and your session metadata. The standard examples:
+
+- **`chatgpt.com` guest mode** is OpenAI logging your prompts directly. Their privacy policy applies. PCMag's [ChatGPT Tracks More Than You Think](https://www.pcmag.com/explainers/chatgpt-tracks-more-than-you-think-how-to-lock-down-your-privacy) is a good summary of what this looks like.
+- **Claude incognito mode** is Anthropic logging your prompts, with the specific feature carve-out that incognito chats aren't saved to your visible history, aren't used by Claude's memory, and aren't used for training. **The important caveat**: per [Anthropic's docs](https://support.claude.com/en/articles/12260368-using-incognito-chats), incognito chats "are retained for either 30 days (default), or longer in accordance with your organization's custom data retention setting (available for Enterprise plans)." Not in your history, not actually deleted either.
+- **Bing Copilot** is Microsoft logging your prompts. Standard Microsoft privacy policy applies.
+
+**Wrapper sites** that don't require an account but ALSO log you on top of the underlying provider's logging. Most of the "free Claude!" and "free GPT!" pages from the search results are in this group. They serve a real model, but they're a server in the middle that has its own logs, and most of them don't publicly commit to not retaining your prompts. Convenient. Not private.
+
+When you're evaluating any "free [Brand] without login" page, the question to ask is: *does their privacy page explicitly say they don't store prompts and don't pass identifying information to the model provider?* If the answer is just "no signup required!" with nothing about logging, you're in the wrapper category.
+
+**Open-source wrappers** are a half-step better than the closed ones, and worth calling out as a separate category. Our own [SurfSense /free](/free) is in this bucket: the source code that handles your prompt is on [GitHub](https://github.com/MODSetter/SurfSense), so the claims about anonymous sessions, no persistent identity, and no prompt retention are auditable rather than promised. That doesn't make it equivalent to Brave Leo or Duck.ai (the model provider behind /free still receives the prompt content), but it does mean you can verify the wrapper layer doesn't add its own logging on top. If you're going to use a wrapper anyway, prefer one whose code you can read.
+
+### What every public AI provider logs (with citations)
+
+This is the reference table the wrapper sites don't include. Sources are linked in each cell that needs one.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png"
+  alt="A comparison illustration showing two flows: a 'standard no-login AI' lane where data flows past IP, prompt, and session logging stages into a tall stack of provider logs; and an 'anonymized AI' lane where a shield strips the IP before the data reaches a tiny single-sheet record."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+| Service | IP logged | Prompt content logged | Used for training | Retention |
+|---|---|---|---|---|
+| ChatGPT (guest, no login) | Yes | Yes | OpenAI may use guest prompts for service improvement | Per OpenAI privacy policy |
+| Claude (logged-in, normal) | Yes | Yes | No on Free/Pro/Max per [Anthropic privacy center](https://privacy.claude.com); stricter on Team/Enterprise | Per account settings |
+| Claude (incognito) | Yes | Yes (but not in your history) | No, per [Anthropic docs](https://support.claude.com/en/articles/12260368-using-incognito-chats) | 30 days default, longer for Enterprise |
+| Gemini (Google account) | Yes | Yes | Per Google account settings | Per Google account settings |
+| Bing Copilot (no account) | Yes | Yes | Per Microsoft privacy policy | Per Microsoft privacy policy |
+| Brave Leo | **No** ("no identifiers linkable to you" per [brave.com/leo](https://brave.com/leo/)) | Forwarded to model, not retained by Brave | No, per Brave docs | None per Brave docs |
+| Duck.ai | **No** (DuckDuckGo strips IP before forwarding) | Forwarded to model, not retained by DuckDuckGo | No, per [DuckDuckGo docs](https://duckduckgo.com/duckduckgo-help-pages/duckai/) | Local-only chat history |
+| Self-hosted Ollama | n/a (local only) | n/a (local only) | n/a | None unless you save it |
+| Wrapper sites (HIX, EaseMate, NoteGPT, etc.) | Yes (by wrapper) + downstream provider | Yes (by both wrapper and provider) | Depends on wrapper TOS | Depends on wrapper TOS |
+
+A few takeaways from the table that the existing search results never make explicit:
+
+- **Brave Leo and Duck.ai are the only mainstream products that publicly commit to not logging your IP.** They achieve this with a server-side proxy (DuckDuckGo) or a privacy-first browser pipeline (Brave). The model provider never sees your real IP; it sees the proxy's.
+- **Claude's incognito mode is honest about its limits.** It's still retained for 30 days. That's a thoughtful safety design, not an anonymity promise.
+- **Wrapper sites add a layer of logging on top of the model provider's logging.** That's strictly worse for privacy than going to the model provider's first-party guest mode, even if it removes the account requirement.
+
+## Pick the right tool for what you're actually doing
+
+The right tool depends on what you're pasting into the chat box. Three rough buckets cover almost every reader.
+
+<img
+  src="/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png"
+  alt="A decision tree showing which no-login AI tool to pick based on what you are pasting: casual content can use any tool, sensitive code or customer data needs anonymized tools or self-hosting, and regulated data needs enterprise contracts or self-hosting only."
+  width={1920}
+  height={1080}
+  style={{ width: '100%', height: 'auto', borderRadius: '12px' }}
+/>
+
+### If you're a casual user with non-sensitive prompts
+
+Homework explanations, recipe ideas, brainstorming, drafts of emails, quick summaries of public articles, code from public GitHub repos. Your prompt isn't interesting to anyone with subpoena power. Privacy isn't really the constraint; convenience and model quality are.
+
+**What to use:** anything in this guide. Pick by which model you want.
+
+- For ChatGPT-like answers, `chatgpt.com` guest mode or Bing Copilot.
+- For Claude, [Duck.ai](https://duck.ai) (Haiku, anonymised) or an aggregator like [SurfSense /free](/free) which lists Claude among its model options.
+- For multi-model comparison in one tab, [SurfSense /free](/free) or [Duck.ai](https://duck.ai).
+
+### If you work with code, internal docs, or customer data
+
+Code that includes API keys, internal class names, or business logic; documents your employer hasn't published; conversations that involve customer details. The IP and the prompt content matter here.
+
+**What to use:** the anonymised category only. Brave Leo, Duck.ai, or a self-hosted model. If you must use a first-party guest mode, redact ruthlessly before pasting. Avoid wrapper sites for this kind of prompt.
+
+### If you're handling regulated or contractually sensitive data
+
+PHI under HIPAA, attorney-client privileged matter, financial data under SOX or GLBA, EU personal data under GDPR, anything covered by an NDA. The legal exposure is severe and the answer is not a free chat product.
+
+**What to use:** self-host an open-weights model on hardware you control, or use an enterprise contract with a BAA / DPA in place (Anthropic Enterprise, OpenAI Enterprise, Google Cloud Vertex AI). Public free chat is not an acceptable channel here, regardless of whether it asks for a login.
+
+## For developers and the privacy-serious: self-host in 5 minutes
+
+This section is for technical readers. If you're a casual user, you can skip it; the answers above are enough.
+
+Self-hosting an open-weights model is the only path where "private" means private in the strict sense. Your prompt content never leaves your machine. There is no provider, no logging, no retention, no training-on-your-data risk. And it's much easier than it used to be.
+
+1. Install [Ollama](https://ollama.com) (one-click installer for macOS, Windows, Linux).
+2. Open a terminal and run:
+   ```bash
+   ollama run llama3.3:8b
+   ```
+3. The first run downloads the model (about 5 GB). Subsequent runs start instantly.
+4. Type a prompt at the `>>>` prompt. You're chatting with a local model.
+
+Quality is genuinely competitive for most casual use. Llama 3.3 8B handles writing, summarisation, and general Q&A well. For better quality, swap to `qwen2.5:14b` or `mistral-small:24b` if you have 16+ GB of RAM. For coding-specific work, `deepseek-coder-v2` is the current open-weights leader.
+
+If you want a graphical interface instead of a terminal, install [LM Studio](https://lmstudio.ai). For a hosted-but-self-controlled experience, the open-source SurfSense stack on the [GitHub repo](https://github.com/MODSetter/SurfSense) gives you the same chat UI with the same model options, running on your own servers.
+
+For the deeper performance trade-offs between local and frontier models on real document Q&A, our [agentic RAG vs long-context LLMs benchmark](/blog/agentic-rag-vs-long-context-llms-benchmark) has the numbers.
+
+## FAQ
+
+### Can I use ChatGPT without an account?
+
+Yes. Open `chatgpt.com` in a private tab and you'll get guest mode automatically. You get around 10 messages on the headline GPT-5-class model per 5-hour rolling window, after which you're auto-switched to a lighter GPT-5 Mini variant with no hard limit (not blocked by a hard signup wall). No file uploads, no chat history, no Code Interpreter, but for short queries the model quality is the same as the paid first-tier experience.
+
+### Can I use Claude without an account?
+
+Not on `claude.ai` itself, which still requires signup. The closest no-account paths are [Duck.ai](https://duck.ai) (Claude Haiku 4.5, free, anonymised), [Brave Leo](https://brave.com/leo/) (Claude Haiku in the Brave browser sidebar), and aggregator pages like our open-source [SurfSense /free](/free), which lists Claude among the models you can pick with no Anthropic account and a 500K free token budget shared across the whole page. For more, see our [Claude-specific guide](/blog/use-claude-without-login-2026).
+
+### Can I use Gemini without a Google account?
+
+Not on Google's own product pages. Aggregator sites like our open-source [SurfSense /free](/free) include Gemini among the models you can pick and forward requests to the Gemini API behind the scenes, so the user-facing chat works without a Google sign-in. If you specifically want what Gemini is best at (long-context, web research, Workspace integration), there isn't a perfect Google-free substitute, though Brave Leo with Claude Haiku and Perplexity cover most use cases.
+
+### What is Claude incognito mode?
+
+A feature [Anthropic launched on April 9, 2026](https://support.claude.com/en/articles/12260368-using-incognito-chats), available on every Claude plan from Free to Enterprise. Click the ghost icon when starting a new chat. The conversation isn't saved to your history, isn't pulled into Claude's memory, and isn't used for training. It still requires an existing Claude account, and the conversation is retained for 30 days for safety. Useful if you have a Claude account and want a temporary one-off chat.
+
+### Is using AI without an account actually private?
+
+Not by itself. "No login" removes one identifier (your account), but the model provider still sees your IP and the content of your prompt. For actual anonymity, use Brave Leo, Duck.ai, or a self-hosted open-weights model. The privacy section above explains the categories in detail.
+
+### Does ChatGPT guest mode keep my data private?
+
+Less than logged-in mode, but not by much. OpenAI still logs your IP and prompt content. Logged-out users have fewer opt-out controls than logged-in free-tier users. If your prompt is something you'd be uncomfortable seeing on someone else's screen, treat ChatGPT guest mode as recorded.
+
+### What's the most private AI chatbot in 2026?
+
+A self-hosted open-weights model running locally via Ollama or LM Studio. Among hosted options, Brave Leo and Duck.ai are the two that publicly commit to not logging your IP and not retaining your prompts.
+
+### Are wrapper sites that say "free Claude" or "free GPT" safe?
+
+They're convenient, not private. Most "free [Brand] without login" pages are servers that wrap the underlying API and serve responses for free. They don't ask you to sign up, but they're a third party in the middle that has their own logs, on top of the model provider's logs. Use them for casual prompts you'd be fine with showing up in someone else's database.
+
+### What's the difference between Duck.ai and a regular ChatGPT wrapper?
+
+Duck.ai is the only mainstream chat product that publicly documents an end-to-end anonymisation model: DuckDuckGo proxies your request, strips your IP, doesn't retain prompts, and doesn't train on your data. Standard wrapper sites do none of these things. They're just "no signup form".
+
+### Is Brave Leo really free with no login?
+
+Yes. Per [Brave's documentation](https://brave.com/leo/), no account or signup is required for the free tier, and Brave doesn't collect identifiers tied to you. The free tier includes Claude Haiku, Llama 3.1 8B, Mixtral, and Qwen 3 14B. Premium ($14.99/month) adds Claude Sonnet 4 and DeepSeek R1.
+
+### How can a developer avoid the browser login flow entirely?
+
+For Anthropic specifically: configure Claude Code to authenticate via Amazon Bedrock, Google Vertex AI, or Microsoft Foundry per the [Claude Code authentication docs](https://code.claude.com/docs/en/authentication). No browser login required, IAM-only auth. For OpenAI and Google, the standard answer is API keys with cloud-provider IAM and IP allow-listing. For full local control, the [self-hosting section](#for-developers-and-the-privacy-serious-self-host-in-5-minutes) covers Ollama and LM Studio.
+
+## The bottom line
+
+The question "can I use a top AI model without an account" has a much better answer in 2026 than it did a year ago. Anthropic added incognito mode, Duck.ai added free Claude Haiku with real anonymisation, Brave Leo grew into a credible browser-side option, and the multi-model aggregators got cheaper to run.
+
+If you just want to chat: pick the brand you want, use the path from the relevant section above, and be done. If you care about privacy: stick to Brave Leo, Duck.ai, or a self-hosted model, and remember that "no signup" alone doesn't make a tool anonymous. If you're handling sensitive or regulated data: don't use a free chat product at all, use an enterprise contract or run the model yourself.
+
+And if you want a single no-account chat hub that lets you pick from ChatGPT, Claude, Gemini, DeepSeek, Mistral, Llama, and a rotating list of others under one URL with the wrapper-layer code open on [GitHub](https://github.com/MODSetter/SurfSense), that's exactly what we built **[SurfSense /free](/free)** for. The pitch: 500,000 free tokens shared across any model on the page, no account, anonymous sessions not stored in any database, and the model lineup updates whenever new models ship. It's not the right answer for every reader (if you need IP anonymisation specifically, Brave Leo or Duck.ai is still the better fit), but it is a genuine, honest pick, and we'd rather list it confidently than pretend we don't make it. Whichever you choose, the goal of this guide was to give you the honest map first.
--- a/surfsense_web/components/homepage/navbar.tsx
+++ b/surfsense_web/components/homepage/navbar.tsx
@ -37,7 +37,7 @@ export const Navbar = ({ scrolledBgClassName }: NavbarProps = {}) => {
 	const navItems = [
 		{ name: "Free\u00A0AI", link: "/free" },
 		{ name: "Pricing", link: "/pricing" },
-		// { name: "Blog", link: "/blog" },
+		{ name: "Blog", link: "/blog" },
 		{ name: "Changelog", link: "/changelog" },
 		{ name: "Docs", link: "/docs" },
 		{ name: "Contact\u00A0Us", link: "/contact" },
--- a/surfsense_web/pnpm-workspace.yaml
+++ b/surfsense_web/pnpm-workspace.yaml
@ -1,3 +1,6 @@
+packages:
+  - "."
+
 allowBuilds:
  "@parcel/watcher": true
  "@rocicorp/zero-sqlite3": true
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-01-hero-image.png
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-02-architecture-diagram.png
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-dark.png
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-light.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-03-accuracy-bar-chart-light.png
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-dark.png
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-light.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-04-cost-vs-accuracy-light.png
--- a/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png
+++ b/surfsense_web/public/images/blog/agentic-rag-vs-long-context-llms-benchmark/placeholder-05-decision-tree.png
--- a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png
+++ b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-01-no-login-vs-no-tracking-hero.png
--- a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png
+++ b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-02-three-tier-pyramid.png
--- a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png
+++ b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-03-what-providers-log-conceptual.png
--- a/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png
+++ b/surfsense_web/public/images/blog/no-login-ai-privacy-reality-check/placeholder-04-decision-tree-by-use-case.png
--- a/surfsense_web/source.config.ts
+++ b/surfsense_web/source.config.ts
@ -25,6 +25,12 @@ export const blog = defineDocs({
 			author: z.string().default("SurfSense Team"),
 			authorAvatar: z.string().optional(),
 			tags: z.array(z.string()).optional(),
+			// Pin this post into the featured section above the archive grid.
+			// Multiple posts can be featured at once; ordering within the
+			// featured section follows `featured_order` ascending and falls
+			// back to `date` descending.
+			featured: z.boolean().optional().default(false),
+			featured_order: z.number().optional(),
 		}),
 	},
 });