docs: improve setup guidance and env examples

This commit is contained in:
willchen96 2026-05-10 22:36:29 +08:00
parent dbbf19697e
commit a84c1cc113
5 changed files with 100 additions and 45 deletions

View file

@ -120,14 +120,11 @@ export async function ensureReviewAccess(
}
/**
* Filter a list of document IDs down to those the caller is actually
* authorised to read owners pass, plus any document whose `project_id`
* the caller has access to (own project or `shared_with` member).
* Filter user-supplied document IDs down to documents the caller can read.
*
* The tabular-review routes accept user-supplied `document_ids` from
* request bodies; without this filter an attacker who has any review of
* their own can plant arbitrary doc UUIDs and have the server fetch + run
* an LLM extraction over their bytes (CWE-639).
* Tabular review routes accept document IDs from request bodies. Without this
* check, a caller with access to any review could attach arbitrary document
* UUIDs and later cause /generate or /regenerate-cell to extract those bytes.
*/
export async function filterAccessibleDocumentIds(
documentIds: string[],
@ -146,18 +143,22 @@ export async function filterAccessibleDocumentIds(
project_id: string | null;
}[];
if (rows.length === 0) return [];
const accessibleProjectIds = new Set(
await listAccessibleProjectIds(userId, userEmail, db),
);
const out: string[] = [];
for (const d of rows) {
if (d.user_id === userId) {
out.push(d.id);
} else if (d.project_id && accessibleProjectIds.has(d.project_id)) {
out.push(d.id);
const allowed: string[] = [];
for (const doc of rows) {
if (doc.user_id === userId) {
allowed.push(doc.id);
} else if (
doc.project_id &&
accessibleProjectIds.has(doc.project_id)
) {
allowed.push(doc.id);
}
}
return out;
return allowed;
}
/**

View file

@ -193,10 +193,6 @@ tabularRouter.post("/", requireAuth, async (req, res) => {
if (!access.ok)
return void res.status(404).json({ detail: "Project not found" });
}
// Drop any document_ids the caller can't access. Without this filter a
// user can stuff foreign UUIDs into document_ids, then call /generate
// or /regenerate-cell to read those documents' bytes back through the
// LLM (CWE-639).
const allowedDocumentIds = Array.isArray(document_ids)
? await filterAccessibleDocumentIds(
document_ids,
@ -515,9 +511,6 @@ tabularRouter.patch("/:reviewId", requireAuth, async (req, res) => {
const existingDocIds = (existingCells ?? []).map(
(cell) => cell.document_id,
);
// Drop any newly-added doc_ids the caller can't read; preserve
// already-attached docs so a non-owner collaborator's PATCH
// doesn't accidentally orphan cells they can't directly access.
const existingDocIdSet = new Set(existingDocIds);
const newDocCandidates = requestedDocIds.filter(
(id) => !existingDocIdSet.has(id),
@ -687,9 +680,6 @@ tabularRouter.post(
if (!column)
return void res.status(400).json({ detail: "Column not found" });
// Defense-in-depth — refuse to extract bytes for a document the
// caller can't read, even if a stale tabular_cells row points at it
// from before the access filter was added (CWE-639).
const docAllowed = await filterAccessibleDocumentIds(
[document_id],
userId,
@ -804,21 +794,19 @@ tabularRouter.post("/:reviewId/generate", requireAuth, async (req, res) => {
cellMap.set(`${cell.document_id}:${cell.column_index}`, cell);
const docIds = [...new Set((cells ?? []).map((c) => c.document_id))];
// Same defense-in-depth as /regenerate-cell — filter to docs the caller
// can actually read, so legacy cells planted before the access check
// can't be coerced into running an LLM extraction (CWE-639).
const allowedDocIds = new Set(
await filterAccessibleDocumentIds(docIds, userId, userEmail, db),
);
let docs: Record<string, unknown>[] = [];
if (docIds.length > 0) {
const filteredIds = docIds.filter((id) => allowedDocIds.has(id));
const { data } = filteredIds.length > 0
? await db
.from("documents")
.select("id, filename, file_type, page_count")
.in("id", filteredIds)
: { data: [] as Record<string, unknown>[] };
const { data } =
filteredIds.length > 0
? await db
.from("documents")
.select("id, filename, file_type, page_count")
.in("id", filteredIds)
: { data: [] as Record<string, unknown>[] };
docs = data ?? [];
} else if (review.project_id) {
const { data } = await db