From fe567a6af1de29eaa07d4197dfceb44c10314521 Mon Sep 17 00:00:00 2001
From: Valerio <88933932+0xMassi@users.noreply.github.com>
Date: Tue, 19 May 2026 19:05:16 +0200
Subject: [PATCH] feat(core): endpoints module for API surface extraction from
HTML and JS (#47)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* feat(core): endpoints module — extract API surface from HTML + JS bundles
* fix(docker): source CA bundle from distroless instead of apt (fixes arm64 release build)
* fix(test): serialize env-mutating CloudClient tests to stop flaky CI
* feat(core): filter endpoint-extractor noise (invalid hosts, schema domains, bare paths)
---
Dockerfile | 6 +-
Dockerfile.ci | 7 +-
crates/webclaw-core/src/endpoints.rs | 515 +++++++++++++++++++++++++++
crates/webclaw-core/src/lib.rs | 1 +
crates/webclaw-fetch/src/cloud.rs | 18 +-
5 files changed, 536 insertions(+), 11 deletions(-)
create mode 100644 crates/webclaw-core/src/endpoints.rs
diff --git a/Dockerfile b/Dockerfile
index 552aea7..fefb39b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \
# ---------------------------------------------------------------------------
FROM ubuntu:24.04
-RUN apt-get update && apt-get install -y --no-install-recommends \
- ca-certificates \
- && rm -rf /var/lib/apt/lists/*
+# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of
+# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners).
+COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
# Copy all three binaries
COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw
diff --git a/Dockerfile.ci b/Dockerfile.ci
index ccd8a33..7b62718 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -5,9 +5,10 @@ ARG BINARY_DIR=binaries
FROM ubuntu:24.04
-RUN apt-get update && apt-get install -y --no-install-recommends \
- ca-certificates \
- && rm -rf /var/lib/apt/lists/*
+# CA bundle copied from a reliable multi-arch image instead of apt-installing
+# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from
+# CI runners and breaks the multi-arch release build. No build-time network.
+COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
ARG BINARY_DIR
COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw
diff --git a/crates/webclaw-core/src/endpoints.rs b/crates/webclaw-core/src/endpoints.rs
new file mode 100644
index 0000000..21c5280
--- /dev/null
+++ b/crates/webclaw-core/src/endpoints.rs
@@ -0,0 +1,515 @@
+//! API/endpoint surface discovery from HTML + JS bundle text.
+//!
+//! Pure and zero-network: callers fetch the page and its `
+
+
+
+