mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-01 11:06:24 +02:00
233 lines
7.8 KiB
Python
233 lines
7.8 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
webclaw benchmark — webclaw vs trafilatura vs firecrawl.
|
|||
|
|
|
|||
|
|
Produces results/YYYY-MM-DD.json matching the schema in methodology.md.
|
|||
|
|
Sites and facts come from ../sites.txt and ../facts.json.
|
|||
|
|
Tokenizer: cl100k_base (GPT-4 / GPT-3.5 / text-embedding-3-*).
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
FIRECRAWL_API_KEY=fc-... python3 bench.py
|
|||
|
|
python3 bench.py # runs webclaw + trafilatura only
|
|||
|
|
|
|||
|
|
Optional env:
|
|||
|
|
WEBCLAW path to webclaw release binary (default: ../../target/release/webclaw)
|
|||
|
|
RUNS runs per site (default: 3)
|
|||
|
|
WEBCLAW_TIMEOUT seconds (default: 30)
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
import json, os, re, statistics, subprocess, sys, time
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
HERE = Path(__file__).resolve().parent
|
|||
|
|
ROOT = HERE.parent # benchmarks/
|
|||
|
|
REPO_ROOT = ROOT.parent # core/
|
|||
|
|
|
|||
|
|
WEBCLAW = os.environ.get("WEBCLAW", str(REPO_ROOT / "target" / "release" / "webclaw"))
|
|||
|
|
RUNS = int(os.environ.get("RUNS", "3"))
|
|||
|
|
WC_TIMEOUT = int(os.environ.get("WEBCLAW_TIMEOUT", "30"))
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import tiktoken
|
|||
|
|
import trafilatura
|
|||
|
|
except ImportError as e:
|
|||
|
|
sys.exit(f"missing dep: {e}. run: pip install tiktoken trafilatura firecrawl-py")
|
|||
|
|
|
|||
|
|
ENC = tiktoken.get_encoding("cl100k_base")
|
|||
|
|
|
|||
|
|
FC_KEY = os.environ.get("FIRECRAWL_API_KEY")
|
|||
|
|
FC = None
|
|||
|
|
if FC_KEY:
|
|||
|
|
try:
|
|||
|
|
from firecrawl import Firecrawl
|
|||
|
|
FC = Firecrawl(api_key=FC_KEY)
|
|||
|
|
except ImportError:
|
|||
|
|
print("firecrawl-py not installed; skipping firecrawl column", file=sys.stderr)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_sites() -> list[str]:
|
|||
|
|
path = ROOT / "sites.txt"
|
|||
|
|
out = []
|
|||
|
|
for line in path.read_text().splitlines():
|
|||
|
|
s = line.split("#", 1)[0].strip()
|
|||
|
|
if s:
|
|||
|
|
out.append(s)
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_facts() -> dict[str, list[str]]:
|
|||
|
|
return json.loads((ROOT / "facts.json").read_text())["facts"]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_webclaw_llm(url: str) -> tuple[str, float]:
|
|||
|
|
t0 = time.time()
|
|||
|
|
r = subprocess.run(
|
|||
|
|
[WEBCLAW, url, "-f", "llm", "-t", str(WC_TIMEOUT)],
|
|||
|
|
capture_output=True, text=True, timeout=WC_TIMEOUT + 15,
|
|||
|
|
)
|
|||
|
|
return r.stdout or "", time.time() - t0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_webclaw_raw(url: str) -> str:
|
|||
|
|
r = subprocess.run(
|
|||
|
|
[WEBCLAW, url, "--raw-html", "-t", str(WC_TIMEOUT)],
|
|||
|
|
capture_output=True, text=True, timeout=WC_TIMEOUT + 15,
|
|||
|
|
)
|
|||
|
|
return r.stdout or ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_trafilatura(url: str) -> tuple[str, float]:
|
|||
|
|
t0 = time.time()
|
|||
|
|
try:
|
|||
|
|
html = trafilatura.fetch_url(url)
|
|||
|
|
out = ""
|
|||
|
|
if html:
|
|||
|
|
out = trafilatura.extract(
|
|||
|
|
html, output_format="markdown",
|
|||
|
|
include_links=True, include_tables=True, favor_recall=True,
|
|||
|
|
) or ""
|
|||
|
|
except Exception:
|
|||
|
|
out = ""
|
|||
|
|
return out, time.time() - t0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_firecrawl(url: str) -> tuple[str, float]:
|
|||
|
|
if not FC:
|
|||
|
|
return "", 0.0
|
|||
|
|
t0 = time.time()
|
|||
|
|
try:
|
|||
|
|
r = FC.scrape(url, formats=["markdown"])
|
|||
|
|
return (r.markdown or ""), time.time() - t0
|
|||
|
|
except Exception:
|
|||
|
|
return "", time.time() - t0
|
|||
|
|
|
|||
|
|
|
|||
|
|
def tok(s: str) -> int:
|
|||
|
|
return len(ENC.encode(s, disallowed_special=())) if s else 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
_WORD = re.compile(r"[A-Za-z][A-Za-z0-9]*")
|
|||
|
|
|
|||
|
|
def hit_count(text: str, facts: list[str]) -> int:
|
|||
|
|
"""Case-insensitive; word-boundary for single-token alphanumeric facts,
|
|||
|
|
substring for multi-word or non-alpha facts (like '99.999')."""
|
|||
|
|
if not text:
|
|||
|
|
return 0
|
|||
|
|
low = text.lower()
|
|||
|
|
count = 0
|
|||
|
|
for f in facts:
|
|||
|
|
f_low = f.lower()
|
|||
|
|
if " " in f or not f.isalpha():
|
|||
|
|
if f_low in low:
|
|||
|
|
count += 1
|
|||
|
|
else:
|
|||
|
|
if re.search(r"\b" + re.escape(f_low) + r"\b", low):
|
|||
|
|
count += 1
|
|||
|
|
return count
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> int:
|
|||
|
|
sites = load_sites()
|
|||
|
|
facts_by_url = load_facts()
|
|||
|
|
print(f"running {len(sites)} sites × {3 if FC else 2} tools × {RUNS} runs")
|
|||
|
|
if not FC:
|
|||
|
|
print(" (no FIRECRAWL_API_KEY — skipping firecrawl column)")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
per_site = []
|
|||
|
|
for i, url in enumerate(sites, 1):
|
|||
|
|
facts = facts_by_url.get(url, [])
|
|||
|
|
if not facts:
|
|||
|
|
print(f"[{i}/{len(sites)}] {url} SKIPPED — no facts in facts.json")
|
|||
|
|
continue
|
|||
|
|
print(f"[{i}/{len(sites)}] {url}")
|
|||
|
|
raw_t = tok(run_webclaw_raw(url))
|
|||
|
|
|
|||
|
|
def run_one(fn):
|
|||
|
|
out, seconds = fn(url)
|
|||
|
|
return {"tokens": tok(out), "facts": hit_count(out, facts), "seconds": seconds}
|
|||
|
|
|
|||
|
|
runs = {"webclaw": [], "trafilatura": [], "firecrawl": []}
|
|||
|
|
for _ in range(RUNS):
|
|||
|
|
runs["webclaw"].append(run_one(run_webclaw_llm))
|
|||
|
|
runs["trafilatura"].append(run_one(run_trafilatura))
|
|||
|
|
if FC:
|
|||
|
|
runs["firecrawl"].append(run_one(run_firecrawl))
|
|||
|
|
else:
|
|||
|
|
runs["firecrawl"].append({"tokens": 0, "facts": 0, "seconds": 0.0})
|
|||
|
|
|
|||
|
|
def med(tool, key):
|
|||
|
|
return statistics.median(r[key] for r in runs[tool])
|
|||
|
|
|
|||
|
|
def med_ints(tool):
|
|||
|
|
return {
|
|||
|
|
"tokens_med": int(med(tool, "tokens")),
|
|||
|
|
"facts_med": int(med(tool, "facts")),
|
|||
|
|
"seconds_med": round(med(tool, "seconds"), 2),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
per_site.append({
|
|||
|
|
"url": url,
|
|||
|
|
"facts_count": len(facts),
|
|||
|
|
"raw_tokens": raw_t,
|
|||
|
|
"webclaw": med_ints("webclaw"),
|
|||
|
|
"trafilatura": med_ints("trafilatura"),
|
|||
|
|
"firecrawl": med_ints("firecrawl"),
|
|||
|
|
})
|
|||
|
|
last = per_site[-1]
|
|||
|
|
print(f" raw={raw_t} wc={last['webclaw']['tokens_med']}/{last['webclaw']['facts_med']}"
|
|||
|
|
f" tr={last['trafilatura']['tokens_med']}/{last['trafilatura']['facts_med']}"
|
|||
|
|
f" fc={last['firecrawl']['tokens_med']}/{last['firecrawl']['facts_med']}")
|
|||
|
|
|
|||
|
|
# aggregates
|
|||
|
|
total_facts = sum(r["facts_count"] for r in per_site)
|
|||
|
|
|
|||
|
|
def agg(tool):
|
|||
|
|
red_vals = [
|
|||
|
|
(r["raw_tokens"] - r[tool]["tokens_med"]) / r["raw_tokens"] * 100
|
|||
|
|
for r in per_site
|
|||
|
|
if r["raw_tokens"] > 0 and r[tool]["tokens_med"] > 0
|
|||
|
|
]
|
|||
|
|
return {
|
|||
|
|
"reduction_mean": round(statistics.mean(red_vals), 1) if red_vals else 0.0,
|
|||
|
|
"reduction_median": round(statistics.median(red_vals), 1) if red_vals else 0.0,
|
|||
|
|
"facts_preserved": sum(r[tool]["facts_med"] for r in per_site),
|
|||
|
|
"total_facts": total_facts,
|
|||
|
|
"fidelity_pct": round(sum(r[tool]["facts_med"] for r in per_site) / total_facts * 100, 1) if total_facts else 0,
|
|||
|
|
"latency_mean": round(statistics.mean(r[tool]["seconds_med"] for r in per_site), 2),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
result = {
|
|||
|
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
|||
|
|
"webclaw_version": subprocess.check_output([WEBCLAW, "--version"], text=True).strip().split()[-1],
|
|||
|
|
"trafilatura_version": trafilatura.__version__,
|
|||
|
|
"firecrawl_enabled": FC is not None,
|
|||
|
|
"tokenizer": "cl100k_base",
|
|||
|
|
"runs_per_site": RUNS,
|
|||
|
|
"site_count": len(per_site),
|
|||
|
|
"total_facts": total_facts,
|
|||
|
|
"aggregates": {t: agg(t) for t in ["webclaw", "trafilatura", "firecrawl"]},
|
|||
|
|
"per_site": per_site,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
out_path = ROOT / "results" / f"{time.strftime('%Y-%m-%d')}.json"
|
|||
|
|
out_path.parent.mkdir(exist_ok=True)
|
|||
|
|
out_path.write_text(json.dumps(result, indent=2))
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
print("=" * 70)
|
|||
|
|
print(f"{len(per_site)} sites, {total_facts} facts, median of {RUNS} runs")
|
|||
|
|
print("=" * 70)
|
|||
|
|
for t in ["webclaw", "trafilatura", "firecrawl"]:
|
|||
|
|
a = result["aggregates"][t]
|
|||
|
|
print(f" {t:14s} reduction_mean={a['reduction_mean']:5.1f}%"
|
|||
|
|
f" fidelity={a['facts_preserved']}/{a['total_facts']} ({a['fidelity_pct']}%)"
|
|||
|
|
f" latency={a['latency_mean']}s")
|
|||
|
|
print()
|
|||
|
|
print(f" results → {out_path.relative_to(REPO_ROOT)}")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
sys.exit(main())
|