webclaw/benchmarks/scripts/bench.py

233 lines
7.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
webclaw benchmark webclaw vs trafilatura vs firecrawl.
Produces results/YYYY-MM-DD.json matching the schema in methodology.md.
Sites and facts come from ../sites.txt and ../facts.json.
Tokenizer: cl100k_base (GPT-4 / GPT-3.5 / text-embedding-3-*).
Usage:
FIRECRAWL_API_KEY=fc-... python3 bench.py
python3 bench.py # runs webclaw + trafilatura only
Optional env:
WEBCLAW path to webclaw release binary (default: ../../target/release/webclaw)
RUNS runs per site (default: 3)
WEBCLAW_TIMEOUT seconds (default: 30)
"""
from __future__ import annotations
import json, os, re, statistics, subprocess, sys, time
from pathlib import Path
HERE = Path(__file__).resolve().parent
ROOT = HERE.parent # benchmarks/
REPO_ROOT = ROOT.parent # core/
WEBCLAW = os.environ.get("WEBCLAW", str(REPO_ROOT / "target" / "release" / "webclaw"))
RUNS = int(os.environ.get("RUNS", "3"))
WC_TIMEOUT = int(os.environ.get("WEBCLAW_TIMEOUT", "30"))
try:
import tiktoken
import trafilatura
except ImportError as e:
sys.exit(f"missing dep: {e}. run: pip install tiktoken trafilatura firecrawl-py")
ENC = tiktoken.get_encoding("cl100k_base")
FC_KEY = os.environ.get("FIRECRAWL_API_KEY")
FC = None
if FC_KEY:
try:
from firecrawl import Firecrawl
FC = Firecrawl(api_key=FC_KEY)
except ImportError:
print("firecrawl-py not installed; skipping firecrawl column", file=sys.stderr)
def load_sites() -> list[str]:
path = ROOT / "sites.txt"
out = []
for line in path.read_text().splitlines():
s = line.split("#", 1)[0].strip()
if s:
out.append(s)
return out
def load_facts() -> dict[str, list[str]]:
return json.loads((ROOT / "facts.json").read_text())["facts"]
def run_webclaw_llm(url: str) -> tuple[str, float]:
t0 = time.time()
r = subprocess.run(
[WEBCLAW, url, "-f", "llm", "-t", str(WC_TIMEOUT)],
capture_output=True, text=True, timeout=WC_TIMEOUT + 15,
)
return r.stdout or "", time.time() - t0
def run_webclaw_raw(url: str) -> str:
r = subprocess.run(
[WEBCLAW, url, "--raw-html", "-t", str(WC_TIMEOUT)],
capture_output=True, text=True, timeout=WC_TIMEOUT + 15,
)
return r.stdout or ""
def run_trafilatura(url: str) -> tuple[str, float]:
t0 = time.time()
try:
html = trafilatura.fetch_url(url)
out = ""
if html:
out = trafilatura.extract(
html, output_format="markdown",
include_links=True, include_tables=True, favor_recall=True,
) or ""
except Exception:
out = ""
return out, time.time() - t0
def run_firecrawl(url: str) -> tuple[str, float]:
if not FC:
return "", 0.0
t0 = time.time()
try:
r = FC.scrape(url, formats=["markdown"])
return (r.markdown or ""), time.time() - t0
except Exception:
return "", time.time() - t0
def tok(s: str) -> int:
return len(ENC.encode(s, disallowed_special=())) if s else 0
_WORD = re.compile(r"[A-Za-z][A-Za-z0-9]*")
def hit_count(text: str, facts: list[str]) -> int:
"""Case-insensitive; word-boundary for single-token alphanumeric facts,
substring for multi-word or non-alpha facts (like '99.999')."""
if not text:
return 0
low = text.lower()
count = 0
for f in facts:
f_low = f.lower()
if " " in f or not f.isalpha():
if f_low in low:
count += 1
else:
if re.search(r"\b" + re.escape(f_low) + r"\b", low):
count += 1
return count
def main() -> int:
sites = load_sites()
facts_by_url = load_facts()
print(f"running {len(sites)} sites × {3 if FC else 2} tools × {RUNS} runs")
if not FC:
print(" (no FIRECRAWL_API_KEY — skipping firecrawl column)")
print()
per_site = []
for i, url in enumerate(sites, 1):
facts = facts_by_url.get(url, [])
if not facts:
print(f"[{i}/{len(sites)}] {url} SKIPPED — no facts in facts.json")
continue
print(f"[{i}/{len(sites)}] {url}")
raw_t = tok(run_webclaw_raw(url))
def run_one(fn):
out, seconds = fn(url)
return {"tokens": tok(out), "facts": hit_count(out, facts), "seconds": seconds}
runs = {"webclaw": [], "trafilatura": [], "firecrawl": []}
for _ in range(RUNS):
runs["webclaw"].append(run_one(run_webclaw_llm))
runs["trafilatura"].append(run_one(run_trafilatura))
if FC:
runs["firecrawl"].append(run_one(run_firecrawl))
else:
runs["firecrawl"].append({"tokens": 0, "facts": 0, "seconds": 0.0})
def med(tool, key):
return statistics.median(r[key] for r in runs[tool])
def med_ints(tool):
return {
"tokens_med": int(med(tool, "tokens")),
"facts_med": int(med(tool, "facts")),
"seconds_med": round(med(tool, "seconds"), 2),
}
per_site.append({
"url": url,
"facts_count": len(facts),
"raw_tokens": raw_t,
"webclaw": med_ints("webclaw"),
"trafilatura": med_ints("trafilatura"),
"firecrawl": med_ints("firecrawl"),
})
last = per_site[-1]
print(f" raw={raw_t} wc={last['webclaw']['tokens_med']}/{last['webclaw']['facts_med']}"
f" tr={last['trafilatura']['tokens_med']}/{last['trafilatura']['facts_med']}"
f" fc={last['firecrawl']['tokens_med']}/{last['firecrawl']['facts_med']}")
# aggregates
total_facts = sum(r["facts_count"] for r in per_site)
def agg(tool):
red_vals = [
(r["raw_tokens"] - r[tool]["tokens_med"]) / r["raw_tokens"] * 100
for r in per_site
if r["raw_tokens"] > 0 and r[tool]["tokens_med"] > 0
]
return {
"reduction_mean": round(statistics.mean(red_vals), 1) if red_vals else 0.0,
"reduction_median": round(statistics.median(red_vals), 1) if red_vals else 0.0,
"facts_preserved": sum(r[tool]["facts_med"] for r in per_site),
"total_facts": total_facts,
"fidelity_pct": round(sum(r[tool]["facts_med"] for r in per_site) / total_facts * 100, 1) if total_facts else 0,
"latency_mean": round(statistics.mean(r[tool]["seconds_med"] for r in per_site), 2),
}
result = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"webclaw_version": subprocess.check_output([WEBCLAW, "--version"], text=True).strip().split()[-1],
"trafilatura_version": trafilatura.__version__,
"firecrawl_enabled": FC is not None,
"tokenizer": "cl100k_base",
"runs_per_site": RUNS,
"site_count": len(per_site),
"total_facts": total_facts,
"aggregates": {t: agg(t) for t in ["webclaw", "trafilatura", "firecrawl"]},
"per_site": per_site,
}
out_path = ROOT / "results" / f"{time.strftime('%Y-%m-%d')}.json"
out_path.parent.mkdir(exist_ok=True)
out_path.write_text(json.dumps(result, indent=2))
print()
print("=" * 70)
print(f"{len(per_site)} sites, {total_facts} facts, median of {RUNS} runs")
print("=" * 70)
for t in ["webclaw", "trafilatura", "firecrawl"]:
a = result["aggregates"][t]
print(f" {t:14s} reduction_mean={a['reduction_mean']:5.1f}%"
f" fidelity={a['facts_preserved']}/{a['total_facts']} ({a['fidelity_pct']}%)"
f" latency={a['latency_mean']}s")
print()
print(f" results → {out_path.relative_to(REPO_ROOT)}")
return 0
if __name__ == "__main__":
sys.exit(main())