sqlite-vec/examples/nbc-headlines/1_scrape.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NBC News Headlines: Scraper\n",
    "\n",
    "This notebooks implements a scraper for [NBC News](https://www.nbcnews.com) headlines. It uses [this sitemap](https://www.nbcnews.com/archive/articles/2024/march), which provides a list of article headlines + URLs\n",
    "for every month for the past few years. \n",
    "\n",
    "This dataset is mostly to get a simple, real-world small text dataset for testing embeddings. \n",
    "They're small pieces of text (~dozen words), have a wide range of semantic meaning, and are more \"real-world\"\n",
    "them some other embeddings datasets out there.\n",
    "\n",
    "This notebook uses [Deno](https://deno.com/), [linkedom](https://github.com/WebReflection/linkedom), and a few \n",
    "SQLite extensions to scrape the headlines for a given date range. It creates a single SQL table, `articles`, \n",
    "with a few columns like `headline` and `url`. By default it will get all article headlines from January 2024 -> present\n",
    "and save them to a database called `headlines-2024.db`. Feel free to copy+paste this code into your own custom scraper. \n",
    "\n",
    "This notebook also just scrapes the data into a SQLite database, it does NOT do any embeddings + vector search. \n",
    "For those examples of those, see [`./2_build.ipynb`](./2_build.ipynb) and [`./3_search.ipynb`](./3_search.ipynb)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
    "import { parseHTML } from \"npm:linkedom\";\n",
    "import * as d3 from \"npm:d3-time\";\n",
    "import * as sqlitePath from \"npm:sqlite-path\";\n",
    "import * as sqliteUrl from \"npm:sqlite-url\";\n",
    "import * as sqliteRegex from \"npm:sqlite-regex\";\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
    "\n",
    "class Db {\n",
    "  db: Database;\n",
    "  #stmtInsertArticle: Statement;\n",
    "\n",
    "  constructor(path:string) {\n",
    "    this.db = new Database(path);\n",
    "    this.db.enableLoadExtension = true;\n",
    "    this.db.loadExtension(sqlitePath.getLoadablePath());\n",
    "    this.db.loadExtension(sqliteUrl.getLoadablePath());\n",
    "    this.db.loadExtension(sqliteRegex.getLoadablePath());\n",
    "    this.db.enableLoadExtension = false;\n",
    "\n",
    "    this.db.exec(`\n",
    "      CREATE TABLE IF NOT EXISTS articles(\n",
    "        id integer primary key autoincrement,\n",
    "        year integer,\n",
    "        month integer,\n",
    "        slug TEXT,\n",
    "        slug_id TEXT,\n",
    "        headline TEXT,\n",
    "        url TEXT,\n",
    "        category1 TEXT,\n",
    "        category2 TEXT\n",
    "      )\n",
    "    `);\n",
    "\n",
    "    this.#stmtInsertArticle = this.db.prepare(`\n",
    "      insert into articles(year, month, slug, slug_id, headline, url, category1, category2)\n",
    "      select\n",
    "        :year as year,\n",
    "        :month as month,\n",
    "         regex_capture(\n",
    "          '(?P<slug>.+)-(?P<id>[^-]+)$',\n",
    "          path_at(url_path(:url), -1),\n",
    "          'slug'\n",
    "        ) as slug,\n",
    "        regex_capture(\n",
    "          '(?P<slug>.+)-(?P<id>[^-]+)$',\n",
    "          path_at(url_path(:url), -1),\n",
    "          'id'\n",
    "        ) as slug_id,\n",
    "        :headline as headline,\n",
    "        :url as url,\n",
    "        path_at(url_path(:url), 0) as category1,\n",
    "        iif(\n",
    "          path_length(url_path(:url)) > 2,\n",
    "          path_at(url_path(:url), 1),\n",
    "          null\n",
    "        ) as category2\n",
    "    `);\n",
    "  }\n",
    "\n",
    "  insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {\n",
    "    const tx = this.db.transaction((year, month, articles) => {\n",
    "      for(const article of articles) {\n",
    "        this.#stmtInsertArticle.run({...article, year, month})\n",
    "      }\n",
    "    });\n",
    "    tx(year, month, articles);\n",
    "  }\n",
    "}\n",
    "\n",
    "async function insertMonth(db: Db, year:number, month: text) {\n",
    "  let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;\n",
    "  while(true) {\n",
    "    const monthPage = await fetch(url).then(r=>r.text())\n",
    "    const {document:monthPageDoc} = parseHTML(monthPage);\n",
    "    const monthEntries = monthPageDoc\n",
    "      .querySelectorAll('.MonthPage a')\n",
    "      .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
    "    db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);\n",
    "    const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');\n",
    "    if(!next) {\n",
    "      break;\n",
    "    }\n",
    "    url = `https://www.nbcnews.com${next.getAttribute('href')}`;\n",
    "  }\n",
    "\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "async function backfill(db, start: Date, end: Date) {\n",
    "  const targets = d3.timeMonths(start, end)\n",
    "    .map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));\n",
    "  for(const target of targets) {\n",
    "    console.log(`${target.year} ${target.monthIndex}`)\n",
    "    await insertMonth(db, target.year, months[target.monthIndex]);\n",
    "  }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024 0\n",
      "2024 1\n",
      "2024 2\n",
      "2024 3\n",
      "2024 4\n",
      "2024 5\n",
      "2024 6\n",
      "2024 7\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001b[33m1\u001b[39m"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "const db = new Db(\":memory:\");\n",
    "await backfill(db, new Date('2024-01-01'), new Date())\n",
    "db.db.exec(\"vacuum into 'headlines-2024.db'\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Deno",
   "language": "typescript",
   "name": "deno"
  },
  "language_info": {
   "codemirror_mode": "typescript",
   "file_extension": ".ts",
   "mimetype": "text/x.typescript",
   "name": "typescript",
   "nbconvert_exporter": "script",
   "pygments_lexer": "typescript",
   "version": "5.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
more work on nbc headlines sample 2024-09-07 09:22:21 -07:00			`{`
			`"cells": [`
nbc headlines updates 2024-10-02 10:24:49 -07:00			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# NBC News Headlines: Scraper\n",`
			`"\n",`
			`"This notebooks implements a scraper for [NBC News](https://www.nbcnews.com) headlines. It uses [this sitemap](https://www.nbcnews.com/archive/articles/2024/march), which provides a list of article headlines + URLs\n",`
			`"for every month for the past few years. \n",`
			`"\n",`
			`"This dataset is mostly to get a simple, real-world small text dataset for testing embeddings. \n",`
			`"They're small pieces of text (~dozen words), have a wide range of semantic meaning, and are more \"real-world\"\n",`
			`"them some other embeddings datasets out there.\n",`
			`"\n",`
			`"This notebook uses [Deno](https://deno.com/), [linkedom](https://github.com/WebReflection/linkedom), and a few \n",`
			"SQLite extensions to scrape the headlines for a given date range. It creates a single SQL table, `articles`, \n",
			"with a few columns like `headline` and `url`. By default it will get all article headlines from January 2024 -> present\n",
			"and save them to a database called `headlines-2024.db`. Feel free to copy+paste this code into your own custom scraper. \n",
			`"\n",`
			`"This notebook also just scrapes the data into a SQLite database, it does NOT do any embeddings + vector search. \n",`
			"For those examples of those, see [`./2_build.ipynb`](./2_build.ipynb) and [`./3_search.ipynb`](./3_search.ipynb)."
			`]`
			`},`
more work on nbc headlines sample 2024-09-07 09:22:21 -07:00			`{`
			`"cell_type": "code",`
			`"execution_count": 43,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",`
			`"import { parseHTML } from \"npm:linkedom\";\n",`
			`"import * as d3 from \"npm:d3-time\";\n",`
			`"import * as sqlitePath from \"npm:sqlite-path\";\n",`
			`"import * as sqliteUrl from \"npm:sqlite-url\";\n",`
			`"import * as sqliteRegex from \"npm:sqlite-regex\";\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 47,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",`
			`"\n",`
			`"class Db {\n",`
			`" db: Database;\n",`
			`" #stmtInsertArticle: Statement;\n",`
			`"\n",`
			`" constructor(path:string) {\n",`
			`" this.db = new Database(path);\n",`
			`" this.db.enableLoadExtension = true;\n",`
			`" this.db.loadExtension(sqlitePath.getLoadablePath());\n",`
			`" this.db.loadExtension(sqliteUrl.getLoadablePath());\n",`
			`" this.db.loadExtension(sqliteRegex.getLoadablePath());\n",`
			`" this.db.enableLoadExtension = false;\n",`
			`"\n",`
			" this.db.exec(`\n",
			`" CREATE TABLE IF NOT EXISTS articles(\n",`
			`" id integer primary key autoincrement,\n",`
			`" year integer,\n",`
			`" month integer,\n",`
			`" slug TEXT,\n",`
			`" slug_id TEXT,\n",`
			`" headline TEXT,\n",`
			`" url TEXT,\n",`
			`" category1 TEXT,\n",`
			`" category2 TEXT\n",`
			`" )\n",`
			" `);\n",
			`"\n",`
			" this.#stmtInsertArticle = this.db.prepare(`\n",
			`" insert into articles(year, month, slug, slug_id, headline, url, category1, category2)\n",`
			`" select\n",`
			`" :year as year,\n",`
			`" :month as month,\n",`
			`" regex_capture(\n",`
			`" '(?P<slug>.+)-(?P<id>[^-]+)$',\n",`
			`" path_at(url_path(:url), -1),\n",`
			`" 'slug'\n",`
			`" ) as slug,\n",`
			`" regex_capture(\n",`
			`" '(?P<slug>.+)-(?P<id>[^-]+)$',\n",`
			`" path_at(url_path(:url), -1),\n",`
			`" 'id'\n",`
			`" ) as slug_id,\n",`
			`" :headline as headline,\n",`
			`" :url as url,\n",`
			`" path_at(url_path(:url), 0) as category1,\n",`
			`" iif(\n",`
			`" path_length(url_path(:url)) > 2,\n",`
			`" path_at(url_path(:url), 1),\n",`
			`" null\n",`
			`" ) as category2\n",`
			" `);\n",
			`" }\n",`
			`"\n",`
			`" insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {\n",`
			`" const tx = this.db.transaction((year, month, articles) => {\n",`
			`" for(const article of articles) {\n",`
			`" this.#stmtInsertArticle.run({...article, year, month})\n",`
			`" }\n",`
			`" });\n",`
			`" tx(year, month, articles);\n",`
			`" }\n",`
			`"}\n",`
			`"\n",`
			`"async function insertMonth(db: Db, year:number, month: text) {\n",`
			" let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;\n",
			`" while(true) {\n",`
			`" const monthPage = await fetch(url).then(r=>r.text())\n",`
			`" const {document:monthPageDoc} = parseHTML(monthPage);\n",`
			`" const monthEntries = monthPageDoc\n",`
			`" .querySelectorAll('.MonthPage a')\n",`
			`" .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",`
			`" db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);\n",`
			`" const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');\n",`
			`" if(!next) {\n",`
			`" break;\n",`
			`" }\n",`
			" url = `https://www.nbcnews.com${next.getAttribute('href')}`;\n",
			`" }\n",`
			`"\n",`
			`"}\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 48,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"\n",`
			`"async function backfill(db, start: Date, end: Date) {\n",`
			`" const targets = d3.timeMonths(start, end)\n",`
			`" .map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));\n",`
			`" for(const target of targets) {\n",`
			" console.log(`${target.year} ${target.monthIndex}`)\n",
			`" await insertMonth(db, target.year, months[target.monthIndex]);\n",`
			`" }\n",`
			`"}\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 49,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"2024 0\n",`
			`"2024 1\n",`
			`"2024 2\n",`
			`"2024 3\n",`
			`"2024 4\n",`
			`"2024 5\n",`
			`"2024 6\n",`
			`"2024 7\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"text/plain": [`
			`"\u001b[33m1\u001b[39m"`
			`]`
			`},`
			`"execution_count": 49,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"const db = new Db(\":memory:\");\n",`
			`"await backfill(db, new Date('2024-01-01'), new Date())\n",`
			`"db.db.exec(\"vacuum into 'headlines-2024.db'\")"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Deno",`
			`"language": "typescript",`
			`"name": "deno"`
			`},`
			`"language_info": {`
			`"codemirror_mode": "typescript",`
			`"file_extension": ".ts",`
			`"mimetype": "text/x.typescript",`
			`"name": "typescript",`
			`"nbconvert_exporter": "script",`
			`"pygments_lexer": "typescript",`
			`"version": "5.5.2"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`