diff --git a/examples/nbc-headlinesx/scrape.ipynb b/examples/nbc-headlinesx/scrape.ipynb new file mode 100644 index 0000000..399e88f --- /dev/null +++ b/examples/nbc-headlinesx/scrape.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import { parseHTML } from \"npm:linkedom\";\n", + "import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n", + "import * as sqlitePath from \"npm:sqlite-path\";\n", + "import * as sqliteUrl from \"npm:sqlite-url\";\n", + "import * as sqliteRegex from \"npm:sqlite-regex\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n", + "\n", + "const db = new Database(\":memory:\")\n", + "db.enableLoadExtension = true;\n", + "db.loadExtension(sqlitePath.getLoadablePath());\n", + "db.loadExtension(sqliteUrl.getLoadablePath());\n", + "db.loadExtension(sqliteRegex.getLoadablePath());\n", + "db.enableLoadExtension = false;\n", + "\n", + "db.exec(`\n", + " CREATE TABLE articles(\n", + " slug_id TEXT,\n", + " slug TEXT,\n", + " headline TEXT,\n", + " url TEXT,\n", + " year integer,\n", + " month integer,\n", + " category1 TEXT,\n", + " category2 TEXT\n", + " )\n", + "`)\n", + "\n", + "const stmt = db.prepare(`\n", + " insert into articles\n", + " select\n", + " regex_capture(\n", + " '(?P.+)-(?Prcna\\\\d+)',\n", + " path_at(url_path(:url), -1),\n", + " 'id'\n", + " ) as id,\n", + " regex_capture(\n", + " '(?P.+)-(?Prcna\\\\d+)',\n", + " path_at(url_path(:url), -1),\n", + " 'slug'\n", + " ) as slug,\n", + " :headline as headline,\n", + " :url as url,\n", + " :year as year,\n", + " :month as month,\n", + " path_at(url_path(:url), 0) as category1,\n", + " iif(\n", + " path_length(url_path(:url)) > 2,\n", + " path_at(url_path(:url), 1),\n", + " null\n", + " ) as category2\n", + "`);\n", + "\n", + "const insertArticles = db.transaction((year, month, articles) => {\n", + " for(const article of articles) {\n", + " stmt.run({...article, year, month})\n", + " }\n", + "})\n", + "\n", + "async function insertMonth(year:number, month: text) {\n", + " const monthPage = await fetch(`https://www.nbcnews.com/archive/articles/${year}/${month}`).then(r=>r.text())\n", + " const {document:monthPageDoc} = parseHTML(monthPage);\n", + " const monthEntries = monthPageDoc\n", + " .querySelectorAll('.MonthPage a')\n", + " .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n", + " insertArticles(year, months.findIndex(m => m === month)+1, monthEntries)\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "for(let year = 2014; year <= 2023; year++) {\n", + " for(const month of months) {\n", + " console.log(year, month);\n", + " await insertMonth(year, month);\n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db.exec(\"vacuum into 'articles.db'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db.sql`select * from articles order by random() limit 10`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Deno", + "language": "typescript", + "name": "deno" + }, + "language_info": { + "codemirror_mode": "typescript", + "file_extension": ".ts", + "mimetype": "text/x.typescript", + "name": "typescript", + "nbconvert_exporter": "script", + "pygments_lexer": "typescript", + "version": "5.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}