{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import { parseHTML } from \"npm:linkedom\";\n", "import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n", "import * as sqlitePath from \"npm:sqlite-path\";\n", "import * as sqliteUrl from \"npm:sqlite-url\";\n", "import * as sqliteRegex from \"npm:sqlite-regex\";" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n", "\n", "const db = new Database(\":memory:\")\n", "db.enableLoadExtension = true;\n", "db.loadExtension(sqlitePath.getLoadablePath());\n", "db.loadExtension(sqliteUrl.getLoadablePath());\n", "db.loadExtension(sqliteRegex.getLoadablePath());\n", "db.enableLoadExtension = false;\n", "\n", "db.exec(`\n", " CREATE TABLE articles(\n", " slug_id TEXT,\n", " slug TEXT,\n", " headline TEXT,\n", " url TEXT,\n", " year integer,\n", " month integer,\n", " category1 TEXT,\n", " category2 TEXT\n", " )\n", "`)\n", "\n", "const stmt = db.prepare(`\n", " insert into articles\n", " select\n", " regex_capture(\n", " '(?P.+)-(?Prcna\\\\d+)',\n", " path_at(url_path(:url), -1),\n", " 'id'\n", " ) as id,\n", " regex_capture(\n", " '(?P.+)-(?Prcna\\\\d+)',\n", " path_at(url_path(:url), -1),\n", " 'slug'\n", " ) as slug,\n", " :headline as headline,\n", " :url as url,\n", " :year as year,\n", " :month as month,\n", " path_at(url_path(:url), 0) as category1,\n", " iif(\n", " path_length(url_path(:url)) > 2,\n", " path_at(url_path(:url), 1),\n", " null\n", " ) as category2\n", "`);\n", "\n", "const insertArticles = db.transaction((year, month, articles) => {\n", " for(const article of articles) {\n", " stmt.run({...article, year, month})\n", " }\n", "})\n", "\n", "async function insertMonth(year:number, month: text) {\n", " const monthPage = await fetch(`https://www.nbcnews.com/archive/articles/${year}/${month}`).then(r=>r.text())\n", " const {document:monthPageDoc} = parseHTML(monthPage);\n", " const monthEntries = monthPageDoc\n", " .querySelectorAll('.MonthPage a')\n", " .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n", " insertArticles(year, months.findIndex(m => m === month)+1, monthEntries)\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "for(let year = 2014; year <= 2023; year++) {\n", " for(const month of months) {\n", " console.log(year, month);\n", " await insertMonth(year, month);\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "db.exec(\"vacuum into 'articles.db'\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "db.sql`select * from articles order by random() limit 10`" ] } ], "metadata": { "kernelspec": { "display_name": "Deno", "language": "typescript", "name": "deno" }, "language_info": { "codemirror_mode": "typescript", "file_extension": ".ts", "mimetype": "text/x.typescript", "name": "typescript", "nbconvert_exporter": "script", "pygments_lexer": "typescript", "version": "5.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }