sqlite-vec/examples/nbc-headlinesx/scrape.ipynb

145 lines
3.9 KiB
Text
Raw Normal View History

2024-08-12 15:39:54 -07:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import { parseHTML } from \"npm:linkedom\";\n",
"import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
"import * as sqlitePath from \"npm:sqlite-path\";\n",
"import * as sqliteUrl from \"npm:sqlite-url\";\n",
"import * as sqliteRegex from \"npm:sqlite-regex\";"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
"\n",
"const db = new Database(\":memory:\")\n",
"db.enableLoadExtension = true;\n",
"db.loadExtension(sqlitePath.getLoadablePath());\n",
"db.loadExtension(sqliteUrl.getLoadablePath());\n",
"db.loadExtension(sqliteRegex.getLoadablePath());\n",
"db.enableLoadExtension = false;\n",
"\n",
"db.exec(`\n",
" CREATE TABLE articles(\n",
" slug_id TEXT,\n",
" slug TEXT,\n",
" headline TEXT,\n",
" url TEXT,\n",
" year integer,\n",
" month integer,\n",
" category1 TEXT,\n",
" category2 TEXT\n",
" )\n",
"`)\n",
"\n",
"const stmt = db.prepare(`\n",
" insert into articles\n",
" select\n",
" regex_capture(\n",
" '(?P<slug>.+)-(?P<id>rcna\\\\d+)',\n",
" path_at(url_path(:url), -1),\n",
" 'id'\n",
" ) as id,\n",
" regex_capture(\n",
" '(?P<slug>.+)-(?P<id>rcna\\\\d+)',\n",
" path_at(url_path(:url), -1),\n",
" 'slug'\n",
" ) as slug,\n",
" :headline as headline,\n",
" :url as url,\n",
" :year as year,\n",
" :month as month,\n",
" path_at(url_path(:url), 0) as category1,\n",
" iif(\n",
" path_length(url_path(:url)) > 2,\n",
" path_at(url_path(:url), 1),\n",
" null\n",
" ) as category2\n",
"`);\n",
"\n",
"const insertArticles = db.transaction((year, month, articles) => {\n",
" for(const article of articles) {\n",
" stmt.run({...article, year, month})\n",
" }\n",
"})\n",
"\n",
"async function insertMonth(year:number, month: text) {\n",
" const monthPage = await fetch(`https://www.nbcnews.com/archive/articles/${year}/${month}`).then(r=>r.text())\n",
" const {document:monthPageDoc} = parseHTML(monthPage);\n",
" const monthEntries = monthPageDoc\n",
" .querySelectorAll('.MonthPage a')\n",
" .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
" insertArticles(year, months.findIndex(m => m === month)+1, monthEntries)\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"for(let year = 2014; year <= 2023; year++) {\n",
" for(const month of months) {\n",
" console.log(year, month);\n",
" await insertMonth(year, month);\n",
" }\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"db.exec(\"vacuum into 'articles.db'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"db.sql`select * from articles order by random() limit 10`"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Deno",
"language": "typescript",
"name": "deno"
},
"language_info": {
"codemirror_mode": "typescript",
"file_extension": ".ts",
"mimetype": "text/x.typescript",
"name": "typescript",
"nbconvert_exporter": "script",
"pygments_lexer": "typescript",
"version": "5.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}