mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 16:56:27 +02:00
more work on nbc headlines sample
This commit is contained in:
parent
687a2821f2
commit
fa6edfe246
16 changed files with 3036 additions and 1169 deletions
178
examples/nbc-headlines/1_scrape.ipynb
Normal file
178
examples/nbc-headlines/1_scrape.ipynb
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
|
||||
"import { parseHTML } from \"npm:linkedom\";\n",
|
||||
"import * as d3 from \"npm:d3-time\";\n",
|
||||
"import * as sqlitePath from \"npm:sqlite-path\";\n",
|
||||
"import * as sqliteUrl from \"npm:sqlite-url\";\n",
|
||||
"import * as sqliteRegex from \"npm:sqlite-regex\";\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
|
||||
"\n",
|
||||
"class Db {\n",
|
||||
" db: Database;\n",
|
||||
" #stmtInsertArticle: Statement;\n",
|
||||
"\n",
|
||||
" constructor(path:string) {\n",
|
||||
" this.db = new Database(path);\n",
|
||||
" this.db.enableLoadExtension = true;\n",
|
||||
" this.db.loadExtension(sqlitePath.getLoadablePath());\n",
|
||||
" this.db.loadExtension(sqliteUrl.getLoadablePath());\n",
|
||||
" this.db.loadExtension(sqliteRegex.getLoadablePath());\n",
|
||||
" this.db.enableLoadExtension = false;\n",
|
||||
"\n",
|
||||
" this.db.exec(`\n",
|
||||
" CREATE TABLE IF NOT EXISTS articles(\n",
|
||||
" id integer primary key autoincrement,\n",
|
||||
" year integer,\n",
|
||||
" month integer,\n",
|
||||
" slug TEXT,\n",
|
||||
" slug_id TEXT,\n",
|
||||
" headline TEXT,\n",
|
||||
" url TEXT,\n",
|
||||
" category1 TEXT,\n",
|
||||
" category2 TEXT\n",
|
||||
" )\n",
|
||||
" `);\n",
|
||||
"\n",
|
||||
" this.#stmtInsertArticle = this.db.prepare(`\n",
|
||||
" insert into articles(year, month, slug, slug_id, headline, url, category1, category2)\n",
|
||||
" select\n",
|
||||
" :year as year,\n",
|
||||
" :month as month,\n",
|
||||
" regex_capture(\n",
|
||||
" '(?P<slug>.+)-(?P<id>[^-]+)$',\n",
|
||||
" path_at(url_path(:url), -1),\n",
|
||||
" 'slug'\n",
|
||||
" ) as slug,\n",
|
||||
" regex_capture(\n",
|
||||
" '(?P<slug>.+)-(?P<id>[^-]+)$',\n",
|
||||
" path_at(url_path(:url), -1),\n",
|
||||
" 'id'\n",
|
||||
" ) as slug_id,\n",
|
||||
" :headline as headline,\n",
|
||||
" :url as url,\n",
|
||||
" path_at(url_path(:url), 0) as category1,\n",
|
||||
" iif(\n",
|
||||
" path_length(url_path(:url)) > 2,\n",
|
||||
" path_at(url_path(:url), 1),\n",
|
||||
" null\n",
|
||||
" ) as category2\n",
|
||||
" `);\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {\n",
|
||||
" const tx = this.db.transaction((year, month, articles) => {\n",
|
||||
" for(const article of articles) {\n",
|
||||
" this.#stmtInsertArticle.run({...article, year, month})\n",
|
||||
" }\n",
|
||||
" });\n",
|
||||
" tx(year, month, articles);\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"async function insertMonth(db: Db, year:number, month: text) {\n",
|
||||
" let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;\n",
|
||||
" while(true) {\n",
|
||||
" const monthPage = await fetch(url).then(r=>r.text())\n",
|
||||
" const {document:monthPageDoc} = parseHTML(monthPage);\n",
|
||||
" const monthEntries = monthPageDoc\n",
|
||||
" .querySelectorAll('.MonthPage a')\n",
|
||||
" .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
|
||||
" db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);\n",
|
||||
" const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');\n",
|
||||
" if(!next) {\n",
|
||||
" break;\n",
|
||||
" }\n",
|
||||
" url = `https://www.nbcnews.com${next.getAttribute('href')}`;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"async function backfill(db, start: Date, end: Date) {\n",
|
||||
" const targets = d3.timeMonths(start, end)\n",
|
||||
" .map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));\n",
|
||||
" for(const target of targets) {\n",
|
||||
" console.log(`${target.year} ${target.monthIndex}`)\n",
|
||||
" await insertMonth(db, target.year, months[target.monthIndex]);\n",
|
||||
" }\n",
|
||||
"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2024 0\n",
|
||||
"2024 1\n",
|
||||
"2024 2\n",
|
||||
"2024 3\n",
|
||||
"2024 4\n",
|
||||
"2024 5\n",
|
||||
"2024 6\n",
|
||||
"2024 7\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\u001b[33m1\u001b[39m"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"const db = new Db(\":memory:\");\n",
|
||||
"await backfill(db, new Date('2024-01-01'), new Date())\n",
|
||||
"db.db.exec(\"vacuum into 'headlines-2024.db'\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Deno",
|
||||
"language": "typescript",
|
||||
"name": "deno"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": "typescript",
|
||||
"file_extension": ".ts",
|
||||
"mimetype": "text/x.typescript",
|
||||
"name": "typescript",
|
||||
"nbconvert_exporter": "script",
|
||||
"pygments_lexer": "typescript",
|
||||
"version": "5.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue