more work on nbc headlines sample

This commit is contained in:
Alex Garcia 2024-09-07 09:22:21 -07:00
parent 687a2821f2
commit fa6edfe246
16 changed files with 3036 additions and 1169 deletions

View file

@ -0,0 +1,178 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
"import { parseHTML } from \"npm:linkedom\";\n",
"import * as d3 from \"npm:d3-time\";\n",
"import * as sqlitePath from \"npm:sqlite-path\";\n",
"import * as sqliteUrl from \"npm:sqlite-url\";\n",
"import * as sqliteRegex from \"npm:sqlite-regex\";\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
"\n",
"class Db {\n",
" db: Database;\n",
" #stmtInsertArticle: Statement;\n",
"\n",
" constructor(path:string) {\n",
" this.db = new Database(path);\n",
" this.db.enableLoadExtension = true;\n",
" this.db.loadExtension(sqlitePath.getLoadablePath());\n",
" this.db.loadExtension(sqliteUrl.getLoadablePath());\n",
" this.db.loadExtension(sqliteRegex.getLoadablePath());\n",
" this.db.enableLoadExtension = false;\n",
"\n",
" this.db.exec(`\n",
" CREATE TABLE IF NOT EXISTS articles(\n",
" id integer primary key autoincrement,\n",
" year integer,\n",
" month integer,\n",
" slug TEXT,\n",
" slug_id TEXT,\n",
" headline TEXT,\n",
" url TEXT,\n",
" category1 TEXT,\n",
" category2 TEXT\n",
" )\n",
" `);\n",
"\n",
" this.#stmtInsertArticle = this.db.prepare(`\n",
" insert into articles(year, month, slug, slug_id, headline, url, category1, category2)\n",
" select\n",
" :year as year,\n",
" :month as month,\n",
" regex_capture(\n",
" '(?P<slug>.+)-(?P<id>[^-]+)$',\n",
" path_at(url_path(:url), -1),\n",
" 'slug'\n",
" ) as slug,\n",
" regex_capture(\n",
" '(?P<slug>.+)-(?P<id>[^-]+)$',\n",
" path_at(url_path(:url), -1),\n",
" 'id'\n",
" ) as slug_id,\n",
" :headline as headline,\n",
" :url as url,\n",
" path_at(url_path(:url), 0) as category1,\n",
" iif(\n",
" path_length(url_path(:url)) > 2,\n",
" path_at(url_path(:url), 1),\n",
" null\n",
" ) as category2\n",
" `);\n",
" }\n",
"\n",
" insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {\n",
" const tx = this.db.transaction((year, month, articles) => {\n",
" for(const article of articles) {\n",
" this.#stmtInsertArticle.run({...article, year, month})\n",
" }\n",
" });\n",
" tx(year, month, articles);\n",
" }\n",
"}\n",
"\n",
"async function insertMonth(db: Db, year:number, month: text) {\n",
" let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;\n",
" while(true) {\n",
" const monthPage = await fetch(url).then(r=>r.text())\n",
" const {document:monthPageDoc} = parseHTML(monthPage);\n",
" const monthEntries = monthPageDoc\n",
" .querySelectorAll('.MonthPage a')\n",
" .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
" db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);\n",
" const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');\n",
" if(!next) {\n",
" break;\n",
" }\n",
" url = `https://www.nbcnews.com${next.getAttribute('href')}`;\n",
" }\n",
"\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"\n",
"async function backfill(db, start: Date, end: Date) {\n",
" const targets = d3.timeMonths(start, end)\n",
" .map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));\n",
" for(const target of targets) {\n",
" console.log(`${target.year} ${target.monthIndex}`)\n",
" await insertMonth(db, target.year, months[target.monthIndex]);\n",
" }\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024 0\n",
"2024 1\n",
"2024 2\n",
"2024 3\n",
"2024 4\n",
"2024 5\n",
"2024 6\n",
"2024 7\n"
]
},
{
"data": {
"text/plain": [
"\u001b[33m1\u001b[39m"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const db = new Db(\":memory:\");\n",
"await backfill(db, new Date('2024-01-01'), new Date())\n",
"db.db.exec(\"vacuum into 'headlines-2024.db'\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Deno",
"language": "typescript",
"name": "deno"
},
"language_info": {
"codemirror_mode": "typescript",
"file_extension": ".ts",
"mimetype": "text/x.typescript",
"name": "typescript",
"nbconvert_exporter": "script",
"pygments_lexer": "typescript",
"version": "5.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -0,0 +1,559 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[no code]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".open tmp3.db"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"<th>\n",
"schema\n",
"</th>\n",
"<th>\n",
"name\n",
"</th>\n",
"<th>\n",
"type\n",
"</th>\n",
"<th>\n",
"ncol\n",
"</th>\n",
"<th>\n",
"wr\n",
"</th>\n",
"<th>\n",
"strict\n",
"</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"main\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"sqlite_sequence\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"table\n",
"</td>\n",
"<td >\n",
"2\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"main\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"articles\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"table\n",
"</td>\n",
"<td >\n",
"9\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"main\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"sqlite_schema\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"table\n",
"</td>\n",
"<td >\n",
"5\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"temp\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"sqlite_temp_schema\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"table\n",
"</td>\n",
"<td >\n",
"5\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"4 rows × 6 columns\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mschema\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mname \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mtype \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mncol\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mwr\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mstrict\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mmain \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0msqlite_sequence \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mtable\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mmain \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0marticles \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mtable\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mmain \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0msqlite_schema \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mtable\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mtemp \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0msqlite_temp_schema\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mtable\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select * from pragma_table_list;"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"0 row × 0 column\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"create virtual table fts_headlines using fts5(\n",
" headline,\n",
" content='articles', content_rowid='id'\n",
");\n",
"\n",
"insert into fts_headlines(rowid, headline)\n",
" select rowid, headline\n",
" from articles;\n",
"\n",
"insert into fts_headlines(fts_headlines) values('optimize');"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"<th>\n",
"headline\n",
"</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Washington state faces first outbreak of a deadly fungal infection that&#39;s on the rise in the U.S.\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Israel-Hamas war live updates: U.S. readies weeks of retaliatory strikes against Iran-linked targets\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"House to vote on an expanded child tax credit bill\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Travel costs, staff and ads added up before Ron DeSantis dropped out\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Victims of Hamas attack in Israel and their families blame Iran in new federal lawsuit\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Trump meets with Teamsters as he targets Biden support\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"The bipartisan border deal would not allow 5,000 illegal crossings per day, despite what Trump says\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Machu Picchu tourism suffering after week of protests against new ticketing system\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"FCC moves to criminalize most AI-generated robocalls\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td style=\"text-align: left;\">\n",
"Civil rights group says N.C. public schools are harming LGBTQ students, violating federal law\n",
"</td>\n",
"</tr>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"10 rows × 1 column\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m──────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mheadline \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m──────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWashington state faces first outbreak of a deadly fungal infection that's on the rise in the U.S. \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mIsrael-Hamas war live updates: U.S. readies weeks of retaliatory strikes against Iran-linked targets\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHouse to vote on an expanded child tax credit bill \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTravel costs, staff and ads added up before Ron DeSantis dropped out \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mVictims of Hamas attack in Israel and their families blame Iran in new federal lawsuit \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump meets with Teamsters as he targets Biden support \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mThe bipartisan border deal would not allow 5,000 illegal crossings per day, despite what Trump says \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mMachu Picchu tourism suffering after week of protests against new ticketing system \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mFCC moves to criminalize most AI-generated robocalls \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mCivil rights group says N.C. public schools are harming LGBTQ students, violating federal law \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m──────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select * from fts_headlines limit 10;"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"0 row × 0 column\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".load ./lembed0\n",
".load ../../dist/vec0\n",
"\n",
"insert into lembed_models(name, model) values\n",
" ('default', lembed_model_from_file('all-MiniLM-L6-v2.e4ce9877.q8_0.gguf'));"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"create virtual table vec_headlines using vec0(\n",
" article_id integer primary key,\n",
" headline_embedding float[384]\n",
");\n",
"\n",
"insert into vec_headlines(article_id, headline_embedding)\n",
"select\n",
" rowid,\n",
" lembed(headline)\n",
"from articles;"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"<th>\n",
"article_id\n",
"</th>\n",
"<th>\n",
"headline_embedding\n",
"</th>\n",
"<th>\n",
"vec_to_json(vec_slice(headline_embedding, 0, 8))\n",
"</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.055018,-0.021632,-0.012835,0.048403,0.039037,-0.012824,-0.043627,0.031868]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"2\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.048287,0.023883,-0.004665,0.001806,0.030342,0.050691,0.050082,-0.127660]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"3\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[-0.042424,-0.019893,0.022101,-0.030609,-0.016659,0.008453,-0.056492,0.093258]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"4\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.076178,-0.080511,0.034440,0.027351,0.028441,0.038463,-0.023355,0.089898]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"5\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.028183,0.091150,-0.043882,0.028064,0.010961,0.018683,0.011500,-0.015776]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"6\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[-0.061114,-0.031104,0.060050,-0.037375,0.007963,-0.049056,-0.042365,-0.021792]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"7\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.059814,0.026079,0.061488,0.011823,0.048770,-0.035152,0.031329,-0.015644]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"8\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.095066,0.001522,-0.030417,0.091296,0.068129,-0.021405,0.008825,0.023469]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"9\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.017708,-0.086306,0.002358,0.010318,0.008864,0.025368,0.094156,-0.006123]\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"10\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;1536&gt;\n",
"</td>\n",
"<td style=\"color: red\">\n",
"[0.034452,0.045083,-0.000227,0.102294,0.047915,-0.012732,-0.024640,-0.043112]\n",
"</td>\n",
"</tr>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"10 rows × 3 columns\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0marticle_id\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mheadline_embedding\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec_to_json(vec_slice(headline_embedding, 0, 8)) \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.055018,-0.021632,-0.012835,0.048403,0.039037,-0.012824,-0.043627,0.031868] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.048287,0.023883,-0.004665,0.001806,0.030342,0.050691,0.050082,-0.127660] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[-0.042424,-0.019893,0.022101,-0.030609,-0.016659,0.008453,-0.056492,0.093258] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.076178,-0.080511,0.034440,0.027351,0.028441,0.038463,-0.023355,0.089898] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.028183,0.091150,-0.043882,0.028064,0.010961,0.018683,0.011500,-0.015776] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[-0.061114,-0.031104,0.060050,-0.037375,0.007963,-0.049056,-0.042365,-0.021792]\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.059814,0.026079,0.061488,0.011823,0.048770,-0.035152,0.031329,-0.015644] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.095066,0.001522,-0.030417,0.091296,0.068129,-0.021405,0.008825,0.023469] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.017708,-0.086306,0.002358,0.010318,0.008864,0.025368,0.094156,-0.006123] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<1536> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[0.034452,0.045083,-0.000227,0.102294,0.047915,-0.012732,-0.024640,-0.043112] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select\n",
" article_id,\n",
" headline_embedding,\n",
" vec_to_json(vec_slice(headline_embedding, 0, 8))\n",
"from vec_headlines\n",
"limit 10;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Solite",
"language": "sql",
"name": "solite"
},
"language_info": {
"file_extension": ".sql",
"mimetype": "text/x.sqlite",
"name": "sql",
"nb_converter": "script",
"pygments_lexer": "sql",
"version": "TODO"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,2 @@
deps: all-MiniLM-L6-v2.e4ce9877.q8_0.gguf:
curl -L https://github.com/asg017/sqlite-rembed/releases/download/v0.0.1-alpha.9/install.sh | sh curl -L -o $@ https://huggingface.co/asg017/sqlite-lembed-model-examples/resolve/main/all-MiniLM-L6-v2/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf
curl -L https://github.com/asg017/sqlite-vec/releases/download/v0.0.1-alpha.37/install.sh | sh
wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.11/llamafile-0.8.11
wget https://huggingface.co/leliuga/all-MiniLM-L6-v2-GGUF/resolve/main/all-MiniLM-L6-v2.F16.gguf
.PHONY: deps

View file

@ -1 +1,4 @@
https://www.nbcnews.com/archive/articles/last-seven-days - `headlines-2024.db`
- 14.5k rows
- 4.4MB

View file

@ -0,0 +1,26 @@
.open tmp.db
--.load ./vec0
.load ./lembed0
.timer on
insert into lembed_models(name, model)
values (
'default',
lembed_model_from_file('all-MiniLM-L6-v2.e4ce9877.q8_0.gguf')
);
with subset as (
select headline from articles limit 1000
)
select sum(lembed(headline)) from subset;
.load ./rembed0
insert into rembed_clients(name, options)
values ('default','llamafile');
with subset as (
select headline from articles limit 1000
)
select sum(rembed('default', headline)) from subset;

View file

@ -2,30 +2,44 @@
.header on .header on
.bail on .bail on
.load ./vec0 begin;
.load ./rembed0
insert into rembed_clients(name, options) create virtual table fts_headlines using fts5(
values ('llamafile', 'llamafile'); headline,
content='articles', content_rowid='id'
create table articles as
select
value ->> 'url' as url,
value ->> 'headline' as headline,
rembed('llamafile', value ->> 'headline') as headline_embedding
from json_each(
readfile('2024-07-26.json')
); );
select writefile( insert into fts_headlines(rowid, headline)
'articles.json', select rowid, headline
json_group_array( from articles;
json_object(
'id', rowid, INSERT INTO fts_headlines(fts_headlines) VALUES('optimize');
'url', url,
'headline', headline, .timer on
'headline_embedding', vec_to_json(headline_embedding)
) .load ../../dist/vec0
) .load ./lembed0
)
insert into lembed_models(name, model) values
('default', lembed_model_from_file('all-MiniLM-L6-v2.e4ce9877.q8_0.gguf'));
create virtual table vec_headlines using vec0(
article_id integer primary key,
headline_embedding float[384]
);
-- 1m23s
insert into vec_headlines(article_id, headline_embedding)
select
rowid,
lembed(headline)
from articles; from articles;
commit;
-- rembed vec0 INSERT: 10m17s
-- before: 4.37 MB
-- /w fts content: 5.35 MB (+0.98 MB)
-- with optimize 5.30 MB (-0.049 MB)
-- w/ fts: 6.67 MB (+2.30 MB)
-- sum(octet_length(headline)): 1.16 MB

View file

@ -1,34 +0,0 @@
import Database from "better-sqlite3";
import * as sqliteVec from "sqlite-vec";
import { pipeline } from "@xenova/transformers";
const db = new Database("articles.db");
sqliteVec.load(db);
const extractor = await pipeline(
"feature-extraction",
"Xenova/all-MiniLM-L6-v2"
);
const query = "sports";
const queryEmbedding = await extractor([query], {
pooling: "mean",
normalize: true,
});
const rows = db
.prepare(
`
select
article_id,
headline,
distance
from vec_articles
left join articles on articles.id = vec_articles.article_id
where headline_embedding match ?
and k = 8;
`
)
.all(queryEmbedding.data);
console.log(rows);

View file

@ -1,31 +0,0 @@
import sqlite3
import sqlite_vec
from sentence_transformers import SentenceTransformer
db = sqlite3.connect("articles.db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)
model = SentenceTransformer("all-MiniLM-L6-v2")
query = "sports"
query_embedding = model.encode(query)
results = db.execute(
"""
select
article_id,
headline,
distance
from vec_articles
left join articles on articles.id = vec_articles.article_id
where headline_embedding match ?
and k = 8;
""",
[query_embedding]
).fetchall()
for (article_id, headline, distance) in results:
print(article_id, headline, distance)

View file

@ -1,54 +0,0 @@
.load vec0
.header on
.bail on
.timer on
create temp table raw_articles as
select
value ->> 'id' as id,
value ->> 'url' as url,
value ->> 'headline' as headline,
value ->> 'headline_embedding' as headline_embedding
from json_each(
readfile('articles.json')
);
create table articles(
id integer primary key,
headline text,
url text
);
insert into articles(id, headline, url)
select id, headline, url from temp.raw_articles;
select * from articles limit 5;
create virtual table vec_articles using vec0(
article_id integer primary key,
headline_embedding float[384]
);
insert into vec_articles(article_id, headline_embedding)
select id, headline_embedding from temp.raw_articles;
select * from articles limit 5;
select article_id, vec_to_json(headline_embedding) from articles limit 5;
.param set :query 'sports'
.load ./rembed0
insert into rembed_clients values ('all-MiniLM-L6-v2', 'llamafile');
.mode qbox -ww
select
article_id,
--headline,
distance
from vec_articles
--left join articles on articles.id = vec_articles.article_id
where headline_embedding match rembed('all-MiniLM-L6-v2', :query)
and k = 10;

View file

@ -0,0 +1,897 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"vscode": {
"languageId": "sql"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[no code]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".open tmp.db\n",
".load ../../dist/vec0\n",
".load ./rembed0\n",
"\n",
"insert into rembed_clients(name, options)\n",
" values ('snowflake-arctic-embed-m-v1.5', 'llamafile');\n",
"\n",
"--select vec_version(), rembed_debug();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## FTS Search"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"vscode": {
"languageId": "sql"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"<th>\n",
"rowid\n",
"</th>\n",
"<th>\n",
"headline_highlighted\n",
"</th>\n",
"<th>\n",
"rank\n",
"</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td >\n",
"4666\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Kamala Harris visits &lt;b&gt;Planned&lt;/b&gt; &lt;b&gt;Parenthood&lt;/b&gt; clinic\n",
"</td>\n",
"<td >\n",
"-18.9139950477264\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"6521\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Former Marine sentenced to 9 years in prison for firebombing &lt;b&gt;Planned&lt;/b&gt; &lt;b&gt;Parenthood&lt;/b&gt; clinic\n",
"</td>\n",
"<td >\n",
"-14.807022703838651\n",
"</td>\n",
"</tr>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"2 rows × 3 columns\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mrowid\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mheadline_highlighted \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mrank \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4666\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris visits <b>Planned</b> <b>Parenthood</b> clinic \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -18.9139950477264\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6521\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mFormer Marine sentenced to 9 years in prison for firebombing <b>Planned</b> <b>Parenthood</b> clinic\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m-14.807022703838651\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".param set query planned parenthood\n",
"\n",
"select\n",
" rowid,\n",
" highlight(fts_headlines, 0, '<b>', '</b>') as headline_highlighted,\n",
" rank\n",
"from fts_headlines\n",
"where headline match :query\n",
"order by rank\n",
"limit 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Vector Search"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"vscode": {
"languageId": "sql"
}
},
"outputs": [
{
"ename": "SQL logic error",
"evalue": "Error sending HTTP request: http://localhost:8080/embedding: Connection Failed: Connect error: Connection refused (os error 61)",
"output_type": "error",
"traceback": []
}
],
"source": [
".param set query planned parenthood\n",
"\n",
"select\n",
" article_id,\n",
" articles.headline,\n",
" distance\n",
"from vec_headlines\n",
"left join articles on articles.rowid = vec_headlines.article_id\n",
"where headline_embedding match vec_normalize(vec_slice(rembed('snowflake-arctic-embed-m-v1.5', :query), 0, 256))\n",
" and k = 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## FTS + Vector search: RFF"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"vscode": {
"languageId": "sql"
}
},
"outputs": [
{
"ename": "SQL logic error",
"evalue": "SQL logic error (1) : Error sending HTTP request: http://localhost:8080/embedding: Connection Failed: Connect error: Connection refused (os error 61)",
"output_type": "error",
"traceback": []
}
],
"source": [
".param set query planned parenthood\n",
"\n",
"insert or replace into temp.sqlite_parameters\n",
" values ('query_embedding', rembed('snowflake-arctic-embed-m-v1.5', :query));\n",
"\n",
".param set weight_fts 1.0\n",
".param set weight_vec 1.0\n",
".param set rrf_k 60\n",
".param set k 10\n",
"\n",
"\n",
"with vec_matches as (\n",
" select\n",
" article_id,\n",
" row_number() over (order by distance) as rank_number,\n",
" distance\n",
" from vec_headlines\n",
" where\n",
" headline_embedding match vec_slice(:query_embedding, 0, 256)\n",
" and k = :k\n",
" order by distance\n",
"),\n",
"fts_matches as (\n",
" select\n",
" rowid,\n",
" row_number() over (order by rank) as rank_number,\n",
" rank as score\n",
" from fts_headlines\n",
" where headline match :query\n",
" limit :k\n",
"),\n",
"final as (\n",
" select\n",
" articles.id,\n",
" articles.headline,\n",
" vec_matches.distance as vector_distance,\n",
" fts_matches.score as fts_score,\n",
" coalesce(1.0 / (:rrf_k + fts_matches.rowid), 0.0) * :weight_fts +\n",
" coalesce(1.0 / (:rrf_k + vec_matches.article_id), 0.0) * :weight_vec\n",
" as combined_score\n",
"\n",
" from fts_matches\n",
" full outer join vec_matches on vec_matches.article_id = fts_matches.rowid\n",
" join articles on articles.rowid = coalesce(fts_matches.rowid, vec_matches.article_id)\n",
" order by combined_score desc\n",
")\n",
"select * from final;\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"vscode": {
"languageId": "sql"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"<th>\n",
"json(&#39;[1,2,3,4]&#39;)\n",
"</th>\n",
"<th>\n",
"vec_f32(X&#39;AABBCCDD&#39;)\n",
"</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td style=\"color: red\">\n",
"[1,2,3,4]\n",
"</td>\n",
"<td style=\"color: blue\">\n",
"Blob&lt;4&gt;\n",
"</td>\n",
"</tr>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"1 row × 2 columns\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m───────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mjson('[1,2,3,4]')\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec_f32(X'AABBCCDD')\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m───────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m[1,2,3,4] \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m Blob<4> \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m───────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select json('[1,2,3,4]'), vec_f32(X'AABBCCDD')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"vscode": {
"languageId": "sql"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table>\n",
"<thead>\n",
"<tr style=\"text-align: center;\">\n",
"<th>\n",
"addr\n",
"</th>\n",
"<th>\n",
"opcode\n",
"</th>\n",
"<th>\n",
"p1\n",
"</th>\n",
"<th>\n",
"p2\n",
"</th>\n",
"<th>\n",
"p3\n",
"</th>\n",
"<th>\n",
"p4\n",
"</th>\n",
"<th>\n",
"p5\n",
"</th>\n",
"<th>\n",
"comment\n",
"</th>\n",
"<th>\n",
"subprog\n",
"</th>\n",
"<th>\n",
"nexec\n",
"</th>\n",
"<th>\n",
"ncycle\n",
"</th>\n",
"</tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Init\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"12\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Null\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"2\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"VOpen\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"vtab:7FAC27505B30\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"3\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Integer\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"2\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"4\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Integer\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"3\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"5\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"VFilter\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"8\n",
"</td>\n",
"<td >\n",
"2\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"6\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"AggStep\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"count(0)\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"7\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"VNext\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"6\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"8\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"AggFinal\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"count(0)\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"9\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Copy\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"4\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"10\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"ResultRow\n",
"</td>\n",
"<td >\n",
"4\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"11\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Halt\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"12\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Transaction\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"11\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"0\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"<tr>\n",
"<td >\n",
"13\n",
"</td>\n",
"<td style=\"text-align: left;\">\n",
"Goto\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"1\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"<td >\n",
"0\n",
"</td>\n",
"</tr>\n",
"</tbody>\n",
"</table>\n",
"<div style=\"text-align: right;\">\n",
"14 rows × 11 columns\n",
"</div>\n",
"</div>\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m───────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0maddr\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mopcode \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mp1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mp2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mp3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mp4 \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mp5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mcomment\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0msubprog\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mnexec\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mncycle\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m───────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mInit \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m12\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mNull \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mVOpen \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvtab:7FAC27505B30\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mInteger \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mInteger \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mVFilter \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAggStep \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mcount(0) \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mVNext \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAggFinal \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mcount(0) \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mCopy \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mResultRow \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 11\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHalt \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 12\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTransaction\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m11\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0 \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 13\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mGoto \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[3m NULL \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m───────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select * from bytecode('select count(*) from pragma_table_list;')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Solite",
"language": "sql",
"name": "solite"
},
"language_info": {
"file_extension": ".sql",
"mimetype": "text/x.sqlite",
"name": "sqlite",
"nb_converter": "script",
"pygments_lexer": "sql",
"version": "TODO"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -1,56 +0,0 @@
<html>
<body>
<h1>sqlite-vec articles.db demo</h1>
<script type="module">
import { default as init } from "https://cdn.jsdelivr.net/npm/sqlite-vec-wasm-demo@latest/sqlite3.mjs";
import { pipeline } from "https://cdn.jsdelivr.net/npm/@xenova/transformers";
const sqlite3 = await init();
const dbContents = await fetch("articles.db").then((r) => r.arrayBuffer());
const db = new sqlite3.oo1.DB();
const p = sqlite3.wasm.allocFromTypedArray(dbContents);
const rc = sqlite3.capi.sqlite3_deserialize(
db.pointer,
"main",
p,
dbContents.byteLength,
dbContents.byteLength,
sqlite3.capi.SQLITE_DESERIALIZE_FREEONCLOSE,
);
db.checkRc(rc);
const extractor = await pipeline(
"feature-extraction",
"Xenova/all-MiniLM-L6-v2",
);
const query = "sports";
const queryEmbedding = await extractor([query], {
pooling: "mean",
normalize: true,
});
const rows = db
.selectObjects(
`
select
article_id,
headline,
distance
from vec_articles
left join articles on articles.id = vec_articles.article_id
where headline_embedding match ?
and k = 8;
`,
queryEmbedding.data.buffer,
);
const output = document.body.appendChild(document.createElement('pre'));
output.innerText = JSON.stringify(rows, null, 2);
</script>
</body>
</html>

View file

@ -1,801 +0,0 @@
{
"name": "nbc-headlines",
"version": "1.0.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "nbc-headlines",
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@xenova/transformers": "^2.17.2",
"better-sqlite3": "^11.1.2",
"sqlite-vec": "^0.0.1-alpha.37"
}
},
"node_modules/@huggingface/jinja": {
"version": "0.2.2",
"license": "MIT",
"engines": {
"node": ">=18"
}
},
"node_modules/@protobufjs/aspromise": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
"integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/base64": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
"integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/codegen": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz",
"integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/eventemitter": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
"integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/fetch": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
"integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
"license": "BSD-3-Clause",
"dependencies": {
"@protobufjs/aspromise": "^1.1.1",
"@protobufjs/inquire": "^1.1.0"
}
},
"node_modules/@protobufjs/float": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
"integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/inquire": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz",
"integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/path": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
"integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/pool": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
"integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
"license": "BSD-3-Clause"
},
"node_modules/@protobufjs/utf8": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==",
"license": "BSD-3-Clause"
},
"node_modules/@types/long": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
"integrity": "sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==",
"license": "MIT"
},
"node_modules/@types/node": {
"version": "20.14.12",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.12.tgz",
"integrity": "sha512-r7wNXakLeSsGT0H1AU863vS2wa5wBOK4bWMjZz2wj+8nBx+m5PeIn0k8AloSLpRuiwdRQZwarZqHE4FNArPuJQ==",
"license": "MIT",
"dependencies": {
"undici-types": "~5.26.4"
}
},
"node_modules/@xenova/transformers": {
"version": "2.17.2",
"resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.2.tgz",
"integrity": "sha512-lZmHqzrVIkSvZdKZEx7IYY51TK0WDrC8eR0c5IMnBsO8di8are1zzw8BlLhyO2TklZKLN5UffNGs1IJwT6oOqQ==",
"license": "Apache-2.0",
"dependencies": {
"@huggingface/jinja": "^0.2.2",
"onnxruntime-web": "1.14.0",
"sharp": "^0.32.0"
},
"optionalDependencies": {
"onnxruntime-node": "1.14.0"
}
},
"node_modules/b4a": {
"version": "1.6.6",
"license": "Apache-2.0"
},
"node_modules/bare-events": {
"version": "2.4.2",
"license": "Apache-2.0",
"optional": true
},
"node_modules/bare-fs": {
"version": "2.3.1",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-events": "^2.0.0",
"bare-path": "^2.0.0",
"bare-stream": "^2.0.0"
}
},
"node_modules/bare-os": {
"version": "2.4.0",
"license": "Apache-2.0",
"optional": true
},
"node_modules/bare-path": {
"version": "2.1.3",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-os": "^2.1.0"
}
},
"node_modules/bare-stream": {
"version": "2.1.3",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"streamx": "^2.18.0"
}
},
"node_modules/base64-js": {
"version": "1.5.1",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT"
},
"node_modules/better-sqlite3": {
"version": "11.1.2",
"resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-11.1.2.tgz",
"integrity": "sha512-gujtFwavWU4MSPT+h9B+4pkvZdyOUkH54zgLdIrMmmmd4ZqiBIrRNBzNzYVFO417xo882uP5HBu4GjOfaSrIQw==",
"hasInstallScript": true,
"license": "MIT",
"dependencies": {
"bindings": "^1.5.0",
"prebuild-install": "^7.1.1"
}
},
"node_modules/bindings": {
"version": "1.5.0",
"license": "MIT",
"dependencies": {
"file-uri-to-path": "1.0.0"
}
},
"node_modules/bl": {
"version": "4.1.0",
"license": "MIT",
"dependencies": {
"buffer": "^5.5.0",
"inherits": "^2.0.4",
"readable-stream": "^3.4.0"
}
},
"node_modules/buffer": {
"version": "5.7.1",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT",
"dependencies": {
"base64-js": "^1.3.1",
"ieee754": "^1.1.13"
}
},
"node_modules/chownr": {
"version": "1.1.4",
"license": "ISC"
},
"node_modules/color": {
"version": "4.2.3",
"license": "MIT",
"dependencies": {
"color-convert": "^2.0.1",
"color-string": "^1.9.0"
},
"engines": {
"node": ">=12.5.0"
}
},
"node_modules/color-convert": {
"version": "2.0.1",
"license": "MIT",
"dependencies": {
"color-name": "~1.1.4"
},
"engines": {
"node": ">=7.0.0"
}
},
"node_modules/color-name": {
"version": "1.1.4",
"license": "MIT"
},
"node_modules/color-string": {
"version": "1.9.1",
"license": "MIT",
"dependencies": {
"color-name": "^1.0.0",
"simple-swizzle": "^0.2.2"
}
},
"node_modules/decompress-response": {
"version": "6.0.0",
"license": "MIT",
"dependencies": {
"mimic-response": "^3.1.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/deep-extend": {
"version": "0.6.0",
"license": "MIT",
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/detect-libc": {
"version": "2.0.3",
"license": "Apache-2.0",
"engines": {
"node": ">=8"
}
},
"node_modules/end-of-stream": {
"version": "1.4.4",
"license": "MIT",
"dependencies": {
"once": "^1.4.0"
}
},
"node_modules/expand-template": {
"version": "2.0.3",
"license": "(MIT OR WTFPL)",
"engines": {
"node": ">=6"
}
},
"node_modules/fast-fifo": {
"version": "1.3.2",
"license": "MIT"
},
"node_modules/file-uri-to-path": {
"version": "1.0.0",
"license": "MIT"
},
"node_modules/flatbuffers": {
"version": "1.12.0",
"license": "SEE LICENSE IN LICENSE.txt"
},
"node_modules/fs-constants": {
"version": "1.0.0",
"license": "MIT"
},
"node_modules/github-from-package": {
"version": "0.0.0",
"license": "MIT"
},
"node_modules/guid-typescript": {
"version": "1.0.9",
"license": "ISC"
},
"node_modules/ieee754": {
"version": "1.2.1",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "BSD-3-Clause"
},
"node_modules/inherits": {
"version": "2.0.4",
"license": "ISC"
},
"node_modules/ini": {
"version": "1.3.8",
"license": "ISC"
},
"node_modules/is-arrayish": {
"version": "0.3.2",
"license": "MIT"
},
"node_modules/long": {
"version": "4.0.0",
"license": "Apache-2.0"
},
"node_modules/mimic-response": {
"version": "3.1.0",
"license": "MIT",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimist": {
"version": "1.2.8",
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/mkdirp-classic": {
"version": "0.5.3",
"license": "MIT"
},
"node_modules/napi-build-utils": {
"version": "1.0.2",
"license": "MIT"
},
"node_modules/node-abi": {
"version": "3.65.0",
"license": "MIT",
"dependencies": {
"semver": "^7.3.5"
},
"engines": {
"node": ">=10"
}
},
"node_modules/node-addon-api": {
"version": "6.1.0",
"license": "MIT"
},
"node_modules/once": {
"version": "1.4.0",
"license": "ISC",
"dependencies": {
"wrappy": "1"
}
},
"node_modules/onnx-proto": {
"version": "4.0.4",
"license": "MIT",
"dependencies": {
"protobufjs": "^6.8.8"
}
},
"node_modules/onnx-proto/node_modules/protobufjs": {
"version": "6.11.4",
"resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz",
"integrity": "sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==",
"hasInstallScript": true,
"license": "BSD-3-Clause",
"dependencies": {
"@protobufjs/aspromise": "^1.1.2",
"@protobufjs/base64": "^1.1.2",
"@protobufjs/codegen": "^2.0.4",
"@protobufjs/eventemitter": "^1.1.0",
"@protobufjs/fetch": "^1.1.0",
"@protobufjs/float": "^1.0.2",
"@protobufjs/inquire": "^1.1.0",
"@protobufjs/path": "^1.1.2",
"@protobufjs/pool": "^1.1.0",
"@protobufjs/utf8": "^1.1.0",
"@types/long": "^4.0.1",
"@types/node": ">=13.7.0",
"long": "^4.0.0"
},
"bin": {
"pbjs": "bin/pbjs",
"pbts": "bin/pbts"
}
},
"node_modules/onnxruntime-common": {
"version": "1.14.0",
"license": "MIT"
},
"node_modules/onnxruntime-node": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz",
"integrity": "sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==",
"license": "MIT",
"optional": true,
"os": [
"win32",
"darwin",
"linux"
],
"dependencies": {
"onnxruntime-common": "~1.14.0"
}
},
"node_modules/onnxruntime-web": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz",
"integrity": "sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==",
"license": "MIT",
"dependencies": {
"flatbuffers": "^1.12.0",
"guid-typescript": "^1.0.9",
"long": "^4.0.0",
"onnx-proto": "^4.0.4",
"onnxruntime-common": "~1.14.0",
"platform": "^1.3.6"
}
},
"node_modules/platform": {
"version": "1.3.6",
"license": "MIT"
},
"node_modules/prebuild-install": {
"version": "7.1.2",
"license": "MIT",
"dependencies": {
"detect-libc": "^2.0.0",
"expand-template": "^2.0.3",
"github-from-package": "0.0.0",
"minimist": "^1.2.3",
"mkdirp-classic": "^0.5.3",
"napi-build-utils": "^1.0.1",
"node-abi": "^3.3.0",
"pump": "^3.0.0",
"rc": "^1.2.7",
"simple-get": "^4.0.0",
"tar-fs": "^2.0.0",
"tunnel-agent": "^0.6.0"
},
"bin": {
"prebuild-install": "bin.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/pump": {
"version": "3.0.0",
"license": "MIT",
"dependencies": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"node_modules/queue-tick": {
"version": "1.0.1",
"license": "MIT"
},
"node_modules/rc": {
"version": "1.2.8",
"license": "(BSD-2-Clause OR MIT OR Apache-2.0)",
"dependencies": {
"deep-extend": "^0.6.0",
"ini": "~1.3.0",
"minimist": "^1.2.0",
"strip-json-comments": "~2.0.1"
},
"bin": {
"rc": "cli.js"
}
},
"node_modules/readable-stream": {
"version": "3.6.2",
"license": "MIT",
"dependencies": {
"inherits": "^2.0.3",
"string_decoder": "^1.1.1",
"util-deprecate": "^1.0.1"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/safe-buffer": {
"version": "5.2.1",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT"
},
"node_modules/semver": {
"version": "7.6.3",
"license": "ISC",
"bin": {
"semver": "bin/semver.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/sharp": {
"version": "0.32.6",
"hasInstallScript": true,
"license": "Apache-2.0",
"dependencies": {
"color": "^4.2.3",
"detect-libc": "^2.0.2",
"node-addon-api": "^6.1.0",
"prebuild-install": "^7.1.1",
"semver": "^7.5.4",
"simple-get": "^4.0.1",
"tar-fs": "^3.0.4",
"tunnel-agent": "^0.6.0"
},
"engines": {
"node": ">=14.15.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/sharp/node_modules/tar-fs": {
"version": "3.0.6",
"license": "MIT",
"dependencies": {
"pump": "^3.0.0",
"tar-stream": "^3.1.5"
},
"optionalDependencies": {
"bare-fs": "^2.1.1",
"bare-path": "^2.1.0"
}
},
"node_modules/sharp/node_modules/tar-stream": {
"version": "3.1.7",
"license": "MIT",
"dependencies": {
"b4a": "^1.6.4",
"fast-fifo": "^1.2.0",
"streamx": "^2.15.0"
}
},
"node_modules/simple-concat": {
"version": "1.0.1",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT"
},
"node_modules/simple-get": {
"version": "4.0.1",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT",
"dependencies": {
"decompress-response": "^6.0.0",
"once": "^1.3.1",
"simple-concat": "^1.0.0"
}
},
"node_modules/simple-swizzle": {
"version": "0.2.2",
"license": "MIT",
"dependencies": {
"is-arrayish": "^0.3.1"
}
},
"node_modules/sqlite-vec": {
"version": "0.0.1-alpha.37",
"resolved": "https://registry.npmjs.org/sqlite-vec/-/sqlite-vec-0.0.1-alpha.37.tgz",
"integrity": "sha512-ZbdIGPLXVr7cxy9pjwxT8MYBd342bLVbLdV3ZiZz0FpdT5yi0lXyYSh/BEtt0FNrRTwXSLDLL3saTKih6KYzaQ==",
"license": "MIT OR Apache",
"optionalDependencies": {
"sqlite-vec-darwin-arm64": "0.0.1-alpha.37",
"sqlite-vec-darwin-x64": "0.0.1-alpha.37",
"sqlite-vec-linux-x64": "0.0.1-alpha.37",
"sqlite-vec-windows-x64": "0.0.1-alpha.37"
}
},
"node_modules/sqlite-vec-darwin-arm64": {
"version": "0.0.1-alpha.37",
"resolved": "https://registry.npmjs.org/sqlite-vec-darwin-arm64/-/sqlite-vec-darwin-arm64-0.0.1-alpha.37.tgz",
"integrity": "sha512-3cPdW8JbNVZ08bwsCMsAVv0atZsvRY6Co3bgbiOVjwduBYNd3cTCC6Q+ICjz+H4hTEMZNqGdIWEeCTLgbvnEPw==",
"cpu": [
"arm64"
],
"license": "MIT OR Apache",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/sqlite-vec-darwin-x64": {
"version": "0.0.1-alpha.37",
"resolved": "https://registry.npmjs.org/sqlite-vec-darwin-x64/-/sqlite-vec-darwin-x64-0.0.1-alpha.37.tgz",
"integrity": "sha512-/56xiUbONFw+g3x/UlzGP2ykMDkrQ10trZVxk2Mmshd1y1QOqCoJoWzzcFm3OdaHI/Gz9vOJdCA8Py1QAr7Xqg==",
"cpu": [
"x64"
],
"license": "MIT OR Apache",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/sqlite-vec-linux-x64": {
"version": "0.0.1-alpha.37",
"resolved": "https://registry.npmjs.org/sqlite-vec-linux-x64/-/sqlite-vec-linux-x64-0.0.1-alpha.37.tgz",
"integrity": "sha512-BEyesm7Vo4EzE+ZQR1CY4M+lHNQufG26q1rkT5LKv6bIyPpebd+zs+hC0ndJmiNjLDNqqsbCsLQoTfLpGG2Urg==",
"cpu": [
"x64"
],
"license": "MIT OR Apache",
"optional": true,
"os": [
"linux"
]
},
"node_modules/sqlite-vec-windows-x64": {
"version": "0.0.1-alpha.37",
"resolved": "https://registry.npmjs.org/sqlite-vec-windows-x64/-/sqlite-vec-windows-x64-0.0.1-alpha.37.tgz",
"integrity": "sha512-jPSrUQNuFXy0Y6weSGS3ivU6pZp9nrvHHP3rseXFHuVj+pYK7gN6h1bC20J9Ch/BVz+9S2cVouoO/HvV90+E8w==",
"cpu": [
"x64"
],
"license": "MIT OR Apache",
"optional": true,
"os": [
"windows"
]
},
"node_modules/streamx": {
"version": "2.18.0",
"license": "MIT",
"dependencies": {
"fast-fifo": "^1.3.2",
"queue-tick": "^1.0.1",
"text-decoder": "^1.1.0"
},
"optionalDependencies": {
"bare-events": "^2.2.0"
}
},
"node_modules/string_decoder": {
"version": "1.3.0",
"license": "MIT",
"dependencies": {
"safe-buffer": "~5.2.0"
}
},
"node_modules/strip-json-comments": {
"version": "2.0.1",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/tar-fs": {
"version": "2.1.1",
"license": "MIT",
"dependencies": {
"chownr": "^1.1.1",
"mkdirp-classic": "^0.5.2",
"pump": "^3.0.0",
"tar-stream": "^2.1.4"
}
},
"node_modules/tar-stream": {
"version": "2.2.0",
"license": "MIT",
"dependencies": {
"bl": "^4.0.3",
"end-of-stream": "^1.4.1",
"fs-constants": "^1.0.0",
"inherits": "^2.0.3",
"readable-stream": "^3.1.1"
},
"engines": {
"node": ">=6"
}
},
"node_modules/text-decoder": {
"version": "1.1.1",
"license": "Apache-2.0",
"dependencies": {
"b4a": "^1.6.4"
}
},
"node_modules/tunnel-agent": {
"version": "0.6.0",
"license": "Apache-2.0",
"dependencies": {
"safe-buffer": "^5.0.1"
},
"engines": {
"node": "*"
}
},
"node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
"license": "MIT"
},
"node_modules/util-deprecate": {
"version": "1.0.2",
"license": "MIT"
},
"node_modules/wrappy": {
"version": "1.0.2",
"license": "ISC"
}
}
}

View file

@ -1,17 +0,0 @@
{
"name": "nbc-headlines",
"version": "1.0.0",
"main": "demo.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"description": "",
"dependencies": {
"@xenova/transformers": "^2.17.2",
"better-sqlite3": "^11.1.2",
"sqlite-vec": "^0.0.1-alpha.37"
}
}

View file

@ -0,0 +1,57 @@
.load ../../dist/vec0
.load ./rembed0
insert into rembed_clients(name, options)
values ('snowflake-arctic-embed-m-v1.5', 'llamafile');
.bail on
.mode box
.header on
.timer on
.param set :query 'death row'
.param set :weight_fts 1.0
.param set :weight_vec 1.0
.param set :rrf_k 60
.param set :query_embedding "vec_normalize(vec_slice(rembed('snowflake-arctic-embed-m-v1.5', :query), 0, 256))"
.param set :k 10
select 'Hybrid w/ RRF' as "";
with vec_matches as (
select
article_id,
row_number() over (order by distance) as rank_number,
distance
from vec_headlines
where
headline_embedding match :query_embedding
and k = :k
order by distance
),
fts_matches as (
select
rowid,
--highlight(fts_headlines, 0, '<b>', '</b>') as headline_highlighted,
row_number() over (order by rank) as rank_number,
rank as score
from fts_headlines
where headline match :query
limit :k
),
final as (
select
articles.id,
articles.headline,
vec_matches.distance as vector_distance,
fts_matches.score as fts_score,
coalesce(1.0 / (:rrf_k + fts_matches.rowid), 0.0) * :weight_fts +
coalesce(1.0 / (:rrf_k + vec_matches.article_id), 0.0) * :weight_vec
as combined_score
from fts_matches
full outer join vec_matches on vec_matches.article_id = fts_matches.rowid
join articles on articles.rowid = coalesce(fts_matches.rowid, vec_matches.article_id)
order by combined_score desc
)
select * from final;

View file

@ -1,144 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import { parseHTML } from \"npm:linkedom\";\n",
"import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
"import * as sqlitePath from \"npm:sqlite-path\";\n",
"import * as sqliteUrl from \"npm:sqlite-url\";\n",
"import * as sqliteRegex from \"npm:sqlite-regex\";"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
"\n",
"const db = new Database(\":memory:\")\n",
"db.enableLoadExtension = true;\n",
"db.loadExtension(sqlitePath.getLoadablePath());\n",
"db.loadExtension(sqliteUrl.getLoadablePath());\n",
"db.loadExtension(sqliteRegex.getLoadablePath());\n",
"db.enableLoadExtension = false;\n",
"\n",
"db.exec(`\n",
" CREATE TABLE articles(\n",
" slug_id TEXT,\n",
" slug TEXT,\n",
" headline TEXT,\n",
" url TEXT,\n",
" year integer,\n",
" month integer,\n",
" category1 TEXT,\n",
" category2 TEXT\n",
" )\n",
"`)\n",
"\n",
"const stmt = db.prepare(`\n",
" insert into articles\n",
" select\n",
" regex_capture(\n",
" '(?P<slug>.+)-(?P<id>rcna\\\\d+)',\n",
" path_at(url_path(:url), -1),\n",
" 'id'\n",
" ) as id,\n",
" regex_capture(\n",
" '(?P<slug>.+)-(?P<id>rcna\\\\d+)',\n",
" path_at(url_path(:url), -1),\n",
" 'slug'\n",
" ) as slug,\n",
" :headline as headline,\n",
" :url as url,\n",
" :year as year,\n",
" :month as month,\n",
" path_at(url_path(:url), 0) as category1,\n",
" iif(\n",
" path_length(url_path(:url)) > 2,\n",
" path_at(url_path(:url), 1),\n",
" null\n",
" ) as category2\n",
"`);\n",
"\n",
"const insertArticles = db.transaction((year, month, articles) => {\n",
" for(const article of articles) {\n",
" stmt.run({...article, year, month})\n",
" }\n",
"})\n",
"\n",
"async function insertMonth(year:number, month: text) {\n",
" const monthPage = await fetch(`https://www.nbcnews.com/archive/articles/${year}/${month}`).then(r=>r.text())\n",
" const {document:monthPageDoc} = parseHTML(monthPage);\n",
" const monthEntries = monthPageDoc\n",
" .querySelectorAll('.MonthPage a')\n",
" .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
" insertArticles(year, months.findIndex(m => m === month)+1, monthEntries)\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"for(let year = 2014; year <= 2023; year++) {\n",
" for(const month of months) {\n",
" console.log(year, month);\n",
" await insertMonth(year, month);\n",
" }\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"db.exec(\"vacuum into 'articles.db'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"db.sql`select * from articles order by random() limit 10`"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Deno",
"language": "typescript",
"name": "deno"
},
"language_info": {
"codemirror_mode": "typescript",
"file_extension": ".ts",
"mimetype": "text/x.typescript",
"name": "typescript",
"nbconvert_exporter": "script",
"pygments_lexer": "typescript",
"version": "5.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}