sqlite-vec/tests/test-metadata.py
2024-11-14 11:32:48 -08:00

394 lines
14 KiB
Python

import sqlite3
from collections import OrderedDict
def test_constructor_limit(db, snapshot):
assert exec(
db,
f"""
create virtual table v using vec0(
{",".join([f"metadata{x} integer" for x in range(17)])}
v float[1]
)
""",
) == snapshot(name="max 16 metadata columns")
def test_normal(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
assert exec(
db, "select * from sqlite_master where type = 'table' order by name"
) == snapshot(name="sqlite_master")
assert vec0_shadow_table_contents(db, "v") == snapshot()
INSERT = "insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)"
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 1.1, "one"]) == snapshot()
assert exec(db, INSERT, [b"\x22\x22\x22\x22", 1, 2, 2.2, "two"]) == snapshot()
assert exec(db, INSERT, [b"\x33\x33\x33\x33", 1, 3, 3.3, "three"]) == snapshot()
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "drop table v") == snapshot()
assert exec(db, "select * from sqlite_master") == snapshot()
#
# assert exec(db, "select * from v") == snapshot()
# assert vec0_shadow_table_contents(db, "v") == snapshot()
#
# db.execute("drop table v;")
# assert exec(db, "select * from sqlite_master order by name") == snapshot(
# name="sqlite_master post drop"
# )
def test_long_text(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
)
assert vec0_shadow_table_contents(db, "v") == snapshot()
INSERT = "insert into v(vector, name) values (?, ?)"
assert exec(db, INSERT, [b"\x11\x11\x11\x11", "123456789a12"])
assert exec(db, INSERT, [b"\x11\x11\x11\x11", "123456789a123"])
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
def test_types(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
INSERT = "insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)"
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 1.1, "test"]) == snapshot(
name="legal"
)
# fmt: off
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 'illegal', 1, 1.1, 'test']) == snapshot(name="illegal-type-boolean")
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 'illegal', 1.1, 'test']) == snapshot(name="illegal-type-int")
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 'illegal', 'test']) == snapshot(name="illegal-type-float")
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 1.1, 420]) == snapshot(name="illegal-type-text")
# fmt: on
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 44, 1, 1.1, "test"]) == snapshot(
name="illegal-boolean"
)
def test_updates(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
INSERT = "insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)"
exec(db, INSERT, [1, b"\x11\x11\x11\x11", 1, 1, 1.1, "test1"])
exec(db, INSERT, [2, b"\x22\x22\x22\x22", 1, 2, 2.2, "test2"])
exec(db, INSERT, [3, b"\x33\x33\x33\x33", 1, 3, 3.3, "1234567890123"])
assert exec(db, "select * from v") == snapshot(name="1-init")
assert vec0_shadow_table_contents(db, "v") == snapshot(name="1-init")
assert exec(
db, "UPDATE v SET b = 0, n = 11, f = 11.11, t = 'newtest1' where rowid = 1"
)
assert exec(db, "select * from v") == snapshot(name="general-update-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="general-update-shaodnw"
)
# string update #1: long string updated to long string
exec(db, "UPDATE v SET t = '1234567890123-updated' where rowid = 3")
assert exec(db, "select * from v") == snapshot(name="string-update-1-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-1-shadow"
)
# string update #2: short string updated to short string
exec(db, "UPDATE v SET t = 'test2-short' where rowid = 2")
assert exec(db, "select * from v") == snapshot(name="string-update-2-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-2-shadow"
)
# string update #3: short string updated to long string
exec(db, "UPDATE v SET t = 'test2-long-long-long' where rowid = 2")
assert exec(db, "select * from v") == snapshot(name="string-update-3-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-3-shadow"
)
# string update #4: long string updated to short string
exec(db, "UPDATE v SET t = 'test2-shortx' where rowid = 2")
assert exec(db, "select * from v") == snapshot(name="string-update-4-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-4-shadow"
)
def test_deletes(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
INSERT = "insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)"
assert exec(db, INSERT, [1, b"\x11\x11\x11\x11", 1, 1, 1.1, "test1"]) == snapshot()
assert exec(db, INSERT, [2, b"\x22\x22\x22\x22", 1, 2, 2.2, "test2"]) == snapshot()
assert (
exec(db, INSERT, [3, b"\x33\x33\x33\x33", 1, 3, 3.3, "1234567890123"])
== snapshot()
)
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "DELETE FROM v where rowid = 1") == snapshot()
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "DELETE FROM v where rowid = 3") == snapshot()
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
def test_knn(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
)
assert exec(
db, "select * from sqlite_master where type = 'table' order by name"
) == snapshot(name="sqlite_master")
db.executemany(
"insert into v(vector, name) values (?, ?)",
[("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")],
)
# EVIDENCE-OF: V16511_00582 catches "illegal" constraints on metadata columns
assert (
exec(
db,
"select *, distance from v where vector match '[5]' and k = 3 and name like 'illegal'",
)
== snapshot()
)
def test_idxstr(db, snapshot):
db.execute(
"""
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1],
+title text,
is_favorited boolean,
genre text,
num_reviews int,
mean_rating float,
chunk_size=8
);
"""
)
assert (
eqp(
db,
"select * from vec_movies where synopsis_embedding match '' and k = 0 and is_favorited = true",
)
== snapshot()
)
ops = ["<", ">", "<=", ">=", "!="]
for op in ops:
assert eqp(
db,
f"select * from vec_movies where synopsis_embedding match '' and k = 0 and genre {op} NULL",
) == snapshot(name=f"knn-constraint-text {op}")
for op in ops:
assert eqp(
db,
f"select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews {op} NULL",
) == snapshot(name=f"knn-constraint-int {op}")
for op in ops:
assert eqp(
db,
f"select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating {op} NULL",
) == snapshot(name=f"knn-constraint-float {op}")
# for op in ops:
# assert eqp(
# db,
# f"select * from vec_movies where synopsis_embedding match '' and k = 0 and is_favorited {op} NULL",
# ) == snapshot(name=f"knn-constraint-boolean {op}")
def eqp(db, sql):
o = OrderedDict()
o["sql"] = sql
o["plan"] = [
dict(row) for row in db.execute(f"explain query plan {sql}").fetchall()
]
return o
def test_stress(db, snapshot):
db.execute(
"""
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1],
+title text,
is_favorited boolean,
genre text,
num_reviews int,
mean_rating float,
chunk_size=8
);
"""
)
db.execute(
"""
INSERT INTO vec_movies(movie_id, synopsis_embedding, is_favorited, genre, title, num_reviews, mean_rating)
VALUES
(1, '[1]', 0, 'horror', 'The Conjuring', 153, 4.6),
(2, '[2]', 0, 'comedy', 'Dumb and Dumber', 382, 2.6),
(3, '[3]', 0, 'scifi', 'Interstellar', 53, 5.0),
(4, '[4]', 0, 'fantasy', 'The Lord of the Rings: The Fellowship of the Ring', 210, 4.2),
(5, '[5]', 1, 'documentary', 'An Inconvenient Truth', 93, 3.4),
(6, '[6]', 1, 'horror', 'Hereditary', 167, 4.7),
(7, '[7]', 1, 'comedy', 'Anchorman: The Legend of Ron Burgundy', 482, 2.9),
(8, '[8]', 0, 'scifi', 'Blade Runner 2049', 301, 5.0),
(9, '[9]', 1, 'fantasy', 'Harry Potter and the Sorcerer''s Stone', 134, 4.1),
(10, '[10]', 0, 'documentary', 'Free Solo', 66, 3.2),
(11, '[11]', 1, 'horror', 'Get Out', 88, 4.9),
(12, '[12]', 0, 'comedy', 'The Hangover', 59, 2.8),
(13, '[13]', 1, 'scifi', 'The Matrix', 423, 4.5),
(14, '[14]', 0, 'fantasy', 'Pan''s Labyrinth', 275, 3.6),
(15, '[15]', 1, 'documentary', '13th', 191, 4.4),
(16, '[16]', 0, 'horror', 'It Follows', 314, 4.3),
(17, '[17]', 1, 'comedy', 'Step Brothers', 74, 3.0),
(18, '[18]', 1, 'scifi', 'Inception', 201, 5.0),
(19, '[19]', 1, 'fantasy', 'The Shape of Water', 399, 2.7),
(20, '[20]', 1, 'documentary', 'Won''t You Be My Neighbor?', 186, 4.8),
(21, '[21]', 1, 'scifi', 'Gravity', 342, 4.0),
(22, '[22]', 1, 'scifi', 'Dune', 451, 4.4),
(23, '[23]', 1, 'scifi', 'The Martian', 522, 4.6),
(24, '[24]', 1, 'horror', 'A Quiet Place', 271, 4.3),
(25, '[25]', 1, 'fantasy', 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 310, 3.9);
"""
)
assert vec0_shadow_table_contents(db, "vec_movies") == snapshot()
assert (
exec(
db,
"""
select
movie_id,
title,
genre,
num_reviews,
mean_rating,
is_favorited,
distance
from vec_movies
where synopsis_embedding match '[15.5]'
and genre = 'scifi'
and num_reviews between 100 and 500
and mean_rating > 3.5
and k = 5;
""",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, genre, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and genre = 'horror'",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, genre, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and genre = 'comedy'",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, num_reviews, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and num_reviews between 100 and 500",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, num_reviews, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and num_reviews >= 500",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and mean_rating < 3.0",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and mean_rating between 4.0 and 5.0",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE",
)
== snapshot()
)
def exec(db, sql, parameters=[]):
try:
rows = db.execute(sql, parameters).fetchall()
except (sqlite3.OperationalError, sqlite3.DatabaseError) as e:
return {
"error": e.__class__.__name__,
"message": str(e),
}
a = []
for row in rows:
o = OrderedDict()
for k in row.keys():
o[k] = row[k]
a.append(o)
result = OrderedDict()
result["sql"] = sql
result["rows"] = a
return result
def vec0_shadow_table_contents(db, v):
shadow_tables = [
row[0]
for row in db.execute(
"select name from sqlite_master where name like ? order by 1", [f"{v}_%"]
).fetchall()
]
o = {}
for shadow_table in shadow_tables:
o[shadow_table] = exec(db, f"select * from {shadow_table}")
return o