diff --git a/reference.yaml b/reference.yaml index b63021d..e0b9046 100644 --- a/reference.yaml +++ b/reference.yaml @@ -1,33 +1,45 @@ sections: - meta: - title: Meta - desc: TODO constructors: title: Constructors - desc: TODO + desc: | + SQL functions that "construct" vectors with different element types. + + Currently, only `float32`, `int8`, and `bit` vectors are supported. + op: title: Operations - desc: TODO + desc: | + Different operations and utilities for working with vectors. distance: title: Distance functions - desc: TODO + desc: Various algorithms to calculate distance between two vectors. quantization: title: Quantization - desc: TODO -functions: + desc: Various techniques to "compress" a vector by reducing precision and accuracy. + numpy: + title: "NumPy Utilities" + desc: Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html). + meta: + title: Meta + desc: Helper functions to debug `sqlite-vec` installations. + entrypoints: + title: Entrypoints + desc: All the named entrypoints that load in different `sqlite-vec` functions and options. +# vec0: +# title: "vec0 Virtual Table" +# desc: TODO +meta: vec_version: params: [] - section: meta desc: Returns a version string of the current `sqlite-vec` installation. example: select vec_version(); vec_debug: params: [] - section: meta desc: Returns debugging information of the current `sqlite-vec` installation. example: select vec_debug(); +constructors: vec_f32: params: [vector] - section: constructors desc: | Creates a float vector from a BLOB or JSON text. If a BLOB is provided, the length must be divisible by 4, as a float takes up 4 bytes of space each. @@ -42,7 +54,6 @@ functions: - select vec_f32(X'AA'); vec_int8: params: [vector] - section: constructors desc: | Creates a 8-bit integer vector from a BLOB or JSON text. If a BLOB is provided, the length must be divisible by 4, as a float takes up 4 bytes of space each. @@ -60,19 +71,18 @@ functions: vec_bit: params: [vector] - section: constructors desc: | Creates a binary vector from a BLOB. - The returned value is a BLOB with 4 bytes per element, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html) + The returned value is a BLOB with 1 byte per 8 elements, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html) of `224`. example: - select vec_bit(X'F0'); - select subtype(vec_bit(X'F0')); - select vec_to_json(vec_bit(X'F0')); +op: vec_length: params: [vector] - section: op desc: | Returns the number of elements in the given vector. The vector can be `JSON`, `BLOB`, or the result of a [constructor function](#constructors). @@ -84,9 +94,20 @@ functions: - select vec_length(vec_int8(X'AABBCCDD')); - select vec_length(vec_bit(X'AABBCCDD')); - select vec_length(X'CCDD'); + vec_type: + params: [vector] + desc: | + Returns the name of the type of `vector` as text. One of `'float32'`, `'int8'`, or `'bit'`. + + This function will return an error if `vector` is invalid. + example: + - select vec_type('[.1, .2]'); + - select vec_type(X'AABBCCDD'); + - select vec_type(vec_int8(X'AABBCCDD')); + - select vec_type(vec_bit(X'AABBCCDD')); + - select vec_type(X'CCDD'); vec_add: params: [a, b] - section: op desc: | Adds every element in vector `a` with vector `b`, returning a new vector `c`. Both vectors must be of the same type and same length. Only `float32` and `int8` vectors are supported. @@ -119,7 +140,6 @@ functions: - select vec_add(vec_bit(X'AA'), vec_bit(X'BB')); vec_sub: params: [a, b] - section: op desc: | Subtracts every element in vector `a` with vector `b`, returning a new vector `c`. Both vectors must be of the same type and same length. Only `float32` and `int8` vectors are supported. @@ -152,7 +172,6 @@ functions: - select vec_sub(vec_bit(X'AA'), vec_bit(X'BB')); vec_normalize: params: [vector] - section: op desc: | Performs L2 normalization on the given vector. Only float32 vectors are currently supported. @@ -172,7 +191,6 @@ functions: ); vec_slice: params: [vector, start, end] - section: op desc: | Extract a subset of `vector` from the `start` element (inclusive) to the `end` element (exclusive). TODO check @@ -208,7 +226,6 @@ functions: ); vec_to_json: params: [vector] - section: op desc: | Represents a vector as JSON text. The input vector can be a vector BLOB or JSON text. @@ -219,10 +236,45 @@ functions: - select vec_to_json(vec_bit(X'AABBCCDD')); - select vec_to_json('[1,2,3,4]'); - select vec_to_json('invalid'); + vec_each: + params: [vector] + desc: | + A table function to iterate through every element in a vector. One row id returned per element in a vector. + ```sql + CREATE TABLE vec_each( + rowid int, -- The + vector HIDDEN -- input parameter: A well-formed vector value + ) + ``` + + Returns an error if `vector` is not a valid vector. + example: + - select rowid, value from vec_each('[1,2,3,4]'); + - select rowid, value from vec_each(X'AABBCCDD00112233'); + - select rowid, value from vec_each(vec_int8(X'AABBCCDD')); + - select rowid, value from vec_each(vec_bit(X'F0')); + +distance: + vec_distance_L2: + params: [a, b] + desc: | + Calculates the L2 euclidian distance between vectors `a` and `b`. Only valid for float32 or int8 vectors. + + Returns an error under the following conditions: + - `a` or `b` are invalid vectors + - `a` or `b` do not share the same vector element types (ex float32 or int8) + - `a` or `b` are bit vectors. Use [`vec_distance_hamming()`](#vec_distance_hamming) for distance calculations between two bitvectors. + - `a` or `b` do not have the same length. + example: + - select vec_distance_L2('[1, 1]', '[2, 2]'); + - select vec_distance_L2('[1, 1]', '[-2, -2]'); + - select vec_distance_L2('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]'); + - select vec_distance_L2(X'AABBCCDD', X'00112233'); + - select vec_distance_L2('[1, 1]', vec_int8('[2, 2]')); + - select vec_distance_L2(vec_bit(X'AA'), vec_bit(X'BB')); vec_distance_cosine: params: [a, b] - section: distance desc: | Calculates the cosine distance between vectors `a` and `b`. Only valid for float32 or int8 vectors. @@ -236,9 +288,10 @@ functions: - select vec_distance_cosine('[1, 1]', '[-2, -2]'); - select vec_distance_cosine('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]'); - select vec_distance_cosine(X'AABBCCDD', X'00112233'); + - select vec_distance_cosine('[1, 1]', vec_int8('[2, 2]')); + - select vec_distance_cosine(vec_bit(X'AA'), vec_bit(X'BB')); vec_distance_hamming: params: [a, b] - section: distance desc: | Calculates the hamming distance between two bitvectors `a` and `b`. Only valid for bitvectors. @@ -250,34 +303,85 @@ functions: - select vec_distance_hamming(vec_bit(X'00'), vec_bit(X'FF')); - select vec_distance_hamming(vec_bit(X'FF'), vec_bit(X'FF')); - select vec_distance_hamming(vec_bit(X'F0'), vec_bit(X'44')); - - select vec_distance_hamming(X'F0', X'00'); - vec_distance_l2: - params: [a, b] - section: distance + - select vec_distance_hamming('[1, 1]', '[0, 0]'); + +quantization: + vec_quantize_binary: + params: [vector] + desc: | + Quantize a float32 or int8 vector into a bitvector. + For every element in the vector, a `1` is assigned to positive numbers and a `0` is assigned to negative numbers. + These values are then packed into a bit vector. + + Returns an error if `vector` is invalid, or if `vector` is not a float32 or int8 vector. + example: + - select vec_quantize_binary('[1, 2, 3, 4, 5, 6, 7, 8]'); + - select vec_quantize_binary('[1, 2, 3, 4, -5, -6, -7, -8]'); + - select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]'); + - select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]'); + - select vec_quantize_binary(vec_int8(X'11223344')); + - select vec_quantize_binary(vec_bit(X'FF')); + vec_quantize_i8: + params: [vector, "[start]", "[end]"] desc: x example: select 'todo'; - vec_quantize_binary: +numpy: + vec_npy_each: params: [vector] - section: quantization - desc: x - example: select 'todo'; - vec_quantize_i8: - params: [vector, "[start]", "[end]"] - section: quantization - desc: x - example: select 'todo'; + desc: | + xxx + example: + - | + -- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() + select + rowid, + vector, + vec_type(vector), + vec_to_json(vector) + from vec_npy_each( + X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' + ) + - | + -- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() + select + rowid, + vector, + vec_type(vector), + vec_to_json(vector) + from vec_npy_each( + X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' + ) + +vec0: + vec0: + params: [] + desc: TODO + example: + - | + create virtual table vec_items using vec0( + contents_embedding float[4] + ); + - | + insert into vec_items(rowid, contents_embedding) + values (1, '[1, 1, 1, 1]'), + (2, '[2, 2, 2, 2]'), + (3, '[3, 3, 3, 3]'); + +entrypoints: + {} + #sqlite3_vec_init: + # desc: | + # asdf + #sqlite3_vec_fs_read_init: + # desc: | + # asdf #table_functions: # vec_each: # columns: [rowid, value] # inputs: ["vector"] # desc: # example: -# vec_npy_each: -# columns: [rowid, vector] -# inputs: ["input"] -# desc: -# example: #virtual_tables: # vec0: # desc: diff --git a/site/api-reference.md b/site/api-reference.md index 093b53f..9f7b5fe 100644 --- a/site/api-reference.md +++ b/site/api-reference.md @@ -1,45 +1,23 @@ +--- +outline: 2 +--- + # API Reference +A complete reference to all the SQL scalar functions, table functions, and virtual tables inside `sqlite-vec`. + ::: warning sqlite-vec is pre-v1, so expect breaking changes. ::: [[toc]] -## Meta {#meta} - -TODO - -### `vec_version()` {#vec_version} - -Returns a version string of the current `sqlite-vec` installation. - -```sql -select vec_version(); --- 'v0.0.1-alpha.36' - - -``` - -### `vec_debug()` {#vec_debug} - -Returns debugging information of the current `sqlite-vec` installation. - -```sql -select vec_debug(); -/* -'Version: v0.0.1-alpha.36 -Date: 2024-07-16T23:06:41Z-0700 -Commit: e507bc0230de6dc44c7ff3b4895785edd734f31d -Build flags: avx ' -*/ - - -``` - ## Constructors {#constructors} -TODO +SQL functions that "construct" vectors with different element types. + +Currently, only `float32`, `int8`, and `bit` vectors are supported. + ### `vec_f32(vector)` {#vec_f32} @@ -52,7 +30,7 @@ of `223`. ```sql select vec_f32('[.1, .2, .3, 4]'); --- X'CDCCCC3DCDCC4C3E9A99993E008040' +-- X'CDCCCC3DCDCC4C3E9A99993E00008040' select subtype(vec_f32('[.1, .2, .3, 4]')); -- 223 @@ -81,7 +59,7 @@ of `225`. ```sql select vec_int8('[1, 2, 3, 4]'); --- X'1234' +-- X'01020304' select subtype(vec_int8('[1, 2, 3, 4]')); -- 225 @@ -102,7 +80,7 @@ select vec_int8('[999]'); Creates a binary vector from a BLOB. -The returned value is a BLOB with 4 bytes per element, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html) +The returned value is a BLOB with 1 byte per 8 elements, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html) of `224`. @@ -121,7 +99,8 @@ select vec_to_json(vec_bit(X'F0')); ## Operations {#op} -TODO +Different operations and utilities for working with vectors. + ### `vec_length(vector)` {#vec_length} @@ -148,6 +127,32 @@ select vec_length(X'CCDD'); -- ❌ invalid float32 vector BLOB length. Must be divisible by 4, found 2 +``` + +### `vec_type(vector)` {#vec_type} + +Returns the name of the type of `vector` as text. One of `'float32'`, `'int8'`, or `'bit'`. + +This function will return an error if `vector` is invalid. + + +```sql +select vec_type('[.1, .2]'); +-- 'float32' + +select vec_type(X'AABBCCDD'); +-- 'float32' + +select vec_type(vec_int8(X'AABBCCDD')); +-- 'int8' + +select vec_type(vec_bit(X'AABBCCDD')); +-- 'bit' + +select vec_type(X'CCDD'); +-- ❌ invalid float32 vector BLOB length. Must be divisible by 4, found 2 + + ``` ### `vec_add(a, b)` {#vec_add} @@ -165,7 +170,7 @@ select vec_add( '[.1, .2, .3]', '[.4, .5, .6]' ); --- X'0003F3333333F6766663F' +-- X'0000003F3333333F6766663F' select vec_to_json( vec_add( @@ -243,7 +248,7 @@ Returns an error if the input is an invalid vector or not a float32 vector. ```sql select vec_normalize('[2, 3, 1, -4]'); --- X'BAF4BA3E8B37C3FBAF43A3EBAF43ABF' +-- X'BAF4BA3E8B370C3FBAF43A3EBAF43ABF' select vec_to_json( vec_normalize('[2, 3, 1, -4]') @@ -277,7 +282,7 @@ Returns an error in the following conditions: ```sql select vec_slice('[1, 2,3, 4]', 0, 2); --- X'00803F00040' +-- X'0000803F00000040' select vec_to_json( vec_slice('[1, 2,3, 4]', 0, 2) @@ -331,11 +336,134 @@ select vec_to_json('invalid'); -- ❌ JSON array parsing error: Input does not start with '[' +``` + +### `vec_each(vector)` {#vec_each} + +A table function to iterate through every element in a vector. One row id returned per element in a vector. + +```sql +CREATE TABLE vec_each( + rowid int, -- The + vector HIDDEN -- input parameter: A well-formed vector value +) +``` + +Returns an error if `vector` is not a valid vector. + + +```sql +select rowid, value from vec_each('[1,2,3,4]'); +/* +┌───────┬───────┐ +│ rowid │ value │ +├───────┼───────┤ +│ 0 │ 1 │ +├───────┼───────┤ +│ 1 │ 2 │ +├───────┼───────┤ +│ 2 │ 3 │ +├───────┼───────┤ +│ 3 │ 4 │ +└───────┴───────┘ + +*/ + + +select rowid, value from vec_each(X'AABBCCDD00112233'); +/* +┌───────┬──────────────────────┐ +│ rowid │ value │ +├───────┼──────────────────────┤ +│ 0 │ -1844071490169864200 │ +├───────┼──────────────────────┤ +│ 1 │ 3.773402568185702e-8 │ +└───────┴──────────────────────┘ + +*/ + + +select rowid, value from vec_each(vec_int8(X'AABBCCDD')); +/* +┌───────┬───────┐ +│ rowid │ value │ +├───────┼───────┤ +│ 0 │ -86 │ +├───────┼───────┤ +│ 1 │ -69 │ +├───────┼───────┤ +│ 2 │ -52 │ +├───────┼───────┤ +│ 3 │ -35 │ +└───────┴───────┘ + +*/ + + +select rowid, value from vec_each(vec_bit(X'F0')); +/* +┌───────┬───────┐ +│ rowid │ value │ +├───────┼───────┤ +│ 0 │ 1 │ +├───────┼───────┤ +│ 1 │ 1 │ +├───────┼───────┤ +│ 2 │ 1 │ +├───────┼───────┤ +│ 3 │ 1 │ +├───────┼───────┤ +│ 4 │ 0 │ +├───────┼───────┤ +│ 5 │ 0 │ +├───────┼───────┤ +│ 6 │ 0 │ +├───────┼───────┤ +│ 7 │ 0 │ +└───────┴───────┘ + +*/ + + + ``` ## Distance functions {#distance} -TODO +Various algorithms to calculate distance between two vectors. + +### `vec_distance_L2(a, b)` {#vec_distance_L2} + +Calculates the L2 euclidian distance between vectors `a` and `b`. Only valid for float32 or int8 vectors. + +Returns an error under the following conditions: +- `a` or `b` are invalid vectors +- `a` or `b` do not share the same vector element types (ex float32 or int8) +- `a` or `b` are bit vectors. Use [`vec_distance_hamming()`](#vec_distance_hamming) for distance calculations between two bitvectors. +- `a` or `b` do not have the same length. + + +```sql +select vec_distance_L2('[1, 1]', '[2, 2]'); +-- 1.4142135381698608 + +select vec_distance_L2('[1, 1]', '[-2, -2]'); +-- 4.242640495300293 + +select vec_distance_L2('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]'); +-- 5.7157673835754395 + +select vec_distance_L2(X'AABBCCDD', X'00112233'); +-- 1844071490169864200 + +select vec_distance_L2('[1, 1]', vec_int8('[2, 2]')); +-- ❌ Vector type mistmatch. First vector has type float32, while the second has type int8. + +select vec_distance_L2(vec_bit(X'AA'), vec_bit(X'BB')); +-- ❌ Cannot calculate L2 distance between two bitvectors. + + +``` ### `vec_distance_cosine(a, b)` {#vec_distance_cosine} @@ -361,6 +489,12 @@ select vec_distance_cosine('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]'); select vec_distance_cosine(X'AABBCCDD', X'00112233'); -- 2 +select vec_distance_cosine('[1, 1]', vec_int8('[2, 2]')); +-- ❌ Vector type mistmatch. First vector has type float32, while the second has type int8. + +select vec_distance_cosine(vec_bit(X'AA'), vec_bit(X'BB')); +-- ❌ Cannot calculate cosine distance between two bitvectors. + ``` @@ -384,34 +518,43 @@ select vec_distance_hamming(vec_bit(X'FF'), vec_bit(X'FF')); select vec_distance_hamming(vec_bit(X'F0'), vec_bit(X'44')); -- 4 -select vec_distance_hamming(X'F0', X'00'); --- ❌ Error reading 1st vector: invalid float32 vector BLOB length. Must be divisible by 4, found 1 - - -``` - -### `vec_distance_l2(a, b)` {#vec_distance_l2} - -x - -```sql -select 'todo'; --- 'todo' +select vec_distance_hamming('[1, 1]', '[0, 0]'); +-- ❌ Cannot calculate hamming distance between two float32 vectors. ``` ## Quantization {#quantization} -TODO +Various techniques to "compress" a vector by reducing precision and accuracy. ### `vec_quantize_binary(vector)` {#vec_quantize_binary} -x +Quantize a float32 or int8 vector into a bitvector. +For every element in the vector, a `1` is assigned to positive numbers and a `0` is assigned to negative numbers. +These values are then packed into a bit vector. + +Returns an error if `vector` is invalid, or if `vector` is not a float32 or int8 vector. + ```sql -select 'todo'; --- 'todo' +select vec_quantize_binary('[1, 2, 3, 4, 5, 6, 7, 8]'); +-- X'FF' + +select vec_quantize_binary('[1, 2, 3, 4, -5, -6, -7, -8]'); +-- X'0F' + +select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]'); +-- X'00' + +select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]'); +-- X'00' + +select vec_quantize_binary(vec_int8(X'11223344')); +-- ❌ Binary quantization requires vectors with a length divisible by 8 + +select vec_quantize_binary(vec_bit(X'FF')); +-- ❌ Can only binary quantize float or int8 vectors ``` @@ -427,3 +570,97 @@ select 'todo'; ``` +## NumPy Utilities {#numpy} + +Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html). + +### `vec_npy_each(vector)` {#vec_npy_each} + +xxx + + +```sql +-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() +select + rowid, + vector, + vec_type(vector), + vec_to_json(vector) +from vec_npy_each( + X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' +) +/* +┌───────┬─────────────┬──────────────────┬─────────────────────┐ +│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ +├───────┼─────────────┼──────────────────┼─────────────────────┤ +│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ +├───────┼─────────────┼──────────────────┼─────────────────────┤ +│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ +├───────┼─────────────┼──────────────────┼─────────────────────┤ +│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ +└───────┴─────────────┴──────────────────┴─────────────────────┘ + +*/ + + +-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() +select + rowid, + vector, + vec_type(vector), + vec_to_json(vector) +from vec_npy_each( + X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' +) +/* +┌───────┬─────────────┬──────────────────┬─────────────────────┐ +│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ +├───────┼─────────────┼──────────────────┼─────────────────────┤ +│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ +├───────┼─────────────┼──────────────────┼─────────────────────┤ +│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ +├───────┼─────────────┼──────────────────┼─────────────────────┤ +│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ +└───────┴─────────────┴──────────────────┴─────────────────────┘ + +*/ + + + +``` + +## Meta {#meta} + +Helper functions to debug `sqlite-vec` installations. + +### `vec_version()` {#vec_version} + +Returns a version string of the current `sqlite-vec` installation. + +```sql +select vec_version(); +-- 'v0.0.1-alpha.36' + + +``` + +### `vec_debug()` {#vec_debug} + +Returns debugging information of the current `sqlite-vec` installation. + +```sql +select vec_debug(); +/* +'Version: v0.0.1-alpha.36 +Date: 2024-07-16T23:06:41Z-0700 +Commit: e507bc0230de6dc44c7ff3b4895785edd734f31d +Build flags: avx ' +*/ + + +``` + +## Entrypoints {#entrypoints} + +All the named entrypoints that load in different `sqlite-vec` functions and options. + diff --git a/site/build-ref.mjs b/site/build-ref.mjs index 76a0704..32702d9 100644 --- a/site/build-ref.mjs +++ b/site/build-ref.mjs @@ -6,6 +6,22 @@ import { readFileSync, writeFileSync } from "node:fs"; import * as v from "valibot"; import { table } from "table"; +const HEADER = `--- +outline: 2 +--- + +# API Reference + +A complete reference to all the SQL scalar functions, table functions, and virtual tables inside \`sqlite-vec\`. + +::: warning +sqlite-vec is pre-v1, so expect breaking changes. +::: + +[[toc]] + +`; + const REF_PATH = resolve( dirname(fileURLToPath(import.meta.url)), "../reference.yaml" @@ -15,32 +31,25 @@ const EXT_PATH = resolve( "../dist/vec0" ); -const DocSchema = v.object({ - sections: v.record( - v.string(), - v.object({ - title: v.string(), - desc: v.string(), - }) - ), - functions: v.record( - v.string(), - v.object({ - params: v.array(v.string()), - desc: v.string(), - section: v.string(), - example: v.union([v.string(), v.array(v.string())]), - }) - ), - /*table_functions: v.record( +const DocSchema = v.objectWithRest( + { + sections: v.record( + v.string(), + v.object({ + title: v.string(), + desc: v.string(), + }) + ), + }, + v.record( v.string(), v.object({ params: v.array(v.string()), desc: v.string(), example: v.union([v.string(), v.array(v.string())]), }) - ),*/ -}); + ) +); const tableConfig = { border: { @@ -78,7 +87,7 @@ function formatSingleValue(value) { if (value instanceof Uint8Array) { let s = "X'"; for (const v of value) { - s += v.toString(16).toUpperCase(); + s += v.toString(16).toUpperCase().padStart(2, "0"); } s += "'"; return `-- ${s}`; @@ -87,12 +96,13 @@ function formatSingleValue(value) { return "-- " + JSON.stringify(value, null, 2); } function formatValue(value) { - if (typeof value === "string" || typeof value === "number") return value; + if (typeof value === "string") return `'${value}'`; + if (typeof value === "number") return value; if (value === null) return "NULL"; if (value instanceof Uint8Array) { let s = "X'"; for (const v of value) { - s += v.toString(16); + s += v.toString(16).toUpperCase().padStart(2, "0"); } s += "'"; return s; @@ -125,7 +135,11 @@ function renderExamples(db, name, example) { results = null; try { stmt = db.prepare(sql); - stmt.raw(true); + try { + stmt.raw(true); + } catch (err) { + 1; + } } catch (error) { console.error(`Error preparing statement for ${name}:`); console.error(error); @@ -157,37 +171,27 @@ function renderExamples(db, name, example) { return md; } -let md = `# API Reference - -::: warning -sqlite-vec is pre-v1, so expect breaking changes. -::: - -[[toc]] - -`; +let md = HEADER; const doc = v.parse(DocSchema, load(readFileSync(REF_PATH, "utf8"))); const db = new Database(); db.loadExtension(EXT_PATH); -let lastSection = null; -for (const [name, { params, desc, example, section }] of Object.entries( - doc.functions -)) { - const headerText = `\`${name}(${(params ?? []).join(", ")})\` {#${name}}`; +for (const section in doc.sections) { + md += `## ${doc.sections[section].title} {#${section}} \n\n`; + md += doc.sections[section].desc; + md += "\n\n"; - if (lastSection != section) { - md += `## ${doc.sections[section].title} {#${section}} \n\n`; - md += doc.sections[section].desc; - md += "\n\n"; - lastSection = section; + for (const [name, { params, desc, example }] of Object.entries( + doc[section] + )) { + const headerText = `\`${name}(${(params ?? []).join(", ")})\` {#${name}}`; + + md += "### " + headerText + "\n\n"; + + md += desc + "\n\n"; + md += renderExamples(db, name, example); } - - md += "### " + headerText + "\n\n"; - - md += desc + "\n\n"; - md += renderExamples(db, name, example); } writeFileSync("api-reference.md", md, "utf8"); diff --git a/site/versioning.md b/site/versioning.md new file mode 100644 index 0000000..44d612b --- /dev/null +++ b/site/versioning.md @@ -0,0 +1,49 @@ +# Semantic Versioning for `sqlite-vec` + +`sqlite-vec` is pre-v1, so according to the rules of [Semantic Versioning](https://semver.org/), +so "minor" release like "0.2.0" or "0.3.0" may contain breaking changes. + +But what exactly counts as a "breaking change" in a SQLite extension? The line isn't so clear, unforetunately. +Here are a all the surfaces that COULD count as a "breaking change": + +- SQL functions and columns on virtual tables +- The C API (extension entrypoints) +- "Bindings" like the official `pip` and `npm` packages +- Release assets like the pre-compile extensions + +## What counts as a "breaking change"? + + +### Changes to SQL functions + +- Re-naming or removing an SQL function +- Changing the number of required SQL parameters + +### Changes to SQL virtual tables + +- The number of + +### Changes to the C API + +Currently there is no "official" C API for `sqlite-vec`. However, there are entrypoints defined in C that C developers or developers using FFI can call. Any + + +### Compile-time options + +The removal of any compile time options + + +## When is `v1.0` coming? + +In a few months! The main problems I want to solve before `v1.0` include: + +- Metadata columns +- Metadata filtering +- ANN indexing +- Quantization + pre-transformations + +Once those items are complete, I will likely create a `v1.0` release, along with renaming the `vec0` virtual table modile to `vec1`. And if future major releases are required, a `v2.0` major releases will be made with new `vec2` virtual tables and so on. + +Ideally, only a `v1` major release would be required. But who knows what the future has in store with vector search! + +In general, I will try my best to maximize stability and limit the number of breaking changes for future `sqlite-vec` versions. diff --git a/sqlite-vec.c b/sqlite-vec.c index 9b02872..84e6ad9 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -1082,8 +1082,105 @@ finish: return; } -static void vec_quantize_i8(sqlite3_context *context, int argc, +char * vec_type_name(enum VectorElementType elementType) { + switch(elementType) { + case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: + return "float32"; + case SQLITE_VEC_ELEMENT_TYPE_INT8: + return "int8"; + case SQLITE_VEC_ELEMENT_TYPE_BIT: + return "bit"; + } +} + +static void vec_type(sqlite3_context *context, int argc, + sqlite3_value **argv) { + assert(argc == 1); + void *vector; + size_t dimensions; + vector_cleanup cleanup; + char *pzError; + enum VectorElementType elementType; + int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType, + &cleanup, &pzError); + if (rc != SQLITE_OK) { + sqlite3_result_error(context, pzError, -1); + sqlite3_free(pzError); + return; + } + sqlite3_result_text(context, vec_type_name(elementType), -1, SQLITE_STATIC); + cleanup(vector); + +} +static void vec_quantize_binary(sqlite3_context *context, int argc, + sqlite3_value **argv) { + assert(argc == 1); + void *vector; + size_t dimensions; + vector_cleanup vectorCleanup; + char *pzError; + enum VectorElementType elementType; + int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType, + &vectorCleanup, &pzError); + if (rc != SQLITE_OK) { + sqlite3_result_error(context, pzError, -1); + sqlite3_free(pzError); + return; + } + + if(dimensions <= 0) { + sqlite3_result_error(context, "Zero length vectors are not supported.", -1); + goto cleanup; + return; + } + if((dimensions % CHAR_BIT) != 0) { + sqlite3_result_error(context, "Binary quantization requires vectors with a length divisible by 8", -1); + goto cleanup; + return; + } + + int sz = dimensions / CHAR_BIT; + u8 *out = sqlite3_malloc(sz); + if (!out) { + sqlite3_result_error_code(context, SQLITE_NOMEM); + goto cleanup; + return; + } + memset(out, 0, sz); + + switch(elementType) { + case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { + + for (size_t i = 0; i < dimensions; i++) { + int res = ((f32 *)vector)[i] > 0.0; + out[i / 8] |= (res << (i % 8)); + } + break; + } + case SQLITE_VEC_ELEMENT_TYPE_INT8: { + for (size_t i = 0; i < dimensions; i++) { + int res = ((i8 *)vector)[i] > 0; + out[i / 8] |= (res << (i % 8)); + } + break; + } + case SQLITE_VEC_ELEMENT_TYPE_BIT: { + sqlite3_result_error(context, "Can only binary quantize float or int8 vectors", -1); + sqlite3_free(out); + return; + } + } + sqlite3_result_blob(context, out, sz, sqlite3_free); + sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_BIT); + + + cleanup: + vectorCleanup(vector); +} + +static void vec_quantize_int8(sqlite3_context *context, int argc, sqlite3_value **argv) { + assert(argc == 2); f32 *srcVector; size_t dimensions; fvec_cleanup srcCleanup; @@ -1099,39 +1196,23 @@ static void vec_quantize_i8(sqlite3_context *context, int argc, int sz = dimensions * sizeof(i8); out = sqlite3_malloc(sz); if (!out) { - rc = SQLITE_NOMEM; + sqlite3_result_error_nomem(context); goto cleanup; } memset(out, 0, sz); - if (argc == 2) { - if ((sqlite3_value_type(argv[1]) != SQLITE_TEXT) || - (sqlite3_value_bytes(argv[1]) != strlen("unit")) || - (sqlite3_stricmp((const char *)sqlite3_value_text(argv[1]), "unit") != - 0)) { - sqlite3_result_error(context, - "2nd argument to vec_quantize_i8() must be 'unit', " - "or ranges must be provided.", - -1); - sqlite3_free(out); - goto cleanup; - } - f32 step = (1.0 - (-1.0)) / 255; - for (size_t i = 0; i < dimensions; i++) { - out[i] = ((srcVector[i] - (-1.0)) / step) - 128; - } - } else if (argc == 3) { - // f32 * minVector, maxVector; - // size_t d; - // fvec_cleanup minCleanup, maxCleanup; - // int rc = fvec_from_value(argv[1], ) - + if ((sqlite3_value_type(argv[1]) != SQLITE_TEXT) || + (sqlite3_value_bytes(argv[1]) != strlen("unit")) || + (sqlite3_stricmp((const char *)sqlite3_value_text(argv[1]), "unit") != + 0)) { + sqlite3_result_error(context, "2nd argument to vec_quantize_i8() must be 'unit'.", -1); sqlite3_free(out); - // TODO - sqlite3_result_error( - context, "ranges parameter not supported in vec_quantize_i8 yet.", -1); goto cleanup; } + f32 step = (1.0 - (-1.0)) / 255; + for (size_t i = 0; i < dimensions; i++) { + out[i] = ((srcVector[i] - (-1.0)) / step) - 128; + } sqlite3_result_blob(context, out, dimensions * sizeof(i8), sqlite3_free); sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_INT8); @@ -1140,58 +1221,6 @@ cleanup: srcCleanup(srcVector); } -static void vec_quantize_binary(sqlite3_context *context, int argc, - sqlite3_value **argv) { - assert(argc == 1); - void *vector; - size_t dimensions; - vector_cleanup cleanup; - char *pzError; - enum VectorElementType elementType; - int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType, - &cleanup, &pzError); - if (rc != SQLITE_OK) { - sqlite3_result_error(context, pzError, -1); - sqlite3_free(pzError); - return; - } - - if (elementType == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { - int sz = dimensions / CHAR_BIT; - u8 *out = sqlite3_malloc(sz); - if (!out) { - cleanup(vector); - sqlite3_result_error_code(context, SQLITE_NOMEM); - return; - } - memset(out, 0, sz); - for (size_t i = 0; i < dimensions; i++) { - int res = ((f32 *)vector)[i] > 0.0; - out[i / 8] |= (res << (i % 8)); - } - sqlite3_result_blob(context, out, dimensions / CHAR_BIT, sqlite3_free); - sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_BIT); - } else if (elementType == SQLITE_VEC_ELEMENT_TYPE_INT8) { - int sz = dimensions / CHAR_BIT; - u8 *out = sqlite3_malloc(sz); - if (!out) { - cleanup(vector); - sqlite3_result_error_code(context, SQLITE_NOMEM); - return; - } - memset(out, 0, sz); - for (size_t i = 0; i < dimensions; i++) { - int res = ((i8 *)vector)[i] > 0; - out[i / 8] |= (res << (i % 8)); - } - sqlite3_result_blob(context, out, dimensions / CHAR_BIT, sqlite3_free); - sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_BIT); - } else { - sqlite3_result_error(context, - "Can only binary quantize float or int8 vectors", -1); - return; - } -} static void vec_add(sqlite3_context *context, int argc, sqlite3_value **argv) { assert(argc == 2); @@ -2778,7 +2807,7 @@ static int vec_npy_eachColumnBuffer(vec_npy_each_cursor *pCur, } case SQLITE_VEC_ELEMENT_TYPE_INT8: case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // TODO + // https://github.com/asg017/sqlite-vec/issues/42 sqlite3_result_error(context, "vec_npy_each only supports float32 vectors", -1); break; @@ -2806,7 +2835,7 @@ static int vec_npy_eachColumnFile(vec_npy_each_cursor *pCur, } case SQLITE_VEC_ELEMENT_TYPE_INT8: case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // TODO + // https://github.com/asg017/sqlite-vec/issues/42 sqlite3_result_error(context, "vec_npy_each only supports float32 vectors", -1); break; @@ -5902,13 +5931,13 @@ static sqlite3_module vec0Module = { /* xCommit */ 0, /* xRollback */ 0, /* xFindFunction */ 0, - /* xRename */ 0, // TODO + /* xRename */ 0, // https://github.com/asg017/sqlite-vec/issues/43 /* xSavepoint */ 0, /* xRelease */ 0, /* xRollbackTo */ 0, /* xShadowName */ vec0ShadowName, #if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0, // TODO + /* xIntegrity */ 0, // https://github.com/asg017/sqlite-vec/issues/44 #endif }; #pragma endregion @@ -6661,6 +6690,7 @@ __declspec(dllexport) {"vec_distance_hamming",vec_distance_hamming, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE, }, {"vec_distance_cosine", vec_distance_cosine, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE, }, {"vec_length", vec_length, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE, }, + {"vec_type", vec_type, 1, DEFAULT_FLAGS, }, {"vec_to_json", vec_to_json, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, {"vec_add", vec_add, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, {"vec_sub", vec_sub, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, @@ -6669,8 +6699,7 @@ __declspec(dllexport) {"vec_f32", vec_f32, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, {"vec_bit", vec_bit, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, {"vec_int8", vec_int8, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, - {"vec_quantize_i8", vec_quantize_i8, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, - {"vec_quantize_i8", vec_quantize_i8, 3, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, + {"vec_quantize_int8", vec_quantize_int8, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, {"vec_quantize_binary", vec_quantize_binary, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, }, {"vec_static_blob_from_raw", vec_static_blob_from_raw, 4, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE }, // clang-format on diff --git a/tests/test-loadable.py b/tests/test-loadable.py index 293c667..9eea556 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -110,12 +110,12 @@ FUNCTIONS = [ "vec_length", "vec_normalize", "vec_quantize_binary", - "vec_quantize_i8", - "vec_quantize_i8", + "vec_quantize_int8", "vec_slice", "vec_static_blob_from_raw", "vec_sub", "vec_to_json", + "vec_type", "vec_version", ] MODULES = [ @@ -448,6 +448,20 @@ def test_vec_slice(): vec_slice(b"\xab\xab\xab\xab", 0, 0) +def test_vec_type(): + vec_type = lambda *args, a="?": db.execute(f"select vec_type({a})", args).fetchone()[0] + assert vec_type('[1]') == "float32" + assert vec_type(b"\xaa\xbb\xcc\xdd") == "float32" + assert vec_type('[1]', a='vec_f32(?)') == "float32" + assert vec_type('[1]', a='vec_int8(?)') == "int8" + assert vec_type(b"\xaa", a='vec_bit(?)') == "bit" + + with _raises("invalid float32 vector"): + vec_type(b"\xaa") + with _raises("found NULL"): + vec_type(None) + + def test_vec_add(): vec_add = lambda *args, a="?", b="?": db.execute( f"select vec_add({a}, {b})", args @@ -517,11 +531,11 @@ def test_vec_to_json(): @pytest.mark.skip(reason="TODO") -def test_vec_quantize_i8(): - vec_quantize_i8 = lambda *args: db.execute( - "select vec_quantize_i8()", args +def test_vec_quantize_int8(): + vec_quantize_int8 = lambda *args: db.execute( + "select vec_quantize_int8()", args ).fetchone()[0] - assert vec_quantize_i8() == 111 + assert vec_quantize_int8() == 111 def test_vec_quantize_binary(): @@ -1020,9 +1034,9 @@ def test_vec0_updates(): db.execute( """ INSERT INTO t3 VALUES - (1, :x, vec_quantize_i8(:x, 'unit') ,vec_quantize_binary(:x)), - (2, :y, vec_quantize_i8(:y, 'unit') ,vec_quantize_binary(:y)), - (3, :z, vec_quantize_i8(:z, 'unit') ,vec_quantize_binary(:z)); + (1, :x, vec_quantize_int8(:x, 'unit') ,vec_quantize_binary(:x)), + (2, :y, vec_quantize_int8(:y, 'unit') ,vec_quantize_binary(:y)), + (3, :z, vec_quantize_int8(:z, 'unit') ,vec_quantize_binary(:z)); """, { "x": "[.1, .1, .1, .1, -.1, -.1, -.1, -.1]", @@ -1795,7 +1809,7 @@ def test_vec0_knn(): db.executemany( """ INSERT INTO v VALUES - (:id, :vector, vec_quantize_i8(:vector, 'unit') ,vec_quantize_binary(:vector)); + (:id, :vector, vec_quantize_int8(:vector, 'unit') ,vec_quantize_binary(:vector)); """, [ {