omnigraph/vendor/lance-table/protos/index.proto
aaltshuler b5c0c6238b fix(deps): vendor lance-table 7.0.0 + lance#7480 so merge-updated tables survive filtered reads after deletes
iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an
update-style merge_insert over a merge-written fragment legally reuses the
updated rows' stable row ids (row-id-lineage spec: updates preserve
_rowid) while the superseded fragment keeps its full sequence plus a
deletion vector. A later delete leaves the overlapping id range sparsely
tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling —
failing every filtered read that builds the id→address map ("Wrong range"
debug assert; "all columns in a record batch must have the same length"
or a silently-wrong batch in release).

The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER
v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a
vendored pin: vendor/lance-table is the pristine published 7.0.0 source
plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert;
hard-error on the true invariant — one live id claimed by two fragments)
and upstream's regression unit test, wired via [patch.crates-io]. The fix
is read-side only, so already-written graphs become readable as-is — no
data repair.

Removal condition (see vendor/lance-table/README.omnigraph.md): drop the
vendor dir + patch entry at the first Lance bump whose lance-table ships
lance#7480 (9.0.0, or a backported 8.0.1). The surface guard
filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in
both directions.

Turns the previous commit's red tests green. Full workspace gate passes
(cargo test --workspace --locked --no-fail-fast, 68 suites).
2026-07-02 23:23:39 +03:00

249 lines
No EOL
5.3 KiB
Protocol Buffer

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.index.pb;
import "google/protobuf/any.proto";
// The type of an index.
enum IndexType {
// Vector index
VECTOR = 0;
}
message Index {
// The unique index name in the dataset.
string name = 1;
// Columns to be used to build the index.
repeated string columns = 2;
// The version of the dataset this index was built from.
uint64 dataset_version = 3;
// The [`IndexType`] of the index.
IndexType index_type = 4;
/// Index implementation details.
oneof implementation {
VectorIndex vector_index = 5;
}
}
message Tensor {
enum DataType {
BFLOAT16 = 0;
FLOAT16 = 1;
FLOAT32 = 2;
FLOAT64 = 3;
UINT8 = 4;
UINT16 = 5;
UINT32 = 6;
UINT64 = 7;
}
DataType data_type = 1;
// Data shape, [dim1, dim2, ...]
repeated uint32 shape = 2;
// Data buffer
bytes data = 3;
}
// Inverted Index File Metadata.
message IVF {
// Centroids of partitions. `dimension * num_partitions` of float32s.
//
// Deprecated, use centroids_tensor instead.
repeated float centroids = 1; // [deprecated = true];
// File offset of each partition.
repeated uint64 offsets = 2;
// Number of records in the partition.
repeated uint32 lengths = 3;
// Tensor of centroids. `num_partitions * dimension` of float32s.
Tensor centroids_tensor = 4;
// KMeans loss.
optional double loss = 5;
}
// Product Quantization.
message PQ {
// The number of bits to present a centroid.
uint32 num_bits = 1;
// Number of sub vectors.
uint32 num_sub_vectors = 2;
// Vector dimension
uint32 dimension = 3;
// Codebook. `dimension * 2 ^ num_bits` of float32s.
repeated float codebook = 4;
// Tensor of codebook. `2 ^ num_bits * dimension` of floats.
Tensor codebook_tensor = 5;
}
// Transform type
enum TransformType {
OPQ = 0;
}
// A transform matrix to apply to a vector or vectors.
message Transform {
// The file offset the matrix is stored
uint64 position = 1;
// Data shape of the matrix, [rows, cols].
repeated uint32 shape = 2;
// Transform type.
TransformType type = 3;
}
// Flat Index
message Flat {}
// DiskAnn Index
message DiskAnn {
// Graph spec version
uint32 spec = 1;
// Graph file
string filename = 2;
// r parameter
uint32 r = 3;
// alpha parameter
float alpha = 4;
// L parameter
uint32 L = 5;
/// Entry points to the graph
repeated uint64 entries = 6;
}
// One stage in the vector index pipeline.
message VectorIndexStage {
oneof stage {
// Flat index
Flat flat = 1;
// `IVF` - Inverted File
IVF ivf = 2;
// Product Quantization
PQ pq = 3;
// Transformer
Transform transform = 4;
// DiskANN
DiskAnn diskann = 5;
}
}
// Metric Type for Vector Index
enum VectorMetricType {
// L2 (Euclidean) Distance
L2 = 0;
// Cosine Distance
Cosine = 1;
// Dot Product
Dot = 2;
// Hamming Distance
Hamming = 3;
}
// Vector Index Metadata
message VectorIndex {
// Index specification version.
uint32 spec_version = 1;
// Vector dimension;
uint32 dimension = 2;
// Composed vector index stages.
//
// For example, `IVF_PQ` index type can be expressed as:
//
// ```text
// let stages = vec![Ivf{}, PQ{num_bits: 8, num_sub_vectors: 16}]
// ```
repeated VectorIndexStage stages = 3;
// Vector distance metrics type
VectorMetricType metric_type = 4;
}
// Details for vector indexes, stored in the manifest's index_details field.
message VectorIndexDetails {
VectorMetricType metric_type = 1;
// The target number of vectors per partition.
// 0 means unset.
uint64 target_partition_size = 2;
// Optional HNSW index configuration. If set, the index has an HNSW layer.
optional HnswParameters hnsw_index_config = 3;
message ProductQuantization {
uint32 num_bits = 1;
uint32 num_sub_vectors = 2;
}
message ScalarQuantization {
uint32 num_bits = 1;
}
message RabitQuantization {
enum RotationType {
FAST = 0;
MATRIX = 1;
}
uint32 num_bits = 1;
RotationType rotation_type = 2;
}
// No quantization; vectors are stored as-is.
message FlatCompression {}
oneof compression {
ProductQuantization pq = 4;
ScalarQuantization sq = 5;
RabitQuantization rq = 6;
FlatCompression flat = 8;
}
// Runtime hints: optional build preferences that don't affect index structure.
// Keys use reverse-DNS namespacing (e.g., "lance.ivf.max_iters", "lancedb.accelerator").
// Unrecognized keys must be silently ignored by all runtimes.
map<string, string> runtime_hints = 9;
}
// Hierarchical Navigable Small World (HNSW) parameters, used as an optional configuration for IVF indexes.
message HnswParameters {
// The maximum number of outgoing edges per node in the HNSW graph. Higher values
// means more connections, better recall, but more memory and slower builds.
// Referred to as "M" in the HNSW literature.
uint32 max_connections = 1;
// "construction exploration factor": The size of the dynamic list used during
// index construction.
uint32 construction_ef = 2;
// The maximum number of levels in the HNSW graph.
uint32 max_level = 3;
}
message JsonIndexDetails {
string path = 1;
google.protobuf.Any target_details = 2;
}
message BloomFilterIndexDetails {}
message RTreeIndexDetails {}