Feature/pkgsplit (#83)

* Starting to spawn base package
* More package hacking
* Bedrock and VertexAI
* Parquet split
* Updated templates
* Utils
This commit is contained in:
cybermaggedon 2024-09-30 19:36:09 +01:00 committed by GitHub
parent 3fb75c617b
commit 9b91d5eee3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
262 changed files with 630 additions and 420 deletions

View file

@ -0,0 +1,108 @@
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
class TrustGraph:
def __init__(self, hosts=None):
if hosts is None:
hosts = ["localhost"]
self.cluster = Cluster(hosts)
self.session = self.cluster.connect()
self.init()
def clear(self):
self.session.execute("""
drop keyspace if exists trustgraph;
""");
self.init()
def init(self):
self.session.execute("""
create keyspace if not exists trustgraph
with replication = {
'class' : 'SimpleStrategy',
'replication_factor' : 1
};
""");
self.session.set_keyspace('trustgraph')
self.session.execute("""
create table if not exists triples (
s text,
p text,
o text,
PRIMARY KEY (s, p, o)
);
""");
self.session.execute("""
create index if not exists triples_p
ON triples (p);
""");
self.session.execute("""
create index if not exists triples_o
ON triples (o);
""");
def insert(self, s, p, o):
self.session.execute(
"insert into triples (s, p, o) values (%s, %s, %s)",
(s, p, o)
)
def get_all(self, limit=50):
return self.session.execute(
f"select s, p, o from triples limit {limit}"
)
def get_s(self, s, limit=10):
return self.session.execute(
f"select p, o from triples where s = %s limit {limit}",
(s,)
)
def get_p(self, p, limit=10):
return self.session.execute(
f"select s, o from triples where p = %s limit {limit}",
(p,)
)
def get_o(self, o, limit=10):
return self.session.execute(
f"select s, p from triples where o = %s limit {limit}",
(o,)
)
def get_sp(self, s, p, limit=10):
return self.session.execute(
f"select o from triples where s = %s and p = %s limit {limit}",
(s, p)
)
def get_po(self, p, o, limit=10):
return self.session.execute(
f"select s from triples where p = %s and o = %s allow filtering limit {limit}",
(p, o)
)
def get_os(self, o, s, limit=10):
return self.session.execute(
f"select p from triples where o = %s and s = %s limit {limit}",
(o, s)
)
def get_spo(self, s, p, o, limit=10):
return self.session.execute(
f"""select s as x from triples where s = %s and p = %s and o = %s limit {limit}""",
(s, p, o)
)

View file

@ -0,0 +1,138 @@
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
import time
class DocVectors:
def __init__(self, uri="http://localhost:19530", prefix='doc'):
self.client = MilvusClient(uri=uri)
# Strategy is to create collections per dimension. Probably only
# going to be using 1 anyway, but that means we don't need to
# hard-code the dimension anywhere, and no big deal if more than
# one are created.
self.collections = {}
self.prefix = prefix
# Time between reloads
self.reload_time = 90
# Next time to reload - this forces a reload at next window
self.next_reload = time.time() + self.reload_time
print("Reload at", self.next_reload)
def init_collection(self, dimension):
collection_name = self.prefix + "_" + str(dimension)
pkey_field = FieldSchema(
name="id",
dtype=DataType.INT64,
is_primary=True,
auto_id=True,
)
vec_field = FieldSchema(
name="vector",
dtype=DataType.FLOAT_VECTOR,
dim=dimension,
)
doc_field = FieldSchema(
name="doc",
dtype=DataType.VARCHAR,
max_length=65535,
)
schema = CollectionSchema(
fields = [pkey_field, vec_field, doc_field],
description = "Document embedding schema",
)
self.client.create_collection(
collection_name=collection_name,
schema=schema,
metric_type="COSINE",
)
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
field_name="vector",
metric_type="COSINE",
index_type="IVF_SQ8",
index_name="vector_index",
params={ "nlist": 128 }
)
self.client.create_index(
collection_name=collection_name,
index_params=index_params
)
self.collections[dimension] = collection_name
def insert(self, embeds, doc):
dim = len(embeds)
if dim not in self.collections:
self.init_collection(dim)
data = [
{
"vector": embeds,
"doc": doc,
}
]
self.client.insert(
collection_name=self.collections[dim],
data=data
)
def search(self, embeds, fields=["doc"], limit=10):
dim = len(embeds)
if dim not in self.collections:
self.init_collection(dim)
coll = self.collections[dim]
search_params = {
"metric_type": "COSINE",
"params": {
"radius": 0.1,
"range_filter": 0.8
}
}
print("Loading...")
self.client.load_collection(
collection_name=coll,
)
print("Searching...")
res = self.client.search(
collection_name=coll,
data=[embeds],
limit=limit,
output_fields=fields,
search_params=search_params,
)[0]
# If reload time has passed, unload collection
if time.time() > self.next_reload:
print("Unloading, reload at", self.next_reload)
self.client.release_collection(
collection_name=coll,
)
self.next_reload = time.time() + self.reload_time
return res

View file

@ -0,0 +1,138 @@
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
import time
class EntityVectors:
def __init__(self, uri="http://localhost:19530", prefix='entity'):
self.client = MilvusClient(uri=uri)
# Strategy is to create collections per dimension. Probably only
# going to be using 1 anyway, but that means we don't need to
# hard-code the dimension anywhere, and no big deal if more than
# one are created.
self.collections = {}
self.prefix = prefix
# Time between reloads
self.reload_time = 90
# Next time to reload - this forces a reload at next window
self.next_reload = time.time() + self.reload_time
print("Reload at", self.next_reload)
def init_collection(self, dimension):
collection_name = self.prefix + "_" + str(dimension)
pkey_field = FieldSchema(
name="id",
dtype=DataType.INT64,
is_primary=True,
auto_id=True,
)
vec_field = FieldSchema(
name="vector",
dtype=DataType.FLOAT_VECTOR,
dim=dimension,
)
entity_field = FieldSchema(
name="entity",
dtype=DataType.VARCHAR,
max_length=65535,
)
schema = CollectionSchema(
fields = [pkey_field, vec_field, entity_field],
description = "Graph embedding schema",
)
self.client.create_collection(
collection_name=collection_name,
schema=schema,
metric_type="COSINE",
)
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
field_name="vector",
metric_type="COSINE",
index_type="IVF_SQ8",
index_name="vector_index",
params={ "nlist": 128 }
)
self.client.create_index(
collection_name=collection_name,
index_params=index_params
)
self.collections[dimension] = collection_name
def insert(self, embeds, entity):
dim = len(embeds)
if dim not in self.collections:
self.init_collection(dim)
data = [
{
"vector": embeds,
"entity": entity,
}
]
self.client.insert(
collection_name=self.collections[dim],
data=data
)
def search(self, embeds, fields=["entity"], limit=10):
dim = len(embeds)
if dim not in self.collections:
self.init_collection(dim)
coll = self.collections[dim]
search_params = {
"metric_type": "COSINE",
"params": {
"radius": 0.1,
"range_filter": 0.8
}
}
print("Loading...")
self.client.load_collection(
collection_name=coll,
)
print("Searching...")
res = self.client.search(
collection_name=coll,
data=[embeds],
limit=limit,
output_fields=fields,
search_params=search_params,
)[0]
# If reload time has passed, unload collection
if time.time() > self.next_reload:
print("Unloading, reload at", self.next_reload)
self.client.release_collection(
collection_name=coll,
)
self.next_reload = time.time() + self.reload_time
return res

View file

@ -0,0 +1,154 @@
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
import time
class ObjectVectors:
def __init__(self, uri="http://localhost:19530", prefix='obj'):
self.client = MilvusClient(uri=uri)
# Strategy is to create collections per dimension. Probably only
# going to be using 1 anyway, but that means we don't need to
# hard-code the dimension anywhere, and no big deal if more than
# one are created.
self.collections = {}
self.prefix = prefix
# Time between reloads
self.reload_time = 90
# Next time to reload - this forces a reload at next window
self.next_reload = time.time() + self.reload_time
print("Reload at", self.next_reload)
def init_collection(self, dimension, name):
collection_name = self.prefix + "_" + name + "_" + str(dimension)
pkey_field = FieldSchema(
name="id",
dtype=DataType.INT64,
is_primary=True,
auto_id=True,
)
vec_field = FieldSchema(
name="vector",
dtype=DataType.FLOAT_VECTOR,
dim=dimension,
)
name_field = FieldSchema(
name="name",
dtype=DataType.VARCHAR,
max_length=65535,
)
key_name_field = FieldSchema(
name="key_name",
dtype=DataType.VARCHAR,
max_length=65535,
)
key_field = FieldSchema(
name="key",
dtype=DataType.VARCHAR,
max_length=65535,
)
schema = CollectionSchema(
fields = [
pkey_field, vec_field, name_field, key_name_field, key_field
],
description = "Object embedding schema",
)
self.client.create_collection(
collection_name=collection_name,
schema=schema,
metric_type="COSINE",
)
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
field_name="vector",
metric_type="COSINE",
index_type="IVF_SQ8",
index_name="vector_index",
params={ "nlist": 128 }
)
self.client.create_index(
collection_name=collection_name,
index_params=index_params
)
self.collections[(dimension, name)] = collection_name
def insert(self, embeds, name, key_name, key):
dim = len(embeds)
if (dim, name) not in self.collections:
self.init_collection(dim, name)
data = [
{
"vector": embeds,
"name": name,
"key_name": key_name,
"key": key,
}
]
self.client.insert(
collection_name=self.collections[(dim, name)],
data=data
)
def search(self, embeds, name, fields=["key_name", "name"], limit=10):
dim = len(embeds)
if dim not in self.collections:
self.init_collection(dim, name)
coll = self.collections[(dim, name)]
search_params = {
"metric_type": "COSINE",
"params": {
"radius": 0.1,
"range_filter": 0.8
}
}
print("Loading...")
self.client.load_collection(
collection_name=coll,
)
print("Searching...")
res = self.client.search(
collection_name=coll,
data=[embeds],
limit=limit,
output_fields=fields,
search_params=search_params,
)[0]
# If reload time has passed, unload collection
if time.time() > self.next_reload:
print("Unloading, reload at", self.next_reload)
self.client.release_collection(
collection_name=coll,
)
self.next_reload = time.time() + self.reload_time
return res