Feature/pkgsplit (#83)

* Starting to spawn base package * More package hacking * Bedrock and VertexAI * Parquet split * Updated templates * Utils
2026-05-03 20:32:38 +02:00 · 2024-09-30 19:36:09 +01:00 · 2024-09-30 19:36:09 +01:00 · 9b91d5eee3
commit 9b91d5eee3
parent 3fb75c617b
262 changed files with 630 additions and 420 deletions
--- a/trustgraph-flow/trustgraph/direct/init.py
+++ b/trustgraph-flow/trustgraph/direct/init.py
--- a/trustgraph-flow/trustgraph/direct/cassandra.py
+++ b/trustgraph-flow/trustgraph/direct/cassandra.py
@ -0,0 +1,108 @@
+
+from cassandra.cluster import Cluster
+from cassandra.auth import PlainTextAuthProvider
+
+class TrustGraph:
+
+    def __init__(self, hosts=None):
+
+        if hosts is None:
+            hosts = ["localhost"]
+            
+        self.cluster = Cluster(hosts)
+        self.session = self.cluster.connect()
+
+        self.init()
+
+    def clear(self):
+
+        self.session.execute("""
+            drop keyspace if exists trustgraph;
+        """);
+
+        self.init()
+
+    def init(self):
+
+        self.session.execute("""
+            create keyspace if not exists trustgraph
+                with replication = { 
+                   'class' : 'SimpleStrategy', 
+                   'replication_factor' : 1 
+                };
+        """);
+
+        self.session.set_keyspace('trustgraph')
+
+        self.session.execute("""
+            create table if not exists triples (
+                s text,
+                p text,
+                o text,
+                PRIMARY KEY (s, p, o)
+            );
+        """);
+
+        self.session.execute("""
+            create index if not exists triples_p
+                ON triples (p);
+        """);
+
+        self.session.execute("""
+            create index if not exists triples_o
+                ON triples (o);
+        """);
+
+    def insert(self, s, p, o):
+    
+        self.session.execute(
+            "insert into triples (s, p, o) values (%s, %s, %s)",
+            (s, p, o)
+        )
+
+    def get_all(self, limit=50):
+        return self.session.execute(
+            f"select s, p, o from triples limit {limit}"
+        )
+
+    def get_s(self, s, limit=10):
+        return self.session.execute(
+            f"select p, o from triples where s = %s limit {limit}",
+            (s,)
+        )
+
+    def get_p(self, p, limit=10):
+        return self.session.execute(
+            f"select s, o from triples where p = %s limit {limit}",
+            (p,)
+        )
+
+    def get_o(self, o, limit=10):
+        return self.session.execute(
+            f"select s, p from triples where o = %s limit {limit}",
+            (o,)
+        )
+
+    def get_sp(self, s, p, limit=10):
+        return self.session.execute(
+            f"select o from triples where s = %s and p = %s limit {limit}",
+            (s, p)
+        )
+
+    def get_po(self, p, o, limit=10):
+        return self.session.execute(
+            f"select s from triples where p = %s and o = %s allow filtering limit {limit}",
+            (p, o)
+        )
+
+    def get_os(self, o, s, limit=10):
+        return self.session.execute(
+            f"select p from triples where o = %s and s = %s limit {limit}",
+            (o, s)
+        )
+
+    def get_spo(self, s, p, o, limit=10):
+        return self.session.execute(
+            f"""select s as x from triples where s = %s and p = %s and o = %s limit {limit}""",
+            (s, p, o)
+        )
--- a/trustgraph-flow/trustgraph/direct/milvus_doc_embeddings.py
+++ b/trustgraph-flow/trustgraph/direct/milvus_doc_embeddings.py
@ -0,0 +1,138 @@
+
+from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
+import time
+
+class DocVectors:
+
+    def __init__(self, uri="http://localhost:19530", prefix='doc'):
+
+        self.client = MilvusClient(uri=uri)
+
+        # Strategy is to create collections per dimension.  Probably only
+        # going to be using 1 anyway, but that means we don't need to
+        # hard-code the dimension anywhere, and no big deal if more than
+        # one are created.
+        self.collections = {}
+
+        self.prefix = prefix
+
+        # Time between reloads
+        self.reload_time = 90
+
+        # Next time to reload - this forces a reload at next window
+        self.next_reload = time.time() + self.reload_time
+        print("Reload at", self.next_reload)
+
+    def init_collection(self, dimension):
+
+        collection_name = self.prefix + "_" + str(dimension)
+
+        pkey_field = FieldSchema(
+            name="id",
+            dtype=DataType.INT64,
+            is_primary=True,
+            auto_id=True,
+        )
+
+        vec_field = FieldSchema(
+            name="vector",
+            dtype=DataType.FLOAT_VECTOR,
+            dim=dimension,
+        )
+
+        doc_field = FieldSchema(
+            name="doc",
+            dtype=DataType.VARCHAR,
+            max_length=65535,
+        )
+
+        schema = CollectionSchema(
+            fields = [pkey_field, vec_field, doc_field],
+            description = "Document embedding schema",
+        )
+
+        self.client.create_collection(
+            collection_name=collection_name,
+            schema=schema,
+            metric_type="COSINE",
+        )
+
+        index_params = MilvusClient.prepare_index_params()
+
+        index_params.add_index(
+            field_name="vector",
+            metric_type="COSINE",
+            index_type="IVF_SQ8",
+            index_name="vector_index",
+            params={ "nlist": 128 }
+        )
+
+        self.client.create_index(
+            collection_name=collection_name,
+            index_params=index_params
+        )
+
+        self.collections[dimension] = collection_name
+
+    def insert(self, embeds, doc):
+
+        dim = len(embeds)
+
+        if dim not in self.collections:
+            self.init_collection(dim)
+    
+        data = [
+            {
+                "vector": embeds,
+                "doc": doc,
+            }
+        ]
+
+        self.client.insert(
+            collection_name=self.collections[dim],
+            data=data
+        )
+
+    def search(self, embeds, fields=["doc"], limit=10):
+
+        dim = len(embeds)
+
+        if dim not in self.collections:
+            self.init_collection(dim)
+
+        coll = self.collections[dim]
+
+        search_params = {
+            "metric_type": "COSINE",
+            "params": {
+                "radius": 0.1,
+                "range_filter": 0.8
+            }
+        }
+
+        print("Loading...")
+        self.client.load_collection(
+            collection_name=coll,
+        )
+
+        print("Searching...")
+
+        res = self.client.search(
+            collection_name=coll,
+            data=[embeds],
+            limit=limit,
+            output_fields=fields,
+            search_params=search_params,
+        )[0]
+
+
+        # If reload time has passed, unload collection
+        if time.time() > self.next_reload:
+            print("Unloading, reload at", self.next_reload)
+            self.client.release_collection(
+                collection_name=coll,
+            )
+            self.next_reload = time.time() + self.reload_time
+
+        return res
+
--- a/trustgraph-flow/trustgraph/direct/milvus_graph_embeddings.py
+++ b/trustgraph-flow/trustgraph/direct/milvus_graph_embeddings.py
@ -0,0 +1,138 @@
+
+from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
+import time
+
+class EntityVectors:
+
+    def __init__(self, uri="http://localhost:19530", prefix='entity'):
+
+        self.client = MilvusClient(uri=uri)
+
+        # Strategy is to create collections per dimension.  Probably only
+        # going to be using 1 anyway, but that means we don't need to
+        # hard-code the dimension anywhere, and no big deal if more than
+        # one are created.
+        self.collections = {}
+
+        self.prefix = prefix
+
+        # Time between reloads
+        self.reload_time = 90
+
+        # Next time to reload - this forces a reload at next window
+        self.next_reload = time.time() + self.reload_time
+        print("Reload at", self.next_reload)
+
+    def init_collection(self, dimension):
+
+        collection_name = self.prefix + "_" + str(dimension)
+
+        pkey_field = FieldSchema(
+            name="id",
+            dtype=DataType.INT64,
+            is_primary=True,
+            auto_id=True,
+        )
+
+        vec_field = FieldSchema(
+            name="vector",
+            dtype=DataType.FLOAT_VECTOR,
+            dim=dimension,
+        )
+
+        entity_field = FieldSchema(
+            name="entity",
+            dtype=DataType.VARCHAR,
+            max_length=65535,
+        )
+
+        schema = CollectionSchema(
+            fields = [pkey_field, vec_field, entity_field],
+            description = "Graph embedding schema",
+        )
+
+        self.client.create_collection(
+            collection_name=collection_name,
+            schema=schema,
+            metric_type="COSINE",
+        )
+
+        index_params = MilvusClient.prepare_index_params()
+
+        index_params.add_index(
+            field_name="vector",
+            metric_type="COSINE",
+            index_type="IVF_SQ8",
+            index_name="vector_index",
+            params={ "nlist": 128 }
+        )
+
+        self.client.create_index(
+            collection_name=collection_name,
+            index_params=index_params
+        )
+
+        self.collections[dimension] = collection_name
+
+    def insert(self, embeds, entity):
+
+        dim = len(embeds)
+
+        if dim not in self.collections:
+            self.init_collection(dim)
+    
+        data = [
+            {
+                "vector": embeds,
+                "entity": entity,
+            }
+        ]
+
+        self.client.insert(
+            collection_name=self.collections[dim],
+            data=data
+        )
+
+    def search(self, embeds, fields=["entity"], limit=10):
+
+        dim = len(embeds)
+
+        if dim not in self.collections:
+            self.init_collection(dim)
+
+        coll = self.collections[dim]
+
+        search_params = {
+            "metric_type": "COSINE",
+            "params": {
+                "radius": 0.1,
+                "range_filter": 0.8
+            }
+        }
+
+        print("Loading...")
+        self.client.load_collection(
+            collection_name=coll,
+        )
+
+        print("Searching...")
+
+        res = self.client.search(
+            collection_name=coll,
+            data=[embeds],
+            limit=limit,
+            output_fields=fields,
+            search_params=search_params,
+        )[0]
+
+
+        # If reload time has passed, unload collection
+        if time.time() > self.next_reload:
+            print("Unloading, reload at", self.next_reload)
+            self.client.release_collection(
+                collection_name=coll,
+            )
+            self.next_reload = time.time() + self.reload_time
+
+        return res
+
--- a/trustgraph-flow/trustgraph/direct/milvus_object_embeddings.py
+++ b/trustgraph-flow/trustgraph/direct/milvus_object_embeddings.py
@ -0,0 +1,154 @@
+
+from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
+import time
+
+class ObjectVectors:
+
+    def __init__(self, uri="http://localhost:19530", prefix='obj'):
+
+        self.client = MilvusClient(uri=uri)
+
+        # Strategy is to create collections per dimension.  Probably only
+        # going to be using 1 anyway, but that means we don't need to
+        # hard-code the dimension anywhere, and no big deal if more than
+        # one are created.
+        self.collections = {}
+
+        self.prefix = prefix
+
+        # Time between reloads
+        self.reload_time = 90
+
+        # Next time to reload - this forces a reload at next window
+        self.next_reload = time.time() + self.reload_time
+        print("Reload at", self.next_reload)
+
+    def init_collection(self, dimension, name):
+
+        collection_name = self.prefix + "_" + name + "_" + str(dimension)
+
+        pkey_field = FieldSchema(
+            name="id",
+            dtype=DataType.INT64,
+            is_primary=True,
+            auto_id=True,
+        )
+
+        vec_field = FieldSchema(
+            name="vector",
+            dtype=DataType.FLOAT_VECTOR,
+            dim=dimension,
+        )
+
+        name_field = FieldSchema(
+            name="name",
+            dtype=DataType.VARCHAR,
+            max_length=65535,
+        )
+
+        key_name_field = FieldSchema(
+            name="key_name",
+            dtype=DataType.VARCHAR,
+            max_length=65535,
+        )
+
+        key_field = FieldSchema(
+            name="key",
+            dtype=DataType.VARCHAR,
+            max_length=65535,
+        )
+
+        schema = CollectionSchema(
+            fields = [
+                pkey_field, vec_field, name_field, key_name_field, key_field
+            ],
+            description = "Object embedding schema",
+        )
+
+        self.client.create_collection(
+            collection_name=collection_name,
+            schema=schema,
+            metric_type="COSINE",
+        )
+
+        index_params = MilvusClient.prepare_index_params()
+
+        index_params.add_index(
+            field_name="vector",
+            metric_type="COSINE",
+            index_type="IVF_SQ8",
+            index_name="vector_index",
+            params={ "nlist": 128 }
+        )
+
+        self.client.create_index(
+            collection_name=collection_name,
+            index_params=index_params
+        )
+
+        self.collections[(dimension, name)] = collection_name
+
+    def insert(self, embeds, name, key_name, key):
+
+        dim = len(embeds)
+
+        if (dim, name) not in self.collections:
+            self.init_collection(dim, name)
+    
+        data = [
+            {
+                "vector": embeds,
+                "name": name,
+                "key_name": key_name,
+                "key": key,
+            }
+        ]
+
+        self.client.insert(
+            collection_name=self.collections[(dim, name)],
+            data=data
+        )
+
+    def search(self, embeds, name, fields=["key_name", "name"], limit=10):
+
+        dim = len(embeds)
+
+        if dim not in self.collections:
+            self.init_collection(dim, name)
+
+        coll = self.collections[(dim, name)]
+
+        search_params = {
+            "metric_type": "COSINE",
+            "params": {
+                "radius": 0.1,
+                "range_filter": 0.8
+            }
+        }
+
+        print("Loading...")
+        self.client.load_collection(
+            collection_name=coll,
+        )
+
+        print("Searching...")
+
+        res = self.client.search(
+            collection_name=coll,
+            data=[embeds],
+            limit=limit,
+            output_fields=fields,
+            search_params=search_params,
+        )[0]
+
+
+        # If reload time has passed, unload collection
+        if time.time() > self.next_reload:
+            print("Unloading, reload at", self.next_reload)
+            self.client.release_collection(
+                collection_name=coll,
+            )
+            self.next_reload = time.time() + self.reload_time
+
+        return res
+