Feature/pkgsplit (#83)

* Starting to spawn base package
* More package hacking
* Bedrock and VertexAI
* Parquet split
* Updated templates
* Utils
This commit is contained in:
cybermaggedon 2024-09-30 19:36:09 +01:00 committed by GitHub
parent 3fb75c617b
commit 9b91d5eee3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
262 changed files with 630 additions and 420 deletions

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,63 @@
"""
Accepts entity/vector pairs and writes them to a Milvus store.
"""
from .... schema import ChunkEmbeddings
from .... schema import chunk_embeddings_ingest_queue
from .... log_level import LogLevel
from .... direct.milvus_doc_embeddings import DocVectors
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = chunk_embeddings_ingest_queue
default_subscriber = module
default_store_uri = 'http://localhost:19530'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
store_uri = params.get("store_uri", default_store_uri)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": ChunkEmbeddings,
"store_uri": store_uri,
}
)
self.vecstore = DocVectors(store_uri)
def handle(self, msg):
v = msg.value()
chunk = v.chunk.decode("utf-8")
if v.chunk != "" and v.chunk is not None:
for vec in v.vectors:
self.vecstore.insert(vec, chunk)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-t', '--store-uri',
default=default_store_uri,
help=f'Milvus store URI (default: {default_store_uri})'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,104 @@
"""
Accepts entity/vector pairs and writes them to a Qdrant store.
"""
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams
import uuid
from .... schema import ChunkEmbeddings
from .... schema import chunk_embeddings_ingest_queue
from .... log_level import LogLevel
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = chunk_embeddings_ingest_queue
default_subscriber = module
default_store_uri = 'http://localhost:6333'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
store_uri = params.get("store_uri", default_store_uri)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": ChunkEmbeddings,
"store_uri": store_uri,
}
)
self.last_collection = None
self.last_dim = None
self.client = QdrantClient(url=store_uri)
def handle(self, msg):
v = msg.value()
chunk = v.chunk.decode("utf-8")
if chunk == "": return
for vec in v.vectors:
dim = len(vec)
collection = "doc_" + str(dim)
if dim != self.last_dim:
if not self.client.collection_exists(collection):
try:
self.client.create_collection(
collection_name=collection,
vectors_config=VectorParams(
size=dim, distance=Distance.DOT
),
)
except Exception as e:
print("Qdrant collection creation failed")
raise e
self.last_collection = collection
self.last_dim = dim
self.client.upsert(
collection_name=collection,
points=[
PointStruct(
id=str(uuid.uuid4()),
vector=vec,
payload={
"doc": chunk,
}
)
]
)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-t', '--store-uri',
default=default_store_uri,
help=f'Qdrant store URI (default: {default_store_uri})'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,61 @@
"""
Accepts entity/vector pairs and writes them to a Milvus store.
"""
from .... schema import GraphEmbeddings
from .... schema import graph_embeddings_store_queue
from .... log_level import LogLevel
from .... direct.milvus_graph_embeddings import EntityVectors
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = graph_embeddings_store_queue
default_subscriber = module
default_store_uri = 'http://localhost:19530'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
store_uri = params.get("store_uri", default_store_uri)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": GraphEmbeddings,
"store_uri": store_uri,
}
)
self.vecstore = EntityVectors(store_uri)
def handle(self, msg):
v = msg.value()
if v.entity.value != "":
for vec in v.vectors:
self.vecstore.insert(vec, v.entity.value)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-t', '--store-uri',
default=default_store_uri,
help=f'Milvus store URI (default: {default_store_uri})'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,102 @@
"""
Accepts entity/vector pairs and writes them to a Qdrant store.
"""
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams
import uuid
from .... schema import GraphEmbeddings
from .... schema import graph_embeddings_store_queue
from .... log_level import LogLevel
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = graph_embeddings_store_queue
default_subscriber = module
default_store_uri = 'http://localhost:6333'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
store_uri = params.get("store_uri", default_store_uri)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": GraphEmbeddings,
"store_uri": store_uri,
}
)
self.last_collection = None
self.last_dim = None
self.client = QdrantClient(url=store_uri)
def handle(self, msg):
v = msg.value()
if v.entity.value == "" or v.entity.value is None: return
for vec in v.vectors:
dim = len(vec)
collection = "triples_" + str(dim)
if dim != self.last_dim:
if not self.client.collection_exists(collection):
try:
self.client.create_collection(
collection_name=collection,
vectors_config=VectorParams(
size=dim, distance=Distance.DOT
),
)
except Exception as e:
print("Qdrant collection creation failed")
raise e
self.last_collection = collection
self.last_dim = dim
self.client.upsert(
collection_name=collection,
points=[
PointStruct(
id=str(uuid.uuid4()),
vector=vec,
payload={
"entity": v.entity.value,
}
)
]
)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-t', '--store-uri',
default=default_store_uri,
help=f'Qdrant store URI (default: {default_store_uri})'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,61 @@
"""
Accepts entity/vector pairs and writes them to a Milvus store.
"""
from .... schema import ObjectEmbeddings
from .... schema import object_embeddings_store_queue
from .... log_level import LogLevel
from .... direct.milvus_object_embeddings import ObjectVectors
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = object_embeddings_store_queue
default_subscriber = module
default_store_uri = 'http://localhost:19530'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
store_uri = params.get("store_uri", default_store_uri)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": ObjectEmbeddings,
"store_uri": store_uri,
}
)
self.vecstore = ObjectVectors(store_uri)
def handle(self, msg):
v = msg.value()
if v.id != "" and v.id is not None:
for vec in v.vectors:
self.vecstore.insert(vec, v.name, v.key_name, v.id)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-t', '--store-uri',
default=default_store_uri,
help=f'Milvus store URI (default: {default_store_uri})'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,127 @@
"""
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
"""
import pulsar
import base64
import os
import argparse
import time
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from .... schema import Rows
from .... schema import rows_store_queue
from .... log_level import LogLevel
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = rows_store_queue
default_subscriber = module
default_graph_host='localhost'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
graph_host = params.get("graph_host", default_graph_host)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": Rows,
"graph_host": graph_host,
}
)
self.cluster = Cluster(graph_host.split(","))
self.session = self.cluster.connect()
self.tables = set()
self.session.execute("""
create keyspace if not exists trustgraph
with replication = {
'class' : 'SimpleStrategy',
'replication_factor' : 1
};
""");
self.session.execute("use trustgraph");
def handle(self, msg):
try:
v = msg.value()
name = v.row_schema.name
if name not in self.tables:
# FIXME: SQL injection?
pkey = []
stmt = "create table if not exists " + name + " ( "
for field in v.row_schema.fields:
stmt += field.name + " text, "
if field.primary:
pkey.append(field.name)
stmt += "PRIMARY KEY (" + ", ".join(pkey) + "));"
self.session.execute(stmt)
self.tables.add(name);
for row in v.rows:
field_names = []
values = []
for field in v.row_schema.fields:
field_names.append(field.name)
values.append(row[field.name])
# FIXME: SQL injection?
stmt = (
"insert into " + name + " (" + ", ".join(field_names) +
") values (" + ",".join(["%s"] * len(values)) + ")"
)
self.session.execute(stmt, values)
except Exception as e:
print("Exception:", str(e), flush=True)
# If there's an error make sure to do table creation etc.
self.tables.remove(name)
raise e
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-g', '--graph-host',
default="localhost",
help=f'Graph host (default: localhost)'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,69 @@
"""
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
"""
import pulsar
import base64
import os
import argparse
import time
from .... direct.cassandra import TrustGraph
from .... schema import Triple
from .... schema import triples_store_queue
from .... log_level import LogLevel
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = triples_store_queue
default_subscriber = module
default_graph_host='localhost'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
graph_host = params.get("graph_host", default_graph_host)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": Triple,
"graph_host": graph_host,
}
)
self.tg = TrustGraph([graph_host])
def handle(self, msg):
v = msg.value()
self.tg.insert(
v.s.value,
v.p.value,
v.o.value
)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-g', '--graph-host',
default="localhost",
help=f'Graph host (default: localhost)'
)
def run():
Processor.start(module, __doc__)

View file

@ -0,0 +1,3 @@
from . write import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . write import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,156 @@
"""
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
"""
import pulsar
import base64
import os
import argparse
import time
from neo4j import GraphDatabase
from .... schema import Triple
from .... schema import triples_store_queue
from .... log_level import LogLevel
from .... base import Consumer
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = triples_store_queue
default_subscriber = module
default_graph_host = 'bolt://neo4j:7687'
default_username = 'neo4j'
default_password = 'password'
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
graph_host = params.get("graph_host", default_graph_host)
username = params.get("username", default_username)
password = params.get("passowrd", default_password)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": Triple,
"graph_host": graph_host,
}
)
self.db = "neo4j"
self.io = GraphDatabase.driver(graph_host, auth=(username, password))
def create_node(self, uri):
print("Create node", uri)
summary = self.io.execute_query(
"MERGE (n:Node {uri: $uri})",
uri=uri,
database_=self.db,
).summary
print("Created {nodes_created} nodes in {time} ms.".format(
nodes_created=summary.counters.nodes_created,
time=summary.result_available_after
))
def create_literal(self, value):
print("Create literal", value)
summary = self.io.execute_query(
"MERGE (n:Literal {value: $value})",
value=value,
database_=self.db,
).summary
print("Created {nodes_created} nodes in {time} ms.".format(
nodes_created=summary.counters.nodes_created,
time=summary.result_available_after
))
def relate_node(self, src, uri, dest):
print("Create node rel", src, uri, dest)
summary = self.io.execute_query(
"MATCH (src:Node {uri: $src}) "
"MATCH (dest:Node {uri: $dest}) "
"MERGE (src)-[:Rel {uri: $uri}]->(dest)",
src=src, dest=dest, uri=uri,
database_=self.db,
).summary
print("Created {nodes_created} nodes in {time} ms.".format(
nodes_created=summary.counters.nodes_created,
time=summary.result_available_after
))
def relate_literal(self, src, uri, dest):
print("Create literal rel", src, uri, dest)
summary = self.io.execute_query(
"MATCH (src:Node {uri: $src}) "
"MATCH (dest:Literal {value: $dest}) "
"MERGE (src)-[:Rel {uri: $uri}]->(dest)",
src=src, dest=dest, uri=uri,
database_=self.db,
).summary
print("Created {nodes_created} nodes in {time} ms.".format(
nodes_created=summary.counters.nodes_created,
time=summary.result_available_after
))
def handle(self, msg):
v = msg.value()
self.create_node(v.s.value)
if v.o.is_uri:
self.create_node(v.o.value)
self.relate_node(v.s.value, v.p.value, v.o.value)
else:
self.create_literal(v.o.value)
self.relate_literal(v.s.value, v.p.value, v.o.value)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-g', '--graph_host',
default=default_graph_host,
help=f'Graph host (default: {default_graph_host})'
)
parser.add_argument(
'--username',
default=default_username,
help=f'Neo4j username (default: {default_username})'
)
parser.add_argument(
'--password',
default=default_password,
help=f'Neo4j password (default: {default_password})'
)
def run():
Processor.start(module, __doc__)