mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-29 17:25:15 +02:00
Dump GEs
This commit is contained in:
parent
9bcdee0f64
commit
7d864fe370
7 changed files with 198 additions and 0 deletions
6
scripts/ge-dump-parquet
Executable file
6
scripts/ge-dump-parquet
Executable file
|
|
@ -0,0 +1,6 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from trustgraph.dump.graph_embeddings.parquet import run
|
||||||
|
|
||||||
|
run()
|
||||||
|
|
||||||
1
setup.py
1
setup.py
|
|
@ -69,5 +69,6 @@ setuptools.setup(
|
||||||
"scripts/triples-write-cassandra",
|
"scripts/triples-write-cassandra",
|
||||||
"scripts/dump-parquet",
|
"scripts/dump-parquet",
|
||||||
"scripts/triples-dump-parquet",
|
"scripts/triples-dump-parquet",
|
||||||
|
"scripts/ge-dump-parquet",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
0
trustgraph/dump/graph_embeddings/__init__.py
Normal file
0
trustgraph/dump/graph_embeddings/__init__.py
Normal file
3
trustgraph/dump/graph_embeddings/parquet/__init__.py
Normal file
3
trustgraph/dump/graph_embeddings/parquet/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
|
||||||
|
from . processor import *
|
||||||
|
|
||||||
7
trustgraph/dump/graph_embeddings/parquet/__main__.py
Executable file
7
trustgraph/dump/graph_embeddings/parquet/__main__.py
Executable file
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from . write import run
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run()
|
||||||
|
|
||||||
87
trustgraph/dump/graph_embeddings/parquet/processor.py
Executable file
87
trustgraph/dump/graph_embeddings/parquet/processor.py
Executable file
|
|
@ -0,0 +1,87 @@
|
||||||
|
|
||||||
|
"""
|
||||||
|
Write graph embeddings to parquet files in a directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pulsar
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
|
from .... trustgraph import TrustGraph
|
||||||
|
from .... schema import GraphEmbeddings
|
||||||
|
from .... schema import graph_embeddings_store_queue
|
||||||
|
from .... log_level import LogLevel
|
||||||
|
from .... base import Consumer
|
||||||
|
|
||||||
|
from . writer import ParquetWriter
|
||||||
|
|
||||||
|
module = ".".join(__name__.split(".")[1:-1])
|
||||||
|
|
||||||
|
default_input_queue = graph_embeddings_store_queue
|
||||||
|
default_subscriber = module
|
||||||
|
default_graph_host='localhost'
|
||||||
|
default_directory = "."
|
||||||
|
default_file_template = "graph-embeds-{id}.parquet"
|
||||||
|
default_rotation_time = 60
|
||||||
|
|
||||||
|
class Processor(Consumer):
|
||||||
|
|
||||||
|
def __init__(self, **params):
|
||||||
|
|
||||||
|
input_queue = params.get("input_queue", default_input_queue)
|
||||||
|
subscriber = params.get("subscriber", default_subscriber)
|
||||||
|
directory = params.get("directory", default_directory)
|
||||||
|
file_template = params.get("file_template", default_file_template)
|
||||||
|
rotation_time = params.get("rotation_time", default_rotation_time)
|
||||||
|
|
||||||
|
super(Processor, self).__init__(
|
||||||
|
**params | {
|
||||||
|
"input_queue": input_queue,
|
||||||
|
"subscriber": subscriber,
|
||||||
|
"input_schema": GraphEmbeddings,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.writer = ParquetWriter(directory, file_template, rotation_time)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if hasattr(self, "writer"):
|
||||||
|
del self.writer
|
||||||
|
|
||||||
|
def handle(self, msg):
|
||||||
|
|
||||||
|
v = msg.value()
|
||||||
|
self.writer.write(v.vectors, v.entity.value)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def add_args(parser):
|
||||||
|
|
||||||
|
Consumer.add_args(
|
||||||
|
parser, default_input_queue, default_subscriber,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--directory',
|
||||||
|
default=default_directory,
|
||||||
|
help=f'Directory to write to (default: {default_directory})'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-f', '--file-template',
|
||||||
|
default=default_file_template,
|
||||||
|
help=f'Directory to write to (default: {default_file_template})'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-t', '--rotation-time',
|
||||||
|
type=int,
|
||||||
|
default=default_rotation_time,
|
||||||
|
help=f'Rotation time / seconds (default: {default_rotation_time})'
|
||||||
|
)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
|
||||||
|
Processor.start(module, __doc__)
|
||||||
|
|
||||||
94
trustgraph/dump/graph_embeddings/parquet/writer.py
Normal file
94
trustgraph/dump/graph_embeddings/parquet/writer.py
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
import pyarrow as pa
|
||||||
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
class ParquetWriter:
|
||||||
|
|
||||||
|
def __init__(self, directory, file_template, rotation_time):
|
||||||
|
self.directory = directory
|
||||||
|
self.file_template = file_template
|
||||||
|
self.rotation_time = rotation_time
|
||||||
|
|
||||||
|
self.q = queue.Queue()
|
||||||
|
|
||||||
|
self.running = True
|
||||||
|
|
||||||
|
self.thread = threading.Thread(target=(self.writer_thread))
|
||||||
|
self.thread.start()
|
||||||
|
|
||||||
|
def writer_thread(self):
|
||||||
|
|
||||||
|
items = []
|
||||||
|
|
||||||
|
timeout = None
|
||||||
|
|
||||||
|
while self.running:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
item = self.q.get(timeout=1)
|
||||||
|
|
||||||
|
if timeout == None:
|
||||||
|
timeout = time.time() + self.rotation_time
|
||||||
|
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if timeout:
|
||||||
|
if time.time() > timeout:
|
||||||
|
|
||||||
|
self.write_file(items)
|
||||||
|
timeout = None
|
||||||
|
items = []
|
||||||
|
|
||||||
|
def write_file(self, items):
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
schema = pa.schema([
|
||||||
|
pa.field('embeddings', pa.list_(pa.list_(pa.float64()))),
|
||||||
|
pa.field('entity', pa.string()),
|
||||||
|
])
|
||||||
|
|
||||||
|
fname = self.file_template.format(id=str(uuid.uuid4()))
|
||||||
|
path = f"{self.directory}/{fname}"
|
||||||
|
|
||||||
|
writer = pq.ParquetWriter(path, schema)
|
||||||
|
|
||||||
|
batch = pa.record_batch(
|
||||||
|
[
|
||||||
|
[i[0] for i in items],
|
||||||
|
[i[1] for i in items],
|
||||||
|
],
|
||||||
|
names=['embeddings', 'entity']
|
||||||
|
)
|
||||||
|
|
||||||
|
writer.write_batch(batch)
|
||||||
|
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
print(f"Wrote {path}.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
print("Parquet write:", e)
|
||||||
|
|
||||||
|
def write(self, embeds, ent):
|
||||||
|
self.q.put((embeds, ent))
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
if hasattr(self, "q"):
|
||||||
|
self.thread.join()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue