Feature/pkgsplit (#83)

* Starting to spawn base package
* More package hacking
* Bedrock and VertexAI
* Parquet split
* Updated templates
* Utils
This commit is contained in:
cybermaggedon 2024-09-30 19:36:09 +01:00 committed by GitHub
parent 3fb75c617b
commit 9b91d5eee3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
262 changed files with 630 additions and 420 deletions

View file

@ -0,0 +1,85 @@
"""
Write graphs triples to parquet files in a directory.
"""
import pulsar
import base64
import os
import argparse
import time
from .... schema import Triple
from .... schema import triples_store_queue
from .... base import Consumer
from . writer import ParquetWriter
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = triples_store_queue
default_subscriber = module
default_graph_host='localhost'
default_directory = "."
default_file_template = "triples-{id}.parquet"
default_rotation_time = 60
class Processor(Consumer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
subscriber = params.get("subscriber", default_subscriber)
directory = params.get("directory", default_directory)
file_template = params.get("file_template", default_file_template)
rotation_time = params.get("rotation_time", default_rotation_time)
super(Processor, self).__init__(
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": Triple,
}
)
self.writer = ParquetWriter(directory, file_template, rotation_time)
def __del__(self):
if hasattr(self, "writer"):
del self.writer
def handle(self, msg):
v = msg.value()
self.writer.write(v.s.value, v.p.value, v.o.value)
@staticmethod
def add_args(parser):
Consumer.add_args(
parser, default_input_queue, default_subscriber,
)
parser.add_argument(
'-d', '--directory',
default=default_directory,
help=f'Directory to write to (default: {default_directory})'
)
parser.add_argument(
'-f', '--file-template',
default=default_file_template,
help=f'Directory to write to (default: {default_file_template})'
)
parser.add_argument(
'-t', '--rotation-time',
type=int,
default=default_rotation_time,
help=f'Rotation time / seconds (default: {default_rotation_time})'
)
def run():
Processor.start(module, __doc__)