Feature/subpackages (#80)

* Renaming what will become the core package

* Tweaking to get  package build working

* Fix metering merge

* Rename to core directory

* Bump version.  Use namespace searching for packaging trustgraph-core

* Change references to trustgraph-core

* Forming embeddings-hf package

* Reference modules in core package.

* Build both packages to one container, bump version

* Update YAMLs
This commit is contained in:
cybermaggedon 2024-09-30 14:00:29 +01:00 committed by GitHub
parent 14d79ef9f1
commit f081933217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
303 changed files with 681 additions and 624 deletions

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.chunking.recursive import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.chunking.token import run
run()

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
"""
Concatenates multiple parquet files into a single parquet output
"""
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import sys
import argparse
parser = argparse.ArgumentParser(
prog="combine-parquet",
description=__doc__
)
parser.add_argument(
'-i', '--input',
nargs='*',
help=f'Input files'
)
parser.add_argument(
'-o', '--output',
help=f'Output files'
)
args = parser.parse_args()
df = None
for file in args.input:
part = pq.read_table(file).to_pandas()
if df is None:
df = part
else:
df = pd.concat([df, part], ignore_index=True)
if df is not None:
table = pa.Table.from_pandas(df)
pq.write_table(table, args.output)

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.query.doc_embeddings.milvus import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.query.doc_embeddings.qdrant import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.doc_embeddings.milvus import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.doc_embeddings.qdrant import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.retrieval.document_rag import run
run()

View file

@ -0,0 +1,24 @@
#!/usr/bin/env python3
import pyarrow as pa
import pyarrow.csv as pc
import pyarrow.parquet as pq
import pandas as pd
import sys
df = None
for file in sys.argv[1:]:
part = pq.read_table(file).to_pandas()
if df is None:
df = part
else:
df = pd.concat([df, part], ignore_index=True)
if df is not None:
table = pa.Table.from_pandas(df)
pc.write_csv(table, sys.stdout.buffer)

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.embeddings.ollama import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.embeddings.vectorize import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.dump.graph_embeddings.parquet import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.query.graph_embeddings.milvus import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.query.graph_embeddings.qdrant import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.graph_embeddings.milvus import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.graph_embeddings.qdrant import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.retrieval.graph_rag import run
run()

View file

@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""
Connects to the graph query service and dumps all graph edges.
"""
import argparse
import os
from trustgraph.core.clients.triples_query_client import TriplesQueryClient
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
def show_graph(pulsar):
tq = TriplesQueryClient(pulsar_host=pulsar)
rows = tq.request(None, None, None, limit=10_000_000)
for row in rows:
print(row.s.value, row.p.value, row.o.value)
def main():
parser = argparse.ArgumentParser(
prog='graph-show',
description=__doc__,
)
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
args = parser.parse_args()
try:
show_graph(args.pulsar_host)
except Exception as e:
print("Exception:", e, flush=True)
main()

View file

@ -0,0 +1,74 @@
#!/usr/bin/env python3
"""
Connects to the graph query service and dumps all graph edges.
"""
import argparse
import os
from trustgraph.core.clients.triples_query_client import TriplesQueryClient
import rdflib
import io
import sys
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
def show_graph(pulsar):
tq = TriplesQueryClient(pulsar_host=pulsar)
rows = tq.request(None, None, None, limit=10_000_000)
g = rdflib.Graph()
for row in rows:
sv = rdflib.term.URIRef(row.s.value)
pv = rdflib.term.URIRef(row.p.value)
if row.o.is_uri:
# Skip malformed URLs with spaces in
if " " in row.o.value:
continue
ov = rdflib.term.URIRef(row.o.value)
else:
ov = rdflib.term.Literal(row.o.value)
g.add((sv, pv, ov))
g.serialize(destination="output.ttl", format="turtle")
buf = io.BytesIO()
g.serialize(destination=buf, format="turtle")
sys.stdout.write(buf.getvalue().decode("utf-8"))
def main():
parser = argparse.ArgumentParser(
prog='graph-show',
description=__doc__,
)
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
args = parser.parse_args()
try:
show_graph(args.pulsar_host)
except Exception as e:
print("Exception:", e, flush=True)
main()

View file

@ -0,0 +1,11 @@
#!/usr/bin/env bash
CSRF_TOKEN=$(curl http://localhost:7750/pulsar-manager/csrf-token)
curl \
-H "X-XSRF-TOKEN: $CSRF_TOKEN" \
-H "Cookie: XSRF-TOKEN=$CSRF_TOKEN;" \
-H 'Content-Type: application/json' \
-X PUT \
http://localhost:7750/pulsar-manager/users/superuser \
-d '{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"}'

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.extract.kg.definitions import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.extract.kg.relationships import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.extract.kg.topics import run
run()

View file

@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""
Loads Graph embeddings into TrustGraph processing.
"""
import pulsar
from pulsar.schema import JsonSchema
from trustgraph.core.schema import GraphEmbeddings, Value
from trustgraph.core.schema import graph_embeddings_store_queue
import argparse
import os
import time
import pyarrow as pa
import pyarrow.parquet as pq
from trustgraph.core.log_level import LogLevel
class Loader:
def __init__(
self,
pulsar_host,
output_queue,
log_level,
file,
):
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(GraphEmbeddings),
chunking_enabled=True,
)
self.file = file
def run(self):
try:
path = self.file
print("Reading file...")
table = pq.read_table(path)
print("Loaded.")
names = set(table.column_names)
if "embeddings" not in names:
print("No 'embeddings' column")
if "entity" not in names:
print("No 'entity' column")
embc = table.column("embeddings")
entc = table.column("entity")
for emb, ent in zip(embc, entc):
b = emb.as_py()
n = ent.as_py()
r = GraphEmbeddings(
vectors=b,
entity=Value(
value=n,
is_uri=n.startswith("https:")
)
)
self.producer.send(r)
except Exception as e:
print(e, flush=True)
def __del__(self):
self.client.close()
def main():
parser = argparse.ArgumentParser(
prog='loader',
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
default_output_queue = graph_embeddings_store_queue
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.ERROR,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-f', '--file',
required=True,
help=f'File to load'
)
args = parser.parse_args()
while True:
try:
p = Loader(
pulsar_host=args.pulsar_host,
output_queue=args.output_queue,
log_level=args.log_level,
file=args.file,
)
p.run()
print("File loaded.")
break
except Exception as e:
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)
main()

128
trustgraph-core/scripts/load-pdf Executable file
View file

@ -0,0 +1,128 @@
#!/usr/bin/env python3
"""
Loads a PDF document into TrustGraph processing.
"""
import pulsar
from pulsar.schema import JsonSchema
from trustgraph.core.schema import Document, Source, document_ingest_queue
import base64
import hashlib
import argparse
import os
import time
from trustgraph.core.log_level import LogLevel
class Loader:
def __init__(
self,
pulsar_host,
output_queue,
log_level,
file,
):
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(Document),
chunking_enabled=True,
)
self.file = file
def run(self):
try:
path = self.file
data = open(path, "rb").read()
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8]
r = Document(
source=Source(
source=path,
title=path,
id=id,
),
data=base64.b64encode(data),
)
self.producer.send(r)
except Exception as e:
print(e, flush=True)
def __del__(self):
self.client.close()
def main():
parser = argparse.ArgumentParser(
prog='loader',
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
default_output_queue = document_ingest_queue
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.ERROR,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-f', '--file',
required=True,
help=f'File to load'
)
args = parser.parse_args()
while True:
try:
p = Loader(
pulsar_host=args.pulsar_host,
output_queue=args.output_queue,
log_level=args.log_level,
file=args.file,
)
p.run()
print("File loaded.")
break
except Exception as e:
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)
main()

128
trustgraph-core/scripts/load-text Executable file
View file

@ -0,0 +1,128 @@
#!/usr/bin/env python3
"""
Loads a text document into TrustGraph processing.
"""
import pulsar
from pulsar.schema import JsonSchema
from trustgraph.core.schema import TextDocument, Source, text_ingest_queue
import base64
import hashlib
import argparse
import os
import time
from trustgraph.core.log_level import LogLevel
class Loader:
def __init__(
self,
pulsar_host,
output_queue,
log_level,
file,
):
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(TextDocument),
chunking_enabled=True,
)
self.file = file
def run(self):
try:
path = self.file
data = open(path, "rb").read()
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8]
r = TextDocument(
source=Source(
source=path,
title=path,
id=id,
),
text=data,
)
self.producer.send(r)
except Exception as e:
print(e, flush=True)
def __del__(self):
self.client.close()
def main():
parser = argparse.ArgumentParser(
prog='loader',
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
default_output_queue = text_ingest_queue
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.ERROR,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-f', '--file',
required=True,
help=f'File to load'
)
args = parser.parse_args()
while True:
try:
p = Loader(
pulsar_host=args.pulsar_host,
output_queue=args.output_queue,
log_level=args.log_level,
file=args.file,
)
p.run()
print("File loaded.")
break
except Exception as e:
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)
main()

View file

@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""
Loads Graph embeddings into TrustGraph processing.
"""
import pulsar
from pulsar.schema import JsonSchema
from trustgraph.core.schema import Triple, Value
from trustgraph.core.schema import triples_store_queue
import argparse
import os
import time
import pyarrow as pa
import pyarrow.parquet as pq
from trustgraph.core.log_level import LogLevel
class Loader:
def __init__(
self,
pulsar_host,
output_queue,
log_level,
file,
):
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(Triple),
chunking_enabled=True,
)
self.file = file
def run(self):
try:
path = self.file
print("Reading file...")
table = pq.read_table(path)
print("Loaded.")
names = set(table.column_names)
if "s" not in names:
print("No 's' column")
if "p" not in names:
print("No 'p' column")
if "o" not in names:
print("No 'o' column")
sc = table.column("s")
pc = table.column("p")
oc = table.column("o")
for s, p, o in zip(sc, pc, oc):
r = Triple(
s=Value(value=s.as_py(), is_uri=True),
p=Value(value=p.as_py(), is_uri=True),
o=Value(value=o.as_py(), is_uri=o.as_py().startswith("https:"))
)
self.producer.send(r)
except Exception as e:
print(e, flush=True)
def __del__(self):
self.client.close()
def main():
parser = argparse.ArgumentParser(
prog='loader',
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
default_output_queue = triples_store_queue
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.ERROR,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-f', '--file',
required=True,
help=f'File to load'
)
args = parser.parse_args()
while True:
try:
p = Loader(
pulsar_host=args.pulsar_host,
output_queue=args.output_queue,
log_level=args.log_level,
file=args.file,
)
p.run()
print("File loaded.")
break
except Exception as e:
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)
main()

View file

@ -0,0 +1,5 @@
#!/usr/bin/env python3
from trustgraph.core.metering import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.extract.object.row import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.object_embeddings.milvus import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.decoding.pdf import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.prompt.generic import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.prompt.template import run
run()

View file

@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""
Uses the Document RAG service to answer a query
"""
import argparse
import os
from trustgraph.core.clients.document_rag_client import DocumentRagClient
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
def query(pulsar, query):
rag = DocumentRagClient(pulsar_host=pulsar)
resp = rag.request(query)
print(resp)
def main():
parser = argparse.ArgumentParser(
prog='graph-show',
description=__doc__,
)
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-q', '--query',
required=True,
help=f'Query to execute',
)
args = parser.parse_args()
try:
query(args.pulsar_host, args.query)
except Exception as e:
print("Exception:", e, flush=True)
main()

View file

@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""
Uses the GraphRAG service to answer a query
"""
import argparse
import os
from trustgraph.core.clients.graph_rag_client import GraphRagClient
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
def query(pulsar, query):
rag = GraphRagClient(pulsar_host=pulsar)
resp = rag.request(query)
print(resp)
def main():
parser = argparse.ArgumentParser(
prog='graph-show',
description=__doc__,
)
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-q', '--query',
required=True,
help=f'Query to execute',
)
args = parser.parse_args()
try:
query(args.pulsar_host, args.query)
except Exception as e:
print("Exception:", e, flush=True)
main()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.rows.cassandra import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.processing import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.azure import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.bedrock import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.claude import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.cohere import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.llamafile import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.ollama import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.openai import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.model.text_completion.vertexai import run
run()

View file

@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""
Initialises Pulsar with Trustgraph tenant / namespaces & policy
"""
import requests
import time
import argparse
default_pulsar_admin_url = "http://pulsar:8080"
def get_clusters(url):
print("Get clusters...", flush=True)
resp = requests.get(f"{url}/admin/v2/clusters")
if resp.status_code != 200: raise RuntimeError("Could not fetch clusters")
return resp.json()
def ensure_tenant(url, tenant, clusters):
resp = requests.get(f"{url}/admin/v2/tenants/{tenant}")
if resp.status_code == 200:
print(f"Tenant {tenant} already exists.", flush=True)
return
resp = requests.put(
f"{url}/admin/v2/tenants/{tenant}",
json={
"adminRoles": [],
"allowedClusters": clusters,
}
)
if resp.status_code != 204:
print(resp.text, flush=True)
raise RuntimeError("Tenant creation failed.")
print(f"Tenant {tenant} created.", flush=True)
def ensure_namespace(url, tenant, namespace, config):
resp = requests.get(f"{url}/admin/v2/namespaces/{tenant}/{namespace}")
if resp.status_code == 200:
print(f"Namespace {tenant}/{namespace} already exists.", flush=True)
return
resp = requests.put(
f"{url}/admin/v2/namespaces/{tenant}/{namespace}",
json=config,
)
if resp.status_code != 204:
print(resp.status_code, flush=True)
print(resp.text, flush=True)
raise RuntimeError(f"Namespace {tenant}/{namespace} creation failed.")
print(f"Namespace {tenant}/{namespace} created.", flush=True)
def init(url, tenant="tg"):
clusters = get_clusters(url)
ensure_tenant(url, tenant, clusters)
ensure_namespace(url, tenant, "flow", {})
ensure_namespace(url, tenant, "request", {})
ensure_namespace(url, tenant, "response", {
"retention_policies": {
"retentionSizeInMB": -1,
"retentionTimeInMinutes": 3,
}
})
def main():
parser = argparse.ArgumentParser(
prog='tg-init-pulsar',
description=__doc__,
)
parser.add_argument(
'-p', '--pulsar-admin-url',
default=default_pulsar_admin_url,
help=f'Pulsar admin URL (default: {default_pulsar_admin_url})',
)
args = parser.parse_args()
while True:
try:
print(flush=True)
print(
f"Initialising with Pulsar {args.pulsar_admin_url}...",
flush=True
)
init(args.pulsar_admin_url, "tg")
print("Initialisation complete.", flush=True)
break
except Exception as e:
print("Exception:", e, flush=True)
print("Sleeping...", flush=True)
time.sleep(2)
print("Will retry...", flush=True)
main()

View file

@ -0,0 +1,24 @@
#!/usr/bin/env python3
import requests
import tabulate
url = 'http://localhost:9090/api/v1/query?query=processor_state%7Bprocessor_state%3D%22running%22%7D'
resp = requests.get(url)
obj = resp.json()
tbl = [
[
m["metric"]["job"],
"running" if int(m["value"][1]) > 0 else "down"
]
for m in obj["data"]["result"]
]
print(tabulate.tabulate(
tbl, tablefmt="pretty", headers=["processor", "state"],
stralign="left"
))

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.dump.triples.parquet import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.query.triples.cassandra import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.query.triples.neo4j import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.triples.cassandra import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.core.storage.triples.neo4j import run
run()