feat: complete knowledge core storage — named graphs, provenance, source material (#973)

Implements all three changes from the knowledge-core-completeness tech spec:

1. Named graph field preserved through Cassandra storage (7-element tuple),
   enabling provenance triples to retain their graph URIs on round-trip.

2. Provenance triples already arrive on triples-input — no routing change
   needed; Change 1 was sufficient.

3. Source material (library documents) streamed alongside triples and
   embeddings during core download/upload. The knowledge manager fetches
   the document hierarchy from the librarian on download and recreates it
   on upload, preserving the full provenance chain across instances.
This commit is contained in:
cybermaggedon 2026-06-03 10:46:52 +01:00 committed by GitHub
parent aa158e1ba3
commit 6df7471a55
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1347 additions and 15 deletions

View file

@ -47,6 +47,31 @@ def write_ge(f, data):
)
f.write(msgpack.packb(msg, use_bin_type=True))
def write_library_metadata(f, data):
msg = (
"lm",
{
"i": data["id"],
"k": data.get("kind", ""),
"t": data.get("title", ""),
"p": data.get("parent-id", ""),
"d": data.get("document-type", ""),
"c": data.get("comments", ""),
"g": data.get("tags", []),
}
)
f.write(msgpack.packb(msg, use_bin_type=True))
def write_library_blob(f, data):
msg = (
"lb",
{
"i": data["id"],
"d": data.get("data", b""),
}
)
f.write(msgpack.packb(msg, use_bin_type=True))
def fetch(url, workspace, id, output, token=None):
api = Api(url=url, token=token, workspace=workspace)
@ -55,6 +80,8 @@ def fetch(url, workspace, id, output, token=None):
try:
ge = 0
t = 0
lm = 0
lb = 0
with open(output, "wb") as f:
@ -68,7 +95,15 @@ def fetch(url, workspace, id, output, token=None):
ge += 1
write_ge(f, response["graph-embeddings"])
print(f"Got: {t} triple, {ge} GE messages.")
if "library-metadata" in response:
lm += 1
write_library_metadata(f, response["library-metadata"])
if "library-blob" in response:
lb += 1
write_library_blob(f, response["library-blob"])
print(f"Got: {t} triple, {ge} GE, {lm} library metadata, {lb} library blob messages.")
finally:
socket.close()

View file

@ -40,6 +40,23 @@ def read_message(unpacked, id):
},
"triples": msg["t"],
}
elif unpacked[0] == "lm":
msg = unpacked[1]
return "lm", {
"id": msg["i"],
"kind": msg.get("k", ""),
"title": msg.get("t", ""),
"parent-id": msg.get("p", ""),
"document-type": msg.get("d", ""),
"comments": msg.get("c", ""),
"tags": msg.get("g", []),
}
elif unpacked[0] == "lb":
msg = unpacked[1]
return "lb", {
"id": msg["i"],
"data": msg.get("d", b""),
}
else:
raise RuntimeError("Unpacked unexpected messsage type", unpacked[0])
@ -51,6 +68,8 @@ def put(url, workspace, id, input, token=None):
try:
ge = 0
t = 0
lm = 0
lb = 0
with open(input, "rb") as f:
@ -73,10 +92,18 @@ def put(url, workspace, id, input, token=None):
t += 1
socket.put_kg_core(id, triples=msg)
elif kind == "lm":
lm += 1
socket.put_kg_core(id, library_metadata=msg)
elif kind == "lb":
lb += 1
socket.put_kg_core(id, library_blob=msg)
else:
raise RuntimeError("Unexpected message kind", kind)
print(f"Put: {t} triple, {ge} GE messages.")
print(f"Put: {t} triple, {ge} GE, {lm} library metadata, {lb} library blob messages.")
finally:
socket.close()