The id field in pipeline Metadata was being overwritten at each processing (#686)

The id field in pipeline Metadata was being overwritten at each processing
stage (document → page → chunk), causing knowledge storage to create
separate cores per chunk instead of grouping by document.

Add a root field that:
- Is set by librarian to the original document ID
- Is copied unchanged through PDF decoder, chunkers, and extractors
- Is used by knowledge storage for document_id grouping (with fallback to id)

Changes:
- Add root field to Metadata schema with empty string default
- Set root=document.id in librarian when initiating document processing
- Copy root through PDF decoder, recursive chunker, and all extractors
- Update knowledge storage to use root (or id as fallback) for grouping
- Add root handling to translators and gateway serialization
- Update test mock Metadata class to include root parameter
This commit is contained in:
cybermaggedon 2026-03-11 12:16:39 +00:00 committed by GitHub
parent aa4f5c6c00
commit 286f762369
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 48 additions and 4 deletions

View file

@ -29,8 +29,9 @@ class Triple:
self.o = o self.o = o
class Metadata: class Metadata:
def __init__(self, id, user, collection): def __init__(self, id, user, collection, root=""):
self.id = id self.id = id
self.root = root
self.user = user self.user = user
self.collection = collection self.collection = collection

View file

@ -15,6 +15,7 @@ class DocumentTranslator(SendTranslator):
return Document( return Document(
metadata=Metadata( metadata=Metadata(
id=data.get("id"), id=data.get("id"),
root=data.get("root", ""),
user=data.get("user", "trustgraph"), user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"), collection=data.get("collection", "default"),
), ),
@ -30,6 +31,8 @@ class DocumentTranslator(SendTranslator):
metadata_dict = {} metadata_dict = {}
if obj.metadata.id: if obj.metadata.id:
metadata_dict["id"] = obj.metadata.id metadata_dict["id"] = obj.metadata.id
if obj.metadata.root:
metadata_dict["root"] = obj.metadata.root
if obj.metadata.user: if obj.metadata.user:
metadata_dict["user"] = obj.metadata.user metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection: if obj.metadata.collection:
@ -53,6 +56,7 @@ class TextDocumentTranslator(SendTranslator):
return TextDocument( return TextDocument(
metadata=Metadata( metadata=Metadata(
id=data.get("id"), id=data.get("id"),
root=data.get("root", ""),
user=data.get("user", "trustgraph"), user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"), collection=data.get("collection", "default"),
), ),
@ -68,6 +72,8 @@ class TextDocumentTranslator(SendTranslator):
metadata_dict = {} metadata_dict = {}
if obj.metadata.id: if obj.metadata.id:
metadata_dict["id"] = obj.metadata.id metadata_dict["id"] = obj.metadata.id
if obj.metadata.root:
metadata_dict["root"] = obj.metadata.root
if obj.metadata.user: if obj.metadata.user:
metadata_dict["user"] = obj.metadata.user metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection: if obj.metadata.collection:
@ -86,6 +92,7 @@ class ChunkTranslator(SendTranslator):
return Chunk( return Chunk(
metadata=Metadata( metadata=Metadata(
id=data.get("id"), id=data.get("id"),
root=data.get("root", ""),
user=data.get("user", "trustgraph"), user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"), collection=data.get("collection", "default"),
), ),
@ -101,6 +108,8 @@ class ChunkTranslator(SendTranslator):
metadata_dict = {} metadata_dict = {}
if obj.metadata.id: if obj.metadata.id:
metadata_dict["id"] = obj.metadata.id metadata_dict["id"] = obj.metadata.id
if obj.metadata.root:
metadata_dict["root"] = obj.metadata.root
if obj.metadata.user: if obj.metadata.user:
metadata_dict["user"] = obj.metadata.user metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection: if obj.metadata.collection:
@ -129,6 +138,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
return DocumentEmbeddings( return DocumentEmbeddings(
metadata=Metadata( metadata=Metadata(
id=metadata.get("id"), id=metadata.get("id"),
root=metadata.get("root", ""),
user=metadata.get("user", "trustgraph"), user=metadata.get("user", "trustgraph"),
collection=metadata.get("collection", "default"), collection=metadata.get("collection", "default"),
), ),
@ -150,6 +160,8 @@ class DocumentEmbeddingsTranslator(SendTranslator):
metadata_dict = {} metadata_dict = {}
if obj.metadata.id: if obj.metadata.id:
metadata_dict["id"] = obj.metadata.id metadata_dict["id"] = obj.metadata.id
if obj.metadata.root:
metadata_dict["root"] = obj.metadata.root
if obj.metadata.user: if obj.metadata.user:
metadata_dict["user"] = obj.metadata.user metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection: if obj.metadata.collection:

View file

@ -20,6 +20,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
triples = Triples( triples = Triples(
metadata=Metadata( metadata=Metadata(
id=data["triples"]["metadata"]["id"], id=data["triples"]["metadata"]["id"],
root=data["triples"]["metadata"].get("root", ""),
user=data["triples"]["metadata"]["user"], user=data["triples"]["metadata"]["user"],
collection=data["triples"]["metadata"]["collection"] collection=data["triples"]["metadata"]["collection"]
), ),
@ -31,6 +32,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
graph_embeddings = GraphEmbeddings( graph_embeddings = GraphEmbeddings(
metadata=Metadata( metadata=Metadata(
id=data["graph-embeddings"]["metadata"]["id"], id=data["graph-embeddings"]["metadata"]["id"],
root=data["graph-embeddings"]["metadata"].get("root", ""),
user=data["graph-embeddings"]["metadata"]["user"], user=data["graph-embeddings"]["metadata"]["user"],
collection=data["graph-embeddings"]["metadata"]["collection"] collection=data["graph-embeddings"]["metadata"]["collection"]
), ),
@ -71,6 +73,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
result["triples"] = { result["triples"] = {
"metadata": { "metadata": {
"id": obj.triples.metadata.id, "id": obj.triples.metadata.id,
"root": obj.triples.metadata.root,
"user": obj.triples.metadata.user, "user": obj.triples.metadata.user,
"collection": obj.triples.metadata.collection, "collection": obj.triples.metadata.collection,
}, },
@ -81,6 +84,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
result["graph-embeddings"] = { result["graph-embeddings"] = {
"metadata": { "metadata": {
"id": obj.graph_embeddings.metadata.id, "id": obj.graph_embeddings.metadata.id,
"root": obj.graph_embeddings.metadata.root,
"user": obj.graph_embeddings.metadata.user, "user": obj.graph_embeddings.metadata.user,
"collection": obj.graph_embeddings.metadata.collection, "collection": obj.graph_embeddings.metadata.collection,
}, },
@ -117,6 +121,7 @@ class KnowledgeResponseTranslator(MessageTranslator):
"triples": { "triples": {
"metadata": { "metadata": {
"id": obj.triples.metadata.id, "id": obj.triples.metadata.id,
"root": obj.triples.metadata.root,
"user": obj.triples.metadata.user, "user": obj.triples.metadata.user,
"collection": obj.triples.metadata.collection, "collection": obj.triples.metadata.collection,
}, },
@ -130,6 +135,7 @@ class KnowledgeResponseTranslator(MessageTranslator):
"graph-embeddings": { "graph-embeddings": {
"metadata": { "metadata": {
"id": obj.graph_embeddings.metadata.id, "id": obj.graph_embeddings.metadata.id,
"root": obj.graph_embeddings.metadata.root,
"user": obj.graph_embeddings.metadata.user, "user": obj.graph_embeddings.metadata.user,
"collection": obj.graph_embeddings.metadata.collection, "collection": obj.graph_embeddings.metadata.collection,
}, },

View file

@ -5,6 +5,9 @@ class Metadata:
# Source identifier # Source identifier
id: str = "" id: str = ""
# Root document identifier (set by librarian, preserved through pipeline)
root: str = ""
# Collection management # Collection management
user: str = "" user: str = ""
collection: str = "" collection: str = ""

View file

@ -178,6 +178,7 @@ class Processor(ChunkingService):
await flow("triples").send(Triples( await flow("triples").send(Triples(
metadata=Metadata( metadata=Metadata(
id=chunk_uri, id=chunk_uri,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),
@ -188,6 +189,7 @@ class Processor(ChunkingService):
r = Chunk( r = Chunk(
metadata=Metadata( metadata=Metadata(
id=chunk_uri, id=chunk_uri,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),

View file

@ -302,6 +302,7 @@ class Processor(FlowProcessor):
await flow("triples").send(Triples( await flow("triples").send(Triples(
metadata=Metadata( metadata=Metadata(
id=pg_uri, id=pg_uri,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),
@ -313,6 +314,7 @@ class Processor(FlowProcessor):
r = TextDocument( r = TextDocument(
metadata=Metadata( metadata=Metadata(
id=pg_uri, id=pg_uri,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),

View file

@ -104,6 +104,7 @@ class Processor(FlowProcessor):
tpls = Triples( tpls = Triples(
metadata = Metadata( metadata = Metadata(
id = metadata.id, id = metadata.id,
root = metadata.root,
user = metadata.user, user = metadata.user,
collection = metadata.collection, collection = metadata.collection,
), ),
@ -116,6 +117,7 @@ class Processor(FlowProcessor):
ecs = EntityContexts( ecs = EntityContexts(
metadata = Metadata( metadata = Metadata(
id = metadata.id, id = metadata.id,
root = metadata.root,
user = metadata.user, user = metadata.user,
collection = metadata.collection, collection = metadata.collection,
), ),

View file

@ -218,6 +218,7 @@ class Processor(FlowProcessor):
flow("triples"), flow("triples"),
Metadata( Metadata(
id=v.metadata.id, id=v.metadata.id,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),
@ -231,6 +232,7 @@ class Processor(FlowProcessor):
flow("entity-contexts"), flow("entity-contexts"),
Metadata( Metadata(
id=v.metadata.id, id=v.metadata.id,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),

View file

@ -554,6 +554,7 @@ class Processor(FlowProcessor):
t = Triples( t = Triples(
metadata=Metadata( metadata=Metadata(
id=metadata.id, id=metadata.id,
root=metadata.root,
user=metadata.user, user=metadata.user,
collection=metadata.collection, collection=metadata.collection,
), ),
@ -566,6 +567,7 @@ class Processor(FlowProcessor):
ec = EntityContexts( ec = EntityContexts(
metadata=Metadata( metadata=Metadata(
id=metadata.id, id=metadata.id,
root=metadata.root,
user=metadata.user, user=metadata.user,
collection=metadata.collection, collection=metadata.collection,
), ),

View file

@ -219,6 +219,7 @@ class Processor(FlowProcessor):
flow("triples"), flow("triples"),
Metadata( Metadata(
id=v.metadata.id, id=v.metadata.id,
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),

View file

@ -272,6 +272,7 @@ class Processor(FlowProcessor):
extracted = ExtractedObject( extracted = ExtractedObject(
metadata=Metadata( metadata=Metadata(
id=f"{v.metadata.id}:{schema_name}", id=f"{v.metadata.id}:{schema_name}",
root=v.metadata.root,
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),

View file

@ -37,6 +37,7 @@ def serialize_triples(message):
return { return {
"metadata": { "metadata": {
"id": message.metadata.id, "id": message.metadata.id,
"root": message.metadata.root,
"user": message.metadata.user, "user": message.metadata.user,
"collection": message.metadata.collection, "collection": message.metadata.collection,
}, },
@ -48,6 +49,7 @@ def serialize_graph_embeddings(message):
return { return {
"metadata": { "metadata": {
"id": message.metadata.id, "id": message.metadata.id,
"root": message.metadata.root,
"user": message.metadata.user, "user": message.metadata.user,
"collection": message.metadata.collection, "collection": message.metadata.collection,
}, },
@ -65,6 +67,7 @@ def serialize_entity_contexts(message):
return { return {
"metadata": { "metadata": {
"id": message.metadata.id, "id": message.metadata.id,
"root": message.metadata.root,
"user": message.metadata.user, "user": message.metadata.user,
"collection": message.metadata.collection, "collection": message.metadata.collection,
}, },
@ -82,6 +85,7 @@ def serialize_document_embeddings(message):
return { return {
"metadata": { "metadata": {
"id": message.metadata.id, "id": message.metadata.id,
"root": message.metadata.root,
"user": message.metadata.user, "user": message.metadata.user,
"collection": message.metadata.collection, "collection": message.metadata.collection,
}, },

View file

@ -48,6 +48,7 @@ class TriplesImport:
elt = Triples( elt = Triples(
metadata=Metadata( metadata=Metadata(
id=data["metadata"]["id"], id=data["metadata"]["id"],
root=data["metadata"].get("root", ""),
user=data["metadata"]["user"], user=data["metadata"]["user"],
collection=data["metadata"]["collection"], collection=data["metadata"]["collection"],
), ),

View file

@ -334,6 +334,7 @@ class Processor(AsyncProcessor):
triples_msg = Triples( triples_msg = Triples(
metadata=Metadata( metadata=Metadata(
id=doc_uri, id=doc_uri,
root=document.id,
user=processing.user, user=processing.user,
collection=processing.collection, collection=processing.collection,
), ),
@ -380,6 +381,7 @@ class Processor(AsyncProcessor):
doc = TextDocument( doc = TextDocument(
metadata = Metadata( metadata = Metadata(
id = document.id, id = document.id,
root = document.id,
user = processing.user, user = processing.user,
collection = processing.collection collection = processing.collection
), ),
@ -390,6 +392,7 @@ class Processor(AsyncProcessor):
doc = TextDocument( doc = TextDocument(
metadata = Metadata( metadata = Metadata(
id = document.id, id = document.id,
root = document.id,
user = processing.user, user = processing.user,
collection = processing.collection collection = processing.collection
), ),
@ -405,6 +408,7 @@ class Processor(AsyncProcessor):
doc = Document( doc = Document(
metadata = Metadata( metadata = Metadata(
id = document.id, id = document.id,
root = document.id,
user = processing.user, user = processing.user,
collection = processing.collection collection = processing.collection
), ),
@ -415,6 +419,7 @@ class Processor(AsyncProcessor):
doc = Document( doc = Document(
metadata = Metadata( metadata = Metadata(
id = document.id, id = document.id,
root = document.id,
user = processing.user, user = processing.user,
collection = processing.collection collection = processing.collection
), ),

View file

@ -233,7 +233,7 @@ class KnowledgeTableStore:
self.insert_triples_stmt, self.insert_triples_stmt,
( (
uuid.uuid4(), m.metadata.user, uuid.uuid4(), m.metadata.user,
m.metadata.id, when, m.metadata.root or m.metadata.id, when,
[], triples, [], triples,
) )
) )
@ -265,7 +265,7 @@ class KnowledgeTableStore:
self.insert_graph_embeddings_stmt, self.insert_graph_embeddings_stmt,
( (
uuid.uuid4(), m.metadata.user, uuid.uuid4(), m.metadata.user,
m.metadata.id, when, m.metadata.root or m.metadata.id, when,
[], entities, [], entities,
) )
) )
@ -297,7 +297,7 @@ class KnowledgeTableStore:
self.insert_document_embeddings_stmt, self.insert_document_embeddings_stmt,
( (
uuid.uuid4(), m.metadata.user, uuid.uuid4(), m.metadata.user,
m.metadata.id, when, m.metadata.root or m.metadata.id, when,
[], chunks, [], chunks,
) )
) )