mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-18 03:45:12 +02:00
The id field in pipeline Metadata was being overwritten at each processing (#686)
The id field in pipeline Metadata was being overwritten at each processing stage (document → page → chunk), causing knowledge storage to create separate cores per chunk instead of grouping by document. Add a root field that: - Is set by librarian to the original document ID - Is copied unchanged through PDF decoder, chunkers, and extractors - Is used by knowledge storage for document_id grouping (with fallback to id) Changes: - Add root field to Metadata schema with empty string default - Set root=document.id in librarian when initiating document processing - Copy root through PDF decoder, recursive chunker, and all extractors - Update knowledge storage to use root (or id as fallback) for grouping - Add root handling to translators and gateway serialization - Update test mock Metadata class to include root parameter
This commit is contained in:
parent
aa4f5c6c00
commit
286f762369
15 changed files with 48 additions and 4 deletions
|
|
@ -29,8 +29,9 @@ class Triple:
|
||||||
self.o = o
|
self.o = o
|
||||||
|
|
||||||
class Metadata:
|
class Metadata:
|
||||||
def __init__(self, id, user, collection):
|
def __init__(self, id, user, collection, root=""):
|
||||||
self.id = id
|
self.id = id
|
||||||
|
self.root = root
|
||||||
self.user = user
|
self.user = user
|
||||||
self.collection = collection
|
self.collection = collection
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ class DocumentTranslator(SendTranslator):
|
||||||
return Document(
|
return Document(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=data.get("id"),
|
id=data.get("id"),
|
||||||
|
root=data.get("root", ""),
|
||||||
user=data.get("user", "trustgraph"),
|
user=data.get("user", "trustgraph"),
|
||||||
collection=data.get("collection", "default"),
|
collection=data.get("collection", "default"),
|
||||||
),
|
),
|
||||||
|
|
@ -30,6 +31,8 @@ class DocumentTranslator(SendTranslator):
|
||||||
metadata_dict = {}
|
metadata_dict = {}
|
||||||
if obj.metadata.id:
|
if obj.metadata.id:
|
||||||
metadata_dict["id"] = obj.metadata.id
|
metadata_dict["id"] = obj.metadata.id
|
||||||
|
if obj.metadata.root:
|
||||||
|
metadata_dict["root"] = obj.metadata.root
|
||||||
if obj.metadata.user:
|
if obj.metadata.user:
|
||||||
metadata_dict["user"] = obj.metadata.user
|
metadata_dict["user"] = obj.metadata.user
|
||||||
if obj.metadata.collection:
|
if obj.metadata.collection:
|
||||||
|
|
@ -53,6 +56,7 @@ class TextDocumentTranslator(SendTranslator):
|
||||||
return TextDocument(
|
return TextDocument(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=data.get("id"),
|
id=data.get("id"),
|
||||||
|
root=data.get("root", ""),
|
||||||
user=data.get("user", "trustgraph"),
|
user=data.get("user", "trustgraph"),
|
||||||
collection=data.get("collection", "default"),
|
collection=data.get("collection", "default"),
|
||||||
),
|
),
|
||||||
|
|
@ -68,6 +72,8 @@ class TextDocumentTranslator(SendTranslator):
|
||||||
metadata_dict = {}
|
metadata_dict = {}
|
||||||
if obj.metadata.id:
|
if obj.metadata.id:
|
||||||
metadata_dict["id"] = obj.metadata.id
|
metadata_dict["id"] = obj.metadata.id
|
||||||
|
if obj.metadata.root:
|
||||||
|
metadata_dict["root"] = obj.metadata.root
|
||||||
if obj.metadata.user:
|
if obj.metadata.user:
|
||||||
metadata_dict["user"] = obj.metadata.user
|
metadata_dict["user"] = obj.metadata.user
|
||||||
if obj.metadata.collection:
|
if obj.metadata.collection:
|
||||||
|
|
@ -86,6 +92,7 @@ class ChunkTranslator(SendTranslator):
|
||||||
return Chunk(
|
return Chunk(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=data.get("id"),
|
id=data.get("id"),
|
||||||
|
root=data.get("root", ""),
|
||||||
user=data.get("user", "trustgraph"),
|
user=data.get("user", "trustgraph"),
|
||||||
collection=data.get("collection", "default"),
|
collection=data.get("collection", "default"),
|
||||||
),
|
),
|
||||||
|
|
@ -101,6 +108,8 @@ class ChunkTranslator(SendTranslator):
|
||||||
metadata_dict = {}
|
metadata_dict = {}
|
||||||
if obj.metadata.id:
|
if obj.metadata.id:
|
||||||
metadata_dict["id"] = obj.metadata.id
|
metadata_dict["id"] = obj.metadata.id
|
||||||
|
if obj.metadata.root:
|
||||||
|
metadata_dict["root"] = obj.metadata.root
|
||||||
if obj.metadata.user:
|
if obj.metadata.user:
|
||||||
metadata_dict["user"] = obj.metadata.user
|
metadata_dict["user"] = obj.metadata.user
|
||||||
if obj.metadata.collection:
|
if obj.metadata.collection:
|
||||||
|
|
@ -129,6 +138,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
||||||
return DocumentEmbeddings(
|
return DocumentEmbeddings(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=metadata.get("id"),
|
id=metadata.get("id"),
|
||||||
|
root=metadata.get("root", ""),
|
||||||
user=metadata.get("user", "trustgraph"),
|
user=metadata.get("user", "trustgraph"),
|
||||||
collection=metadata.get("collection", "default"),
|
collection=metadata.get("collection", "default"),
|
||||||
),
|
),
|
||||||
|
|
@ -150,6 +160,8 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
||||||
metadata_dict = {}
|
metadata_dict = {}
|
||||||
if obj.metadata.id:
|
if obj.metadata.id:
|
||||||
metadata_dict["id"] = obj.metadata.id
|
metadata_dict["id"] = obj.metadata.id
|
||||||
|
if obj.metadata.root:
|
||||||
|
metadata_dict["root"] = obj.metadata.root
|
||||||
if obj.metadata.user:
|
if obj.metadata.user:
|
||||||
metadata_dict["user"] = obj.metadata.user
|
metadata_dict["user"] = obj.metadata.user
|
||||||
if obj.metadata.collection:
|
if obj.metadata.collection:
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
||||||
triples = Triples(
|
triples = Triples(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=data["triples"]["metadata"]["id"],
|
id=data["triples"]["metadata"]["id"],
|
||||||
|
root=data["triples"]["metadata"].get("root", ""),
|
||||||
user=data["triples"]["metadata"]["user"],
|
user=data["triples"]["metadata"]["user"],
|
||||||
collection=data["triples"]["metadata"]["collection"]
|
collection=data["triples"]["metadata"]["collection"]
|
||||||
),
|
),
|
||||||
|
|
@ -31,6 +32,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
||||||
graph_embeddings = GraphEmbeddings(
|
graph_embeddings = GraphEmbeddings(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=data["graph-embeddings"]["metadata"]["id"],
|
id=data["graph-embeddings"]["metadata"]["id"],
|
||||||
|
root=data["graph-embeddings"]["metadata"].get("root", ""),
|
||||||
user=data["graph-embeddings"]["metadata"]["user"],
|
user=data["graph-embeddings"]["metadata"]["user"],
|
||||||
collection=data["graph-embeddings"]["metadata"]["collection"]
|
collection=data["graph-embeddings"]["metadata"]["collection"]
|
||||||
),
|
),
|
||||||
|
|
@ -71,6 +73,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
||||||
result["triples"] = {
|
result["triples"] = {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": obj.triples.metadata.id,
|
"id": obj.triples.metadata.id,
|
||||||
|
"root": obj.triples.metadata.root,
|
||||||
"user": obj.triples.metadata.user,
|
"user": obj.triples.metadata.user,
|
||||||
"collection": obj.triples.metadata.collection,
|
"collection": obj.triples.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
@ -81,6 +84,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
||||||
result["graph-embeddings"] = {
|
result["graph-embeddings"] = {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": obj.graph_embeddings.metadata.id,
|
"id": obj.graph_embeddings.metadata.id,
|
||||||
|
"root": obj.graph_embeddings.metadata.root,
|
||||||
"user": obj.graph_embeddings.metadata.user,
|
"user": obj.graph_embeddings.metadata.user,
|
||||||
"collection": obj.graph_embeddings.metadata.collection,
|
"collection": obj.graph_embeddings.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
@ -117,6 +121,7 @@ class KnowledgeResponseTranslator(MessageTranslator):
|
||||||
"triples": {
|
"triples": {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": obj.triples.metadata.id,
|
"id": obj.triples.metadata.id,
|
||||||
|
"root": obj.triples.metadata.root,
|
||||||
"user": obj.triples.metadata.user,
|
"user": obj.triples.metadata.user,
|
||||||
"collection": obj.triples.metadata.collection,
|
"collection": obj.triples.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
@ -130,6 +135,7 @@ class KnowledgeResponseTranslator(MessageTranslator):
|
||||||
"graph-embeddings": {
|
"graph-embeddings": {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": obj.graph_embeddings.metadata.id,
|
"id": obj.graph_embeddings.metadata.id,
|
||||||
|
"root": obj.graph_embeddings.metadata.root,
|
||||||
"user": obj.graph_embeddings.metadata.user,
|
"user": obj.graph_embeddings.metadata.user,
|
||||||
"collection": obj.graph_embeddings.metadata.collection,
|
"collection": obj.graph_embeddings.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,9 @@ class Metadata:
|
||||||
# Source identifier
|
# Source identifier
|
||||||
id: str = ""
|
id: str = ""
|
||||||
|
|
||||||
|
# Root document identifier (set by librarian, preserved through pipeline)
|
||||||
|
root: str = ""
|
||||||
|
|
||||||
# Collection management
|
# Collection management
|
||||||
user: str = ""
|
user: str = ""
|
||||||
collection: str = ""
|
collection: str = ""
|
||||||
|
|
|
||||||
|
|
@ -178,6 +178,7 @@ class Processor(ChunkingService):
|
||||||
await flow("triples").send(Triples(
|
await flow("triples").send(Triples(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=chunk_uri,
|
id=chunk_uri,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -188,6 +189,7 @@ class Processor(ChunkingService):
|
||||||
r = Chunk(
|
r = Chunk(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=chunk_uri,
|
id=chunk_uri,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -302,6 +302,7 @@ class Processor(FlowProcessor):
|
||||||
await flow("triples").send(Triples(
|
await flow("triples").send(Triples(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=pg_uri,
|
id=pg_uri,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -313,6 +314,7 @@ class Processor(FlowProcessor):
|
||||||
r = TextDocument(
|
r = TextDocument(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=pg_uri,
|
id=pg_uri,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,7 @@ class Processor(FlowProcessor):
|
||||||
tpls = Triples(
|
tpls = Triples(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = metadata.id,
|
id = metadata.id,
|
||||||
|
root = metadata.root,
|
||||||
user = metadata.user,
|
user = metadata.user,
|
||||||
collection = metadata.collection,
|
collection = metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -116,6 +117,7 @@ class Processor(FlowProcessor):
|
||||||
ecs = EntityContexts(
|
ecs = EntityContexts(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = metadata.id,
|
id = metadata.id,
|
||||||
|
root = metadata.root,
|
||||||
user = metadata.user,
|
user = metadata.user,
|
||||||
collection = metadata.collection,
|
collection = metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -218,6 +218,7 @@ class Processor(FlowProcessor):
|
||||||
flow("triples"),
|
flow("triples"),
|
||||||
Metadata(
|
Metadata(
|
||||||
id=v.metadata.id,
|
id=v.metadata.id,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -231,6 +232,7 @@ class Processor(FlowProcessor):
|
||||||
flow("entity-contexts"),
|
flow("entity-contexts"),
|
||||||
Metadata(
|
Metadata(
|
||||||
id=v.metadata.id,
|
id=v.metadata.id,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -554,6 +554,7 @@ class Processor(FlowProcessor):
|
||||||
t = Triples(
|
t = Triples(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=metadata.id,
|
id=metadata.id,
|
||||||
|
root=metadata.root,
|
||||||
user=metadata.user,
|
user=metadata.user,
|
||||||
collection=metadata.collection,
|
collection=metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -566,6 +567,7 @@ class Processor(FlowProcessor):
|
||||||
ec = EntityContexts(
|
ec = EntityContexts(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=metadata.id,
|
id=metadata.id,
|
||||||
|
root=metadata.root,
|
||||||
user=metadata.user,
|
user=metadata.user,
|
||||||
collection=metadata.collection,
|
collection=metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -219,6 +219,7 @@ class Processor(FlowProcessor):
|
||||||
flow("triples"),
|
flow("triples"),
|
||||||
Metadata(
|
Metadata(
|
||||||
id=v.metadata.id,
|
id=v.metadata.id,
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -272,6 +272,7 @@ class Processor(FlowProcessor):
|
||||||
extracted = ExtractedObject(
|
extracted = ExtractedObject(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=f"{v.metadata.id}:{schema_name}",
|
id=f"{v.metadata.id}:{schema_name}",
|
||||||
|
root=v.metadata.root,
|
||||||
user=v.metadata.user,
|
user=v.metadata.user,
|
||||||
collection=v.metadata.collection,
|
collection=v.metadata.collection,
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@ def serialize_triples(message):
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": message.metadata.id,
|
"id": message.metadata.id,
|
||||||
|
"root": message.metadata.root,
|
||||||
"user": message.metadata.user,
|
"user": message.metadata.user,
|
||||||
"collection": message.metadata.collection,
|
"collection": message.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
@ -48,6 +49,7 @@ def serialize_graph_embeddings(message):
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": message.metadata.id,
|
"id": message.metadata.id,
|
||||||
|
"root": message.metadata.root,
|
||||||
"user": message.metadata.user,
|
"user": message.metadata.user,
|
||||||
"collection": message.metadata.collection,
|
"collection": message.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
@ -65,6 +67,7 @@ def serialize_entity_contexts(message):
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": message.metadata.id,
|
"id": message.metadata.id,
|
||||||
|
"root": message.metadata.root,
|
||||||
"user": message.metadata.user,
|
"user": message.metadata.user,
|
||||||
"collection": message.metadata.collection,
|
"collection": message.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
@ -82,6 +85,7 @@ def serialize_document_embeddings(message):
|
||||||
return {
|
return {
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": message.metadata.id,
|
"id": message.metadata.id,
|
||||||
|
"root": message.metadata.root,
|
||||||
"user": message.metadata.user,
|
"user": message.metadata.user,
|
||||||
"collection": message.metadata.collection,
|
"collection": message.metadata.collection,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,7 @@ class TriplesImport:
|
||||||
elt = Triples(
|
elt = Triples(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=data["metadata"]["id"],
|
id=data["metadata"]["id"],
|
||||||
|
root=data["metadata"].get("root", ""),
|
||||||
user=data["metadata"]["user"],
|
user=data["metadata"]["user"],
|
||||||
collection=data["metadata"]["collection"],
|
collection=data["metadata"]["collection"],
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -334,6 +334,7 @@ class Processor(AsyncProcessor):
|
||||||
triples_msg = Triples(
|
triples_msg = Triples(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
id=doc_uri,
|
id=doc_uri,
|
||||||
|
root=document.id,
|
||||||
user=processing.user,
|
user=processing.user,
|
||||||
collection=processing.collection,
|
collection=processing.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -380,6 +381,7 @@ class Processor(AsyncProcessor):
|
||||||
doc = TextDocument(
|
doc = TextDocument(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = document.id,
|
id = document.id,
|
||||||
|
root = document.id,
|
||||||
user = processing.user,
|
user = processing.user,
|
||||||
collection = processing.collection
|
collection = processing.collection
|
||||||
),
|
),
|
||||||
|
|
@ -390,6 +392,7 @@ class Processor(AsyncProcessor):
|
||||||
doc = TextDocument(
|
doc = TextDocument(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = document.id,
|
id = document.id,
|
||||||
|
root = document.id,
|
||||||
user = processing.user,
|
user = processing.user,
|
||||||
collection = processing.collection
|
collection = processing.collection
|
||||||
),
|
),
|
||||||
|
|
@ -405,6 +408,7 @@ class Processor(AsyncProcessor):
|
||||||
doc = Document(
|
doc = Document(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = document.id,
|
id = document.id,
|
||||||
|
root = document.id,
|
||||||
user = processing.user,
|
user = processing.user,
|
||||||
collection = processing.collection
|
collection = processing.collection
|
||||||
),
|
),
|
||||||
|
|
@ -415,6 +419,7 @@ class Processor(AsyncProcessor):
|
||||||
doc = Document(
|
doc = Document(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = document.id,
|
id = document.id,
|
||||||
|
root = document.id,
|
||||||
user = processing.user,
|
user = processing.user,
|
||||||
collection = processing.collection
|
collection = processing.collection
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -233,7 +233,7 @@ class KnowledgeTableStore:
|
||||||
self.insert_triples_stmt,
|
self.insert_triples_stmt,
|
||||||
(
|
(
|
||||||
uuid.uuid4(), m.metadata.user,
|
uuid.uuid4(), m.metadata.user,
|
||||||
m.metadata.id, when,
|
m.metadata.root or m.metadata.id, when,
|
||||||
[], triples,
|
[], triples,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -265,7 +265,7 @@ class KnowledgeTableStore:
|
||||||
self.insert_graph_embeddings_stmt,
|
self.insert_graph_embeddings_stmt,
|
||||||
(
|
(
|
||||||
uuid.uuid4(), m.metadata.user,
|
uuid.uuid4(), m.metadata.user,
|
||||||
m.metadata.id, when,
|
m.metadata.root or m.metadata.id, when,
|
||||||
[], entities,
|
[], entities,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -297,7 +297,7 @@ class KnowledgeTableStore:
|
||||||
self.insert_document_embeddings_stmt,
|
self.insert_document_embeddings_stmt,
|
||||||
(
|
(
|
||||||
uuid.uuid4(), m.metadata.user,
|
uuid.uuid4(), m.metadata.user,
|
||||||
m.metadata.id, when,
|
m.metadata.root or m.metadata.id, when,
|
||||||
[], chunks,
|
[], chunks,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue