mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
The id field in pipeline Metadata was being overwritten at each processing (#686)
The id field in pipeline Metadata was being overwritten at each processing stage (document → page → chunk), causing knowledge storage to create separate cores per chunk instead of grouping by document. Add a root field that: - Is set by librarian to the original document ID - Is copied unchanged through PDF decoder, chunkers, and extractors - Is used by knowledge storage for document_id grouping (with fallback to id) Changes: - Add root field to Metadata schema with empty string default - Set root=document.id in librarian when initiating document processing - Copy root through PDF decoder, recursive chunker, and all extractors - Update knowledge storage to use root (or id as fallback) for grouping - Add root handling to translators and gateway serialization - Update test mock Metadata class to include root parameter
This commit is contained in:
parent
aa4f5c6c00
commit
286f762369
15 changed files with 48 additions and 4 deletions
|
|
@ -15,6 +15,7 @@ class DocumentTranslator(SendTranslator):
|
|||
return Document(
|
||||
metadata=Metadata(
|
||||
id=data.get("id"),
|
||||
root=data.get("root", ""),
|
||||
user=data.get("user", "trustgraph"),
|
||||
collection=data.get("collection", "default"),
|
||||
),
|
||||
|
|
@ -30,6 +31,8 @@ class DocumentTranslator(SendTranslator):
|
|||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
metadata_dict["id"] = obj.metadata.id
|
||||
if obj.metadata.root:
|
||||
metadata_dict["root"] = obj.metadata.root
|
||||
if obj.metadata.user:
|
||||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
|
|
@ -53,6 +56,7 @@ class TextDocumentTranslator(SendTranslator):
|
|||
return TextDocument(
|
||||
metadata=Metadata(
|
||||
id=data.get("id"),
|
||||
root=data.get("root", ""),
|
||||
user=data.get("user", "trustgraph"),
|
||||
collection=data.get("collection", "default"),
|
||||
),
|
||||
|
|
@ -68,6 +72,8 @@ class TextDocumentTranslator(SendTranslator):
|
|||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
metadata_dict["id"] = obj.metadata.id
|
||||
if obj.metadata.root:
|
||||
metadata_dict["root"] = obj.metadata.root
|
||||
if obj.metadata.user:
|
||||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
|
|
@ -86,6 +92,7 @@ class ChunkTranslator(SendTranslator):
|
|||
return Chunk(
|
||||
metadata=Metadata(
|
||||
id=data.get("id"),
|
||||
root=data.get("root", ""),
|
||||
user=data.get("user", "trustgraph"),
|
||||
collection=data.get("collection", "default"),
|
||||
),
|
||||
|
|
@ -101,6 +108,8 @@ class ChunkTranslator(SendTranslator):
|
|||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
metadata_dict["id"] = obj.metadata.id
|
||||
if obj.metadata.root:
|
||||
metadata_dict["root"] = obj.metadata.root
|
||||
if obj.metadata.user:
|
||||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
|
|
@ -129,6 +138,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
return DocumentEmbeddings(
|
||||
metadata=Metadata(
|
||||
id=metadata.get("id"),
|
||||
root=metadata.get("root", ""),
|
||||
user=metadata.get("user", "trustgraph"),
|
||||
collection=metadata.get("collection", "default"),
|
||||
),
|
||||
|
|
@ -150,6 +160,8 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
metadata_dict["id"] = obj.metadata.id
|
||||
if obj.metadata.root:
|
||||
metadata_dict["root"] = obj.metadata.root
|
||||
if obj.metadata.user:
|
||||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
triples = Triples(
|
||||
metadata=Metadata(
|
||||
id=data["triples"]["metadata"]["id"],
|
||||
root=data["triples"]["metadata"].get("root", ""),
|
||||
user=data["triples"]["metadata"]["user"],
|
||||
collection=data["triples"]["metadata"]["collection"]
|
||||
),
|
||||
|
|
@ -31,6 +32,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
graph_embeddings = GraphEmbeddings(
|
||||
metadata=Metadata(
|
||||
id=data["graph-embeddings"]["metadata"]["id"],
|
||||
root=data["graph-embeddings"]["metadata"].get("root", ""),
|
||||
user=data["graph-embeddings"]["metadata"]["user"],
|
||||
collection=data["graph-embeddings"]["metadata"]["collection"]
|
||||
),
|
||||
|
|
@ -71,6 +73,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
result["triples"] = {
|
||||
"metadata": {
|
||||
"id": obj.triples.metadata.id,
|
||||
"root": obj.triples.metadata.root,
|
||||
"user": obj.triples.metadata.user,
|
||||
"collection": obj.triples.metadata.collection,
|
||||
},
|
||||
|
|
@ -81,6 +84,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
result["graph-embeddings"] = {
|
||||
"metadata": {
|
||||
"id": obj.graph_embeddings.metadata.id,
|
||||
"root": obj.graph_embeddings.metadata.root,
|
||||
"user": obj.graph_embeddings.metadata.user,
|
||||
"collection": obj.graph_embeddings.metadata.collection,
|
||||
},
|
||||
|
|
@ -117,6 +121,7 @@ class KnowledgeResponseTranslator(MessageTranslator):
|
|||
"triples": {
|
||||
"metadata": {
|
||||
"id": obj.triples.metadata.id,
|
||||
"root": obj.triples.metadata.root,
|
||||
"user": obj.triples.metadata.user,
|
||||
"collection": obj.triples.metadata.collection,
|
||||
},
|
||||
|
|
@ -130,6 +135,7 @@ class KnowledgeResponseTranslator(MessageTranslator):
|
|||
"graph-embeddings": {
|
||||
"metadata": {
|
||||
"id": obj.graph_embeddings.metadata.id,
|
||||
"root": obj.graph_embeddings.metadata.root,
|
||||
"user": obj.graph_embeddings.metadata.user,
|
||||
"collection": obj.graph_embeddings.metadata.collection,
|
||||
},
|
||||
|
|
|
|||
|
|
@ -5,6 +5,9 @@ class Metadata:
|
|||
# Source identifier
|
||||
id: str = ""
|
||||
|
||||
# Root document identifier (set by librarian, preserved through pipeline)
|
||||
root: str = ""
|
||||
|
||||
# Collection management
|
||||
user: str = ""
|
||||
collection: str = ""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue