Remove redundant metadata (#685)

The metadata field (list of triples) in the pipeline Metadata class
was redundant. Document metadata triples already flow directly from
librarian to triple-store via emit_document_provenance() - they don't
need to pass through the extraction pipeline.

Additionally, chunker and PDF decoder were overwriting metadata to []
anyway, so any metadata passed through the pipeline was being
discarded.

Changes:
- Remove metadata field from Metadata dataclass
  (schema/core/metadata.py)
- Update all Metadata instantiations to remove metadata=[]
  parameter
- Remove metadata handling from translators (document_loading,
  knowledge)
- Remove metadata consumption from extractors (ontology, agent)
- Update gateway serializers and import handlers
- Update all unit, integration, and contract tests
This commit is contained in:
cybermaggedon 2026-03-11 10:51:39 +00:00 committed by GitHub
parent 1837d73f34
commit aa4f5c6c00
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 106 additions and 343 deletions

View file

@ -2,38 +2,30 @@ import base64
from typing import Dict, Any
from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEmbeddings
from .base import SendTranslator
from .metadata import DocumentMetadataTranslator
from .primitives import SubgraphTranslator
class DocumentTranslator(SendTranslator):
"""Translator for Document schema objects (PDF docs etc.)"""
def __init__(self):
self.subgraph_translator = SubgraphTranslator()
def to_pulsar(self, data: Dict[str, Any]) -> Document:
metadata = data.get("metadata", [])
# Handle base64 content validation
doc = base64.b64decode(data["data"])
from ...schema import Metadata
return Document(
metadata=Metadata(
id=data.get("id"),
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata else [],
user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"),
),
data=base64.b64encode(doc).decode("utf-8")
)
def from_pulsar(self, obj: Document) -> Dict[str, Any]:
result = {
"data": obj.data
}
if obj.metadata:
metadata_dict = {}
if obj.metadata.id:
@ -42,43 +34,36 @@ class DocumentTranslator(SendTranslator):
metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection:
metadata_dict["collection"] = obj.metadata.collection
if obj.metadata.metadata:
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
result["metadata"] = metadata_dict
return result
class TextDocumentTranslator(SendTranslator):
"""Translator for TextDocument schema objects"""
def __init__(self):
self.subgraph_translator = SubgraphTranslator()
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
metadata = data.get("metadata", [])
charset = data.get("charset", "utf-8")
# Text is base64 encoded in input
text = base64.b64decode(data["text"]).decode(charset)
from ...schema import Metadata
return TextDocument(
metadata=Metadata(
id=data.get("id"),
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata else [],
user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"),
),
text=text.encode("utf-8")
)
def from_pulsar(self, obj: TextDocument) -> Dict[str, Any]:
result = {
"text": obj.text.decode("utf-8") if isinstance(obj.text, bytes) else obj.text
}
if obj.metadata:
metadata_dict = {}
if obj.metadata.id:
@ -87,39 +72,31 @@ class TextDocumentTranslator(SendTranslator):
metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection:
metadata_dict["collection"] = obj.metadata.collection
if obj.metadata.metadata:
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
result["metadata"] = metadata_dict
return result
class ChunkTranslator(SendTranslator):
"""Translator for Chunk schema objects"""
def __init__(self):
self.subgraph_translator = SubgraphTranslator()
def to_pulsar(self, data: Dict[str, Any]) -> Chunk:
metadata = data.get("metadata", [])
from ...schema import Metadata
return Chunk(
metadata=Metadata(
id=data.get("id"),
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata else [],
user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"),
),
chunk=data["chunk"].encode("utf-8") if isinstance(data["chunk"], str) else data["chunk"]
)
def from_pulsar(self, obj: Chunk) -> Dict[str, Any]:
result = {
"chunk": obj.chunk.decode("utf-8") if isinstance(obj.chunk, bytes) else obj.chunk
}
if obj.metadata:
metadata_dict = {}
if obj.metadata.id:
@ -128,20 +105,15 @@ class ChunkTranslator(SendTranslator):
metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection:
metadata_dict["collection"] = obj.metadata.collection
if obj.metadata.metadata:
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
result["metadata"] = metadata_dict
return result
class DocumentEmbeddingsTranslator(SendTranslator):
"""Translator for DocumentEmbeddings schema objects"""
def __init__(self):
self.subgraph_translator = SubgraphTranslator()
def to_pulsar(self, data: Dict[str, Any]) -> DocumentEmbeddings:
metadata = data.get("metadata", {})
@ -157,13 +129,12 @@ class DocumentEmbeddingsTranslator(SendTranslator):
return DocumentEmbeddings(
metadata=Metadata(
id=metadata.get("id"),
metadata=self.subgraph_translator.to_pulsar(metadata.get("metadata", [])),
user=metadata.get("user", "trustgraph"),
collection=metadata.get("collection", "default"),
),
chunks=chunks
)
def from_pulsar(self, obj: DocumentEmbeddings) -> Dict[str, Any]:
result = {
"chunks": [
@ -174,7 +145,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
for chunk in obj.chunks
]
}
if obj.metadata:
metadata_dict = {}
if obj.metadata.id:
@ -183,9 +154,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
metadata_dict["user"] = obj.metadata.user
if obj.metadata.collection:
metadata_dict["collection"] = obj.metadata.collection
if obj.metadata.metadata:
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
result["metadata"] = metadata_dict
return result

View file

@ -1,43 +1,36 @@
from typing import Dict, Any, Tuple, Optional
from ...schema import (
KnowledgeRequest, KnowledgeResponse, Triples, GraphEmbeddings,
KnowledgeRequest, KnowledgeResponse, Triples, GraphEmbeddings,
Metadata, EntityEmbeddings
)
from .base import MessageTranslator
from .primitives import ValueTranslator, SubgraphTranslator
from .metadata import DocumentMetadataTranslator
class KnowledgeRequestTranslator(MessageTranslator):
"""Translator for KnowledgeRequest schema objects"""
def __init__(self):
self.value_translator = ValueTranslator()
self.subgraph_translator = SubgraphTranslator()
def to_pulsar(self, data: Dict[str, Any]) -> KnowledgeRequest:
triples = None
if "triples" in data:
triples = Triples(
metadata=Metadata(
id=data["triples"]["metadata"]["id"],
metadata=self.subgraph_translator.to_pulsar(
data["triples"]["metadata"]["metadata"]
),
user=data["triples"]["metadata"]["user"],
collection=data["triples"]["metadata"]["collection"]
),
triples=self.subgraph_translator.to_pulsar(data["triples"]["triples"]),
)
graph_embeddings = None
if "graph-embeddings" in data:
graph_embeddings = GraphEmbeddings(
metadata=Metadata(
id=data["graph-embeddings"]["metadata"]["id"],
metadata=self.subgraph_translator.to_pulsar(
data["graph-embeddings"]["metadata"]["metadata"]
),
user=data["graph-embeddings"]["metadata"]["user"],
collection=data["graph-embeddings"]["metadata"]["collection"]
),
@ -49,7 +42,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
for ent in data["graph-embeddings"]["entities"]
]
)
return KnowledgeRequest(
operation=data.get("operation"),
user=data.get("user"),
@ -59,10 +52,10 @@ class KnowledgeRequestTranslator(MessageTranslator):
triples=triples,
graph_embeddings=graph_embeddings,
)
def from_pulsar(self, obj: KnowledgeRequest) -> Dict[str, Any]:
result = {}
if obj.operation:
result["operation"] = obj.operation
if obj.user:
@ -73,27 +66,21 @@ class KnowledgeRequestTranslator(MessageTranslator):
result["flow"] = obj.flow
if obj.collection:
result["collection"] = obj.collection
if obj.triples:
result["triples"] = {
"metadata": {
"id": obj.triples.metadata.id,
"metadata": self.subgraph_translator.from_pulsar(
obj.triples.metadata.metadata
),
"user": obj.triples.metadata.user,
"collection": obj.triples.metadata.collection,
},
"triples": self.subgraph_translator.from_pulsar(obj.triples.triples),
}
if obj.graph_embeddings:
result["graph-embeddings"] = {
"metadata": {
"id": obj.graph_embeddings.metadata.id,
"metadata": self.subgraph_translator.from_pulsar(
obj.graph_embeddings.metadata.metadata
),
"user": obj.graph_embeddings.metadata.user,
"collection": obj.graph_embeddings.metadata.collection,
},
@ -105,50 +92,44 @@ class KnowledgeRequestTranslator(MessageTranslator):
for entity in obj.graph_embeddings.entities
],
}
return result
class KnowledgeResponseTranslator(MessageTranslator):
"""Translator for KnowledgeResponse schema objects"""
def __init__(self):
self.value_translator = ValueTranslator()
self.subgraph_translator = SubgraphTranslator()
def to_pulsar(self, data: Dict[str, Any]) -> KnowledgeResponse:
raise NotImplementedError("Response translation to Pulsar not typically needed")
def from_pulsar(self, obj: KnowledgeResponse) -> Dict[str, Any]:
# Response to list operation
if obj.ids is not None:
return {"ids": obj.ids}
# Streaming triples response
if obj.triples:
return {
"triples": {
"metadata": {
"id": obj.triples.metadata.id,
"metadata": self.subgraph_translator.from_pulsar(
obj.triples.metadata.metadata
),
"user": obj.triples.metadata.user,
"collection": obj.triples.metadata.collection,
},
"triples": self.subgraph_translator.from_pulsar(obj.triples.triples),
}
}
# Streaming graph embeddings response
if obj.graph_embeddings:
return {
"graph-embeddings": {
"metadata": {
"id": obj.graph_embeddings.metadata.id,
"metadata": self.subgraph_translator.from_pulsar(
obj.graph_embeddings.metadata.metadata
),
"user": obj.graph_embeddings.metadata.user,
"collection": obj.graph_embeddings.metadata.collection,
},
@ -161,11 +142,11 @@ class KnowledgeResponseTranslator(MessageTranslator):
],
}
}
# End of stream marker
if obj.eos is True:
return {"eos": True}
# Empty response (successful delete)
return {}

View file

@ -1,14 +1,10 @@
from dataclasses import dataclass, field
from .primitives import Triple
from dataclasses import dataclass
@dataclass
class Metadata:
# Source identifier
id: str = ""
# Subgraph
metadata: list[Triple] = field(default_factory=list)
# Collection management
user: str = ""
collection: str = ""