mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-01 11:26:22 +02:00
Remove redundant metadata (#685)
The metadata field (list of triples) in the pipeline Metadata class was redundant. Document metadata triples already flow directly from librarian to triple-store via emit_document_provenance() - they don't need to pass through the extraction pipeline. Additionally, chunker and PDF decoder were overwriting metadata to [] anyway, so any metadata passed through the pipeline was being discarded. Changes: - Remove metadata field from Metadata dataclass (schema/core/metadata.py) - Update all Metadata instantiations to remove metadata=[] parameter - Remove metadata handling from translators (document_loading, knowledge) - Remove metadata consumption from extractors (ontology, agent) - Update gateway serializers and import handlers - Update all unit, integration, and contract tests
This commit is contained in:
parent
1837d73f34
commit
aa4f5c6c00
37 changed files with 106 additions and 343 deletions
|
|
@ -2,38 +2,30 @@ import base64
|
|||
from typing import Dict, Any
|
||||
from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEmbeddings
|
||||
from .base import SendTranslator
|
||||
from .metadata import DocumentMetadataTranslator
|
||||
from .primitives import SubgraphTranslator
|
||||
|
||||
|
||||
class DocumentTranslator(SendTranslator):
|
||||
"""Translator for Document schema objects (PDF docs etc.)"""
|
||||
|
||||
def __init__(self):
|
||||
self.subgraph_translator = SubgraphTranslator()
|
||||
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> Document:
|
||||
metadata = data.get("metadata", [])
|
||||
|
||||
# Handle base64 content validation
|
||||
doc = base64.b64decode(data["data"])
|
||||
|
||||
|
||||
from ...schema import Metadata
|
||||
return Document(
|
||||
metadata=Metadata(
|
||||
id=data.get("id"),
|
||||
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata else [],
|
||||
user=data.get("user", "trustgraph"),
|
||||
collection=data.get("collection", "default"),
|
||||
),
|
||||
data=base64.b64encode(doc).decode("utf-8")
|
||||
)
|
||||
|
||||
|
||||
def from_pulsar(self, obj: Document) -> Dict[str, Any]:
|
||||
result = {
|
||||
"data": obj.data
|
||||
}
|
||||
|
||||
|
||||
if obj.metadata:
|
||||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
|
|
@ -42,43 +34,36 @@ class DocumentTranslator(SendTranslator):
|
|||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
metadata_dict["collection"] = obj.metadata.collection
|
||||
if obj.metadata.metadata:
|
||||
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
|
||||
|
||||
|
||||
result["metadata"] = metadata_dict
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class TextDocumentTranslator(SendTranslator):
|
||||
"""Translator for TextDocument schema objects"""
|
||||
|
||||
def __init__(self):
|
||||
self.subgraph_translator = SubgraphTranslator()
|
||||
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
|
||||
metadata = data.get("metadata", [])
|
||||
charset = data.get("charset", "utf-8")
|
||||
|
||||
|
||||
# Text is base64 encoded in input
|
||||
text = base64.b64decode(data["text"]).decode(charset)
|
||||
|
||||
|
||||
from ...schema import Metadata
|
||||
return TextDocument(
|
||||
metadata=Metadata(
|
||||
id=data.get("id"),
|
||||
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata else [],
|
||||
user=data.get("user", "trustgraph"),
|
||||
collection=data.get("collection", "default"),
|
||||
),
|
||||
text=text.encode("utf-8")
|
||||
)
|
||||
|
||||
|
||||
def from_pulsar(self, obj: TextDocument) -> Dict[str, Any]:
|
||||
result = {
|
||||
"text": obj.text.decode("utf-8") if isinstance(obj.text, bytes) else obj.text
|
||||
}
|
||||
|
||||
|
||||
if obj.metadata:
|
||||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
|
|
@ -87,39 +72,31 @@ class TextDocumentTranslator(SendTranslator):
|
|||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
metadata_dict["collection"] = obj.metadata.collection
|
||||
if obj.metadata.metadata:
|
||||
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
|
||||
|
||||
|
||||
result["metadata"] = metadata_dict
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class ChunkTranslator(SendTranslator):
|
||||
"""Translator for Chunk schema objects"""
|
||||
|
||||
def __init__(self):
|
||||
self.subgraph_translator = SubgraphTranslator()
|
||||
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> Chunk:
|
||||
metadata = data.get("metadata", [])
|
||||
|
||||
from ...schema import Metadata
|
||||
return Chunk(
|
||||
metadata=Metadata(
|
||||
id=data.get("id"),
|
||||
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata else [],
|
||||
user=data.get("user", "trustgraph"),
|
||||
collection=data.get("collection", "default"),
|
||||
),
|
||||
chunk=data["chunk"].encode("utf-8") if isinstance(data["chunk"], str) else data["chunk"]
|
||||
)
|
||||
|
||||
|
||||
def from_pulsar(self, obj: Chunk) -> Dict[str, Any]:
|
||||
result = {
|
||||
"chunk": obj.chunk.decode("utf-8") if isinstance(obj.chunk, bytes) else obj.chunk
|
||||
}
|
||||
|
||||
|
||||
if obj.metadata:
|
||||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
|
|
@ -128,20 +105,15 @@ class ChunkTranslator(SendTranslator):
|
|||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
metadata_dict["collection"] = obj.metadata.collection
|
||||
if obj.metadata.metadata:
|
||||
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
|
||||
|
||||
|
||||
result["metadata"] = metadata_dict
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class DocumentEmbeddingsTranslator(SendTranslator):
|
||||
"""Translator for DocumentEmbeddings schema objects"""
|
||||
|
||||
def __init__(self):
|
||||
self.subgraph_translator = SubgraphTranslator()
|
||||
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> DocumentEmbeddings:
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
|
|
@ -157,13 +129,12 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
return DocumentEmbeddings(
|
||||
metadata=Metadata(
|
||||
id=metadata.get("id"),
|
||||
metadata=self.subgraph_translator.to_pulsar(metadata.get("metadata", [])),
|
||||
user=metadata.get("user", "trustgraph"),
|
||||
collection=metadata.get("collection", "default"),
|
||||
),
|
||||
chunks=chunks
|
||||
)
|
||||
|
||||
|
||||
def from_pulsar(self, obj: DocumentEmbeddings) -> Dict[str, Any]:
|
||||
result = {
|
||||
"chunks": [
|
||||
|
|
@ -174,7 +145,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
for chunk in obj.chunks
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
if obj.metadata:
|
||||
metadata_dict = {}
|
||||
if obj.metadata.id:
|
||||
|
|
@ -183,9 +154,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
metadata_dict["user"] = obj.metadata.user
|
||||
if obj.metadata.collection:
|
||||
metadata_dict["collection"] = obj.metadata.collection
|
||||
if obj.metadata.metadata:
|
||||
metadata_dict["metadata"] = self.subgraph_translator.from_pulsar(obj.metadata.metadata)
|
||||
|
||||
|
||||
result["metadata"] = metadata_dict
|
||||
|
||||
|
||||
return result
|
||||
|
|
@ -1,43 +1,36 @@
|
|||
from typing import Dict, Any, Tuple, Optional
|
||||
from ...schema import (
|
||||
KnowledgeRequest, KnowledgeResponse, Triples, GraphEmbeddings,
|
||||
KnowledgeRequest, KnowledgeResponse, Triples, GraphEmbeddings,
|
||||
Metadata, EntityEmbeddings
|
||||
)
|
||||
from .base import MessageTranslator
|
||||
from .primitives import ValueTranslator, SubgraphTranslator
|
||||
from .metadata import DocumentMetadataTranslator
|
||||
|
||||
|
||||
class KnowledgeRequestTranslator(MessageTranslator):
|
||||
"""Translator for KnowledgeRequest schema objects"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.value_translator = ValueTranslator()
|
||||
self.subgraph_translator = SubgraphTranslator()
|
||||
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> KnowledgeRequest:
|
||||
triples = None
|
||||
if "triples" in data:
|
||||
triples = Triples(
|
||||
metadata=Metadata(
|
||||
id=data["triples"]["metadata"]["id"],
|
||||
metadata=self.subgraph_translator.to_pulsar(
|
||||
data["triples"]["metadata"]["metadata"]
|
||||
),
|
||||
user=data["triples"]["metadata"]["user"],
|
||||
collection=data["triples"]["metadata"]["collection"]
|
||||
),
|
||||
triples=self.subgraph_translator.to_pulsar(data["triples"]["triples"]),
|
||||
)
|
||||
|
||||
|
||||
graph_embeddings = None
|
||||
if "graph-embeddings" in data:
|
||||
graph_embeddings = GraphEmbeddings(
|
||||
metadata=Metadata(
|
||||
id=data["graph-embeddings"]["metadata"]["id"],
|
||||
metadata=self.subgraph_translator.to_pulsar(
|
||||
data["graph-embeddings"]["metadata"]["metadata"]
|
||||
),
|
||||
user=data["graph-embeddings"]["metadata"]["user"],
|
||||
collection=data["graph-embeddings"]["metadata"]["collection"]
|
||||
),
|
||||
|
|
@ -49,7 +42,7 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
for ent in data["graph-embeddings"]["entities"]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
return KnowledgeRequest(
|
||||
operation=data.get("operation"),
|
||||
user=data.get("user"),
|
||||
|
|
@ -59,10 +52,10 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
triples=triples,
|
||||
graph_embeddings=graph_embeddings,
|
||||
)
|
||||
|
||||
|
||||
def from_pulsar(self, obj: KnowledgeRequest) -> Dict[str, Any]:
|
||||
result = {}
|
||||
|
||||
|
||||
if obj.operation:
|
||||
result["operation"] = obj.operation
|
||||
if obj.user:
|
||||
|
|
@ -73,27 +66,21 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
result["flow"] = obj.flow
|
||||
if obj.collection:
|
||||
result["collection"] = obj.collection
|
||||
|
||||
|
||||
if obj.triples:
|
||||
result["triples"] = {
|
||||
"metadata": {
|
||||
"id": obj.triples.metadata.id,
|
||||
"metadata": self.subgraph_translator.from_pulsar(
|
||||
obj.triples.metadata.metadata
|
||||
),
|
||||
"user": obj.triples.metadata.user,
|
||||
"collection": obj.triples.metadata.collection,
|
||||
},
|
||||
"triples": self.subgraph_translator.from_pulsar(obj.triples.triples),
|
||||
}
|
||||
|
||||
|
||||
if obj.graph_embeddings:
|
||||
result["graph-embeddings"] = {
|
||||
"metadata": {
|
||||
"id": obj.graph_embeddings.metadata.id,
|
||||
"metadata": self.subgraph_translator.from_pulsar(
|
||||
obj.graph_embeddings.metadata.metadata
|
||||
),
|
||||
"user": obj.graph_embeddings.metadata.user,
|
||||
"collection": obj.graph_embeddings.metadata.collection,
|
||||
},
|
||||
|
|
@ -105,50 +92,44 @@ class KnowledgeRequestTranslator(MessageTranslator):
|
|||
for entity in obj.graph_embeddings.entities
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class KnowledgeResponseTranslator(MessageTranslator):
|
||||
"""Translator for KnowledgeResponse schema objects"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.value_translator = ValueTranslator()
|
||||
self.subgraph_translator = SubgraphTranslator()
|
||||
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> KnowledgeResponse:
|
||||
raise NotImplementedError("Response translation to Pulsar not typically needed")
|
||||
|
||||
|
||||
def from_pulsar(self, obj: KnowledgeResponse) -> Dict[str, Any]:
|
||||
# Response to list operation
|
||||
if obj.ids is not None:
|
||||
return {"ids": obj.ids}
|
||||
|
||||
|
||||
# Streaming triples response
|
||||
if obj.triples:
|
||||
return {
|
||||
"triples": {
|
||||
"metadata": {
|
||||
"id": obj.triples.metadata.id,
|
||||
"metadata": self.subgraph_translator.from_pulsar(
|
||||
obj.triples.metadata.metadata
|
||||
),
|
||||
"user": obj.triples.metadata.user,
|
||||
"collection": obj.triples.metadata.collection,
|
||||
},
|
||||
"triples": self.subgraph_translator.from_pulsar(obj.triples.triples),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Streaming graph embeddings response
|
||||
if obj.graph_embeddings:
|
||||
return {
|
||||
"graph-embeddings": {
|
||||
"metadata": {
|
||||
"id": obj.graph_embeddings.metadata.id,
|
||||
"metadata": self.subgraph_translator.from_pulsar(
|
||||
obj.graph_embeddings.metadata.metadata
|
||||
),
|
||||
"user": obj.graph_embeddings.metadata.user,
|
||||
"collection": obj.graph_embeddings.metadata.collection,
|
||||
},
|
||||
|
|
@ -161,11 +142,11 @@ class KnowledgeResponseTranslator(MessageTranslator):
|
|||
],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# End of stream marker
|
||||
if obj.eos is True:
|
||||
return {"eos": True}
|
||||
|
||||
|
||||
# Empty response (successful delete)
|
||||
return {}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,10 @@
|
|||
from dataclasses import dataclass, field
|
||||
from .primitives import Triple
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
# Source identifier
|
||||
id: str = ""
|
||||
|
||||
# Subgraph
|
||||
metadata: list[Triple] = field(default_factory=list)
|
||||
|
||||
# Collection management
|
||||
user: str = ""
|
||||
collection: str = ""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue