Remove redundant metadata (#685)

The metadata field (list of triples) in the pipeline Metadata class
was redundant. Document metadata triples already flow directly from
librarian to triple-store via emit_document_provenance() - they don't
need to pass through the extraction pipeline.

Additionally, chunker and PDF decoder were overwriting metadata to []
anyway, so any metadata passed through the pipeline was being
discarded.

Changes:
- Remove metadata field from Metadata dataclass
  (schema/core/metadata.py)
- Update all Metadata instantiations to remove metadata=[]
  parameter
- Remove metadata handling from translators (document_loading,
  knowledge)
- Remove metadata consumption from extractors (ontology, agent)
- Update gateway serializers and import handlers
- Update all unit, integration, and contract tests
This commit is contained in:
cybermaggedon 2026-03-11 10:51:39 +00:00 committed by GitHub
parent 1837d73f34
commit aa4f5c6c00
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 106 additions and 343 deletions

View file

@ -178,7 +178,6 @@ class Processor(ChunkingService):
await flow("triples").send(Triples(
metadata=Metadata(
id=chunk_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
@ -189,7 +188,6 @@ class Processor(ChunkingService):
r = Chunk(
metadata=Metadata(
id=chunk_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),

View file

@ -302,7 +302,6 @@ class Processor(FlowProcessor):
await flow("triples").send(Triples(
metadata=Metadata(
id=pg_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
@ -314,7 +313,6 @@ class Processor(FlowProcessor):
r = TextDocument(
metadata=Metadata(
id=pg_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),

View file

@ -104,7 +104,6 @@ class Processor(FlowProcessor):
tpls = Triples(
metadata = Metadata(
id = metadata.id,
metadata = [],
user = metadata.user,
collection = metadata.collection,
),
@ -117,7 +116,6 @@ class Processor(FlowProcessor):
ecs = EntityContexts(
metadata = Metadata(
id = metadata.id,
metadata = [],
user = metadata.user,
collection = metadata.collection,
),
@ -216,10 +214,6 @@ class Processor(FlowProcessor):
extraction_data, v.metadata
)
# Put document metadata into triples
for t in v.metadata.metadata:
triples.append(t)
# Emit outputs
if triples:
await self.emit_triples(flow("triples"), v.metadata, triples)

View file

@ -218,7 +218,6 @@ class Processor(FlowProcessor):
flow("triples"),
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
@ -232,7 +231,6 @@ class Processor(FlowProcessor):
flow("entity-contexts"),
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),

View file

@ -306,10 +306,6 @@ class Processor(FlowProcessor):
flow, chunk, ontology_subset, prompt_variables
)
# Add metadata triples
for t in v.metadata.metadata:
triples.append(t)
# Generate ontology definition triples
ontology_triples = self.build_ontology_triples(ontology_subset)
@ -558,7 +554,6 @@ class Processor(FlowProcessor):
t = Triples(
metadata=Metadata(
id=metadata.id,
metadata=[],
user=metadata.user,
collection=metadata.collection,
),
@ -571,7 +566,6 @@ class Processor(FlowProcessor):
ec = EntityContexts(
metadata=Metadata(
id=metadata.id,
metadata=[],
user=metadata.user,
collection=metadata.collection,
),

View file

@ -219,7 +219,6 @@ class Processor(FlowProcessor):
flow("triples"),
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),

View file

@ -272,7 +272,6 @@ class Processor(FlowProcessor):
extracted = ExtractedObject(
metadata=Metadata(
id=f"{v.metadata.id}:{schema_name}",
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),

View file

@ -53,7 +53,6 @@ class RowsImport:
elt = ExtractedObject(
metadata=Metadata(
id=data["metadata"]["id"],
metadata=to_subgraph(data["metadata"].get("metadata", [])),
user=data["metadata"]["user"],
collection=data["metadata"]["collection"],
),

View file

@ -37,18 +37,17 @@ def serialize_triples(message):
return {
"metadata": {
"id": message.metadata.id,
"metadata": serialize_subgraph(message.metadata.metadata),
"user": message.metadata.user,
"collection": message.metadata.collection,
},
"triples": serialize_subgraph(message.triples),
}
def serialize_graph_embeddings(message):
return {
"metadata": {
"id": message.metadata.id,
"metadata": serialize_subgraph(message.metadata.metadata),
"user": message.metadata.user,
"collection": message.metadata.collection,
},
@ -61,11 +60,11 @@ def serialize_graph_embeddings(message):
],
}
def serialize_entity_contexts(message):
return {
"metadata": {
"id": message.metadata.id,
"metadata": serialize_subgraph(message.metadata.metadata),
"user": message.metadata.user,
"collection": message.metadata.collection,
},
@ -78,11 +77,11 @@ def serialize_entity_contexts(message):
],
}
def serialize_document_embeddings(message):
return {
"metadata": {
"id": message.metadata.id,
"metadata": serialize_subgraph(message.metadata.metadata),
"user": message.metadata.user,
"collection": message.metadata.collection,
},

View file

@ -48,7 +48,6 @@ class TriplesImport:
elt = Triples(
metadata=Metadata(
id=data["metadata"]["id"],
metadata=to_subgraph(data["metadata"]["metadata"]),
user=data["metadata"]["user"],
collection=data["metadata"]["collection"],
),

View file

@ -334,7 +334,6 @@ class Processor(AsyncProcessor):
triples_msg = Triples(
metadata=Metadata(
id=doc_uri,
metadata=[],
user=processing.user,
collection=processing.collection,
),
@ -381,7 +380,6 @@ class Processor(AsyncProcessor):
doc = TextDocument(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
@ -392,7 +390,6 @@ class Processor(AsyncProcessor):
doc = TextDocument(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
@ -408,7 +405,6 @@ class Processor(AsyncProcessor):
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
@ -419,7 +415,6 @@ class Processor(AsyncProcessor):
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),

View file

@ -243,7 +243,6 @@ class Processor(FlowProcessor):
await flow("explainability").send(Triples(
metadata=Metadata(
id=explain_id,
metadata=[],
user=v.user,
collection=v.collection, # Store in user's collection, not separate explainability collection
),

View file

@ -218,16 +218,6 @@ class KnowledgeTableStore:
when = int(time.time() * 1000)
if m.metadata.metadata:
metadata = [
(
*term_to_tuple(v.s), *term_to_tuple(v.p), *term_to_tuple(v.o)
)
for v in m.metadata.metadata
]
else:
metadata = []
triples = [
(
*term_to_tuple(v.s), *term_to_tuple(v.p), *term_to_tuple(v.o)
@ -244,7 +234,7 @@ class KnowledgeTableStore:
(
uuid.uuid4(), m.metadata.user,
m.metadata.id, when,
metadata, triples,
[], triples,
)
)
@ -259,16 +249,6 @@ class KnowledgeTableStore:
when = int(time.time() * 1000)
if m.metadata.metadata:
metadata = [
(
*term_to_tuple(v.s), *term_to_tuple(v.p), *term_to_tuple(v.o)
)
for v in m.metadata.metadata
]
else:
metadata = []
entities = [
(
term_to_tuple(v.entity),
@ -286,7 +266,7 @@ class KnowledgeTableStore:
(
uuid.uuid4(), m.metadata.user,
m.metadata.id, when,
metadata, entities,
[], entities,
)
)
@ -301,16 +281,6 @@ class KnowledgeTableStore:
when = int(time.time() * 1000)
if m.metadata.metadata:
metadata = [
(
*term_to_tuple(v.s), *term_to_tuple(v.p), *term_to_tuple(v.o)
)
for v in m.metadata.metadata
]
else:
metadata = []
chunks = [
(
v.chunk_id,
@ -328,7 +298,7 @@ class KnowledgeTableStore:
(
uuid.uuid4(), m.metadata.user,
m.metadata.id, when,
metadata, chunks,
[], chunks,
)
)
@ -423,18 +393,6 @@ class KnowledgeTableStore:
for row in resp:
if row[2]:
metadata = [
Triple(
s = tuple_to_term(elt[0], elt[1]),
p = tuple_to_term(elt[2], elt[3]),
o = tuple_to_term(elt[4], elt[5]),
)
for elt in row[2]
]
else:
metadata = []
if row[3]:
triples = [
Triple(
@ -453,7 +411,6 @@ class KnowledgeTableStore:
id = document_id,
user = user,
collection = "default", # FIXME: What to put here?
metadata = metadata,
),
triples = triples
)
@ -482,18 +439,6 @@ class KnowledgeTableStore:
for row in resp:
if row[2]:
metadata = [
Triple(
s = tuple_to_term(elt[0], elt[1]),
p = tuple_to_term(elt[2], elt[3]),
o = tuple_to_term(elt[4], elt[5]),
)
for elt in row[2]
]
else:
metadata = []
if row[3]:
entities = [
EntityEmbeddings(
@ -511,7 +456,6 @@ class KnowledgeTableStore:
id = document_id,
user = user,
collection = "default", # FIXME: What to put here?
metadata = metadata,
),
entities = entities
)