Remove redundant metadata (#685)

The metadata field (list of triples) in the pipeline Metadata class
was redundant. Document metadata triples already flow directly from
librarian to triple-store via emit_document_provenance() - they don't
need to pass through the extraction pipeline.

Additionally, chunker and PDF decoder were overwriting metadata to []
anyway, so any metadata passed through the pipeline was being
discarded.

Changes:
- Remove metadata field from Metadata dataclass
  (schema/core/metadata.py)
- Update all Metadata instantiations to remove metadata=[]
  parameter
- Remove metadata handling from translators (document_loading,
  knowledge)
- Remove metadata consumption from extractors (ontology, agent)
- Update gateway serializers and import handlers
- Update all unit, integration, and contract tests
This commit is contained in:
cybermaggedon 2026-03-11 10:51:39 +00:00 committed by GitHub
parent 1837d73f34
commit aa4f5c6c00
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 106 additions and 343 deletions

View file

@ -76,13 +76,6 @@ class TestAgentKgExtractionIntegration:
chunk=text.encode('utf-8'),
metadata=Metadata(
id="doc123",
metadata=[
Triple(
s=Term(type=IRI, iri="doc123"),
p=Term(type=IRI, iri="http://example.org/type"),
o=Term(type=LITERAL, value="document")
)
]
)
)
@ -136,11 +129,7 @@ class TestAgentKgExtractionIntegration:
# Parse and process
extraction_data = extractor.parse_jsonl(agent_response)
triples, entity_contexts = extractor.process_extraction_data(extraction_data, v.metadata)
# Add metadata triples
for t in v.metadata.metadata:
triples.append(t)
# Emit outputs
if triples:
await extractor.emit_triples(flow("triples"), v.metadata, triples)
@ -242,9 +231,9 @@ class TestAgentKgExtractionIntegration:
# Act - JSONL parsing is lenient, invalid lines are skipped
await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
# Assert - should emit triples (with just metadata) but no entity contexts
# Assert - with no valid extraction data, nothing is emitted
triples_publisher = mock_flow_context("triples")
triples_publisher.send.assert_called_once()
triples_publisher.send.assert_not_called()
entity_contexts_publisher = mock_flow_context("entity-contexts")
entity_contexts_publisher.send.assert_not_called()
@ -268,17 +257,12 @@ class TestAgentKgExtractionIntegration:
# Act
await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
# Assert
# Should still emit outputs (even if empty) to maintain flow consistency
# Assert - with empty extraction results, nothing is emitted
triples_publisher = mock_flow_context("triples")
entity_contexts_publisher = mock_flow_context("entity-contexts")
# Triples should include metadata triples at minimum
triples_publisher.send.assert_called_once()
sent_triples = triples_publisher.send.call_args[0][0]
assert isinstance(sent_triples, Triples)
# Entity contexts should not be sent if empty
# No triples or entity contexts emitted for empty results
triples_publisher.send.assert_not_called()
entity_contexts_publisher.send.assert_not_called()
@pytest.mark.asyncio
@ -308,7 +292,7 @@ class TestAgentKgExtractionIntegration:
test_text = "Test text for prompt rendering"
chunk = Chunk(
chunk=test_text.encode('utf-8'),
metadata=Metadata(id="test-doc", metadata=[])
metadata=Metadata(id="test-doc")
)
agent_client = mock_flow_context("agent-request")
@ -340,7 +324,7 @@ class TestAgentKgExtractionIntegration:
text = f"Test document {i} content"
chunks.append(Chunk(
chunk=text.encode('utf-8'),
metadata=Metadata(id=f"doc{i}", metadata=[])
metadata=Metadata(id=f"doc{i}")
))
agent_client = mock_flow_context("agent-request")
@ -375,7 +359,7 @@ class TestAgentKgExtractionIntegration:
unicode_text = "Machine Learning (学习机器) は人工知能の一分野です。"
chunk = Chunk(
chunk=unicode_text.encode('utf-8'),
metadata=Metadata(id="unicode-doc", metadata=[])
metadata=Metadata(id="unicode-doc")
)
agent_client = mock_flow_context("agent-request")
@ -411,7 +395,7 @@ class TestAgentKgExtractionIntegration:
large_text = "Machine Learning is important. " * 1000 # Repeat to create large text
chunk = Chunk(
chunk=large_text.encode('utf-8'),
metadata=Metadata(id="large-doc", metadata=[])
metadata=Metadata(id="large-doc")
)
agent_client = mock_flow_context("agent-request")

View file

@ -171,7 +171,6 @@ async def test_export_no_message_loss_integration(mock_backend):
triples_obj = Triples(
metadata=Metadata(
id=f"export-msg-{i}",
metadata=to_subgraph(msg_data["metadata"]["metadata"]),
user=msg_data["metadata"]["user"],
collection=msg_data["metadata"]["collection"],
),

View file

@ -92,7 +92,6 @@ class TestKnowledgeGraphPipelineIntegration:
id="doc-123",
user="test_user",
collection="test_collection",
metadata=[]
),
chunk=b"Machine Learning is a subset of Artificial Intelligence. Neural Networks are used in Machine Learning to process complex patterns."
)
@ -243,13 +242,12 @@ class TestKnowledgeGraphPipelineIntegration:
id="test-doc",
user="test_user",
collection="test_collection",
metadata=[]
)
# Act
triples = []
entities = []
for defn in sample_definitions_response:
s = defn["entity"]
o = defn["definition"]
@ -302,12 +300,11 @@ class TestKnowledgeGraphPipelineIntegration:
id="test-doc",
user="test_user",
collection="test_collection",
metadata=[]
)
# Act
triples = []
for rel in sample_relationships_response:
s = rel["subject"]
p = rel["predicate"]
@ -373,7 +370,6 @@ class TestKnowledgeGraphPipelineIntegration:
id="test-doc",
user="test_user",
collection="test_collection",
metadata=[]
),
triples=[
Triple(
@ -406,7 +402,6 @@ class TestKnowledgeGraphPipelineIntegration:
id="test-doc",
user="test_user",
collection="test_collection",
metadata=[]
),
entities=[
EntityEmbeddings(
@ -542,7 +537,7 @@ class TestKnowledgeGraphPipelineIntegration:
]
sample_chunk = Chunk(
metadata=Metadata(id="test", user="user", collection="collection", metadata=[]),
metadata=Metadata(id="test", user="user", collection="collection"),
chunk=b"Test chunk"
)
@ -569,7 +564,7 @@ class TestKnowledgeGraphPipelineIntegration:
# Arrange
large_chunk_batch = [
Chunk(
metadata=Metadata(id=f"doc-{i}", user="user", collection="collection", metadata=[]),
metadata=Metadata(id=f"doc-{i}", user="user", collection="collection"),
chunk=f"Document {i} contains machine learning and AI content.".encode("utf-8")
)
for i in range(100) # Large batch
@ -608,15 +603,8 @@ class TestKnowledgeGraphPipelineIntegration:
id="test-doc-123",
user="test_user",
collection="test_collection",
metadata=[
Triple(
s=Term(type=IRI, iri="doc:test"),
p=Term(type=IRI, iri="dc:title"),
o=Term(type=LITERAL, value="Test Document")
)
]
)
sample_chunk = Chunk(
metadata=original_metadata,
chunk=b"Test content for metadata propagation"

View file

@ -231,7 +231,6 @@ class TestObjectExtractionServiceIntegration:
id="customer-doc-001",
user="integration_test",
collection="test_documents",
metadata=[]
)
chunk_text = """
@ -299,7 +298,6 @@ class TestObjectExtractionServiceIntegration:
id="product-doc-001",
user="integration_test",
collection="test_documents",
metadata=[]
)
chunk_text = """
@ -373,7 +371,6 @@ class TestObjectExtractionServiceIntegration:
id=chunk_id,
user="concurrent_test",
collection="test_collection",
metadata=[]
)
chunk = Chunk(metadata=metadata, chunk=text.encode('utf-8'))
chunks.append(chunk)
@ -470,7 +467,7 @@ class TestObjectExtractionServiceIntegration:
await processor.on_schema_config(integration_config, version=1)
# Create test chunk
metadata = Metadata(id="error-test", user="test", collection="test", metadata=[])
metadata = Metadata(id="error-test", user="test", collection="test")
chunk = Chunk(metadata=metadata, chunk=b"Some text that will fail to process")
mock_msg = MagicMock()
@ -507,7 +504,6 @@ class TestObjectExtractionServiceIntegration:
id="metadata-test-chunk",
user="test_user",
collection="test_collection",
metadata=[] # Could include source document metadata
)
chunk = Chunk(

View file

@ -120,7 +120,6 @@ class TestRowsCassandraIntegration:
id="doc-001",
user="test_user",
collection="import_2024",
metadata=[]
),
schema_name="customer_records",
values=[{
@ -201,7 +200,7 @@ class TestRowsCassandraIntegration:
# Process objects for different schemas
product_obj = ExtractedObject(
metadata=Metadata(id="p1", user="shop", collection="catalog", metadata=[]),
metadata=Metadata(id="p1", user="shop", collection="catalog"),
schema_name="products",
values=[{"product_id": "P001", "name": "Widget", "price": "19.99"}],
confidence=0.9,
@ -209,7 +208,7 @@ class TestRowsCassandraIntegration:
)
order_obj = ExtractedObject(
metadata=Metadata(id="o1", user="shop", collection="sales", metadata=[]),
metadata=Metadata(id="o1", user="shop", collection="sales"),
schema_name="orders",
values=[{"order_id": "O001", "customer_id": "C001", "total": "59.97"}],
confidence=0.85,
@ -254,7 +253,7 @@ class TestRowsCassandraIntegration:
)
test_obj = ExtractedObject(
metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
metadata=Metadata(id="t1", user="test", collection="test"),
schema_name="indexed_data",
values=[{
"id": "123",
@ -337,7 +336,6 @@ class TestRowsCassandraIntegration:
id="batch-001",
user="test_user",
collection="batch_import",
metadata=[]
),
schema_name="batch_customers",
values=[
@ -391,7 +389,7 @@ class TestRowsCassandraIntegration:
# Process empty batch object
empty_obj = ExtractedObject(
metadata=Metadata(id="empty-1", user="test", collection="empty", metadata=[]),
metadata=Metadata(id="empty-1", user="test", collection="empty"),
schema_name="empty_test",
values=[], # Empty batch
confidence=1.0,
@ -426,7 +424,7 @@ class TestRowsCassandraIntegration:
)
test_obj = ExtractedObject(
metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
metadata=Metadata(id="t1", user="test", collection="test"),
schema_name="map_test",
values=[{"id": "123", "name": "Test Item", "count": "42"}],
confidence=0.9,
@ -470,7 +468,7 @@ class TestRowsCassandraIntegration:
)
test_obj = ExtractedObject(
metadata=Metadata(id="t1", user="test", collection="my_collection", metadata=[]),
metadata=Metadata(id="t1", user="test", collection="my_collection"),
schema_name="partition_test",
values=[{"id": "123", "category": "test"}],
confidence=0.9,