mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-26 15:55:16 +02:00
Remove redundant metadata (#685)
The metadata field (list of triples) in the pipeline Metadata class was redundant. Document metadata triples already flow directly from librarian to triple-store via emit_document_provenance() - they don't need to pass through the extraction pipeline. Additionally, chunker and PDF decoder were overwriting metadata to [] anyway, so any metadata passed through the pipeline was being discarded. Changes: - Remove metadata field from Metadata dataclass (schema/core/metadata.py) - Update all Metadata instantiations to remove metadata=[] parameter - Remove metadata handling from translators (document_loading, knowledge) - Remove metadata consumption from extractors (ontology, agent) - Update gateway serializers and import handlers - Update all unit, integration, and contract tests
This commit is contained in:
parent
1837d73f34
commit
aa4f5c6c00
37 changed files with 106 additions and 343 deletions
|
|
@ -76,13 +76,6 @@ class TestAgentKgExtractionIntegration:
|
|||
chunk=text.encode('utf-8'),
|
||||
metadata=Metadata(
|
||||
id="doc123",
|
||||
metadata=[
|
||||
Triple(
|
||||
s=Term(type=IRI, iri="doc123"),
|
||||
p=Term(type=IRI, iri="http://example.org/type"),
|
||||
o=Term(type=LITERAL, value="document")
|
||||
)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -136,11 +129,7 @@ class TestAgentKgExtractionIntegration:
|
|||
# Parse and process
|
||||
extraction_data = extractor.parse_jsonl(agent_response)
|
||||
triples, entity_contexts = extractor.process_extraction_data(extraction_data, v.metadata)
|
||||
|
||||
# Add metadata triples
|
||||
for t in v.metadata.metadata:
|
||||
triples.append(t)
|
||||
|
||||
|
||||
# Emit outputs
|
||||
if triples:
|
||||
await extractor.emit_triples(flow("triples"), v.metadata, triples)
|
||||
|
|
@ -242,9 +231,9 @@ class TestAgentKgExtractionIntegration:
|
|||
# Act - JSONL parsing is lenient, invalid lines are skipped
|
||||
await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
|
||||
|
||||
# Assert - should emit triples (with just metadata) but no entity contexts
|
||||
# Assert - with no valid extraction data, nothing is emitted
|
||||
triples_publisher = mock_flow_context("triples")
|
||||
triples_publisher.send.assert_called_once()
|
||||
triples_publisher.send.assert_not_called()
|
||||
|
||||
entity_contexts_publisher = mock_flow_context("entity-contexts")
|
||||
entity_contexts_publisher.send.assert_not_called()
|
||||
|
|
@ -268,17 +257,12 @@ class TestAgentKgExtractionIntegration:
|
|||
# Act
|
||||
await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
|
||||
|
||||
# Assert
|
||||
# Should still emit outputs (even if empty) to maintain flow consistency
|
||||
# Assert - with empty extraction results, nothing is emitted
|
||||
triples_publisher = mock_flow_context("triples")
|
||||
entity_contexts_publisher = mock_flow_context("entity-contexts")
|
||||
|
||||
# Triples should include metadata triples at minimum
|
||||
triples_publisher.send.assert_called_once()
|
||||
sent_triples = triples_publisher.send.call_args[0][0]
|
||||
assert isinstance(sent_triples, Triples)
|
||||
|
||||
# Entity contexts should not be sent if empty
|
||||
|
||||
# No triples or entity contexts emitted for empty results
|
||||
triples_publisher.send.assert_not_called()
|
||||
entity_contexts_publisher.send.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
@ -308,7 +292,7 @@ class TestAgentKgExtractionIntegration:
|
|||
test_text = "Test text for prompt rendering"
|
||||
chunk = Chunk(
|
||||
chunk=test_text.encode('utf-8'),
|
||||
metadata=Metadata(id="test-doc", metadata=[])
|
||||
metadata=Metadata(id="test-doc")
|
||||
)
|
||||
|
||||
agent_client = mock_flow_context("agent-request")
|
||||
|
|
@ -340,7 +324,7 @@ class TestAgentKgExtractionIntegration:
|
|||
text = f"Test document {i} content"
|
||||
chunks.append(Chunk(
|
||||
chunk=text.encode('utf-8'),
|
||||
metadata=Metadata(id=f"doc{i}", metadata=[])
|
||||
metadata=Metadata(id=f"doc{i}")
|
||||
))
|
||||
|
||||
agent_client = mock_flow_context("agent-request")
|
||||
|
|
@ -375,7 +359,7 @@ class TestAgentKgExtractionIntegration:
|
|||
unicode_text = "Machine Learning (学习机器) は人工知能の一分野です。"
|
||||
chunk = Chunk(
|
||||
chunk=unicode_text.encode('utf-8'),
|
||||
metadata=Metadata(id="unicode-doc", metadata=[])
|
||||
metadata=Metadata(id="unicode-doc")
|
||||
)
|
||||
|
||||
agent_client = mock_flow_context("agent-request")
|
||||
|
|
@ -411,7 +395,7 @@ class TestAgentKgExtractionIntegration:
|
|||
large_text = "Machine Learning is important. " * 1000 # Repeat to create large text
|
||||
chunk = Chunk(
|
||||
chunk=large_text.encode('utf-8'),
|
||||
metadata=Metadata(id="large-doc", metadata=[])
|
||||
metadata=Metadata(id="large-doc")
|
||||
)
|
||||
|
||||
agent_client = mock_flow_context("agent-request")
|
||||
|
|
|
|||
|
|
@ -171,7 +171,6 @@ async def test_export_no_message_loss_integration(mock_backend):
|
|||
triples_obj = Triples(
|
||||
metadata=Metadata(
|
||||
id=f"export-msg-{i}",
|
||||
metadata=to_subgraph(msg_data["metadata"]["metadata"]),
|
||||
user=msg_data["metadata"]["user"],
|
||||
collection=msg_data["metadata"]["collection"],
|
||||
),
|
||||
|
|
|
|||
|
|
@ -92,7 +92,6 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
id="doc-123",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
chunk=b"Machine Learning is a subset of Artificial Intelligence. Neural Networks are used in Machine Learning to process complex patterns."
|
||||
)
|
||||
|
|
@ -243,13 +242,12 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
id="test-doc",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
# Act
|
||||
triples = []
|
||||
entities = []
|
||||
|
||||
|
||||
for defn in sample_definitions_response:
|
||||
s = defn["entity"]
|
||||
o = defn["definition"]
|
||||
|
|
@ -302,12 +300,11 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
id="test-doc",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
# Act
|
||||
triples = []
|
||||
|
||||
|
||||
for rel in sample_relationships_response:
|
||||
s = rel["subject"]
|
||||
p = rel["predicate"]
|
||||
|
|
@ -373,7 +370,6 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
id="test-doc",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
triples=[
|
||||
Triple(
|
||||
|
|
@ -406,7 +402,6 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
id="test-doc",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
entities=[
|
||||
EntityEmbeddings(
|
||||
|
|
@ -542,7 +537,7 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
]
|
||||
|
||||
sample_chunk = Chunk(
|
||||
metadata=Metadata(id="test", user="user", collection="collection", metadata=[]),
|
||||
metadata=Metadata(id="test", user="user", collection="collection"),
|
||||
chunk=b"Test chunk"
|
||||
)
|
||||
|
||||
|
|
@ -569,7 +564,7 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
# Arrange
|
||||
large_chunk_batch = [
|
||||
Chunk(
|
||||
metadata=Metadata(id=f"doc-{i}", user="user", collection="collection", metadata=[]),
|
||||
metadata=Metadata(id=f"doc-{i}", user="user", collection="collection"),
|
||||
chunk=f"Document {i} contains machine learning and AI content.".encode("utf-8")
|
||||
)
|
||||
for i in range(100) # Large batch
|
||||
|
|
@ -608,15 +603,8 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
id="test-doc-123",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[
|
||||
Triple(
|
||||
s=Term(type=IRI, iri="doc:test"),
|
||||
p=Term(type=IRI, iri="dc:title"),
|
||||
o=Term(type=LITERAL, value="Test Document")
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
sample_chunk = Chunk(
|
||||
metadata=original_metadata,
|
||||
chunk=b"Test content for metadata propagation"
|
||||
|
|
|
|||
|
|
@ -231,7 +231,6 @@ class TestObjectExtractionServiceIntegration:
|
|||
id="customer-doc-001",
|
||||
user="integration_test",
|
||||
collection="test_documents",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
chunk_text = """
|
||||
|
|
@ -299,7 +298,6 @@ class TestObjectExtractionServiceIntegration:
|
|||
id="product-doc-001",
|
||||
user="integration_test",
|
||||
collection="test_documents",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
chunk_text = """
|
||||
|
|
@ -373,7 +371,6 @@ class TestObjectExtractionServiceIntegration:
|
|||
id=chunk_id,
|
||||
user="concurrent_test",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
chunk = Chunk(metadata=metadata, chunk=text.encode('utf-8'))
|
||||
chunks.append(chunk)
|
||||
|
|
@ -470,7 +467,7 @@ class TestObjectExtractionServiceIntegration:
|
|||
await processor.on_schema_config(integration_config, version=1)
|
||||
|
||||
# Create test chunk
|
||||
metadata = Metadata(id="error-test", user="test", collection="test", metadata=[])
|
||||
metadata = Metadata(id="error-test", user="test", collection="test")
|
||||
chunk = Chunk(metadata=metadata, chunk=b"Some text that will fail to process")
|
||||
|
||||
mock_msg = MagicMock()
|
||||
|
|
@ -507,7 +504,6 @@ class TestObjectExtractionServiceIntegration:
|
|||
id="metadata-test-chunk",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[] # Could include source document metadata
|
||||
)
|
||||
|
||||
chunk = Chunk(
|
||||
|
|
|
|||
|
|
@ -120,7 +120,6 @@ class TestRowsCassandraIntegration:
|
|||
id="doc-001",
|
||||
user="test_user",
|
||||
collection="import_2024",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="customer_records",
|
||||
values=[{
|
||||
|
|
@ -201,7 +200,7 @@ class TestRowsCassandraIntegration:
|
|||
|
||||
# Process objects for different schemas
|
||||
product_obj = ExtractedObject(
|
||||
metadata=Metadata(id="p1", user="shop", collection="catalog", metadata=[]),
|
||||
metadata=Metadata(id="p1", user="shop", collection="catalog"),
|
||||
schema_name="products",
|
||||
values=[{"product_id": "P001", "name": "Widget", "price": "19.99"}],
|
||||
confidence=0.9,
|
||||
|
|
@ -209,7 +208,7 @@ class TestRowsCassandraIntegration:
|
|||
)
|
||||
|
||||
order_obj = ExtractedObject(
|
||||
metadata=Metadata(id="o1", user="shop", collection="sales", metadata=[]),
|
||||
metadata=Metadata(id="o1", user="shop", collection="sales"),
|
||||
schema_name="orders",
|
||||
values=[{"order_id": "O001", "customer_id": "C001", "total": "59.97"}],
|
||||
confidence=0.85,
|
||||
|
|
@ -254,7 +253,7 @@ class TestRowsCassandraIntegration:
|
|||
)
|
||||
|
||||
test_obj = ExtractedObject(
|
||||
metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
|
||||
metadata=Metadata(id="t1", user="test", collection="test"),
|
||||
schema_name="indexed_data",
|
||||
values=[{
|
||||
"id": "123",
|
||||
|
|
@ -337,7 +336,6 @@ class TestRowsCassandraIntegration:
|
|||
id="batch-001",
|
||||
user="test_user",
|
||||
collection="batch_import",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="batch_customers",
|
||||
values=[
|
||||
|
|
@ -391,7 +389,7 @@ class TestRowsCassandraIntegration:
|
|||
|
||||
# Process empty batch object
|
||||
empty_obj = ExtractedObject(
|
||||
metadata=Metadata(id="empty-1", user="test", collection="empty", metadata=[]),
|
||||
metadata=Metadata(id="empty-1", user="test", collection="empty"),
|
||||
schema_name="empty_test",
|
||||
values=[], # Empty batch
|
||||
confidence=1.0,
|
||||
|
|
@ -426,7 +424,7 @@ class TestRowsCassandraIntegration:
|
|||
)
|
||||
|
||||
test_obj = ExtractedObject(
|
||||
metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
|
||||
metadata=Metadata(id="t1", user="test", collection="test"),
|
||||
schema_name="map_test",
|
||||
values=[{"id": "123", "name": "Test Item", "count": "42"}],
|
||||
confidence=0.9,
|
||||
|
|
@ -470,7 +468,7 @@ class TestRowsCassandraIntegration:
|
|||
)
|
||||
|
||||
test_obj = ExtractedObject(
|
||||
metadata=Metadata(id="t1", user="test", collection="my_collection", metadata=[]),
|
||||
metadata=Metadata(id="t1", user="test", collection="my_collection"),
|
||||
schema_name="partition_test",
|
||||
values=[{"id": "123", "category": "test"}],
|
||||
confidence=0.9,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue