mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Subgraph provenance (#694)
Replace per-triple provenance reification with subgraph model Extraction provenance previously created a full reification (statement URI, activity, agent) for every single extracted triple, producing ~13 provenance triples per knowledge triple. Since each chunk is processed by a single LLM call, this was both redundant and semantically inaccurate. Now one subgraph object is created per chunk extraction, with tg:contains linking to each extracted triple. For 20 extractions from a chunk this reduces provenance from ~260 triples to ~33. - Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri - Replace triple_provenance_triples() with subgraph_provenance_triples() - Refactor kg-extract-definitions and kg-extract-relationships to generate provenance once per chunk instead of per triple - Add subgraph provenance to kg-extract-ontology and kg-extract-agent (previously had none) - Update CLI tools and tech specs to match Also rename tg-show-document-hierarchy to tg-show-extraction-provenance. Added extra typing for extraction provenance, fixed extraction prov CLI
This commit is contained in:
parent
35128ff019
commit
64e3f6bd0d
20 changed files with 463 additions and 193 deletions
|
|
@ -168,7 +168,7 @@ This is not JSON at all
|
|||
}
|
||||
]
|
||||
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Check entity label triple
|
||||
label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
|
||||
|
|
@ -206,7 +206,7 @@ This is not JSON at all
|
|||
}
|
||||
]
|
||||
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Check that subject, predicate, and object labels are created
|
||||
subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
|
|
@ -244,7 +244,7 @@ This is not JSON at all
|
|||
}
|
||||
]
|
||||
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Check that object labels are not created for literal objects
|
||||
object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
|
||||
|
|
@ -253,7 +253,7 @@ This is not JSON at all
|
|||
|
||||
def test_process_extraction_data_combined(self, agent_extractor, sample_metadata, sample_extraction_data):
|
||||
"""Test processing of combined definitions and relationships"""
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
|
||||
|
||||
# Check that we have both definition and relationship triples
|
||||
definition_triples = [t for t in triples if t.p.iri == DEFINITION]
|
||||
|
|
@ -272,7 +272,7 @@ This is not JSON at all
|
|||
{"type": "definition", "entity": "Test Entity", "definition": "Test definition"}
|
||||
]
|
||||
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should not create subject-of relationships when no metadata ID
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
|
||||
|
|
@ -285,7 +285,7 @@ This is not JSON at all
|
|||
"""Test processing of empty extraction data"""
|
||||
data = []
|
||||
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Should have no entity contexts
|
||||
assert len(entity_contexts) == 0
|
||||
|
|
@ -300,7 +300,7 @@ This is not JSON at all
|
|||
{"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True}
|
||||
]
|
||||
|
||||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Should process valid items and ignore unknown types
|
||||
assert len(entity_contexts) == 1 # Only the definition creates entity context
|
||||
|
|
|
|||
|
|
@ -168,7 +168,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
"""Test processing with empty or minimal metadata"""
|
||||
# Test with None metadata - may not raise AttributeError depending on implementation
|
||||
try:
|
||||
triples, contexts = agent_extractor.process_extraction_data([], None)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data([], None)
|
||||
# If it doesn't raise, check the results
|
||||
assert len(triples) == 0
|
||||
assert len(contexts) == 0
|
||||
|
|
@ -178,14 +178,14 @@ class TestAgentKgExtractionEdgeCases:
|
|||
|
||||
# Test with metadata without ID
|
||||
metadata = Metadata(id=None)
|
||||
triples, contexts = agent_extractor.process_extraction_data([], metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data([], metadata)
|
||||
assert len(triples) == 0
|
||||
assert len(contexts) == 0
|
||||
|
||||
# Test with metadata with empty string ID
|
||||
metadata = Metadata(id="")
|
||||
data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should not create subject-of triples when ID is empty string
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
|
||||
|
|
@ -213,7 +213,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
for entity in special_entities
|
||||
]
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Verify all entities were processed
|
||||
assert len(contexts) == len(special_entities)
|
||||
|
|
@ -234,7 +234,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
{"type": "definition", "entity": "Test Entity", "definition": long_definition}
|
||||
]
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should handle long definitions without issues
|
||||
assert len(contexts) == 1
|
||||
|
|
@ -256,7 +256,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
{"type": "definition", "entity": "AI", "definition": "Another AI definition"}, # Duplicate
|
||||
]
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should process all entries (including duplicates)
|
||||
assert len(contexts) == 4
|
||||
|
|
@ -280,7 +280,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
{"type": "relationship", "subject": "test", "predicate": "test", "object": "", "object-entity": True},
|
||||
]
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should handle empty strings by creating URIs (even if empty)
|
||||
assert len(contexts) == 3
|
||||
|
|
@ -306,7 +306,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
}
|
||||
]
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should handle JSON strings in definitions without parsing them
|
||||
assert len(contexts) == 2
|
||||
|
|
@ -334,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
{"type": "relationship", "subject": "A", "predicate": "rel7", "object": "F", "object-entity": 1},
|
||||
]
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should process all relationships
|
||||
# Note: The current implementation has some logic issues that these tests document
|
||||
|
|
@ -416,7 +416,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
import time
|
||||
start_time = time.time()
|
||||
|
||||
triples, contexts = agent_extractor.process_extraction_data(large_data, metadata)
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(large_data, metadata)
|
||||
|
||||
end_time = time.time()
|
||||
processing_time = end_time - start_time
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue