From 64e3f6bd0d30c64fcde1497de50dffaa6687e12d Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 13 Mar 2026 11:37:59 +0000 Subject: [PATCH] Subgraph provenance (#694) Replace per-triple provenance reification with subgraph model Extraction provenance previously created a full reification (statement URI, activity, agent) for every single extracted triple, producing ~13 provenance triples per knowledge triple. Since each chunk is processed by a single LLM call, this was both redundant and semantically inaccurate. Now one subgraph object is created per chunk extraction, with tg:contains linking to each extracted triple. For 20 extractions from a chunk this reduces provenance from ~260 triples to ~33. - Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri - Replace triple_provenance_triples() with subgraph_provenance_triples() - Refactor kg-extract-definitions and kg-extract-relationships to generate provenance once per chunk instead of per triple - Add subgraph provenance to kg-extract-ontology and kg-extract-agent (previously had none) - Update CLI tools and tech specs to match Also rename tg-show-document-hierarchy to tg-show-extraction-provenance. Added extra typing for extraction provenance, fixed extraction prov CLI --- .../extraction-provenance-subgraph.md | 205 ++++++++++++++++++ docs/tech-specs/extraction-time-provenance.md | 32 +-- docs/tech-specs/query-time-explainability.md | 4 +- .../test_agent_kg_extraction_integration.py | 2 +- .../test_agent_extraction.py | 14 +- .../test_agent_extraction_edge_cases.py | 20 +- trustgraph-base/trustgraph/knowledge/defs.py | 2 +- .../trustgraph/provenance/__init__.py | 20 +- .../trustgraph/provenance/namespaces.py | 8 +- .../trustgraph/provenance/triples.py | 73 ++++--- trustgraph-base/trustgraph/provenance/uris.py | 12 +- .../trustgraph/provenance/vocabulary.py | 14 +- trustgraph-cli/pyproject.toml | 2 +- .../trustgraph/cli/invoke_graph_rag.py | 8 +- .../trustgraph/cli/show_explain_trace.py | 6 +- ...archy.py => show_extraction_provenance.py} | 108 ++++----- .../trustgraph/extract/kg/agent/extract.py | 43 +++- .../extract/kg/definitions/extract.py | 32 +-- .../trustgraph/extract/kg/ontology/extract.py | 19 +- .../extract/kg/relationships/extract.py | 32 +-- 20 files changed, 463 insertions(+), 193 deletions(-) create mode 100644 docs/tech-specs/extraction-provenance-subgraph.md rename trustgraph-cli/trustgraph/cli/{show_document_hierarchy.py => show_extraction_provenance.py} (81%) diff --git a/docs/tech-specs/extraction-provenance-subgraph.md b/docs/tech-specs/extraction-provenance-subgraph.md new file mode 100644 index 00000000..ba0d3e50 --- /dev/null +++ b/docs/tech-specs/extraction-provenance-subgraph.md @@ -0,0 +1,205 @@ +# Extraction Provenance: Subgraph Model + +## Problem + +Extraction-time provenance currently generates a full reification per +extracted triple: a unique `stmt_uri`, `activity_uri`, and associated +PROV-O metadata for every single knowledge fact. Processing one chunk +that yields 20 relationships produces ~220 provenance triples on top of +the ~20 knowledge triples — a roughly 10:1 overhead. + +This is both expensive (storage, indexing, transmission) and semantically +inaccurate. Each chunk is processed by a single LLM call that produces +all its triples in one transaction. The current per-triple model +obscures that by creating the illusion of 20 independent extraction +events. + +Additionally, two of the four extraction processors (kg-extract-ontology, +kg-extract-agent) have no provenance at all, leaving gaps in the audit +trail. + +## Solution + +Replace per-triple reification with a **subgraph model**: one provenance +record per chunk extraction, shared across all triples produced from that +chunk. + +### Terminology Change + +| Old | New | +|-----|-----| +| `stmt_uri` (`https://trustgraph.ai/stmt/{uuid}`) | `subgraph_uri` (`https://trustgraph.ai/subgraph/{uuid}`) | +| `statement_uri()` | `subgraph_uri()` | +| `tg:reifies` (1:1, identity) | `tg:contains` (1:many, containment) | + +### Target Structure + +All provenance triples go in the `urn:graph:source` named graph. + +``` +# Subgraph contains each extracted triple (RDF-star quoted triples) + tg:contains <> . + tg:contains <> . + tg:contains <> . + +# Derivation from source chunk + prov:wasDerivedFrom . + prov:wasGeneratedBy . + +# Activity: one per chunk extraction + rdf:type prov:Activity . + rdfs:label "{component_name} extraction" . + prov:used . + prov:wasAssociatedWith . + prov:startedAtTime "2026-03-13T10:00:00Z" . + tg:componentVersion "0.25.0" . + tg:llmModel "gpt-4" . # if available + tg:ontology . # if available + +# Agent: stable per component + rdf:type prov:Agent . + rdfs:label "{component_name}" . +``` + +### Volume Comparison + +For a chunk producing N extracted triples: + +| | Old (per-triple) | New (subgraph) | +|---|---|---| +| `tg:contains` / `tg:reifies` | N | N | +| Activity triples | ~9 x N | ~9 | +| Agent triples | 2 x N | 2 | +| Statement/subgraph metadata | 2 x N | 2 | +| **Total provenance triples** | **~13N** | **N + 13** | +| **Example (N=20)** | **~260** | **33** | + +## Scope + +### Processors to Update (existing provenance, per-triple) + +**kg-extract-definitions** +(`trustgraph-flow/trustgraph/extract/kg/definitions/extract.py`) + +Currently calls `statement_uri()` + `triple_provenance_triples()` inside +the per-definition loop. + +Changes: +- Move `subgraph_uri()` and `activity_uri()` creation before the loop +- Collect `tg:contains` triples inside the loop +- Emit shared activity/agent/derivation block once after the loop + +**kg-extract-relationships** +(`trustgraph-flow/trustgraph/extract/kg/relationships/extract.py`) + +Same pattern as definitions. Same changes. + +### Processors to Add Provenance (currently missing) + +**kg-extract-ontology** +(`trustgraph-flow/trustgraph/extract/kg/ontology/extract.py`) + +Currently emits triples with no provenance. Add subgraph provenance +using the same pattern: one subgraph per chunk, `tg:contains` for each +extracted triple. + +**kg-extract-agent** +(`trustgraph-flow/trustgraph/extract/kg/agent/extract.py`) + +Currently emits triples with no provenance. Add subgraph provenance +using the same pattern. + +### Shared Provenance Library Changes + +**`trustgraph-base/trustgraph/provenance/triples.py`** + +- Replace `triple_provenance_triples()` with `subgraph_provenance_triples()` +- New function accepts a list of extracted triples instead of a single one +- Generates one `tg:contains` per triple, shared activity/agent block +- Remove old `triple_provenance_triples()` + +**`trustgraph-base/trustgraph/provenance/uris.py`** + +- Replace `statement_uri()` with `subgraph_uri()` + +**`trustgraph-base/trustgraph/provenance/namespaces.py`** + +- Replace `TG_REIFIES` with `TG_CONTAINS` + +### Not in Scope + +- **kg-extract-topics**: older-style processor, not currently used in + standard flows +- **kg-extract-rows**: produces rows not triples, different provenance + model +- **Query-time provenance** (`urn:graph:retrieval`): separate concern, + already uses a different pattern (question/exploration/focus/synthesis) +- **Document/page/chunk provenance** (PDF decoder, chunker): already uses + `derived_entity_triples()` which is per-entity, not per-triple — no + redundancy issue + +## Implementation Notes + +### Processor Loop Restructure + +Before (per-triple, in relationships): +```python +for rel in rels: + # ... build relationship_triple ... + stmt_uri = statement_uri() + prov_triples = triple_provenance_triples( + stmt_uri=stmt_uri, + extracted_triple=relationship_triple, + ... + ) + triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) +``` + +After (subgraph): +```python +sg_uri = subgraph_uri() + +for rel in rels: + # ... build relationship_triple ... + extracted_triples.append(relationship_triple) + +prov_triples = subgraph_provenance_triples( + subgraph_uri=sg_uri, + extracted_triples=extracted_triples, + chunk_uri=chunk_uri, + component_name=default_ident, + component_version=COMPONENT_VERSION, + llm_model=llm_model, + ontology_uri=ontology_uri, +) +triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) +``` + +### New Helper Signature + +```python +def subgraph_provenance_triples( + subgraph_uri: str, + extracted_triples: List[Triple], + chunk_uri: str, + component_name: str, + component_version: str, + llm_model: Optional[str] = None, + ontology_uri: Optional[str] = None, + timestamp: Optional[str] = None, +) -> List[Triple]: + """ + Build provenance triples for a subgraph of extracted knowledge. + + Creates: + - tg:contains link for each extracted triple (RDF-star quoted) + - One prov:wasDerivedFrom link to source chunk + - One activity with agent metadata + """ +``` + +### Breaking Change + +This is a breaking change to the provenance model. Provenance has not +been released, so no migration is needed. The old `tg:reifies` / +`statement_uri` code can be removed outright. diff --git a/docs/tech-specs/extraction-time-provenance.md b/docs/tech-specs/extraction-time-provenance.md index 72c1c971..d70c8c11 100644 --- a/docs/tech-specs/extraction-time-provenance.md +++ b/docs/tech-specs/extraction-time-provenance.md @@ -311,10 +311,10 @@ activity:chunk-789 tg:chunkOverlap 200 . # The extracted triple (edge) entity:JohnSmith rel:worksAt entity:AcmeCorp . -# Statement object pointing at the edge (RDF 1.2 reification) -stmt:001 tg:reifies <> . -stmt:001 prov:wasDerivedFrom chunk:123-1-1 . -stmt:001 prov:wasGeneratedBy activity:extract-999 . +# Subgraph containing the extracted triples +subgraph:001 tg:contains <> . +subgraph:001 prov:wasDerivedFrom chunk:123-1-1 . +subgraph:001 prov:wasGeneratedBy activity:extract-999 . activity:extract-999 a prov:Activity . activity:extract-999 prov:used chunk:123-1-1 . @@ -344,7 +344,7 @@ Custom predicates under the `tg:` namespace for extraction-specific metadata: | Predicate | Domain | Description | |-----------|--------|-------------| -| `tg:reifies` | Statement | Points at the triple this statement object represents | +| `tg:contains` | Subgraph | Points at a triple contained in this extraction subgraph | | `tg:pageCount` | Document | Total number of pages in source document | | `tg:mimeType` | Document | MIME type of source document | | `tg:pageNumber` | Page | Page number in source document | @@ -383,7 +383,7 @@ prov:startedAtTime rdfs:label "started at" . **TrustGraph Predicates:** ``` -tg:reifies rdfs:label "reifies" . +tg:contains rdfs:label "contains" . tg:pageCount rdfs:label "page count" . tg:mimeType rdfs:label "MIME type" . tg:pageNumber rdfs:label "page number" . @@ -416,20 +416,20 @@ For finer-grained provenance, it would be valuable to record exactly where withi # The extracted triple entity:JohnSmith rel:worksAt entity:AcmeCorp . -# Statement with sub-chunk provenance -stmt:001 tg:reifies <> . -stmt:001 prov:wasDerivedFrom chunk:123-1-1 . -stmt:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" . -stmt:001 tg:sourceCharOffset 1547 . -stmt:001 tg:sourceCharLength 46 . +# Subgraph with sub-chunk provenance +subgraph:001 tg:contains <> . +subgraph:001 prov:wasDerivedFrom chunk:123-1-1 . +subgraph:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" . +subgraph:001 tg:sourceCharOffset 1547 . +subgraph:001 tg:sourceCharLength 46 . ``` **Example with text range (alternative):** ``` -stmt:001 tg:reifies <> . -stmt:001 prov:wasDerivedFrom chunk:123-1-1 . -stmt:001 tg:sourceRange "1547-1593" . -stmt:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" . +subgraph:001 tg:contains <> . +subgraph:001 prov:wasDerivedFrom chunk:123-1-1 . +subgraph:001 tg:sourceRange "1547-1593" . +subgraph:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" . ``` **Implementation considerations:** diff --git a/docs/tech-specs/query-time-explainability.md b/docs/tech-specs/query-time-explainability.md index 3385efc9..e696745c 100644 --- a/docs/tech-specs/query-time-explainability.md +++ b/docs/tech-specs/query-time-explainability.md @@ -193,7 +193,7 @@ When storing explainability data, URIs from `uri_map` are used. Selected edges can be traced back to source documents: -1. Query for reifying statement: `?stmt tg:reifies <>` +1. Query for containing subgraph: `?subgraph tg:contains <>` 2. Follow `prov:wasDerivedFrom` chain to root document 3. Each step in chain: chunk → page → document @@ -209,7 +209,7 @@ elif term.type == TRIPLE: This enables queries like: ``` -?stmt tg:reifies <> +?subgraph tg:contains <> ``` ## CLI Usage diff --git a/tests/integration/test_agent_kg_extraction_integration.py b/tests/integration/test_agent_kg_extraction_integration.py index c97bd529..a2576274 100644 --- a/tests/integration/test_agent_kg_extraction_integration.py +++ b/tests/integration/test_agent_kg_extraction_integration.py @@ -128,7 +128,7 @@ class TestAgentKgExtractionIntegration: # Parse and process extraction_data = extractor.parse_jsonl(agent_response) - triples, entity_contexts = extractor.process_extraction_data(extraction_data, v.metadata) + triples, entity_contexts, extracted_triples = extractor.process_extraction_data(extraction_data, v.metadata) # Emit outputs if triples: diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction.py b/tests/unit/test_knowledge_graph/test_agent_extraction.py index e4c723db..d2824c0c 100644 --- a/tests/unit/test_knowledge_graph/test_agent_extraction.py +++ b/tests/unit/test_knowledge_graph/test_agent_extraction.py @@ -168,7 +168,7 @@ This is not JSON at all } ] - triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata) # Check entity label triple label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None) @@ -206,7 +206,7 @@ This is not JSON at all } ] - triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata) # Check that subject, predicate, and object labels are created subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" @@ -244,7 +244,7 @@ This is not JSON at all } ] - triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata) # Check that object labels are not created for literal objects object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"] @@ -253,7 +253,7 @@ This is not JSON at all def test_process_extraction_data_combined(self, agent_extractor, sample_metadata, sample_extraction_data): """Test processing of combined definitions and relationships""" - triples, entity_contexts = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata) # Check that we have both definition and relationship triples definition_triples = [t for t in triples if t.p.iri == DEFINITION] @@ -272,7 +272,7 @@ This is not JSON at all {"type": "definition", "entity": "Test Entity", "definition": "Test definition"} ] - triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should not create subject-of relationships when no metadata ID subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF] @@ -285,7 +285,7 @@ This is not JSON at all """Test processing of empty extraction data""" data = [] - triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata) # Should have no entity contexts assert len(entity_contexts) == 0 @@ -300,7 +300,7 @@ This is not JSON at all {"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True} ] - triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata) + triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata) # Should process valid items and ignore unknown types assert len(entity_contexts) == 1 # Only the definition creates entity context diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py index 2f0174e5..ac20fe11 100644 --- a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py +++ b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py @@ -168,7 +168,7 @@ class TestAgentKgExtractionEdgeCases: """Test processing with empty or minimal metadata""" # Test with None metadata - may not raise AttributeError depending on implementation try: - triples, contexts = agent_extractor.process_extraction_data([], None) + triples, contexts, _ = agent_extractor.process_extraction_data([], None) # If it doesn't raise, check the results assert len(triples) == 0 assert len(contexts) == 0 @@ -178,14 +178,14 @@ class TestAgentKgExtractionEdgeCases: # Test with metadata without ID metadata = Metadata(id=None) - triples, contexts = agent_extractor.process_extraction_data([], metadata) + triples, contexts, _ = agent_extractor.process_extraction_data([], metadata) assert len(triples) == 0 assert len(contexts) == 0 # Test with metadata with empty string ID metadata = Metadata(id="") data = [{"type": "definition", "entity": "Test", "definition": "Test def"}] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should not create subject-of triples when ID is empty string subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF] @@ -213,7 +213,7 @@ class TestAgentKgExtractionEdgeCases: for entity in special_entities ] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Verify all entities were processed assert len(contexts) == len(special_entities) @@ -234,7 +234,7 @@ class TestAgentKgExtractionEdgeCases: {"type": "definition", "entity": "Test Entity", "definition": long_definition} ] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should handle long definitions without issues assert len(contexts) == 1 @@ -256,7 +256,7 @@ class TestAgentKgExtractionEdgeCases: {"type": "definition", "entity": "AI", "definition": "Another AI definition"}, # Duplicate ] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should process all entries (including duplicates) assert len(contexts) == 4 @@ -280,7 +280,7 @@ class TestAgentKgExtractionEdgeCases: {"type": "relationship", "subject": "test", "predicate": "test", "object": "", "object-entity": True}, ] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should handle empty strings by creating URIs (even if empty) assert len(contexts) == 3 @@ -306,7 +306,7 @@ class TestAgentKgExtractionEdgeCases: } ] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should handle JSON strings in definitions without parsing them assert len(contexts) == 2 @@ -334,7 +334,7 @@ class TestAgentKgExtractionEdgeCases: {"type": "relationship", "subject": "A", "predicate": "rel7", "object": "F", "object-entity": 1}, ] - triples, contexts = agent_extractor.process_extraction_data(data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) # Should process all relationships # Note: The current implementation has some logic issues that these tests document @@ -416,7 +416,7 @@ class TestAgentKgExtractionEdgeCases: import time start_time = time.time() - triples, contexts = agent_extractor.process_extraction_data(large_data, metadata) + triples, contexts, _ = agent_extractor.process_extraction_data(large_data, metadata) end_time = time.time() processing_time = end_time - start_time diff --git a/trustgraph-base/trustgraph/knowledge/defs.py b/trustgraph-base/trustgraph/knowledge/defs.py index e8157586..4c7eb41a 100644 --- a/trustgraph-base/trustgraph/knowledge/defs.py +++ b/trustgraph-base/trustgraph/knowledge/defs.py @@ -41,7 +41,7 @@ class QuotedTriple: enabling statements about statements. Example: - # stmt:123 tg:reifies <<:Hope skos:definition "A feeling...">> + # subgraph:123 tg:contains <<:Hope skos:definition "A feeling...">> qt = QuotedTriple( s=Uri("https://example.org/Hope"), p=Uri("http://www.w3.org/2004/02/skos/core#definition"), diff --git a/trustgraph-base/trustgraph/provenance/__init__.py b/trustgraph-base/trustgraph/provenance/__init__.py index b22f44d8..df3c2034 100644 --- a/trustgraph-base/trustgraph/provenance/__init__.py +++ b/trustgraph-base/trustgraph/provenance/__init__.py @@ -2,7 +2,7 @@ Provenance module for extraction-time provenance support. Provides helpers for: -- URI generation for documents, pages, chunks, activities, statements +- URI generation for documents, pages, chunks, activities, subgraphs - PROV-O triple building for provenance metadata - Vocabulary bootstrap for per-collection initialization @@ -38,7 +38,7 @@ from . uris import ( chunk_uri_from_page, chunk_uri_from_doc, activity_uri, - statement_uri, + subgraph_uri, agent_uri, # Query-time provenance URIs (GraphRAG) question_uri, @@ -66,11 +66,13 @@ from . namespaces import ( # RDF/RDFS RDF, RDF_TYPE, RDFS, RDFS_LABEL, # TrustGraph - TG, TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, + TG, TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH, TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION, TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL, TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH, + # Extraction provenance entity types + TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE, # Query-time provenance predicates (GraphRAG) TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING, TG_CONTENT, # Query-time provenance predicates (DocumentRAG) @@ -94,7 +96,7 @@ from . namespaces import ( from . triples import ( document_triples, derived_entity_triples, - triple_provenance_triples, + subgraph_provenance_triples, # Query-time provenance triple builders (GraphRAG) question_triples, exploration_triples, @@ -121,6 +123,7 @@ from . vocabulary import ( PROV_CLASS_LABELS, PROV_PREDICATE_LABELS, DC_PREDICATE_LABELS, + TG_CLASS_LABELS, TG_PREDICATE_LABELS, ) @@ -132,7 +135,7 @@ __all__ = [ "chunk_uri_from_page", "chunk_uri_from_doc", "activity_uri", - "statement_uri", + "subgraph_uri", "agent_uri", # Query-time provenance URIs "question_uri", @@ -153,11 +156,13 @@ __all__ = [ "PROV_USED", "PROV_WAS_ASSOCIATED_WITH", "PROV_STARTED_AT_TIME", "DC", "DC_TITLE", "DC_SOURCE", "DC_DATE", "DC_CREATOR", "RDF", "RDF_TYPE", "RDFS", "RDFS_LABEL", - "TG", "TG_REIFIES", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER", + "TG", "TG_CONTAINS", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER", "TG_CHUNK_INDEX", "TG_CHAR_OFFSET", "TG_CHAR_LENGTH", "TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION", "TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL", "TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH", + # Extraction provenance entity types + "TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE", # Query-time provenance predicates (GraphRAG) "TG_QUERY", "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING", "TG_CONTENT", # Query-time provenance predicates (DocumentRAG) @@ -178,7 +183,7 @@ __all__ = [ # Triple builders "document_triples", "derived_entity_triples", - "triple_provenance_triples", + "subgraph_provenance_triples", # Query-time provenance triple builders (GraphRAG) "question_triples", "exploration_triples", @@ -199,5 +204,6 @@ __all__ = [ "PROV_CLASS_LABELS", "PROV_PREDICATE_LABELS", "DC_PREDICATE_LABELS", + "TG_CLASS_LABELS", "TG_PREDICATE_LABELS", ] diff --git a/trustgraph-base/trustgraph/provenance/namespaces.py b/trustgraph-base/trustgraph/provenance/namespaces.py index 91e82eac..e60dee16 100644 --- a/trustgraph-base/trustgraph/provenance/namespaces.py +++ b/trustgraph-base/trustgraph/provenance/namespaces.py @@ -42,7 +42,7 @@ SKOS_DEFINITION = SKOS + "definition" # TrustGraph namespace for custom predicates TG = "https://trustgraph.ai/ns/" -TG_REIFIES = TG + "reifies" +TG_CONTAINS = TG + "contains" TG_PAGE_COUNT = TG + "pageCount" TG_MIME_TYPE = TG + "mimeType" TG_PAGE_NUMBER = TG + "pageNumber" @@ -72,6 +72,12 @@ TG_DOCUMENT = TG + "document" # Reference to document in librarian TG_CHUNK_COUNT = TG + "chunkCount" TG_SELECTED_CHUNK = TG + "selectedChunk" +# Extraction provenance entity types +TG_DOCUMENT_TYPE = TG + "Document" +TG_PAGE_TYPE = TG + "Page" +TG_CHUNK_TYPE = TG + "Chunk" +TG_SUBGRAPH_TYPE = TG + "Subgraph" + # Explainability entity types (shared) TG_QUESTION = TG + "Question" TG_EXPLORATION = TG + "Exploration" diff --git a/trustgraph-base/trustgraph/provenance/triples.py b/trustgraph-base/trustgraph/provenance/triples.py index 14581c6f..459783d1 100644 --- a/trustgraph-base/trustgraph/provenance/triples.py +++ b/trustgraph-base/trustgraph/provenance/triples.py @@ -16,7 +16,9 @@ from . namespaces import ( TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH, TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION, - TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES, + TG_LLM_MODEL, TG_ONTOLOGY, TG_CONTAINS, + # Extraction provenance entity types + TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE, # Query-time provenance predicates (GraphRAG) TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING, TG_CONTENT, TG_DOCUMENT, @@ -28,7 +30,7 @@ from . namespaces import ( TG_GRAPH_RAG_QUESTION, TG_DOC_RAG_QUESTION, ) -from . uris import activity_uri, agent_uri, edge_selection_uri +from . uris import activity_uri, agent_uri, subgraph_uri, edge_selection_uri def set_graph(triples: List[Triple], graph: str) -> List[Triple]: @@ -92,6 +94,7 @@ def document_triples( """ triples = [ _triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)), + _triple(doc_uri, RDF_TYPE, _iri(TG_DOCUMENT_TYPE)), ] if title: @@ -162,10 +165,23 @@ def derived_entity_triples( act_uri = activity_uri() agt_uri = agent_uri(component_name) + # Determine specific type from parameters + if page_number is not None: + specific_type = TG_PAGE_TYPE + elif chunk_index is not None: + specific_type = TG_CHUNK_TYPE + else: + specific_type = None + triples = [ # Entity declaration _triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)), + ] + if specific_type: + triples.append(_triple(entity_uri, RDF_TYPE, _iri(specific_type))) + + triples.extend([ # Derivation from parent _triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)), @@ -183,7 +199,7 @@ def derived_entity_triples( # Agent declaration _triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)), _triple(agt_uri, RDFS_LABEL, _literal(component_name)), - ] + ]) if label: triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label))) @@ -209,9 +225,9 @@ def derived_entity_triples( return triples -def triple_provenance_triples( - stmt_uri: str, - extracted_triple: Triple, +def subgraph_provenance_triples( + subgraph_uri: str, + extracted_triples: List[Triple], chunk_uri: str, component_name: str, component_version: str, @@ -220,16 +236,20 @@ def triple_provenance_triples( timestamp: Optional[str] = None, ) -> List[Triple]: """ - Build provenance triples for an extracted knowledge triple using reification. + Build provenance triples for a subgraph of extracted knowledge. + + One subgraph per chunk extraction, shared across all triples produced + from that chunk. This replaces per-triple reification with a + containment model. Creates: - - Reification triple: stmt_uri tg:reifies <> - - wasDerivedFrom link to source chunk - - Activity and agent metadata + - tg:contains link for each extracted triple (RDF-star quoted) + - One prov:wasDerivedFrom link to source chunk + - One activity with agent metadata Args: - stmt_uri: URI for the reified statement - extracted_triple: The extracted Triple to reify + subgraph_uri: URI for the extraction subgraph + extracted_triples: The extracted Triple objects to include chunk_uri: URI of source chunk component_name: Name of extractor component component_version: Version of the component @@ -238,7 +258,7 @@ def triple_provenance_triples( timestamp: ISO timestamp Returns: - List of Triple objects for the provenance (including reification) + List of Triple objects for the provenance """ if timestamp is None: timestamp = datetime.utcnow().isoformat() + "Z" @@ -246,20 +266,23 @@ def triple_provenance_triples( act_uri = activity_uri() agt_uri = agent_uri(component_name) - # Create the quoted triple term (RDF-star reification) - triple_term = Term(type=TRIPLE, triple=extracted_triple) + triples = [] - triples = [ - # Reification: stmt_uri tg:reifies <> - Triple( - s=_iri(stmt_uri), - p=_iri(TG_REIFIES), + # Containment: subgraph tg:contains <> for each extracted triple + for extracted_triple in extracted_triples: + triple_term = Term(type=TRIPLE, triple=extracted_triple) + triples.append(Triple( + s=_iri(subgraph_uri), + p=_iri(TG_CONTAINS), o=triple_term - ), + )) - # Statement provenance - _triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)), - _triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)), + # Subgraph provenance + triples.extend([ + _triple(subgraph_uri, RDF_TYPE, _iri(PROV_ENTITY)), + _triple(subgraph_uri, RDF_TYPE, _iri(TG_SUBGRAPH_TYPE)), + _triple(subgraph_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)), + _triple(subgraph_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)), # Activity _triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)), @@ -272,7 +295,7 @@ def triple_provenance_triples( # Agent _triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)), _triple(agt_uri, RDFS_LABEL, _literal(component_name)), - ] + ]) if llm_model: triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model))) diff --git a/trustgraph-base/trustgraph/provenance/uris.py b/trustgraph-base/trustgraph/provenance/uris.py index b14abe76..0f8d3136 100644 --- a/trustgraph-base/trustgraph/provenance/uris.py +++ b/trustgraph-base/trustgraph/provenance/uris.py @@ -8,7 +8,7 @@ Child entities (pages, chunks) append path segments to the parent IRI: - Chunk: {page_iri}/c{chunk_index} (from page) {doc_iri}/c{chunk_index} (from text doc) - Activity: https://trustgraph.ai/activity/{uuid} -- Statement: https://trustgraph.ai/stmt/{uuid} +- Subgraph: https://trustgraph.ai/subgraph/{uuid} """ import uuid @@ -50,11 +50,11 @@ def activity_uri(activity_id: str = None) -> str: return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}" -def statement_uri(stmt_id: str = None) -> str: - """Generate URI for a reified statement. Auto-generates UUID if not provided.""" - if stmt_id is None: - stmt_id = str(uuid.uuid4()) - return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}" +def subgraph_uri(subgraph_id: str = None) -> str: + """Generate URI for an extraction subgraph. Auto-generates UUID if not provided.""" + if subgraph_id is None: + subgraph_id = str(uuid.uuid4()) + return f"{TRUSTGRAPH_BASE}/subgraph/{_encode_id(subgraph_id)}" def agent_uri(component_name: str) -> str: diff --git a/trustgraph-base/trustgraph/provenance/vocabulary.py b/trustgraph-base/trustgraph/provenance/vocabulary.py index 47164da8..f9ae1abd 100644 --- a/trustgraph-base/trustgraph/provenance/vocabulary.py +++ b/trustgraph-base/trustgraph/provenance/vocabulary.py @@ -19,11 +19,12 @@ from . namespaces import ( SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION, SCHEMA_KEYWORDS, SCHEMA_NAME, SKOS_DEFINITION, - TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, + TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH, TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION, TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL, TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH, + TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE, ) @@ -74,9 +75,17 @@ SKOS_LABELS = [ _label_triple(SKOS_DEFINITION, "definition"), ] +# TrustGraph class labels (extraction provenance) +TG_CLASS_LABELS = [ + _label_triple(TG_DOCUMENT_TYPE, "Document"), + _label_triple(TG_PAGE_TYPE, "Page"), + _label_triple(TG_CHUNK_TYPE, "Chunk"), + _label_triple(TG_SUBGRAPH_TYPE, "Subgraph"), +] + # TrustGraph predicate labels TG_PREDICATE_LABELS = [ - _label_triple(TG_REIFIES, "reifies"), + _label_triple(TG_CONTAINS, "contains"), _label_triple(TG_PAGE_COUNT, "page count"), _label_triple(TG_MIME_TYPE, "MIME type"), _label_triple(TG_PAGE_NUMBER, "page number"), @@ -116,5 +125,6 @@ def get_vocabulary_triples() -> List[Triple]: DC_PREDICATE_LABELS + SCHEMA_LABELS + SKOS_LABELS + + TG_CLASS_LABELS + TG_PREDICATE_LABELS ) diff --git a/trustgraph-cli/pyproject.toml b/trustgraph-cli/pyproject.toml index f2ad2ac8..fb3402c9 100644 --- a/trustgraph-cli/pyproject.toml +++ b/trustgraph-cli/pyproject.toml @@ -96,7 +96,7 @@ tg-delete-config-item = "trustgraph.cli.delete_config_item:main" tg-list-collections = "trustgraph.cli.list_collections:main" tg-set-collection = "trustgraph.cli.set_collection:main" tg-delete-collection = "trustgraph.cli.delete_collection:main" -tg-show-document-hierarchy = "trustgraph.cli.show_document_hierarchy:main" +tg-show-extraction-provenance = "trustgraph.cli.show_extraction_provenance:main" tg-list-explain-traces = "trustgraph.cli.list_explain_traces:main" tg-show-explain-trace = "trustgraph.cli.show_explain_trace:main" diff --git a/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py b/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py index 27c6854d..295df0b9 100644 --- a/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py +++ b/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py @@ -36,7 +36,7 @@ TG_SELECTED_EDGE = TG + "selectedEdge" TG_EDGE = TG + "edge" TG_REASONING = TG + "reasoning" TG_CONTENT = TG + "content" -TG_REIFIES = TG + "reifies" +TG_CONTAINS = TG + "contains" PROV = "http://www.w3.org/ns/prov#" PROV_STARTED_AT_TIME = PROV + "startedAtTime" PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom" @@ -185,18 +185,18 @@ async def _query_edge_provenance(ws_url, flow_id, edge_s, edge_p, edge_o, user, """ Query for provenance of an edge (s, p, o) in the knowledge graph. - Finds statements that reify the edge via tg:reifies, then follows + Finds subgraphs that contain the edge via tg:contains, then follows prov:wasDerivedFrom to find source documents. Returns list of source URIs (chunks, pages, documents). """ - # Query for statements that reify this edge: ?stmt tg:reifies <> + # Query for subgraphs that contain this edge: ?subgraph tg:contains <> request = { "id": "edge-prov-request", "service": "triples", "flow": flow_id, "request": { - "p": {"t": "i", "i": TG_REIFIES}, + "p": {"t": "i", "i": TG_CONTAINS}, "o": { "t": "t", # Quoted triple type "tr": { diff --git a/trustgraph-cli/trustgraph/cli/show_explain_trace.py b/trustgraph-cli/trustgraph/cli/show_explain_trace.py index b3fc7058..d7392b99 100644 --- a/trustgraph-cli/trustgraph/cli/show_explain_trace.py +++ b/trustgraph-cli/trustgraph/cli/show_explain_trace.py @@ -40,7 +40,7 @@ SOURCE_GRAPH = "urn:graph:source" # Provenance predicates for edge tracing TG = "https://trustgraph.ai/ns/" -TG_REIFIES = TG + "reifies" +TG_CONTAINS = TG + "contains" PROV = "http://www.w3.org/ns/prov#" PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom" @@ -79,10 +79,10 @@ def trace_edge_provenance(flow, user, collection, edge, label_cache, explain_cli } } - # Query: ?stmt tg:reifies <> + # Query: ?subgraph tg:contains <> try: results = flow.triples_query( - p=TG_REIFIES, + p=TG_CONTAINS, o=quoted_triple, g=SOURCE_GRAPH, user=user, diff --git a/trustgraph-cli/trustgraph/cli/show_document_hierarchy.py b/trustgraph-cli/trustgraph/cli/show_extraction_provenance.py similarity index 81% rename from trustgraph-cli/trustgraph/cli/show_document_hierarchy.py rename to trustgraph-cli/trustgraph/cli/show_extraction_provenance.py index be83efd5..4f87712c 100644 --- a/trustgraph-cli/trustgraph/cli/show_document_hierarchy.py +++ b/trustgraph-cli/trustgraph/cli/show_extraction_provenance.py @@ -1,12 +1,12 @@ """ -Show document hierarchy: Document -> Pages -> Chunks -> Edges. +Show extraction provenance: Document -> Pages -> Chunks -> Edges. Given a document ID, traverses and displays all derived entities (pages, chunks, extracted edges) using prov:wasDerivedFrom relationships. Examples: - tg-show-document-hierarchy -U trustgraph -C default "urn:trustgraph:doc:abc123" - tg-show-document-hierarchy --show-content --max-content 500 "urn:trustgraph:doc:abc123" + tg-show-extraction-provenance -U trustgraph -C default "urn:trustgraph:doc:abc123" + tg-show-extraction-provenance --show-content --max-content 500 "urn:trustgraph:doc:abc123" """ import argparse @@ -25,10 +25,22 @@ PROV_WAS_DERIVED_FROM = "http://www.w3.org/ns/prov#wasDerivedFrom" RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label" RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" TG = "https://trustgraph.ai/ns/" -TG_REIFIES = TG + "reifies" +TG_CONTAINS = TG + "contains" +TG_DOCUMENT_TYPE = TG + "Document" +TG_PAGE_TYPE = TG + "Page" +TG_CHUNK_TYPE = TG + "Chunk" +TG_SUBGRAPH_TYPE = TG + "Subgraph" DC_TITLE = "http://purl.org/dc/terms/title" DC_FORMAT = "http://purl.org/dc/terms/format" +# Map TrustGraph type URIs to display names +TYPE_MAP = { + TG_DOCUMENT_TYPE: "document", + TG_PAGE_TYPE: "page", + TG_CHUNK_TYPE: "chunk", + TG_SUBGRAPH_TYPE: "subgraph", +} + # Source graph SOURCE_GRAPH = "urn:graph:source" @@ -109,15 +121,15 @@ def extract_value(term): def get_node_metadata(socket, flow_id, user, collection, node_uri): - """Get metadata for a node (label, type, title, format).""" + """Get metadata for a node (label, types, title, format).""" triples = query_triples(socket, flow_id, user, collection, s=node_uri, g=SOURCE_GRAPH) - metadata = {"uri": node_uri} + metadata = {"uri": node_uri, "types": []} for s, p, o in triples: if p == RDFS_LABEL: metadata["label"] = o elif p == RDF_TYPE: - metadata["type"] = o + metadata["types"].append(o) elif p == DC_TITLE: metadata["title"] = o elif p == DC_FORMAT: @@ -126,6 +138,14 @@ def get_node_metadata(socket, flow_id, user, collection, node_uri): return metadata +def classify_node(metadata): + """Classify a node based on its rdf:type values.""" + for type_uri in metadata.get("types", []): + if type_uri in TYPE_MAP: + return TYPE_MAP[type_uri] + return "unknown" + + def get_children(socket, flow_id, user, collection, parent_uri): """Get children of a node via prov:wasDerivedFrom.""" triples = query_triples( @@ -135,29 +155,6 @@ def get_children(socket, flow_id, user, collection, parent_uri): return [s for s, p, o in triples] -def get_edges_from_chunk(socket, flow_id, user, collection, chunk_uri): - """Get edges that were derived from a chunk (via tg:reifies).""" - # Query for triples where: ?stmt prov:wasDerivedFrom chunk_uri - # Then get the tg:reifies value - derived_triples = query_triples( - socket, flow_id, user, collection, - p=PROV_WAS_DERIVED_FROM, o=chunk_uri, g=SOURCE_GRAPH - ) - - edges = [] - for stmt_uri, _, _ in derived_triples: - # Get what this statement reifies - reifies_triples = query_triples( - socket, flow_id, user, collection, - s=stmt_uri, p=TG_REIFIES, g=SOURCE_GRAPH - ) - for _, _, edge in reifies_triples: - if isinstance(edge, dict): - edges.append(edge) - - return edges - - def get_document_content(api, user, doc_id, max_content): """Fetch document content from librarian API.""" try: @@ -176,32 +173,6 @@ def get_document_content(api, user, doc_id, max_content): return f"[Error fetching content: {e}]" -def classify_uri(uri): - """Classify a URI as document, page, or chunk based on patterns.""" - if not isinstance(uri, str): - return "unknown" - - # Common patterns in trustgraph URIs - if "/c" in uri and uri.split("/c")[-1].isdigit(): - return "chunk" - if "/p" in uri and any(uri.split("/p")[-1].replace("/", "").isdigit() for _ in [1]): - # Check for page pattern like /p1 or /p1/ - parts = uri.split("/p") - if len(parts) > 1: - remainder = parts[-1].split("/")[0] - if remainder.isdigit(): - return "page" - - if "chunk" in uri.lower(): - return "chunk" - if "page" in uri.lower(): - return "page" - if "doc" in uri.lower(): - return "document" - - return "unknown" - - def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_content=False, max_content=200, visited=None): """Build document hierarchy tree recursively.""" if visited is None: @@ -212,7 +183,7 @@ def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_ visited.add(root_uri) metadata = get_node_metadata(socket, flow_id, user, collection, root_uri) - node_type = classify_uri(root_uri) + node_type = classify_node(metadata) node = { "uri": root_uri, @@ -232,10 +203,20 @@ def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_ children_uris = get_children(socket, flow_id, user, collection, root_uri) for child_uri in children_uris: - child_type = classify_uri(child_uri) + child_metadata = get_node_metadata(socket, flow_id, user, collection, child_uri) + child_type = classify_node(child_metadata) - # Recursively build hierarchy for pages and chunks - if child_type in ("page", "chunk", "unknown"): + if child_type == "subgraph": + # Subgraphs contain extracted edges — inline them + contains_triples = query_triples( + socket, flow_id, user, collection, + s=child_uri, p=TG_CONTAINS, g=SOURCE_GRAPH + ) + for _, _, edge in contains_triples: + if isinstance(edge, dict): + node["edges"].append(edge) + else: + # Recurse into pages, chunks, etc. child_node = build_hierarchy( socket, flow_id, user, collection, child_uri, api=api, show_content=show_content, max_content=max_content, @@ -244,11 +225,6 @@ def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_ if child_node: node["children"].append(child_node) - # Get edges for chunks - if node_type == "chunk": - edges = get_edges_from_chunk(socket, flow_id, user, collection, root_uri) - node["edges"] = edges - # Sort children by URI for consistent output node["children"].sort(key=lambda x: x.get("uri", "")) @@ -332,7 +308,7 @@ def print_json(node): def main(): parser = argparse.ArgumentParser( - prog='tg-show-document-hierarchy', + prog='tg-show-extraction-provenance', description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) diff --git a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py index a00944d6..28dba11a 100644 --- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py @@ -11,6 +11,8 @@ from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION from ....base import FlowProcessor, ConsumerSpec, ProducerSpec from ....base import AgentClientSpec +from ....provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE +from ....flow_version import __version__ as COMPONENT_VERSION from ....template import PromptManager # Module logger @@ -196,9 +198,21 @@ class Processor(FlowProcessor): return # Process extraction data - triples, entity_contexts = self.process_extraction_data( - extraction_data, v.metadata - ) + triples, entity_contexts, extracted_triples = \ + self.process_extraction_data(extraction_data, v.metadata) + + # Generate subgraph provenance for extracted triples + if extracted_triples: + chunk_uri = v.metadata.id + sg_uri = subgraph_uri() + prov_triples = subgraph_provenance_triples( + subgraph_uri=sg_uri, + extracted_triples=extracted_triples, + chunk_uri=chunk_uri, + component_name=default_ident, + component_version=COMPONENT_VERSION, + ) + triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) # Emit outputs if triples: @@ -221,8 +235,13 @@ class Processor(FlowProcessor): Data is a flat list of objects with 'type' discriminator field: - {"type": "definition", "entity": "...", "definition": "..."} - {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool} + + Returns: + Tuple of (all_triples, entity_contexts, extracted_triples) where + extracted_triples contains only the core knowledge facts (for provenance). """ triples = [] + extracted_triples = [] entity_contexts = [] # Categorize items by type @@ -242,11 +261,13 @@ class Processor(FlowProcessor): )) # Add definition - triples.append(Triple( + definition_triple = Triple( s = Term(type=IRI, iri=entity_uri), p = Term(type=IRI, iri=DEFINITION), o = Term(type=LITERAL, value=defn["definition"]), - )) + ) + triples.append(definition_triple) + extracted_triples.append(definition_triple) # Add subject-of relationship to document if metadata.id: @@ -261,7 +282,7 @@ class Processor(FlowProcessor): entity=Term(type=IRI, iri=entity_uri), context=defn["definition"] )) - + # Process relationships for rel in relationships: @@ -298,11 +319,13 @@ class Processor(FlowProcessor): )) # Add the main relationship triple - triples.append(Triple( + relationship_triple = Triple( s = subject_value, p = predicate_value, o = object_value - )) + ) + triples.append(relationship_triple) + extracted_triples.append(relationship_triple) # Add subject-of relationships to document if metadata.id: @@ -324,8 +347,8 @@ class Processor(FlowProcessor): p = Term(type=IRI, iri=SUBJECT_OF), o = Term(type=IRI, iri=metadata.id), )) - - return triples, entity_contexts + + return triples, entity_contexts, extracted_triples @staticmethod def add_args(parser): diff --git a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py index 277157b9..44ea778d 100755 --- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py @@ -20,7 +20,7 @@ from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import PromptClientSpec, ParameterSpec -from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE +from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE from .... flow_version import __version__ as COMPONENT_VERSION DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION) @@ -133,6 +133,7 @@ class Processor(FlowProcessor): raise e triples = [] + extracted_triples = [] entities = [] # Get chunk document ID for provenance linking @@ -173,20 +174,7 @@ class Processor(FlowProcessor): s=s_value, p=DEFINITION_VALUE, o=o_value ) triples.append(definition_triple) - - # Generate provenance for the definition triple (reification) - # Provenance triples go in the source graph for separation from core knowledge - stmt_uri = statement_uri() - prov_triples = triple_provenance_triples( - stmt_uri=stmt_uri, - extracted_triple=definition_triple, - chunk_uri=chunk_uri, - component_name=default_ident, - component_version=COMPONENT_VERSION, - llm_model=llm_model, - ontology_uri=ontology_uri, - ) - triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) + extracted_triples.append(definition_triple) # Link entity to chunk (not top-level document) triples.append(Triple( @@ -211,6 +199,20 @@ class Processor(FlowProcessor): chunk_id=chunk_doc_id, )) + # Generate subgraph provenance once for all extracted triples + if extracted_triples: + sg_uri = subgraph_uri() + prov_triples = subgraph_provenance_triples( + subgraph_uri=sg_uri, + extracted_triples=extracted_triples, + chunk_uri=chunk_uri, + component_name=default_ident, + component_version=COMPONENT_VERSION, + llm_model=llm_model, + ontology_uri=ontology_uri, + ) + triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) + # Send triples in batches for i in range(0, len(triples), self.triples_batch_size): batch = triples[i:i + self.triples_batch_size] diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index 16383752..5078d817 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -23,6 +23,9 @@ from .ontology_selector import OntologySelector, OntologySubset from .simplified_parser import parse_extraction_response from .triple_converter import TripleConverter +from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE +from .... flow_version import __version__ as COMPONENT_VERSION + logger = logging.getLogger(__name__) default_ident = "kg-extract-ontology" @@ -306,11 +309,25 @@ class Processor(FlowProcessor): flow, chunk, ontology_subset, prompt_variables ) + # Generate subgraph provenance for extracted triples + if triples: + chunk_uri = v.metadata.id + sg_uri = subgraph_uri() + prov_triples = subgraph_provenance_triples( + subgraph_uri=sg_uri, + extracted_triples=triples, + chunk_uri=chunk_uri, + component_name=default_ident, + component_version=COMPONENT_VERSION, + ) + # Generate ontology definition triples ontology_triples = self.build_ontology_triples(ontology_subset) - # Combine extracted triples with ontology triples + # Combine extracted triples with ontology triples and provenance all_triples = triples + ontology_triples + if triples: + all_triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) # Build entity contexts from all triples (including ontology elements) entity_contexts = self.build_entity_contexts(all_triples) diff --git a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py index a136acc6..604de1df 100755 --- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py @@ -20,7 +20,7 @@ from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import PromptClientSpec, ParameterSpec -from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE +from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE from .... flow_version import __version__ as COMPONENT_VERSION RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL) @@ -115,6 +115,7 @@ class Processor(FlowProcessor): raise e triples = [] + extracted_triples = [] # Get chunk document ID for provenance linking chunk_doc_id = v.document_id if v.document_id else v.metadata.id @@ -160,20 +161,7 @@ class Processor(FlowProcessor): o=o_value ) triples.append(relationship_triple) - - # Generate provenance for the relationship triple (reification) - # Provenance triples go in the source graph for separation from core knowledge - stmt_uri = statement_uri() - prov_triples = triple_provenance_triples( - stmt_uri=stmt_uri, - extracted_triple=relationship_triple, - chunk_uri=chunk_uri, - component_name=default_ident, - component_version=COMPONENT_VERSION, - llm_model=llm_model, - ontology_uri=ontology_uri, - ) - triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) + extracted_triples.append(relationship_triple) # Label for s triples.append(Triple( @@ -212,6 +200,20 @@ class Processor(FlowProcessor): o=Term(type=IRI, iri=chunk_uri) )) + # Generate subgraph provenance once for all extracted triples + if extracted_triples: + sg_uri = subgraph_uri() + prov_triples = subgraph_provenance_triples( + subgraph_uri=sg_uri, + extracted_triples=extracted_triples, + chunk_uri=chunk_uri, + component_name=default_ident, + component_version=COMPONENT_VERSION, + llm_model=llm_model, + ontology_uri=ontology_uri, + ) + triples.extend(set_graph(prov_triples, GRAPH_SOURCE)) + # Send triples in batches for i in range(0, len(triples), self.triples_batch_size): batch = triples[i:i + self.triples_batch_size]