mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-07 13:55:14 +02:00
Subgraph provenance (#694)
Replace per-triple provenance reification with subgraph model Extraction provenance previously created a full reification (statement URI, activity, agent) for every single extracted triple, producing ~13 provenance triples per knowledge triple. Since each chunk is processed by a single LLM call, this was both redundant and semantically inaccurate. Now one subgraph object is created per chunk extraction, with tg:contains linking to each extracted triple. For 20 extractions from a chunk this reduces provenance from ~260 triples to ~33. - Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri - Replace triple_provenance_triples() with subgraph_provenance_triples() - Refactor kg-extract-definitions and kg-extract-relationships to generate provenance once per chunk instead of per triple - Add subgraph provenance to kg-extract-ontology and kg-extract-agent (previously had none) - Update CLI tools and tech specs to match Also rename tg-show-document-hierarchy to tg-show-extraction-provenance. Added extra typing for extraction provenance, fixed extraction prov CLI
This commit is contained in:
parent
35128ff019
commit
64e3f6bd0d
20 changed files with 463 additions and 193 deletions
|
|
@ -8,7 +8,7 @@ Child entities (pages, chunks) append path segments to the parent IRI:
|
|||
- Chunk: {page_iri}/c{chunk_index} (from page)
|
||||
{doc_iri}/c{chunk_index} (from text doc)
|
||||
- Activity: https://trustgraph.ai/activity/{uuid}
|
||||
- Statement: https://trustgraph.ai/stmt/{uuid}
|
||||
- Subgraph: https://trustgraph.ai/subgraph/{uuid}
|
||||
"""
|
||||
|
||||
import uuid
|
||||
|
|
@ -50,11 +50,11 @@ def activity_uri(activity_id: str = None) -> str:
|
|||
return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
|
||||
|
||||
|
||||
def statement_uri(stmt_id: str = None) -> str:
|
||||
"""Generate URI for a reified statement. Auto-generates UUID if not provided."""
|
||||
if stmt_id is None:
|
||||
stmt_id = str(uuid.uuid4())
|
||||
return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}"
|
||||
def subgraph_uri(subgraph_id: str = None) -> str:
|
||||
"""Generate URI for an extraction subgraph. Auto-generates UUID if not provided."""
|
||||
if subgraph_id is None:
|
||||
subgraph_id = str(uuid.uuid4())
|
||||
return f"{TRUSTGRAPH_BASE}/subgraph/{_encode_id(subgraph_id)}"
|
||||
|
||||
|
||||
def agent_uri(component_name: str) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue