mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-26 08:56:21 +02:00
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
This commit is contained in:
parent
d8f0a576af
commit
cd5580be59
20 changed files with 1601 additions and 59 deletions
48
trustgraph-base/trustgraph/provenance/namespaces.py
Normal file
48
trustgraph-base/trustgraph/provenance/namespaces.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
RDF namespace constants for provenance.
|
||||
|
||||
Includes PROV-O, Dublin Core, and TrustGraph namespace URIs.
|
||||
"""
|
||||
|
||||
# PROV-O namespace (W3C Provenance Ontology)
|
||||
PROV = "http://www.w3.org/ns/prov#"
|
||||
PROV_ENTITY = PROV + "Entity"
|
||||
PROV_ACTIVITY = PROV + "Activity"
|
||||
PROV_AGENT = PROV + "Agent"
|
||||
PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
|
||||
PROV_WAS_GENERATED_BY = PROV + "wasGeneratedBy"
|
||||
PROV_USED = PROV + "used"
|
||||
PROV_WAS_ASSOCIATED_WITH = PROV + "wasAssociatedWith"
|
||||
PROV_STARTED_AT_TIME = PROV + "startedAtTime"
|
||||
|
||||
# Dublin Core namespace
|
||||
DC = "http://purl.org/dc/elements/1.1/"
|
||||
DC_TITLE = DC + "title"
|
||||
DC_SOURCE = DC + "source"
|
||||
DC_DATE = DC + "date"
|
||||
DC_CREATOR = DC + "creator"
|
||||
|
||||
# RDF/RDFS namespace (also in rdf.py, but included here for completeness)
|
||||
RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
RDF_TYPE = RDF + "type"
|
||||
RDFS = "http://www.w3.org/2000/01/rdf-schema#"
|
||||
RDFS_LABEL = RDFS + "label"
|
||||
|
||||
# TrustGraph namespace for custom predicates
|
||||
TG = "https://trustgraph.ai/ns/"
|
||||
TG_REIFIES = TG + "reifies"
|
||||
TG_PAGE_COUNT = TG + "pageCount"
|
||||
TG_MIME_TYPE = TG + "mimeType"
|
||||
TG_PAGE_NUMBER = TG + "pageNumber"
|
||||
TG_CHUNK_INDEX = TG + "chunkIndex"
|
||||
TG_CHAR_OFFSET = TG + "charOffset"
|
||||
TG_CHAR_LENGTH = TG + "charLength"
|
||||
TG_CHUNK_SIZE = TG + "chunkSize"
|
||||
TG_CHUNK_OVERLAP = TG + "chunkOverlap"
|
||||
TG_COMPONENT_VERSION = TG + "componentVersion"
|
||||
TG_LLM_MODEL = TG + "llmModel"
|
||||
TG_ONTOLOGY = TG + "ontology"
|
||||
TG_EMBEDDING_MODEL = TG + "embeddingModel"
|
||||
TG_SOURCE_TEXT = TG + "sourceText"
|
||||
TG_SOURCE_CHAR_OFFSET = TG + "sourceCharOffset"
|
||||
TG_SOURCE_CHAR_LENGTH = TG + "sourceCharLength"
|
||||
Loading…
Add table
Add a link
Reference in a new issue