mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-04 12:52:36 +02:00
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
This commit is contained in:
parent
d8f0a576af
commit
cd5580be59
20 changed files with 1601 additions and 59 deletions
|
|
@ -23,9 +23,14 @@ from .. schema import config_request_queue, config_response_queue
|
|||
|
||||
from .. schema import Document, Metadata
|
||||
from .. schema import TextDocument, Metadata
|
||||
from .. schema import Triples
|
||||
|
||||
from .. exceptions import RequestError
|
||||
|
||||
from .. provenance import (
|
||||
document_uri, document_triples, get_vocabulary_triples,
|
||||
)
|
||||
|
||||
from . librarian import Librarian
|
||||
from . collection_manager import CollectionManager
|
||||
|
||||
|
|
@ -281,6 +286,67 @@ class Processor(AsyncProcessor):
|
|||
# Threshold for sending document_id instead of inline content (2MB)
|
||||
STREAMING_THRESHOLD = 2 * 1024 * 1024
|
||||
|
||||
async def emit_document_provenance(self, document, processing, triples_queue):
|
||||
"""
|
||||
Emit document provenance metadata to the knowledge graph.
|
||||
|
||||
This emits:
|
||||
1. Vocabulary bootstrap triples (idempotent, safe to re-emit)
|
||||
2. Document metadata as PROV-O triples
|
||||
"""
|
||||
logger.debug(f"Emitting document provenance for {document.id}")
|
||||
|
||||
# Build document URI and provenance triples
|
||||
doc_uri = document_uri(document.id)
|
||||
|
||||
# Get page count for PDFs (if available from document metadata)
|
||||
page_count = None
|
||||
if document.kind == "application/pdf":
|
||||
# Page count might be in document metadata triples
|
||||
# For now, we don't have it at this point - it gets determined during extraction
|
||||
pass
|
||||
|
||||
# Build document metadata triples
|
||||
prov_triples = document_triples(
|
||||
doc_uri=doc_uri,
|
||||
title=document.title if document.title else None,
|
||||
mime_type=document.kind,
|
||||
)
|
||||
|
||||
# Include any existing metadata triples from the document
|
||||
if document.metadata:
|
||||
prov_triples.extend(document.metadata)
|
||||
|
||||
# Get vocabulary bootstrap triples (idempotent)
|
||||
vocab_triples = get_vocabulary_triples()
|
||||
|
||||
# Combine all triples
|
||||
all_triples = vocab_triples + prov_triples
|
||||
|
||||
# Create publisher and emit
|
||||
triples_pub = Publisher(
|
||||
self.pubsub, triples_queue, schema=Triples
|
||||
)
|
||||
|
||||
try:
|
||||
await triples_pub.start()
|
||||
|
||||
triples_msg = Triples(
|
||||
metadata=Metadata(
|
||||
id=doc_uri,
|
||||
metadata=[],
|
||||
user=processing.user,
|
||||
collection=processing.collection,
|
||||
),
|
||||
triples=all_triples,
|
||||
)
|
||||
|
||||
await triples_pub.send(None, triples_msg)
|
||||
logger.debug(f"Emitted {len(all_triples)} provenance triples for {document.id}")
|
||||
|
||||
finally:
|
||||
await triples_pub.stop()
|
||||
|
||||
async def load_document(self, document, processing, content):
|
||||
|
||||
logger.debug("Ready for document processing...")
|
||||
|
|
@ -301,6 +367,12 @@ class Processor(AsyncProcessor):
|
|||
|
||||
q = flow["interfaces"][kind]
|
||||
|
||||
# Emit document provenance to knowledge graph
|
||||
if "triples-store" in flow["interfaces"]:
|
||||
await self.emit_document_provenance(
|
||||
document, processing, flow["interfaces"]["triples-store"]
|
||||
)
|
||||
|
||||
if kind == "text-load":
|
||||
# For large text documents, send document_id for streaming retrieval
|
||||
if len(content) >= self.STREAMING_THRESHOLD:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue