mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-26 08:56:21 +02:00
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
This commit is contained in:
parent
d8f0a576af
commit
cd5580be59
20 changed files with 1601 additions and 59 deletions
|
|
@ -8,9 +8,18 @@ import logging
|
|||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from ... schema import TextDocument, Chunk
|
||||
from ... schema import TextDocument, Chunk, Metadata, Triples
|
||||
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
|
||||
|
||||
from ... provenance import (
|
||||
page_uri, chunk_uri_from_page, chunk_uri_from_doc,
|
||||
derived_entity_triples, document_uri,
|
||||
)
|
||||
|
||||
# Component identification for provenance
|
||||
COMPONENT_NAME = "chunker"
|
||||
COMPONENT_VERSION = "1.0.0"
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -63,6 +72,13 @@ class Processor(ChunkingService):
|
|||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name = "triples",
|
||||
schema = Triples,
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Recursive chunker initialized")
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
|
@ -96,21 +112,99 @@ class Processor(ChunkingService):
|
|||
|
||||
texts = text_splitter.create_documents([text])
|
||||
|
||||
# Get parent document ID for provenance linking
|
||||
parent_doc_id = v.document_id or v.metadata.id
|
||||
|
||||
# Determine if parent is a page (from PDF) or source document (text)
|
||||
# Check if parent_doc_id contains "/p" which indicates a page
|
||||
is_from_page = "/p" in parent_doc_id
|
||||
|
||||
# Extract the root document ID for chunk URI generation
|
||||
if is_from_page:
|
||||
# Parent is a page like "doc123/p3", extract page number
|
||||
parts = parent_doc_id.rsplit("/p", 1)
|
||||
root_doc_id = parts[0]
|
||||
page_num = int(parts[1]) if len(parts) > 1 else 1
|
||||
else:
|
||||
root_doc_id = parent_doc_id
|
||||
page_num = None
|
||||
|
||||
# Track character offset for provenance
|
||||
char_offset = 0
|
||||
|
||||
for ix, chunk in enumerate(texts):
|
||||
chunk_index = ix + 1 # 1-indexed
|
||||
|
||||
logger.debug(f"Created chunk of size {len(chunk.page_content)}")
|
||||
|
||||
# Generate chunk document ID
|
||||
if is_from_page:
|
||||
chunk_doc_id = f"{root_doc_id}/p{page_num}/c{chunk_index}"
|
||||
chunk_uri = chunk_uri_from_page(root_doc_id, page_num, chunk_index)
|
||||
parent_uri = page_uri(root_doc_id, page_num)
|
||||
else:
|
||||
chunk_doc_id = f"{root_doc_id}/c{chunk_index}"
|
||||
chunk_uri = chunk_uri_from_doc(root_doc_id, chunk_index)
|
||||
parent_uri = document_uri(root_doc_id)
|
||||
|
||||
chunk_content = chunk.page_content.encode("utf-8")
|
||||
chunk_length = len(chunk.page_content)
|
||||
|
||||
# Save chunk to librarian as child document
|
||||
await self.save_child_document(
|
||||
doc_id=chunk_doc_id,
|
||||
parent_id=parent_doc_id,
|
||||
user=v.metadata.user,
|
||||
content=chunk_content,
|
||||
document_type="chunk",
|
||||
title=f"Chunk {chunk_index}",
|
||||
)
|
||||
|
||||
# Emit provenance triples
|
||||
prov_triples = derived_entity_triples(
|
||||
entity_uri=chunk_uri,
|
||||
parent_uri=parent_uri,
|
||||
component_name=COMPONENT_NAME,
|
||||
component_version=COMPONENT_VERSION,
|
||||
label=f"Chunk {chunk_index}",
|
||||
chunk_index=chunk_index,
|
||||
char_offset=char_offset,
|
||||
char_length=chunk_length,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
await flow("triples").send(Triples(
|
||||
metadata=Metadata(
|
||||
id=chunk_uri,
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
triples=prov_triples,
|
||||
))
|
||||
|
||||
# Forward chunk ID + content (post-chunker optimization)
|
||||
r = Chunk(
|
||||
metadata=v.metadata,
|
||||
chunk=chunk.page_content.encode("utf-8"),
|
||||
metadata=Metadata(
|
||||
id=chunk_uri,
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
chunk=chunk_content,
|
||||
document_id=chunk_doc_id,
|
||||
)
|
||||
|
||||
__class__.chunk_metric.labels(
|
||||
id=consumer.id, flow=consumer.flow
|
||||
).observe(len(chunk.page_content))
|
||||
).observe(chunk_length)
|
||||
|
||||
await flow("output").send(r)
|
||||
|
||||
# Update character offset (approximate, doesn't account for overlap)
|
||||
char_offset += chunk_length - chunk_overlap
|
||||
|
||||
logger.debug("Document chunking complete")
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -16,11 +16,20 @@ import uuid
|
|||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
from ... schema import Document, TextDocument, Metadata
|
||||
from ... schema import LibrarianRequest, LibrarianResponse
|
||||
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
||||
from ... schema import librarian_request_queue, librarian_response_queue
|
||||
from ... schema import Triples
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
|
||||
|
||||
from ... provenance import (
|
||||
document_uri, page_uri, derived_entity_triples,
|
||||
)
|
||||
|
||||
# Component identification for provenance
|
||||
COMPONENT_NAME = "pdf-decoder"
|
||||
COMPONENT_VERSION = "1.0.0"
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -57,6 +66,13 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name = "triples",
|
||||
schema = Triples,
|
||||
)
|
||||
)
|
||||
|
||||
# Librarian client for fetching document content
|
||||
librarian_request_q = params.get(
|
||||
"librarian_request_queue", default_librarian_request_queue
|
||||
|
|
@ -148,6 +164,66 @@ class Processor(FlowProcessor):
|
|||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout fetching document {document_id}")
|
||||
|
||||
async def save_child_document(self, doc_id, parent_id, user, content,
|
||||
document_type="page", title=None, timeout=120):
|
||||
"""
|
||||
Save a child document to the librarian.
|
||||
|
||||
Args:
|
||||
doc_id: ID for the new child document
|
||||
parent_id: ID of the parent document
|
||||
user: User ID
|
||||
content: Document content (bytes)
|
||||
document_type: Type of document ("page", "chunk", etc.)
|
||||
title: Optional title
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
The document ID on success
|
||||
"""
|
||||
import base64
|
||||
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
doc_metadata = DocumentMetadata(
|
||||
id=doc_id,
|
||||
user=user,
|
||||
kind="text/plain",
|
||||
title=title or doc_id,
|
||||
parent_id=parent_id,
|
||||
document_type=document_type,
|
||||
)
|
||||
|
||||
request = LibrarianRequest(
|
||||
operation="add-child-document",
|
||||
document_metadata=doc_metadata,
|
||||
content=base64.b64encode(content).decode("utf-8"),
|
||||
)
|
||||
|
||||
# Create future for response
|
||||
future = asyncio.get_event_loop().create_future()
|
||||
self.pending_requests[request_id] = future
|
||||
|
||||
try:
|
||||
# Send request
|
||||
await self.librarian_request_producer.send(
|
||||
request, properties={"id": request_id}
|
||||
)
|
||||
|
||||
# Wait for response
|
||||
response = await asyncio.wait_for(future, timeout=timeout)
|
||||
|
||||
if response.error:
|
||||
raise RuntimeError(
|
||||
f"Librarian error saving child document: {response.error.type}: {response.error.message}"
|
||||
)
|
||||
|
||||
return doc_id
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout saving child document {doc_id}")
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
logger.debug("PDF message received")
|
||||
|
|
@ -187,13 +263,62 @@ class Processor(FlowProcessor):
|
|||
loader = PyPDFLoader(temp_path)
|
||||
pages = loader.load()
|
||||
|
||||
# Get the source document ID
|
||||
source_doc_id = v.document_id or v.metadata.id
|
||||
|
||||
for ix, page in enumerate(pages):
|
||||
page_num = ix + 1 # 1-indexed page numbers
|
||||
|
||||
logger.debug(f"Processing page {ix}")
|
||||
logger.debug(f"Processing page {page_num}")
|
||||
|
||||
# Generate page document ID
|
||||
page_doc_id = f"{source_doc_id}/p{page_num}"
|
||||
page_content = page.page_content.encode("utf-8")
|
||||
|
||||
# Save page as child document in librarian
|
||||
await self.save_child_document(
|
||||
doc_id=page_doc_id,
|
||||
parent_id=source_doc_id,
|
||||
user=v.metadata.user,
|
||||
content=page_content,
|
||||
document_type="page",
|
||||
title=f"Page {page_num}",
|
||||
)
|
||||
|
||||
# Emit provenance triples
|
||||
doc_uri = document_uri(source_doc_id)
|
||||
pg_uri = page_uri(source_doc_id, page_num)
|
||||
|
||||
prov_triples = derived_entity_triples(
|
||||
entity_uri=pg_uri,
|
||||
parent_uri=doc_uri,
|
||||
component_name=COMPONENT_NAME,
|
||||
component_version=COMPONENT_VERSION,
|
||||
label=f"Page {page_num}",
|
||||
page_number=page_num,
|
||||
)
|
||||
|
||||
await flow("triples").send(Triples(
|
||||
metadata=Metadata(
|
||||
id=pg_uri,
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
triples=prov_triples,
|
||||
))
|
||||
|
||||
# Forward page document ID to chunker
|
||||
# Chunker will fetch content from librarian
|
||||
r = TextDocument(
|
||||
metadata=v.metadata,
|
||||
text=page.page_content.encode("utf-8"),
|
||||
metadata=Metadata(
|
||||
id=pg_uri,
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
document_id=page_doc_id,
|
||||
text=b"", # Empty, chunker will fetch from librarian
|
||||
)
|
||||
|
||||
await flow("output").send(r)
|
||||
|
|
|
|||
|
|
@ -71,7 +71,8 @@ class Processor(FlowProcessor):
|
|||
entities.append(
|
||||
EntityEmbeddings(
|
||||
entity=entity.entity,
|
||||
vectors=vectors
|
||||
vectors=vectors,
|
||||
chunk_id=entity.chunk_id, # Provenance: source chunk
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -128,10 +128,12 @@ class Processor(FlowProcessor):
|
|||
triples = []
|
||||
entities = []
|
||||
|
||||
# FIXME: Putting metadata into triples store is duplicated in
|
||||
# relationships extractor too
|
||||
for t in v.metadata.metadata:
|
||||
triples.append(t)
|
||||
# Get chunk document ID for provenance linking
|
||||
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
||||
chunk_uri = v.metadata.id # The URI form for the chunk
|
||||
|
||||
# Note: Document metadata is now emitted once by librarian at processing
|
||||
# initiation, so we don't need to duplicate it here.
|
||||
|
||||
for defn in defs:
|
||||
|
||||
|
|
@ -159,22 +161,27 @@ class Processor(FlowProcessor):
|
|||
s=s_value, p=DEFINITION_VALUE, o=o_value
|
||||
))
|
||||
|
||||
# Link entity to chunk (not top-level document)
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=SUBJECT_OF_VALUE,
|
||||
o=Term(type=IRI, iri=v.metadata.id)
|
||||
o=Term(type=IRI, iri=chunk_uri)
|
||||
))
|
||||
|
||||
# Output entity name as context for direct name matching
|
||||
# Include chunk_id for embedding provenance
|
||||
entities.append(EntityContext(
|
||||
entity=s_value,
|
||||
context=s,
|
||||
chunk_id=chunk_doc_id,
|
||||
))
|
||||
|
||||
# Output definition as context for semantic matching
|
||||
# Include chunk_id for embedding provenance
|
||||
entities.append(EntityContext(
|
||||
entity=s_value,
|
||||
context=defn["definition"],
|
||||
chunk_id=chunk_doc_id,
|
||||
))
|
||||
|
||||
# Send triples in batches
|
||||
|
|
|
|||
|
|
@ -109,10 +109,12 @@ class Processor(FlowProcessor):
|
|||
|
||||
triples = []
|
||||
|
||||
# FIXME: Putting metadata into triples store is duplicated in
|
||||
# relationships extractor too
|
||||
for t in v.metadata.metadata:
|
||||
triples.append(t)
|
||||
# Get chunk document ID for provenance linking
|
||||
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
||||
chunk_uri = v.metadata.id # The URI form for the chunk
|
||||
|
||||
# Note: Document metadata is now emitted once by librarian at processing
|
||||
# initiation, so we don't need to duplicate it here.
|
||||
|
||||
for rel in rels:
|
||||
|
||||
|
|
@ -168,19 +170,19 @@ class Processor(FlowProcessor):
|
|||
o=Term(type=LITERAL, value=str(o))
|
||||
))
|
||||
|
||||
# 'Subject of' for s
|
||||
# Link entity to chunk (not top-level document)
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=SUBJECT_OF_VALUE,
|
||||
o=Term(type=IRI, iri=v.metadata.id)
|
||||
o=Term(type=IRI, iri=chunk_uri)
|
||||
))
|
||||
|
||||
if rel["object-entity"]:
|
||||
# 'Subject of' for o
|
||||
# Link object entity to chunk
|
||||
triples.append(Triple(
|
||||
s=o_value,
|
||||
p=SUBJECT_OF_VALUE,
|
||||
o=Term(type=IRI, iri=v.metadata.id)
|
||||
o=Term(type=IRI, iri=chunk_uri)
|
||||
))
|
||||
|
||||
# Send triples in batches
|
||||
|
|
|
|||
|
|
@ -609,8 +609,10 @@ class Librarian:
|
|||
):
|
||||
raise RequestError("Document already exists")
|
||||
|
||||
# Ensure document_type is set to "extracted"
|
||||
request.document_metadata.document_type = "extracted"
|
||||
# Set document_type if not specified by caller
|
||||
# Valid types: "page", "chunk", or "extracted" (legacy)
|
||||
if not request.document_metadata.document_type or request.document_metadata.document_type == "source":
|
||||
request.document_metadata.document_type = "extracted"
|
||||
|
||||
# Create object ID for blob
|
||||
object_id = uuid.uuid4()
|
||||
|
|
|
|||
|
|
@ -23,9 +23,14 @@ from .. schema import config_request_queue, config_response_queue
|
|||
|
||||
from .. schema import Document, Metadata
|
||||
from .. schema import TextDocument, Metadata
|
||||
from .. schema import Triples
|
||||
|
||||
from .. exceptions import RequestError
|
||||
|
||||
from .. provenance import (
|
||||
document_uri, document_triples, get_vocabulary_triples,
|
||||
)
|
||||
|
||||
from . librarian import Librarian
|
||||
from . collection_manager import CollectionManager
|
||||
|
||||
|
|
@ -281,6 +286,67 @@ class Processor(AsyncProcessor):
|
|||
# Threshold for sending document_id instead of inline content (2MB)
|
||||
STREAMING_THRESHOLD = 2 * 1024 * 1024
|
||||
|
||||
async def emit_document_provenance(self, document, processing, triples_queue):
|
||||
"""
|
||||
Emit document provenance metadata to the knowledge graph.
|
||||
|
||||
This emits:
|
||||
1. Vocabulary bootstrap triples (idempotent, safe to re-emit)
|
||||
2. Document metadata as PROV-O triples
|
||||
"""
|
||||
logger.debug(f"Emitting document provenance for {document.id}")
|
||||
|
||||
# Build document URI and provenance triples
|
||||
doc_uri = document_uri(document.id)
|
||||
|
||||
# Get page count for PDFs (if available from document metadata)
|
||||
page_count = None
|
||||
if document.kind == "application/pdf":
|
||||
# Page count might be in document metadata triples
|
||||
# For now, we don't have it at this point - it gets determined during extraction
|
||||
pass
|
||||
|
||||
# Build document metadata triples
|
||||
prov_triples = document_triples(
|
||||
doc_uri=doc_uri,
|
||||
title=document.title if document.title else None,
|
||||
mime_type=document.kind,
|
||||
)
|
||||
|
||||
# Include any existing metadata triples from the document
|
||||
if document.metadata:
|
||||
prov_triples.extend(document.metadata)
|
||||
|
||||
# Get vocabulary bootstrap triples (idempotent)
|
||||
vocab_triples = get_vocabulary_triples()
|
||||
|
||||
# Combine all triples
|
||||
all_triples = vocab_triples + prov_triples
|
||||
|
||||
# Create publisher and emit
|
||||
triples_pub = Publisher(
|
||||
self.pubsub, triples_queue, schema=Triples
|
||||
)
|
||||
|
||||
try:
|
||||
await triples_pub.start()
|
||||
|
||||
triples_msg = Triples(
|
||||
metadata=Metadata(
|
||||
id=doc_uri,
|
||||
metadata=[],
|
||||
user=processing.user,
|
||||
collection=processing.collection,
|
||||
),
|
||||
triples=all_triples,
|
||||
)
|
||||
|
||||
await triples_pub.send(None, triples_msg)
|
||||
logger.debug(f"Emitted {len(all_triples)} provenance triples for {document.id}")
|
||||
|
||||
finally:
|
||||
await triples_pub.stop()
|
||||
|
||||
async def load_document(self, document, processing, content):
|
||||
|
||||
logger.debug("Ready for document processing...")
|
||||
|
|
@ -301,6 +367,12 @@ class Processor(AsyncProcessor):
|
|||
|
||||
q = flow["interfaces"][kind]
|
||||
|
||||
# Emit document provenance to knowledge graph
|
||||
if "triples-store" in flow["interfaces"]:
|
||||
await self.emit_document_provenance(
|
||||
document, processing, flow["interfaces"]["triples-store"]
|
||||
)
|
||||
|
||||
if kind == "text-load":
|
||||
# For large text documents, send document_id for streaming retrieval
|
||||
if len(content) >= self.STREAMING_THRESHOLD:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue