Extract-time provenance (#661)

1. Shared Provenance Module - URI generators, namespace constants,
   triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
   initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
   provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
   forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
   - Link entities to chunks via SUBJECT_OF (not top-level document)
   - Removed duplicate metadata emission (now handled by librarian)
   - Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
   - EntityContext schema has chunk_id field
   - EntityEmbeddings schema has chunk_id field
   - Definitions extractor sets chunk_id when creating EntityContext
   - Graph embeddings processor passes chunk_id through to
     EntityEmbeddings

Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
    ↓           ↓          ↓              ↓
  librarian  librarian  librarian    (chunk_id reference)
  + graph    + graph    + graph

Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.

Also, updating tests
This commit is contained in:
cybermaggedon 2026-03-05 18:36:10 +00:00 committed by GitHub
parent d8f0a576af
commit cd5580be59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1601 additions and 59 deletions

View file

@ -8,9 +8,18 @@ import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk
from ... schema import TextDocument, Chunk, Metadata, Triples
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
from ... provenance import (
page_uri, chunk_uri_from_page, chunk_uri_from_doc,
derived_entity_triples, document_uri,
)
# Component identification for provenance
COMPONENT_NAME = "chunker"
COMPONENT_VERSION = "1.0.0"
# Module logger
logger = logging.getLogger(__name__)
@ -63,6 +72,13 @@ class Processor(ChunkingService):
)
)
self.register_specification(
ProducerSpec(
name = "triples",
schema = Triples,
)
)
logger.info("Recursive chunker initialized")
async def on_message(self, msg, consumer, flow):
@ -96,21 +112,99 @@ class Processor(ChunkingService):
texts = text_splitter.create_documents([text])
# Get parent document ID for provenance linking
parent_doc_id = v.document_id or v.metadata.id
# Determine if parent is a page (from PDF) or source document (text)
# Check if parent_doc_id contains "/p" which indicates a page
is_from_page = "/p" in parent_doc_id
# Extract the root document ID for chunk URI generation
if is_from_page:
# Parent is a page like "doc123/p3", extract page number
parts = parent_doc_id.rsplit("/p", 1)
root_doc_id = parts[0]
page_num = int(parts[1]) if len(parts) > 1 else 1
else:
root_doc_id = parent_doc_id
page_num = None
# Track character offset for provenance
char_offset = 0
for ix, chunk in enumerate(texts):
chunk_index = ix + 1 # 1-indexed
logger.debug(f"Created chunk of size {len(chunk.page_content)}")
# Generate chunk document ID
if is_from_page:
chunk_doc_id = f"{root_doc_id}/p{page_num}/c{chunk_index}"
chunk_uri = chunk_uri_from_page(root_doc_id, page_num, chunk_index)
parent_uri = page_uri(root_doc_id, page_num)
else:
chunk_doc_id = f"{root_doc_id}/c{chunk_index}"
chunk_uri = chunk_uri_from_doc(root_doc_id, chunk_index)
parent_uri = document_uri(root_doc_id)
chunk_content = chunk.page_content.encode("utf-8")
chunk_length = len(chunk.page_content)
# Save chunk to librarian as child document
await self.save_child_document(
doc_id=chunk_doc_id,
parent_id=parent_doc_id,
user=v.metadata.user,
content=chunk_content,
document_type="chunk",
title=f"Chunk {chunk_index}",
)
# Emit provenance triples
prov_triples = derived_entity_triples(
entity_uri=chunk_uri,
parent_uri=parent_uri,
component_name=COMPONENT_NAME,
component_version=COMPONENT_VERSION,
label=f"Chunk {chunk_index}",
chunk_index=chunk_index,
char_offset=char_offset,
char_length=chunk_length,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
await flow("triples").send(Triples(
metadata=Metadata(
id=chunk_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
triples=prov_triples,
))
# Forward chunk ID + content (post-chunker optimization)
r = Chunk(
metadata=v.metadata,
chunk=chunk.page_content.encode("utf-8"),
metadata=Metadata(
id=chunk_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
chunk=chunk_content,
document_id=chunk_doc_id,
)
__class__.chunk_metric.labels(
id=consumer.id, flow=consumer.flow
).observe(len(chunk.page_content))
).observe(chunk_length)
await flow("output").send(r)
# Update character offset (approximate, doesn't account for overlap)
char_offset += chunk_length - chunk_overlap
logger.debug("Document chunking complete")
@staticmethod

View file

@ -16,11 +16,20 @@ import uuid
from langchain_community.document_loaders import PyPDFLoader
from ... schema import Document, TextDocument, Metadata
from ... schema import LibrarianRequest, LibrarianResponse
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
from ... schema import librarian_request_queue, librarian_response_queue
from ... schema import Triples
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
from ... provenance import (
document_uri, page_uri, derived_entity_triples,
)
# Component identification for provenance
COMPONENT_NAME = "pdf-decoder"
COMPONENT_VERSION = "1.0.0"
# Module logger
logger = logging.getLogger(__name__)
@ -57,6 +66,13 @@ class Processor(FlowProcessor):
)
)
self.register_specification(
ProducerSpec(
name = "triples",
schema = Triples,
)
)
# Librarian client for fetching document content
librarian_request_q = params.get(
"librarian_request_queue", default_librarian_request_queue
@ -148,6 +164,66 @@ class Processor(FlowProcessor):
self.pending_requests.pop(request_id, None)
raise RuntimeError(f"Timeout fetching document {document_id}")
async def save_child_document(self, doc_id, parent_id, user, content,
document_type="page", title=None, timeout=120):
"""
Save a child document to the librarian.
Args:
doc_id: ID for the new child document
parent_id: ID of the parent document
user: User ID
content: Document content (bytes)
document_type: Type of document ("page", "chunk", etc.)
title: Optional title
timeout: Request timeout in seconds
Returns:
The document ID on success
"""
import base64
request_id = str(uuid.uuid4())
doc_metadata = DocumentMetadata(
id=doc_id,
user=user,
kind="text/plain",
title=title or doc_id,
parent_id=parent_id,
document_type=document_type,
)
request = LibrarianRequest(
operation="add-child-document",
document_metadata=doc_metadata,
content=base64.b64encode(content).decode("utf-8"),
)
# Create future for response
future = asyncio.get_event_loop().create_future()
self.pending_requests[request_id] = future
try:
# Send request
await self.librarian_request_producer.send(
request, properties={"id": request_id}
)
# Wait for response
response = await asyncio.wait_for(future, timeout=timeout)
if response.error:
raise RuntimeError(
f"Librarian error saving child document: {response.error.type}: {response.error.message}"
)
return doc_id
except asyncio.TimeoutError:
self.pending_requests.pop(request_id, None)
raise RuntimeError(f"Timeout saving child document {doc_id}")
async def on_message(self, msg, consumer, flow):
logger.debug("PDF message received")
@ -187,13 +263,62 @@ class Processor(FlowProcessor):
loader = PyPDFLoader(temp_path)
pages = loader.load()
# Get the source document ID
source_doc_id = v.document_id or v.metadata.id
for ix, page in enumerate(pages):
page_num = ix + 1 # 1-indexed page numbers
logger.debug(f"Processing page {ix}")
logger.debug(f"Processing page {page_num}")
# Generate page document ID
page_doc_id = f"{source_doc_id}/p{page_num}"
page_content = page.page_content.encode("utf-8")
# Save page as child document in librarian
await self.save_child_document(
doc_id=page_doc_id,
parent_id=source_doc_id,
user=v.metadata.user,
content=page_content,
document_type="page",
title=f"Page {page_num}",
)
# Emit provenance triples
doc_uri = document_uri(source_doc_id)
pg_uri = page_uri(source_doc_id, page_num)
prov_triples = derived_entity_triples(
entity_uri=pg_uri,
parent_uri=doc_uri,
component_name=COMPONENT_NAME,
component_version=COMPONENT_VERSION,
label=f"Page {page_num}",
page_number=page_num,
)
await flow("triples").send(Triples(
metadata=Metadata(
id=pg_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
triples=prov_triples,
))
# Forward page document ID to chunker
# Chunker will fetch content from librarian
r = TextDocument(
metadata=v.metadata,
text=page.page_content.encode("utf-8"),
metadata=Metadata(
id=pg_uri,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
document_id=page_doc_id,
text=b"", # Empty, chunker will fetch from librarian
)
await flow("output").send(r)

View file

@ -71,7 +71,8 @@ class Processor(FlowProcessor):
entities.append(
EntityEmbeddings(
entity=entity.entity,
vectors=vectors
vectors=vectors,
chunk_id=entity.chunk_id, # Provenance: source chunk
)
)

View file

@ -128,10 +128,12 @@ class Processor(FlowProcessor):
triples = []
entities = []
# FIXME: Putting metadata into triples store is duplicated in
# relationships extractor too
for t in v.metadata.metadata:
triples.append(t)
# Get chunk document ID for provenance linking
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
chunk_uri = v.metadata.id # The URI form for the chunk
# Note: Document metadata is now emitted once by librarian at processing
# initiation, so we don't need to duplicate it here.
for defn in defs:
@ -159,22 +161,27 @@ class Processor(FlowProcessor):
s=s_value, p=DEFINITION_VALUE, o=o_value
))
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=v.metadata.id)
o=Term(type=IRI, iri=chunk_uri)
))
# Output entity name as context for direct name matching
# Include chunk_id for embedding provenance
entities.append(EntityContext(
entity=s_value,
context=s,
chunk_id=chunk_doc_id,
))
# Output definition as context for semantic matching
# Include chunk_id for embedding provenance
entities.append(EntityContext(
entity=s_value,
context=defn["definition"],
chunk_id=chunk_doc_id,
))
# Send triples in batches

View file

@ -109,10 +109,12 @@ class Processor(FlowProcessor):
triples = []
# FIXME: Putting metadata into triples store is duplicated in
# relationships extractor too
for t in v.metadata.metadata:
triples.append(t)
# Get chunk document ID for provenance linking
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
chunk_uri = v.metadata.id # The URI form for the chunk
# Note: Document metadata is now emitted once by librarian at processing
# initiation, so we don't need to duplicate it here.
for rel in rels:
@ -168,19 +170,19 @@ class Processor(FlowProcessor):
o=Term(type=LITERAL, value=str(o))
))
# 'Subject of' for s
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=v.metadata.id)
o=Term(type=IRI, iri=chunk_uri)
))
if rel["object-entity"]:
# 'Subject of' for o
# Link object entity to chunk
triples.append(Triple(
s=o_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=v.metadata.id)
o=Term(type=IRI, iri=chunk_uri)
))
# Send triples in batches

View file

@ -609,8 +609,10 @@ class Librarian:
):
raise RequestError("Document already exists")
# Ensure document_type is set to "extracted"
request.document_metadata.document_type = "extracted"
# Set document_type if not specified by caller
# Valid types: "page", "chunk", or "extracted" (legacy)
if not request.document_metadata.document_type or request.document_metadata.document_type == "source":
request.document_metadata.document_type = "extracted"
# Create object ID for blob
object_id = uuid.uuid4()

View file

@ -23,9 +23,14 @@ from .. schema import config_request_queue, config_response_queue
from .. schema import Document, Metadata
from .. schema import TextDocument, Metadata
from .. schema import Triples
from .. exceptions import RequestError
from .. provenance import (
document_uri, document_triples, get_vocabulary_triples,
)
from . librarian import Librarian
from . collection_manager import CollectionManager
@ -281,6 +286,67 @@ class Processor(AsyncProcessor):
# Threshold for sending document_id instead of inline content (2MB)
STREAMING_THRESHOLD = 2 * 1024 * 1024
async def emit_document_provenance(self, document, processing, triples_queue):
"""
Emit document provenance metadata to the knowledge graph.
This emits:
1. Vocabulary bootstrap triples (idempotent, safe to re-emit)
2. Document metadata as PROV-O triples
"""
logger.debug(f"Emitting document provenance for {document.id}")
# Build document URI and provenance triples
doc_uri = document_uri(document.id)
# Get page count for PDFs (if available from document metadata)
page_count = None
if document.kind == "application/pdf":
# Page count might be in document metadata triples
# For now, we don't have it at this point - it gets determined during extraction
pass
# Build document metadata triples
prov_triples = document_triples(
doc_uri=doc_uri,
title=document.title if document.title else None,
mime_type=document.kind,
)
# Include any existing metadata triples from the document
if document.metadata:
prov_triples.extend(document.metadata)
# Get vocabulary bootstrap triples (idempotent)
vocab_triples = get_vocabulary_triples()
# Combine all triples
all_triples = vocab_triples + prov_triples
# Create publisher and emit
triples_pub = Publisher(
self.pubsub, triples_queue, schema=Triples
)
try:
await triples_pub.start()
triples_msg = Triples(
metadata=Metadata(
id=doc_uri,
metadata=[],
user=processing.user,
collection=processing.collection,
),
triples=all_triples,
)
await triples_pub.send(None, triples_msg)
logger.debug(f"Emitted {len(all_triples)} provenance triples for {document.id}")
finally:
await triples_pub.stop()
async def load_document(self, document, processing, content):
logger.debug("Ready for document processing...")
@ -301,6 +367,12 @@ class Processor(AsyncProcessor):
q = flow["interfaces"][kind]
# Emit document provenance to knowledge graph
if "triples-store" in flow["interfaces"]:
await self.emit_document_provenance(
document, processing, flow["interfaces"]["triples-store"]
)
if kind == "text-load":
# For large text documents, send document_id for streaming retrieval
if len(content) >= self.STREAMING_THRESHOLD: