Extract-time provenance (#661)

1. Shared Provenance Module - URI generators, namespace constants,
   triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
   initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
   provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
   forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
   - Link entities to chunks via SUBJECT_OF (not top-level document)
   - Removed duplicate metadata emission (now handled by librarian)
   - Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
   - EntityContext schema has chunk_id field
   - EntityEmbeddings schema has chunk_id field
   - Definitions extractor sets chunk_id when creating EntityContext
   - Graph embeddings processor passes chunk_id through to
     EntityEmbeddings

Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
    ↓           ↓          ↓              ↓
  librarian  librarian  librarian    (chunk_id reference)
  + graph    + graph    + graph

Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.

Also, updating tests
This commit is contained in:
cybermaggedon 2026-03-05 18:36:10 +00:00 committed by GitHub
parent d8f0a576af
commit cd5580be59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1601 additions and 59 deletions

View file

@ -15,7 +15,7 @@ from .consumer import Consumer
from .producer import Producer
from .metrics import ConsumerMetrics, ProducerMetrics
from ..schema import LibrarianRequest, LibrarianResponse
from ..schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
from ..schema import librarian_request_queue, librarian_response_queue
# Module logger
@ -135,6 +135,67 @@ class ChunkingService(FlowProcessor):
self.pending_requests.pop(request_id, None)
raise RuntimeError(f"Timeout fetching document {document_id}")
async def save_child_document(self, doc_id, parent_id, user, content,
document_type="chunk", title=None, timeout=120):
"""
Save a child document (chunk) to the librarian.
Args:
doc_id: ID for the new child document
parent_id: ID of the parent document
user: User ID
content: Document content (bytes or str)
document_type: Type of document ("chunk", etc.)
title: Optional title
timeout: Request timeout in seconds
Returns:
The document ID on success
"""
request_id = str(uuid.uuid4())
if isinstance(content, str):
content = content.encode("utf-8")
doc_metadata = DocumentMetadata(
id=doc_id,
user=user,
kind="text/plain",
title=title or doc_id,
parent_id=parent_id,
document_type=document_type,
)
request = LibrarianRequest(
operation="add-child-document",
document_metadata=doc_metadata,
content=base64.b64encode(content).decode("utf-8"),
)
# Create future for response
future = asyncio.get_event_loop().create_future()
self.pending_requests[request_id] = future
try:
# Send request
await self.librarian_request_producer.send(
request, properties={"id": request_id}
)
# Wait for response
response = await asyncio.wait_for(future, timeout=timeout)
if response.error:
raise RuntimeError(
f"Librarian error saving chunk: {response.error.type}: {response.error.message}"
)
return doc_id
except asyncio.TimeoutError:
self.pending_requests.pop(request_id, None)
raise RuntimeError(f"Timeout saving chunk {doc_id}")
async def get_document_text(self, doc):
"""
Get text content from a TextDocument, fetching from librarian if needed.

View file

@ -0,0 +1,110 @@
"""
Provenance module for extraction-time provenance support.
Provides helpers for:
- URI generation for documents, pages, chunks, activities, statements
- PROV-O triple building for provenance metadata
- Vocabulary bootstrap for per-collection initialization
Usage example:
from trustgraph.provenance import (
document_uri, page_uri, chunk_uri_from_page,
document_triples, derived_entity_triples,
get_vocabulary_triples,
)
# Generate URIs
doc_uri = document_uri("my-doc-123")
page_uri = page_uri("my-doc-123", page_number=1)
# Build provenance triples
triples = document_triples(
doc_uri,
title="My Document",
mime_type="application/pdf",
page_count=10,
)
# Get vocabulary bootstrap triples (once per collection)
vocab_triples = get_vocabulary_triples()
"""
# URI generation
from . uris import (
TRUSTGRAPH_BASE,
document_uri,
page_uri,
chunk_uri_from_page,
chunk_uri_from_doc,
activity_uri,
statement_uri,
agent_uri,
)
# Namespace constants
from . namespaces import (
# PROV-O
PROV, PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
# Dublin Core
DC, DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
# RDF/RDFS
RDF, RDF_TYPE, RDFS, RDFS_LABEL,
# TrustGraph
TG, TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
)
# Triple builders
from . triples import (
document_triples,
derived_entity_triples,
triple_provenance_triples,
)
# Vocabulary bootstrap
from . vocabulary import (
get_vocabulary_triples,
PROV_CLASS_LABELS,
PROV_PREDICATE_LABELS,
DC_PREDICATE_LABELS,
TG_PREDICATE_LABELS,
)
__all__ = [
# URIs
"TRUSTGRAPH_BASE",
"document_uri",
"page_uri",
"chunk_uri_from_page",
"chunk_uri_from_doc",
"activity_uri",
"statement_uri",
"agent_uri",
# Namespaces
"PROV", "PROV_ENTITY", "PROV_ACTIVITY", "PROV_AGENT",
"PROV_WAS_DERIVED_FROM", "PROV_WAS_GENERATED_BY",
"PROV_USED", "PROV_WAS_ASSOCIATED_WITH", "PROV_STARTED_AT_TIME",
"DC", "DC_TITLE", "DC_SOURCE", "DC_DATE", "DC_CREATOR",
"RDF", "RDF_TYPE", "RDFS", "RDFS_LABEL",
"TG", "TG_REIFIES", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
"TG_CHUNK_INDEX", "TG_CHAR_OFFSET", "TG_CHAR_LENGTH",
"TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
"TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
"TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
# Triple builders
"document_triples",
"derived_entity_triples",
"triple_provenance_triples",
# Vocabulary
"get_vocabulary_triples",
"PROV_CLASS_LABELS",
"PROV_PREDICATE_LABELS",
"DC_PREDICATE_LABELS",
"TG_PREDICATE_LABELS",
]

View file

@ -0,0 +1,48 @@
"""
RDF namespace constants for provenance.
Includes PROV-O, Dublin Core, and TrustGraph namespace URIs.
"""
# PROV-O namespace (W3C Provenance Ontology)
PROV = "http://www.w3.org/ns/prov#"
PROV_ENTITY = PROV + "Entity"
PROV_ACTIVITY = PROV + "Activity"
PROV_AGENT = PROV + "Agent"
PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
PROV_WAS_GENERATED_BY = PROV + "wasGeneratedBy"
PROV_USED = PROV + "used"
PROV_WAS_ASSOCIATED_WITH = PROV + "wasAssociatedWith"
PROV_STARTED_AT_TIME = PROV + "startedAtTime"
# Dublin Core namespace
DC = "http://purl.org/dc/elements/1.1/"
DC_TITLE = DC + "title"
DC_SOURCE = DC + "source"
DC_DATE = DC + "date"
DC_CREATOR = DC + "creator"
# RDF/RDFS namespace (also in rdf.py, but included here for completeness)
RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
RDF_TYPE = RDF + "type"
RDFS = "http://www.w3.org/2000/01/rdf-schema#"
RDFS_LABEL = RDFS + "label"
# TrustGraph namespace for custom predicates
TG = "https://trustgraph.ai/ns/"
TG_REIFIES = TG + "reifies"
TG_PAGE_COUNT = TG + "pageCount"
TG_MIME_TYPE = TG + "mimeType"
TG_PAGE_NUMBER = TG + "pageNumber"
TG_CHUNK_INDEX = TG + "chunkIndex"
TG_CHAR_OFFSET = TG + "charOffset"
TG_CHAR_LENGTH = TG + "charLength"
TG_CHUNK_SIZE = TG + "chunkSize"
TG_CHUNK_OVERLAP = TG + "chunkOverlap"
TG_COMPONENT_VERSION = TG + "componentVersion"
TG_LLM_MODEL = TG + "llmModel"
TG_ONTOLOGY = TG + "ontology"
TG_EMBEDDING_MODEL = TG + "embeddingModel"
TG_SOURCE_TEXT = TG + "sourceText"
TG_SOURCE_CHAR_OFFSET = TG + "sourceCharOffset"
TG_SOURCE_CHAR_LENGTH = TG + "sourceCharLength"

View file

@ -0,0 +1,251 @@
"""
Helper functions to build PROV-O triples for extraction-time provenance.
"""
from datetime import datetime
from typing import List, Optional
from .. schema import Triple, Term, IRI, LITERAL
from . namespaces import (
RDF_TYPE, RDFS_LABEL,
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES,
)
from . uris import activity_uri, agent_uri
def _iri(uri: str) -> Term:
"""Create an IRI term."""
return Term(type=IRI, iri=uri)
def _literal(value) -> Term:
"""Create a literal term."""
return Term(type=LITERAL, value=str(value))
def _triple(s: str, p: str, o_term: Term) -> Triple:
"""Create a triple with IRI subject and predicate."""
return Triple(s=_iri(s), p=_iri(p), o=o_term)
def document_triples(
doc_uri: str,
title: Optional[str] = None,
source: Optional[str] = None,
date: Optional[str] = None,
creator: Optional[str] = None,
page_count: Optional[int] = None,
mime_type: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a source document entity.
Args:
doc_uri: The document URI (from uris.document_uri)
title: Document title
source: Source URL/path
date: Document date
creator: Author/creator
page_count: Number of pages (for PDFs)
mime_type: MIME type
Returns:
List of Triple objects
"""
triples = [
_triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)),
]
if title:
triples.append(_triple(doc_uri, DC_TITLE, _literal(title)))
triples.append(_triple(doc_uri, RDFS_LABEL, _literal(title)))
if source:
triples.append(_triple(doc_uri, DC_SOURCE, _iri(source)))
if date:
triples.append(_triple(doc_uri, DC_DATE, _literal(date)))
if creator:
triples.append(_triple(doc_uri, DC_CREATOR, _literal(creator)))
if page_count is not None:
triples.append(_triple(doc_uri, TG_PAGE_COUNT, _literal(page_count)))
if mime_type:
triples.append(_triple(doc_uri, TG_MIME_TYPE, _literal(mime_type)))
return triples
def derived_entity_triples(
entity_uri: str,
parent_uri: str,
component_name: str,
component_version: str,
label: Optional[str] = None,
page_number: Optional[int] = None,
chunk_index: Optional[int] = None,
char_offset: Optional[int] = None,
char_length: Optional[int] = None,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
timestamp: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a derived entity (page or chunk) with full PROV-O provenance.
Creates:
- Entity declaration
- wasDerivedFrom relationship to parent
- Activity for the extraction
- Agent for the component
Args:
entity_uri: URI of the derived entity (page or chunk)
parent_uri: URI of the parent entity
component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
component_version: Version of the component
label: Human-readable label
page_number: Page number (for pages)
chunk_index: Chunk index (for chunks)
char_offset: Character offset in parent (for chunks)
char_length: Character length (for chunks)
chunk_size: Configured chunk size (for chunking activity)
chunk_overlap: Configured chunk overlap (for chunking activity)
timestamp: ISO timestamp (defaults to now)
Returns:
List of Triple objects
"""
if timestamp is None:
timestamp = datetime.utcnow().isoformat() + "Z"
act_uri = activity_uri()
agt_uri = agent_uri(component_name)
triples = [
# Entity declaration
_triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)),
# Derivation from parent
_triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
# Generation by activity
_triple(entity_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
# Activity declaration
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
_triple(act_uri, PROV_USED, _iri(parent_uri)),
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
_triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
# Agent declaration
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
]
if label:
triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label)))
if page_number is not None:
triples.append(_triple(entity_uri, TG_PAGE_NUMBER, _literal(page_number)))
if chunk_index is not None:
triples.append(_triple(entity_uri, TG_CHUNK_INDEX, _literal(chunk_index)))
if char_offset is not None:
triples.append(_triple(entity_uri, TG_CHAR_OFFSET, _literal(char_offset)))
if char_length is not None:
triples.append(_triple(entity_uri, TG_CHAR_LENGTH, _literal(char_length)))
if chunk_size is not None:
triples.append(_triple(act_uri, TG_CHUNK_SIZE, _literal(chunk_size)))
if chunk_overlap is not None:
triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))
return triples
def triple_provenance_triples(
stmt_uri: str,
subject_uri: str,
predicate_uri: str,
object_term: Term,
chunk_uri: str,
component_name: str,
component_version: str,
llm_model: Optional[str] = None,
ontology_uri: Optional[str] = None,
timestamp: Optional[str] = None,
) -> List[Triple]:
"""
Build provenance triples for an extracted knowledge triple using reification.
Creates:
- Statement object that reifies the triple
- wasDerivedFrom link to source chunk
- Activity and agent metadata
Args:
stmt_uri: URI for the reified statement
subject_uri: Subject of the extracted triple
predicate_uri: Predicate of the extracted triple
object_term: Object of the extracted triple (Term)
chunk_uri: URI of source chunk
component_name: Name of extractor component
component_version: Version of the component
llm_model: LLM model used for extraction
ontology_uri: Ontology URI used for extraction
timestamp: ISO timestamp
Returns:
List of Triple objects for the provenance (not the triple itself)
"""
if timestamp is None:
timestamp = datetime.utcnow().isoformat() + "Z"
act_uri = activity_uri()
agt_uri = agent_uri(component_name)
# Note: The actual reification (tg:reifies pointing at the edge) requires
# RDF 1.2 triple term support. This builds the surrounding provenance.
# The actual reification link must be handled by the knowledge extractor
# using the graph store's reification API.
triples = [
# Statement provenance
_triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
_triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
# Activity
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
_triple(act_uri, PROV_USED, _iri(chunk_uri)),
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
_triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
# Agent
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
]
if llm_model:
triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model)))
if ontology_uri:
triples.append(_triple(act_uri, TG_ONTOLOGY, _iri(ontology_uri)))
return triples

View file

@ -0,0 +1,61 @@
"""
URI generation for provenance entities.
URI patterns:
- Document: https://trustgraph.ai/doc/{doc_id}
- Page: https://trustgraph.ai/page/{doc_id}/p{page_number}
- Chunk: https://trustgraph.ai/chunk/{doc_id}/p{page}/c{chunk} (from page)
https://trustgraph.ai/chunk/{doc_id}/c{chunk} (from text doc)
- Activity: https://trustgraph.ai/activity/{uuid}
- Statement: https://trustgraph.ai/stmt/{uuid}
"""
import uuid
import urllib.parse
# Base URI prefix
TRUSTGRAPH_BASE = "https://trustgraph.ai"
def _encode_id(id_str: str) -> str:
"""URL-encode an ID component for safe inclusion in URIs."""
return urllib.parse.quote(str(id_str), safe='')
def document_uri(doc_id: str) -> str:
"""Generate URI for a source document."""
return f"{TRUSTGRAPH_BASE}/doc/{_encode_id(doc_id)}"
def page_uri(doc_id: str, page_number: int) -> str:
"""Generate URI for a page extracted from a document."""
return f"{TRUSTGRAPH_BASE}/page/{_encode_id(doc_id)}/p{page_number}"
def chunk_uri_from_page(doc_id: str, page_number: int, chunk_index: int) -> str:
"""Generate URI for a chunk extracted from a page."""
return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/p{page_number}/c{chunk_index}"
def chunk_uri_from_doc(doc_id: str, chunk_index: int) -> str:
"""Generate URI for a chunk extracted directly from a text document."""
return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/c{chunk_index}"
def activity_uri(activity_id: str = None) -> str:
"""Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
if activity_id is None:
activity_id = str(uuid.uuid4())
return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
def statement_uri(stmt_id: str = None) -> str:
"""Generate URI for a reified statement. Auto-generates UUID if not provided."""
if stmt_id is None:
stmt_id = str(uuid.uuid4())
return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}"
def agent_uri(component_name: str) -> str:
"""Generate URI for a TrustGraph component agent."""
return f"{TRUSTGRAPH_BASE}/agent/{_encode_id(component_name)}"

View file

@ -0,0 +1,101 @@
"""
Vocabulary bootstrap for provenance.
The knowledge graph is ontology-neutral and initializes empty. When writing
PROV-O provenance data to a collection for the first time, the vocabulary
must be bootstrapped with RDF labels for all classes and predicates.
"""
from typing import List
from .. schema import Triple, Term, IRI, LITERAL
from . namespaces import (
RDFS_LABEL,
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
)
def _label_triple(uri: str, label: str) -> Triple:
"""Create a label triple for a URI."""
return Triple(
s=Term(type=IRI, iri=uri),
p=Term(type=IRI, iri=RDFS_LABEL),
o=Term(type=LITERAL, value=label),
)
# PROV-O class labels
PROV_CLASS_LABELS = [
_label_triple(PROV_ENTITY, "Entity"),
_label_triple(PROV_ACTIVITY, "Activity"),
_label_triple(PROV_AGENT, "Agent"),
]
# PROV-O predicate labels
PROV_PREDICATE_LABELS = [
_label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
_label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
_label_triple(PROV_USED, "used"),
_label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
_label_triple(PROV_STARTED_AT_TIME, "started at"),
]
# Dublin Core predicate labels
DC_PREDICATE_LABELS = [
_label_triple(DC_TITLE, "title"),
_label_triple(DC_SOURCE, "source"),
_label_triple(DC_DATE, "date"),
_label_triple(DC_CREATOR, "creator"),
]
# TrustGraph predicate labels
TG_PREDICATE_LABELS = [
_label_triple(TG_REIFIES, "reifies"),
_label_triple(TG_PAGE_COUNT, "page count"),
_label_triple(TG_MIME_TYPE, "MIME type"),
_label_triple(TG_PAGE_NUMBER, "page number"),
_label_triple(TG_CHUNK_INDEX, "chunk index"),
_label_triple(TG_CHAR_OFFSET, "character offset"),
_label_triple(TG_CHAR_LENGTH, "character length"),
_label_triple(TG_CHUNK_SIZE, "chunk size"),
_label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
_label_triple(TG_COMPONENT_VERSION, "component version"),
_label_triple(TG_LLM_MODEL, "LLM model"),
_label_triple(TG_ONTOLOGY, "ontology"),
_label_triple(TG_EMBEDDING_MODEL, "embedding model"),
_label_triple(TG_SOURCE_TEXT, "source text"),
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
]
def get_vocabulary_triples() -> List[Triple]:
"""
Get all vocabulary bootstrap triples.
Returns a list of triples that define labels for all PROV-O classes,
PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
used in extraction-time provenance.
This should be emitted to the knowledge graph once per collection
before any provenance data is written. The operation is idempotent -
re-emitting the same triples is harmless.
Returns:
List of Triple objects defining vocabulary labels
"""
return (
PROV_CLASS_LABELS +
PROV_PREDICATE_LABELS +
DC_PREDICATE_LABELS +
TG_PREDICATE_LABELS
)

View file

@ -34,5 +34,9 @@ class TextDocument:
class Chunk:
metadata: Metadata | None = None
chunk: bytes = b""
# For provenance: document_id of this chunk in librarian
# Post-chunker optimization: both document_id AND chunk content are included
# so downstream processors have the ID for provenance and content to work with
document_id: str = ""
############################################################################

View file

@ -12,6 +12,8 @@ from ..core.topic import topic
class EntityEmbeddings:
entity: Term | None = None
vectors: list[list[float]] = field(default_factory=list)
# Provenance: which chunk this embedding was derived from
chunk_id: str = ""
# This is a 'batching' mechanism for the above data
@dataclass

View file

@ -12,6 +12,8 @@ from ..core.topic import topic
class EntityContext:
entity: Term | None = None
context: str = ""
# Provenance: which chunk this entity context was derived from
chunk_id: str = ""
# This is a 'batching' mechanism for the above data
@dataclass

View file

@ -91,7 +91,12 @@ class DocumentMetadata:
tags: list[str] = field(default_factory=list)
# Child document support
parent_id: str = "" # Empty for top-level docs, set for children
document_type: str = "source" # "source" or "extracted"
# Document type vocabulary:
# "source" - original uploaded document
# "page" - page extracted from source (e.g., PDF page)
# "chunk" - text chunk derived from page or source
# "extracted" - legacy value, kept for backwards compatibility
document_type: str = "source"
@dataclass
class ProcessingMetadata: