mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 16:36:21 +02:00
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
This commit is contained in:
parent
d8f0a576af
commit
cd5580be59
20 changed files with 1601 additions and 59 deletions
|
|
@ -15,7 +15,7 @@ from .consumer import Consumer
|
|||
from .producer import Producer
|
||||
from .metrics import ConsumerMetrics, ProducerMetrics
|
||||
|
||||
from ..schema import LibrarianRequest, LibrarianResponse
|
||||
from ..schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
||||
from ..schema import librarian_request_queue, librarian_response_queue
|
||||
|
||||
# Module logger
|
||||
|
|
@ -135,6 +135,67 @@ class ChunkingService(FlowProcessor):
|
|||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout fetching document {document_id}")
|
||||
|
||||
async def save_child_document(self, doc_id, parent_id, user, content,
|
||||
document_type="chunk", title=None, timeout=120):
|
||||
"""
|
||||
Save a child document (chunk) to the librarian.
|
||||
|
||||
Args:
|
||||
doc_id: ID for the new child document
|
||||
parent_id: ID of the parent document
|
||||
user: User ID
|
||||
content: Document content (bytes or str)
|
||||
document_type: Type of document ("chunk", etc.)
|
||||
title: Optional title
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
The document ID on success
|
||||
"""
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
if isinstance(content, str):
|
||||
content = content.encode("utf-8")
|
||||
|
||||
doc_metadata = DocumentMetadata(
|
||||
id=doc_id,
|
||||
user=user,
|
||||
kind="text/plain",
|
||||
title=title or doc_id,
|
||||
parent_id=parent_id,
|
||||
document_type=document_type,
|
||||
)
|
||||
|
||||
request = LibrarianRequest(
|
||||
operation="add-child-document",
|
||||
document_metadata=doc_metadata,
|
||||
content=base64.b64encode(content).decode("utf-8"),
|
||||
)
|
||||
|
||||
# Create future for response
|
||||
future = asyncio.get_event_loop().create_future()
|
||||
self.pending_requests[request_id] = future
|
||||
|
||||
try:
|
||||
# Send request
|
||||
await self.librarian_request_producer.send(
|
||||
request, properties={"id": request_id}
|
||||
)
|
||||
|
||||
# Wait for response
|
||||
response = await asyncio.wait_for(future, timeout=timeout)
|
||||
|
||||
if response.error:
|
||||
raise RuntimeError(
|
||||
f"Librarian error saving chunk: {response.error.type}: {response.error.message}"
|
||||
)
|
||||
|
||||
return doc_id
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout saving chunk {doc_id}")
|
||||
|
||||
async def get_document_text(self, doc):
|
||||
"""
|
||||
Get text content from a TextDocument, fetching from librarian if needed.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue