trustgraph/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
cybermaggedon cd5580be59
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
   triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
   initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
   provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
   forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
   - Link entities to chunks via SUBJECT_OF (not top-level document)
   - Removed duplicate metadata emission (now handled by librarian)
   - Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
   - EntityContext schema has chunk_id field
   - EntityEmbeddings schema has chunk_id field
   - Definitions extractor sets chunk_id when creating EntityContext
   - Graph embeddings processor passes chunk_id through to
     EntityEmbeddings

Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
    ↓           ↓          ↓              ↓
  librarian  librarian  librarian    (chunk_id reference)
  + graph    + graph    + graph

Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.

Also, updating tests
2026-03-05 18:36:10 +00:00

229 lines
6.6 KiB
Python
Executable file

"""
Simple decoder, accepts text chunks input, applies entity
relationship analysis to get entity relationship edges which are output as
graph edges.
"""
import json
import logging
import urllib.parse
# Module logger
logger = logging.getLogger(__name__)
from .... schema import Chunk, Triple, Triples
from .... schema import Metadata, Term, IRI, LITERAL
from .... schema import PromptRequest, PromptResponse
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-relationships"
default_concurrency = 1
default_triples_batch_size = 50
class Processor(FlowProcessor):
def __init__(self, **params):
id = params.get("id")
concurrency = params.get("concurrency", 1)
self.triples_batch_size = params.get("triples_batch_size", default_triples_batch_size)
super(Processor, self).__init__(
**params | {
"id": id,
"concurrency": concurrency,
}
)
self.register_specification(
ConsumerSpec(
name = "input",
schema = Chunk,
handler = self.on_message,
concurrency = concurrency,
)
)
self.register_specification(
PromptClientSpec(
request_name = "prompt-request",
response_name = "prompt-response",
)
)
self.register_specification(
ProducerSpec(
name = "triples",
schema = Triples
)
)
def to_uri(self, text):
part = text.replace(" ", "-").lower().encode("utf-8")
quoted = urllib.parse.quote(part)
uri = TRUSTGRAPH_ENTITIES + quoted
return uri
async def emit_triples(self, pub, metadata, triples):
t = Triples(
metadata=metadata,
triples=triples,
)
await pub.send(t)
async def on_message(self, msg, consumer, flow):
v = msg.value()
logger.info(f"Extracting relationships from {v.metadata.id}...")
chunk = v.chunk.decode("utf-8")
logger.debug(f"Processing chunk: {chunk[:100]}..." if len(chunk) > 100 else f"Processing chunk: {chunk}")
try:
try:
rels = await flow("prompt-request").extract_relationships(
text = chunk
)
logger.debug(f"Prompt response: {rels}")
if type(rels) != list:
raise RuntimeError("Expecting array in prompt response")
except Exception as e:
logger.error(f"Prompt exception: {e}", exc_info=True)
raise e
triples = []
# Get chunk document ID for provenance linking
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
chunk_uri = v.metadata.id # The URI form for the chunk
# Note: Document metadata is now emitted once by librarian at processing
# initiation, so we don't need to duplicate it here.
for rel in rels:
s = rel["subject"]
p = rel["predicate"]
o = rel["object"]
if s == "": continue
if p == "": continue
if o == "": continue
if s is None: continue
if p is None: continue
if o is None: continue
s_uri = self.to_uri(s)
s_value = Term(type=IRI, iri=str(s_uri))
p_uri = self.to_uri(p)
p_value = Term(type=IRI, iri=str(p_uri))
if rel["object-entity"]:
o_uri = self.to_uri(o)
o_value = Term(type=IRI, iri=str(o_uri))
else:
o_value = Term(type=LITERAL, value=str(o))
triples.append(Triple(
s=s_value,
p=p_value,
o=o_value
))
# Label for s
triples.append(Triple(
s=s_value,
p=RDF_LABEL_VALUE,
o=Term(type=LITERAL, value=str(s))
))
# Label for p
triples.append(Triple(
s=p_value,
p=RDF_LABEL_VALUE,
o=Term(type=LITERAL, value=str(p))
))
if rel["object-entity"]:
# Label for o
triples.append(Triple(
s=o_value,
p=RDF_LABEL_VALUE,
o=Term(type=LITERAL, value=str(o))
))
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
if rel["object-entity"]:
# Link object entity to chunk
triples.append(Triple(
s=o_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Send triples in batches
for i in range(0, len(triples), self.triples_batch_size):
batch = triples[i:i + self.triples_batch_size]
await self.emit_triples(
flow("triples"),
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
batch
)
except Exception as e:
logger.error(f"Relationship extraction exception: {e}", exc_info=True)
logger.debug("Relationship extraction complete")
@staticmethod
def add_args(parser):
parser.add_argument(
'-c', '--concurrency',
type=int,
default=default_concurrency,
help=f'Concurrent processing threads (default: {default_concurrency})'
)
parser.add_argument(
'--triples-batch-size',
type=int,
default=default_triples_batch_size,
help=f'Maximum triples per output message (default: {default_triples_batch_size})'
)
FlowProcessor.add_args(parser)
def run():
Processor.launch(default_ident, __doc__)