Fix/extraction prov (#662)

Quoted triple fixes, including... 1. Updated triple_provenance_triples() in triples.py: - Now accepts a Triple object directly - Creates the reification triple using TRIPLE term type: stmt_uri tg:reifies <<extracted_triple>> - Includes it in the returned provenance triples 2. Updated definitions extractor: - Added imports for provenance functions and component version - Added ParameterSpec for optional llm-model and ontology flow parameters - For each definition triple, generates provenance with reification 3. Updated relationships extractor: - Same changes as definitions extractor
2026-07-02 22:41:01 +02:00 · 2026-03-06 12:23:58 +00:00 · 2026-03-06 12:23:58 +00:00 · 2b9232917c
commit 2b9232917c
parent cd5580be59
19 changed files with 361 additions and 72 deletions
--- a/trustgraph-base/trustgraph/api/flow.py
+++ b/trustgraph-base/trustgraph/api/flow.py
@ -9,26 +9,45 @@ including LLM operations, RAG queries, knowledge graph management, and more.
 import json
 import base64
-from .. knowledge import hash, Uri, Literal
+from .. knowledge import hash, Uri, Literal, QuotedTriple
-from .. schema import IRI, LITERAL
+from .. schema import IRI, LITERAL, TRIPLE
 from . types import Triple
 from . exceptions import ProtocolException
 def to_value(x):
-    """Convert wire format to Uri or Literal."""
+    """Convert wire format to Uri, Literal, or QuotedTriple."""
    if x.get("t") == IRI:
        return Uri(x.get("i", ""))
    elif x.get("t") == LITERAL:
        return Literal(x.get("v", ""))
    elif x.get("t") == TRIPLE:
        # Wire format uses "tr" key for nested triple dict
        triple_data = x.get("tr")
        if triple_data:
            return QuotedTriple(
                s=to_value(triple_data.get("s", {})),
                p=to_value(triple_data.get("p", {})),
                o=to_value(triple_data.get("o", {})),
            )
        return Literal("")
    # Fallback for any other type
    return Literal(x.get("v", x.get("i", "")))
 def from_value(v):
-    """Convert Uri or Literal to wire format."""
+    """Convert Uri, Literal, or QuotedTriple to wire format."""
    if isinstance(v, Uri):
        return {"t": IRI, "i": str(v)}
    elif isinstance(v, QuotedTriple):
        return {
            "t": TRIPLE,
            "tr": {
                "s": from_value(v.s),
                "p": from_value(v.p),
                "o": from_value(v.o),
            }
        }
    else:
        return {"t": LITERAL, "v": str(v)}
--- a/trustgraph-base/trustgraph/api/knowledge.py
+++ b/trustgraph-base/trustgraph/api/knowledge.py
@ -9,17 +9,27 @@ into flows for use in queries and RAG operations.
 import json
 import base64
-from .. knowledge import hash, Uri, Literal
+from .. knowledge import hash, Uri, Literal, QuotedTriple
-from .. schema import IRI, LITERAL
+from .. schema import IRI, LITERAL, TRIPLE
 from . types import Triple
 def to_value(x):
-    """Convert wire format to Uri or Literal."""
+    """Convert wire format to Uri, Literal, or QuotedTriple."""
    if x.get("t") == IRI:
        return Uri(x.get("i", ""))
    elif x.get("t") == LITERAL:
        return Literal(x.get("v", ""))
    elif x.get("t") == TRIPLE:
        # Wire format uses "tr" key for nested triple dict
        triple_data = x.get("tr")
        if triple_data:
            return QuotedTriple(
                s=to_value(triple_data.get("s", {})),
                p=to_value(triple_data.get("p", {})),
                o=to_value(triple_data.get("o", {})),
            )
        return Literal("")
    # Fallback for any other type
    return Literal(x.get("v", x.get("i", "")))
--- a/trustgraph-base/trustgraph/api/library.py
+++ b/trustgraph-base/trustgraph/api/library.py
@ -12,8 +12,8 @@ import base64
 import logging
 from . types import DocumentMetadata, ProcessingMetadata, Triple
-from .. knowledge import hash, Uri, Literal
+from .. knowledge import hash, Uri, Literal, QuotedTriple
-from .. schema import IRI, LITERAL
+from .. schema import IRI, LITERAL, TRIPLE
 from . exceptions import *
 logger = logging.getLogger(__name__)
@ -27,19 +27,38 @@ DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
 def to_value(x):
-    """Convert wire format to Uri or Literal."""
+    """Convert wire format to Uri, Literal, or QuotedTriple."""
    if x.get("t") == IRI:
        return Uri(x.get("i", ""))
    elif x.get("t") == LITERAL:
        return Literal(x.get("v", ""))
    elif x.get("t") == TRIPLE:
        # Wire format uses "tr" key for nested triple dict
        triple_data = x.get("tr")
        if triple_data:
            return QuotedTriple(
                s=to_value(triple_data.get("s", {})),
                p=to_value(triple_data.get("p", {})),
                o=to_value(triple_data.get("o", {})),
            )
        return Literal("")
    # Fallback for any other type
    return Literal(x.get("v", x.get("i", "")))
 def from_value(v):
-    """Convert Uri or Literal to wire format."""
+    """Convert Uri, Literal, or QuotedTriple to wire format."""
    if isinstance(v, Uri):
        return {"t": IRI, "i": str(v)}
    elif isinstance(v, QuotedTriple):
        return {
            "t": TRIPLE,
            "tr": {
                "s": from_value(v.s),
                "p": from_value(v.p),
                "o": from_value(v.o),
            }
        }
    else:
        return {"t": LITERAL, "v": str(v)}
--- a/trustgraph-base/trustgraph/base/pulsar_backend.py
+++ b/trustgraph-base/trustgraph/base/pulsar_backend.py
@ -12,7 +12,7 @@ import logging
 import base64
 import types
 from dataclasses import asdict, is_dataclass
-from typing import Any
+from typing import Any, get_type_hints
 from .backend import PubSubBackend, BackendProducer, BackendConsumer, Message
@ -58,6 +58,7 @@ def dict_to_dataclass(data: dict, cls: type) -> Any:
    Convert a dictionary back to a dataclass instance.
    Handles nested dataclasses and missing fields.
    Uses get_type_hints() to resolve forward references (string annotations).
    """
    if data is None:
        return None
@ -65,8 +66,13 @@ def dict_to_dataclass(data: dict, cls: type) -> Any:
    if not is_dataclass(cls):
        return data
-    # Get field types from the dataclass
+    # Get field types from the dataclass, resolving forward references
-    field_types = {f.name: f.type for f in cls.__dataclass_fields__.values()}
+    # get_type_hints() evaluates string annotations like "Triple | None"
    try:
        field_types = get_type_hints(cls)
    except Exception:
        # Fallback if get_type_hints fails (shouldn't happen normally)
        field_types = {f.name: f.type for f in cls.__dataclass_fields__.values()}
    kwargs = {}
    for key, value in data.items():
--- a/trustgraph-base/trustgraph/knowledge/defs.py
+++ b/trustgraph-base/trustgraph/knowledge/defs.py
@ -26,8 +26,40 @@ KEYWORD = 'https://schema.org/keywords'
 class Uri(str):
    def is_uri(self): return True
    def is_literal(self): return False
    def is_triple(self): return False
 class Literal(str):
    def is_uri(self): return False
    def is_literal(self): return True
    def is_triple(self): return False
 class QuotedTriple:
    """
    RDF-star quoted triple (reification).
    Represents a triple that can be used as the object of another triple,
    enabling statements about statements.
    Example:
        # stmt:123 tg:reifies <<:Hope skos:definition "A feeling...">>
        qt = QuotedTriple(
            s=Uri("https://example.org/Hope"),
            p=Uri("http://www.w3.org/2004/02/skos/core#definition"),
            o=Literal("A feeling of expectation")
        )
    """
    def __init__(self, s, p, o):
        self.s = s  # Uri, Literal, or QuotedTriple
        self.p = p  # Uri
        self.o = o  # Uri, Literal, or QuotedTriple
    def is_uri(self): return False
    def is_literal(self): return False
    def is_triple(self): return True
    def __repr__(self):
        return f"<<{self.s} {self.p} {self.o}>>"
    def __str__(self):
        return f"<<{self.s} {self.p} {self.o}>>"
--- a/trustgraph-base/trustgraph/messaging/translators/primitives.py
+++ b/trustgraph-base/trustgraph/messaging/translators/primitives.py
@ -82,6 +82,7 @@ def _triple_translator_to_pulsar(data: Dict[str, Any]) -> Triple:
 def _triple_translator_from_pulsar(obj: Triple) -> Dict[str, Any]:
    """Convert Triple object to wire format dict."""
    term_translator = TermTranslator()
    result: Dict[str, Any] = {}
--- a/trustgraph-base/trustgraph/provenance/namespaces.py
+++ b/trustgraph-base/trustgraph/provenance/namespaces.py
@ -28,6 +28,18 @@ RDF_TYPE = RDF + "type"
 RDFS = "http://www.w3.org/2000/01/rdf-schema#"
 RDFS_LABEL = RDFS + "label"
 # Schema.org namespace
 SCHEMA = "https://schema.org/"
 SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
 SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
 SCHEMA_DESCRIPTION = SCHEMA + "description"
 SCHEMA_KEYWORDS = SCHEMA + "keywords"
 SCHEMA_NAME = SCHEMA + "name"
 # SKOS namespace
 SKOS = "http://www.w3.org/2004/02/skos/core#"
 SKOS_DEFINITION = SKOS + "definition"
 # TrustGraph namespace for custom predicates
 TG = "https://trustgraph.ai/ns/"
 TG_REIFIES = TG + "reifies"
--- a/trustgraph-base/trustgraph/provenance/triples.py
+++ b/trustgraph-base/trustgraph/provenance/triples.py
@ -5,7 +5,7 @@ Helper functions to build PROV-O triples for extraction-time provenance.
 from datetime import datetime
 from typing import List, Optional
-from .. schema import Triple, Term, IRI, LITERAL
+from .. schema import Triple, Term, IRI, LITERAL, TRIPLE
 from . namespaces import (
    RDF_TYPE, RDFS_LABEL,
@ -145,6 +145,7 @@ def derived_entity_triples(
        # Activity declaration
        _triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
        _triple(act_uri, RDFS_LABEL, _literal(f"{component_name} extraction")),
        _triple(act_uri, PROV_USED, _iri(parent_uri)),
        _triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
        _triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
@ -181,9 +182,7 @@ def derived_entity_triples(
 def triple_provenance_triples(
    stmt_uri: str,
-    subject_uri: str,
+    extracted_triple: Triple,
    predicate_uri: str,
    object_term: Term,
    chunk_uri: str,
    component_name: str,
    component_version: str,
@ -195,15 +194,13 @@ def triple_provenance_triples(
    Build provenance triples for an extracted knowledge triple using reification.
    Creates:
-    - Statement object that reifies the triple
+    - Reification triple: stmt_uri tg:reifies <<extracted_triple>>
    - wasDerivedFrom link to source chunk
    - Activity and agent metadata
    Args:
        stmt_uri: URI for the reified statement
-        subject_uri: Subject of the extracted triple
+        extracted_triple: The extracted Triple to reify
        predicate_uri: Predicate of the extracted triple
        object_term: Object of the extracted triple (Term)
        chunk_uri: URI of source chunk
        component_name: Name of extractor component
        component_version: Version of the component
@ -212,7 +209,7 @@ def triple_provenance_triples(
        timestamp: ISO timestamp
    Returns:
-        List of Triple objects for the provenance (not the triple itself)
+        List of Triple objects for the provenance (including reification)
    """
    if timestamp is None:
        timestamp = datetime.utcnow().isoformat() + "Z"
@ -220,18 +217,24 @@ def triple_provenance_triples(
    act_uri = activity_uri()
    agt_uri = agent_uri(component_name)
-    # Note: The actual reification (tg:reifies pointing at the edge) requires
+    # Create the quoted triple term (RDF-star reification)
-    # RDF 1.2 triple term support. This builds the surrounding provenance.
+    triple_term = Term(type=TRIPLE, triple=extracted_triple)
    # The actual reification link must be handled by the knowledge extractor
    # using the graph store's reification API.
    triples = [
        # Reification: stmt_uri tg:reifies <<s p o>>
        Triple(
            s=_iri(stmt_uri),
            p=_iri(TG_REIFIES),
            o=triple_term
        ),
        # Statement provenance
        _triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
        _triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
        # Activity
        _triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
        _triple(act_uri, RDFS_LABEL, _literal(f"{component_name} extraction")),
        _triple(act_uri, PROV_USED, _iri(chunk_uri)),
        _triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
        _triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
--- a/trustgraph-base/trustgraph/provenance/uris.py
+++ b/trustgraph-base/trustgraph/provenance/uris.py
@ -1,11 +1,12 @@
 """
 URI generation for provenance entities.
-URI patterns:
+Document IDs are already IRIs (e.g., https://trustgraph.ai/doc/abc123).
- Document:  https://trustgraph.ai/doc/{doc_id}
+Child entities (pages, chunks) append path segments to the parent IRI:
- Page:      https://trustgraph.ai/page/{doc_id}/p{page_number}
+- Document:  {doc_iri} (as provided)
- Chunk:     https://trustgraph.ai/chunk/{doc_id}/p{page}/c{chunk} (from page)
+- Page:      {doc_iri}/p{page_number}
-             https://trustgraph.ai/chunk/{doc_id}/c{chunk} (from text doc)
+- Chunk:     {page_iri}/c{chunk_index} (from page)
             {doc_iri}/c{chunk_index} (from text doc)
 - Activity:  https://trustgraph.ai/activity/{uuid}
 - Statement: https://trustgraph.ai/stmt/{uuid}
 """
@ -13,7 +14,7 @@ URI patterns:
 import uuid
 import urllib.parse
-# Base URI prefix
+# Base URI prefix for generated URIs (activities, statements, agents)
 TRUSTGRAPH_BASE = "https://trustgraph.ai"
@ -22,24 +23,24 @@ def _encode_id(id_str: str) -> str:
    return urllib.parse.quote(str(id_str), safe='')
-def document_uri(doc_id: str) -> str:
+def document_uri(doc_iri: str) -> str:
-    """Generate URI for a source document."""
+    """Return the document IRI as-is (already a full URI)."""
-    return f"{TRUSTGRAPH_BASE}/doc/{_encode_id(doc_id)}"
+    return doc_iri
-def page_uri(doc_id: str, page_number: int) -> str:
+def page_uri(doc_iri: str, page_number: int) -> str:
-    """Generate URI for a page extracted from a document."""
+    """Generate URI for a page by appending to document IRI."""
-    return f"{TRUSTGRAPH_BASE}/page/{_encode_id(doc_id)}/p{page_number}"
+    return f"{doc_iri}/p{page_number}"
-def chunk_uri_from_page(doc_id: str, page_number: int, chunk_index: int) -> str:
+def chunk_uri_from_page(doc_iri: str, page_number: int, chunk_index: int) -> str:
    """Generate URI for a chunk extracted from a page."""
-    return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/p{page_number}/c{chunk_index}"
+    return f"{doc_iri}/p{page_number}/c{chunk_index}"
-def chunk_uri_from_doc(doc_id: str, chunk_index: int) -> str:
+def chunk_uri_from_doc(doc_iri: str, chunk_index: int) -> str:
    """Generate URI for a chunk extracted directly from a text document."""
-    return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/c{chunk_index}"
+    return f"{doc_iri}/c{chunk_index}"
 def activity_uri(activity_id: str = None) -> str:
--- a/trustgraph-base/trustgraph/provenance/vocabulary.py
+++ b/trustgraph-base/trustgraph/provenance/vocabulary.py
@ -16,6 +16,9 @@ from . namespaces import (
    PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
    PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
    DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
    SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
    SCHEMA_KEYWORDS, SCHEMA_NAME,
    SKOS_DEFINITION,
    TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
    TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
    TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
@ -57,6 +60,20 @@ DC_PREDICATE_LABELS = [
    _label_triple(DC_CREATOR, "creator"),
 ]
 # Schema.org labels
 SCHEMA_LABELS = [
    _label_triple(SCHEMA_SUBJECT_OF, "subject of"),
    _label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
    _label_triple(SCHEMA_DESCRIPTION, "description"),
    _label_triple(SCHEMA_KEYWORDS, "keywords"),
    _label_triple(SCHEMA_NAME, "name"),
 ]
 # SKOS labels
 SKOS_LABELS = [
    _label_triple(SKOS_DEFINITION, "definition"),
 ]
 # TrustGraph predicate labels
 TG_PREDICATE_LABELS = [
    _label_triple(TG_REIFIES, "reifies"),
@ -97,5 +114,7 @@ def get_vocabulary_triples() -> List[Triple]:
        PROV_CLASS_LABELS +
        PROV_PREDICATE_LABELS +
        DC_PREDICATE_LABELS +
        SCHEMA_LABELS +
        SKOS_LABELS +
        TG_PREDICATE_LABELS
    )
--- a/trustgraph-cli/trustgraph/cli/graph_to_turtle.py
+++ b/trustgraph-cli/trustgraph/cli/graph_to_turtle.py
@ -1,6 +1,6 @@
 """
 Connects to the graph query service and dumps all graph edges in Turtle
-format.
+format with RDF-star support for quoted triples.
 """
 import rdflib
@ -10,11 +10,37 @@ import argparse
 import os
 from trustgraph.api import Api, Uri
 from trustgraph.knowledge import QuotedTriple
 default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
 default_user = 'trustgraph'
 default_collection = 'default'
 def value_to_rdflib(val):
    """Convert a TrustGraph value to an rdflib term."""
    if isinstance(val, Uri):
        # Skip malformed URLs with spaces
        if " " in val:
            return None
        return rdflib.term.URIRef(val)
    elif isinstance(val, QuotedTriple):
        # RDF-star quoted triple
        s_term = value_to_rdflib(val.s)
        p_term = value_to_rdflib(val.p)
        o_term = value_to_rdflib(val.o)
        if s_term is None or p_term is None or o_term is None:
            return None
        # rdflib 6.x+ supports Triple as a term type
        try:
            return rdflib.term.Triple((s_term, p_term, o_term))
        except AttributeError:
            # Fallback for older rdflib versions - represent as string
            return rdflib.term.Literal(f"<<{val.s} {val.p} {val.o}>>")
    else:
        return rdflib.term.Literal(str(val))
 def show_graph(url, flow_id, user, collection):
    api = Api(url).flow().id(flow_id)
@ -30,18 +56,10 @@ def show_graph(url, flow_id, user, collection):
        sv = rdflib.term.URIRef(row.s)
        pv = rdflib.term.URIRef(row.p)
        ov = value_to_rdflib(row.o)
-        if isinstance(row.o, Uri):
+        if ov is None:
-
+            continue
            # Skip malformed URLs with spaces in
            if " " in row.o:
                continue
            ov = rdflib.term.URIRef(row.o)
        else:
            ov = rdflib.term.Literal(row.o)
        g.add((sv, pv, ov))
--- a/trustgraph-flow/trustgraph/direct/milvus_graph_embeddings.py
+++ b/trustgraph-flow/trustgraph/direct/milvus_graph_embeddings.py
@ -90,8 +90,14 @@ class EntityVectors:
            max_length=65535,
        )
        chunk_id_field = FieldSchema(
            name="chunk_id",
            dtype=DataType.VARCHAR,
            max_length=65535,
        )
        schema = CollectionSchema(
-            fields = [pkey_field, vec_field, entity_field],
+            fields = [pkey_field, vec_field, entity_field, chunk_id_field],
            description = "Graph embedding schema",
        )
@ -119,7 +125,7 @@ class EntityVectors:
        self.collections[(dimension, user, collection)] = collection_name
        logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
-    def insert(self, embeds, entity, user, collection):
+    def insert(self, embeds, entity, user, collection, chunk_id=""):
        dim = len(embeds)
@ -130,6 +136,7 @@ class EntityVectors:
            {
                "vector": embeds,
                "entity": entity,
                "chunk_id": chunk_id,
            }
        ]
--- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
@ -18,7 +18,10 @@ from .... schema import PromptRequest, PromptResponse
 from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
-from .... base import PromptClientSpec
+from .... base import PromptClientSpec, ParameterSpec
 from .... provenance import statement_uri, triple_provenance_triples
 from .... flow_version import __version__ as COMPONENT_VERSION
 DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
@ -75,6 +78,10 @@ class Processor(FlowProcessor):
            )
        )
        # Optional flow parameters for provenance
        self.register_specification(ParameterSpec("llm-model"))
        self.register_specification(ParameterSpec("ontology"))
    def to_uri(self, text):
        part = text.replace(" ", "-").lower().encode("utf-8")
@ -132,6 +139,10 @@ class Processor(FlowProcessor):
            chunk_doc_id = v.document_id if v.document_id else v.metadata.id
            chunk_uri = v.metadata.id  # The URI form for the chunk
            # Get optional provenance parameters
            llm_model = flow("llm-model")
            ontology_uri = flow("ontology")
            # Note: Document metadata is now emitted once by librarian at processing
            # initiation, so we don't need to duplicate it here.
@ -157,9 +168,24 @@ class Processor(FlowProcessor):
                    o=Term(type=LITERAL, value=s),
                ))
-                triples.append(Triple(
+                # The definition triple - this is the main extracted fact
                definition_triple = Triple(
                    s=s_value, p=DEFINITION_VALUE, o=o_value
-                ))
+                )
                triples.append(definition_triple)
                # Generate provenance for the definition triple (reification)
                stmt_uri = statement_uri()
                prov_triples = triple_provenance_triples(
                    stmt_uri=stmt_uri,
                    extracted_triple=definition_triple,
                    chunk_uri=chunk_uri,
                    component_name=default_ident,
                    component_version=COMPONENT_VERSION,
                    llm_model=llm_model,
                    ontology_uri=ontology_uri,
                )
                triples.extend(prov_triples)
                # Link entity to chunk (not top-level document)
                triples.append(Triple(
--- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
@ -18,7 +18,10 @@ from .... schema import PromptRequest, PromptResponse
 from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
-from .... base import PromptClientSpec
+from .... base import PromptClientSpec, ParameterSpec
 from .... provenance import statement_uri, triple_provenance_triples
 from .... flow_version import __version__ as COMPONENT_VERSION
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
 SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
@ -65,6 +68,10 @@ class Processor(FlowProcessor):
            )
        )
        # Optional flow parameters for provenance
        self.register_specification(ParameterSpec("llm-model"))
        self.register_specification(ParameterSpec("ontology"))
    def to_uri(self, text):
        part = text.replace(" ", "-").lower().encode("utf-8")
@ -113,6 +120,10 @@ class Processor(FlowProcessor):
            chunk_doc_id = v.document_id if v.document_id else v.metadata.id
            chunk_uri = v.metadata.id  # The URI form for the chunk
            # Get optional provenance parameters
            llm_model = flow("llm-model")
            ontology_uri = flow("ontology")
            # Note: Document metadata is now emitted once by librarian at processing
            # initiation, so we don't need to duplicate it here.
@ -142,11 +153,26 @@ class Processor(FlowProcessor):
                else:
                    o_value = Term(type=LITERAL, value=str(o))
-                triples.append(Triple(
+                # The relationship triple - this is the main extracted fact
                relationship_triple = Triple(
                    s=s_value,
                    p=p_value,
                    o=o_value
-                ))
+                )
                triples.append(relationship_triple)
                # Generate provenance for the relationship triple (reification)
                stmt_uri = statement_uri()
                prov_triples = triple_provenance_triples(
                    stmt_uri=stmt_uri,
                    extracted_triple=relationship_triple,
                    chunk_uri=chunk_uri,
                    component_name=default_ident,
                    component_version=COMPONENT_VERSION,
                    llm_model=llm_model,
                    ontology_uri=ontology_uri,
                )
                triples.extend(prov_triples)
                # Label for s
                triples.append(Triple(
--- a/trustgraph-flow/trustgraph/query/triples/cassandra/service.py
+++ b/trustgraph-flow/trustgraph/query/triples/cassandra/service.py
@ -6,11 +6,13 @@ null.  Output is a list of quads.
 import logging
 import json
 from .... direct.cassandra_kg import (
    EntityCentricKnowledgeGraph, GRAPH_WILDCARD, DEFAULT_GRAPH
 )
 from .... schema import TriplesQueryRequest, TriplesQueryResponse, Error
-from .... schema import Term, Triple, IRI, LITERAL
+from .... schema import Term, Triple, IRI, LITERAL, TRIPLE
 from .... base import TriplesQueryService
 from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
@ -33,6 +35,36 @@ def get_term_value(term):
        return term.id or term.value
 def deserialize_term(term_dict):
    """Deserialize a term from JSON structure."""
    if term_dict is None:
        return None
    term_type = term_dict.get("type", "")
    if term_type == IRI:
        return Term(type=IRI, iri=term_dict.get("iri", ""))
    elif term_type == LITERAL:
        return Term(
            type=LITERAL,
            value=term_dict.get("value", ""),
            datatype=term_dict.get("datatype", ""),
            language=term_dict.get("language", "")
        )
    elif term_type == TRIPLE:
        # Recursive for nested triples
        nested = term_dict.get("triple")
        if nested:
            return Term(
                type=TRIPLE,
                triple=Triple(
                    s=deserialize_term(nested.get("s")),
                    p=deserialize_term(nested.get("p")),
                    o=deserialize_term(nested.get("o")),
                )
            )
    # Fallback
    return Term(type=LITERAL, value=str(term_dict))
 def create_term(value, otype=None, dtype=None, lang=None):
    """
    Create a Term from a string value, optionally using type metadata.
@ -57,8 +89,22 @@ def create_term(value, otype=None, dtype=None, lang=None):
                language=lang or ""
            )
        elif otype == 't':
-            # Triple/reification - treat as IRI for now
+            # Triple/reification - parse JSON and create nested Triple
-            return Term(type=IRI, iri=value)
+            try:
                triple_data = json.loads(value) if isinstance(value, str) else value
                if isinstance(triple_data, dict):
                    return Term(
                        type=TRIPLE,
                        triple=Triple(
                            s=deserialize_term(triple_data.get("s")),
                            p=deserialize_term(triple_data.get("p")),
                            o=deserialize_term(triple_data.get("o")),
                        )
                    )
            except (json.JSONDecodeError, TypeError) as e:
                logger.warning(f"Failed to parse triple JSON: {e}")
            # Fallback if parsing fails
            return Term(type=LITERAL, value=str(value))
        else:
            # Unknown otype, fall back to heuristic
            pass
--- a/trustgraph-flow/trustgraph/storage/graph_embeddings/milvus/write.py
+++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/milvus/write.py
@ -57,7 +57,8 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
                    self.vecstore.insert(
                        vec, entity_value,
                        message.metadata.user,
-                        message.metadata.collection
+                        message.metadata.collection,
                        chunk_id=entity.chunk_id or "",
                    )
    @staticmethod
--- a/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py
+++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/pinecone/write.py
@ -137,11 +137,15 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
                # Generate unique ID for each vector
                vector_id = str(uuid.uuid4())
                metadata = {"entity": entity_value}
                if entity.chunk_id:
                    metadata["chunk_id"] = entity.chunk_id
                records = [
                    {
                        "id": vector_id,
                        "values": vec,
-                        "metadata": { "entity": entity_value },
+                        "metadata": metadata,
                    }
                ]
--- a/trustgraph-flow/trustgraph/storage/graph_embeddings/qdrant/write.py
+++ b/trustgraph-flow/trustgraph/storage/graph_embeddings/qdrant/write.py
@ -90,15 +90,19 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
                        )
                    )
                payload = {
                    "entity": entity_value,
                }
                if entity.chunk_id:
                    payload["chunk_id"] = entity.chunk_id
                self.qdrant.upsert(
                    collection_name=collection,
                    points=[
                        PointStruct(
                            id=str(uuid.uuid4()),
                            vector=vec,
-                            payload={
+                            payload=payload,
                                "entity": entity_value,
                            }
                        )
                    ]
                )
--- a/trustgraph-flow/trustgraph/storage/triples/cassandra/write.py
+++ b/trustgraph-flow/trustgraph/storage/triples/cassandra/write.py
@ -9,6 +9,7 @@ import os
 import argparse
 import time
 import logging
 import json
 from .... direct.cassandra_kg import (
    EntityCentricKnowledgeGraph, DEFAULT_GRAPH
@ -25,6 +26,37 @@ logger = logging.getLogger(__name__)
 default_ident = "triples-write"
 def serialize_triple(triple):
    """Serialize a Triple object to JSON for storage."""
    if triple is None:
        return None
    def term_to_dict(term):
        if term is None:
            return None
        result = {"type": term.type}
        if term.type == IRI:
            result["iri"] = term.iri
        elif term.type == LITERAL:
            result["value"] = term.value
            if term.datatype:
                result["datatype"] = term.datatype
            if term.language:
                result["language"] = term.language
        elif term.type == BLANK:
            result["id"] = term.id
        elif term.type == TRIPLE:
            result["triple"] = serialize_triple(term.triple)
        return result
    return json.dumps({
        "s": term_to_dict(triple.s),
        "p": term_to_dict(triple.p),
        "o": term_to_dict(triple.o),
    })
 def get_term_value(term):
    """Extract the string value from a Term"""
    if term is None:
@ -33,6 +65,9 @@ def get_term_value(term):
        return term.iri
    elif term.type == LITERAL:
        return term.value
    elif term.type == TRIPLE:
        # Serialize nested triple as JSON
        return serialize_triple(term.triple)
    else:
        # For blank nodes or other types, use id or value
        return term.id or term.value