mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-27 17:36:23 +02:00
Fix/extraction prov (#662)
Quoted triple fixes, including...
1. Updated triple_provenance_triples() in triples.py:
- Now accepts a Triple object directly
- Creates the reification triple using TRIPLE term type: stmt_uri tg:reifies
<<extracted_triple>>
- Includes it in the returned provenance triples
2. Updated definitions extractor:
- Added imports for provenance functions and component version
- Added ParameterSpec for optional llm-model and ontology flow parameters
- For each definition triple, generates provenance with reification
3. Updated relationships extractor:
- Same changes as definitions extractor
This commit is contained in:
parent
cd5580be59
commit
2b9232917c
19 changed files with 361 additions and 72 deletions
|
|
@ -9,26 +9,45 @@ including LLM operations, RAG queries, knowledge graph management, and more.
|
||||||
import json
|
import json
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from .. knowledge import hash, Uri, Literal
|
from .. knowledge import hash, Uri, Literal, QuotedTriple
|
||||||
from .. schema import IRI, LITERAL
|
from .. schema import IRI, LITERAL, TRIPLE
|
||||||
from . types import Triple
|
from . types import Triple
|
||||||
from . exceptions import ProtocolException
|
from . exceptions import ProtocolException
|
||||||
|
|
||||||
|
|
||||||
def to_value(x):
|
def to_value(x):
|
||||||
"""Convert wire format to Uri or Literal."""
|
"""Convert wire format to Uri, Literal, or QuotedTriple."""
|
||||||
if x.get("t") == IRI:
|
if x.get("t") == IRI:
|
||||||
return Uri(x.get("i", ""))
|
return Uri(x.get("i", ""))
|
||||||
elif x.get("t") == LITERAL:
|
elif x.get("t") == LITERAL:
|
||||||
return Literal(x.get("v", ""))
|
return Literal(x.get("v", ""))
|
||||||
|
elif x.get("t") == TRIPLE:
|
||||||
|
# Wire format uses "tr" key for nested triple dict
|
||||||
|
triple_data = x.get("tr")
|
||||||
|
if triple_data:
|
||||||
|
return QuotedTriple(
|
||||||
|
s=to_value(triple_data.get("s", {})),
|
||||||
|
p=to_value(triple_data.get("p", {})),
|
||||||
|
o=to_value(triple_data.get("o", {})),
|
||||||
|
)
|
||||||
|
return Literal("")
|
||||||
# Fallback for any other type
|
# Fallback for any other type
|
||||||
return Literal(x.get("v", x.get("i", "")))
|
return Literal(x.get("v", x.get("i", "")))
|
||||||
|
|
||||||
|
|
||||||
def from_value(v):
|
def from_value(v):
|
||||||
"""Convert Uri or Literal to wire format."""
|
"""Convert Uri, Literal, or QuotedTriple to wire format."""
|
||||||
if isinstance(v, Uri):
|
if isinstance(v, Uri):
|
||||||
return {"t": IRI, "i": str(v)}
|
return {"t": IRI, "i": str(v)}
|
||||||
|
elif isinstance(v, QuotedTriple):
|
||||||
|
return {
|
||||||
|
"t": TRIPLE,
|
||||||
|
"tr": {
|
||||||
|
"s": from_value(v.s),
|
||||||
|
"p": from_value(v.p),
|
||||||
|
"o": from_value(v.o),
|
||||||
|
}
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
return {"t": LITERAL, "v": str(v)}
|
return {"t": LITERAL, "v": str(v)}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,17 +9,27 @@ into flows for use in queries and RAG operations.
|
||||||
import json
|
import json
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from .. knowledge import hash, Uri, Literal
|
from .. knowledge import hash, Uri, Literal, QuotedTriple
|
||||||
from .. schema import IRI, LITERAL
|
from .. schema import IRI, LITERAL, TRIPLE
|
||||||
from . types import Triple
|
from . types import Triple
|
||||||
|
|
||||||
|
|
||||||
def to_value(x):
|
def to_value(x):
|
||||||
"""Convert wire format to Uri or Literal."""
|
"""Convert wire format to Uri, Literal, or QuotedTriple."""
|
||||||
if x.get("t") == IRI:
|
if x.get("t") == IRI:
|
||||||
return Uri(x.get("i", ""))
|
return Uri(x.get("i", ""))
|
||||||
elif x.get("t") == LITERAL:
|
elif x.get("t") == LITERAL:
|
||||||
return Literal(x.get("v", ""))
|
return Literal(x.get("v", ""))
|
||||||
|
elif x.get("t") == TRIPLE:
|
||||||
|
# Wire format uses "tr" key for nested triple dict
|
||||||
|
triple_data = x.get("tr")
|
||||||
|
if triple_data:
|
||||||
|
return QuotedTriple(
|
||||||
|
s=to_value(triple_data.get("s", {})),
|
||||||
|
p=to_value(triple_data.get("p", {})),
|
||||||
|
o=to_value(triple_data.get("o", {})),
|
||||||
|
)
|
||||||
|
return Literal("")
|
||||||
# Fallback for any other type
|
# Fallback for any other type
|
||||||
return Literal(x.get("v", x.get("i", "")))
|
return Literal(x.get("v", x.get("i", "")))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,8 +12,8 @@ import base64
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from . types import DocumentMetadata, ProcessingMetadata, Triple
|
from . types import DocumentMetadata, ProcessingMetadata, Triple
|
||||||
from .. knowledge import hash, Uri, Literal
|
from .. knowledge import hash, Uri, Literal, QuotedTriple
|
||||||
from .. schema import IRI, LITERAL
|
from .. schema import IRI, LITERAL, TRIPLE
|
||||||
from . exceptions import *
|
from . exceptions import *
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -27,19 +27,38 @@ DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
def to_value(x):
|
def to_value(x):
|
||||||
"""Convert wire format to Uri or Literal."""
|
"""Convert wire format to Uri, Literal, or QuotedTriple."""
|
||||||
if x.get("t") == IRI:
|
if x.get("t") == IRI:
|
||||||
return Uri(x.get("i", ""))
|
return Uri(x.get("i", ""))
|
||||||
elif x.get("t") == LITERAL:
|
elif x.get("t") == LITERAL:
|
||||||
return Literal(x.get("v", ""))
|
return Literal(x.get("v", ""))
|
||||||
|
elif x.get("t") == TRIPLE:
|
||||||
|
# Wire format uses "tr" key for nested triple dict
|
||||||
|
triple_data = x.get("tr")
|
||||||
|
if triple_data:
|
||||||
|
return QuotedTriple(
|
||||||
|
s=to_value(triple_data.get("s", {})),
|
||||||
|
p=to_value(triple_data.get("p", {})),
|
||||||
|
o=to_value(triple_data.get("o", {})),
|
||||||
|
)
|
||||||
|
return Literal("")
|
||||||
# Fallback for any other type
|
# Fallback for any other type
|
||||||
return Literal(x.get("v", x.get("i", "")))
|
return Literal(x.get("v", x.get("i", "")))
|
||||||
|
|
||||||
|
|
||||||
def from_value(v):
|
def from_value(v):
|
||||||
"""Convert Uri or Literal to wire format."""
|
"""Convert Uri, Literal, or QuotedTriple to wire format."""
|
||||||
if isinstance(v, Uri):
|
if isinstance(v, Uri):
|
||||||
return {"t": IRI, "i": str(v)}
|
return {"t": IRI, "i": str(v)}
|
||||||
|
elif isinstance(v, QuotedTriple):
|
||||||
|
return {
|
||||||
|
"t": TRIPLE,
|
||||||
|
"tr": {
|
||||||
|
"s": from_value(v.s),
|
||||||
|
"p": from_value(v.p),
|
||||||
|
"o": from_value(v.o),
|
||||||
|
}
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
return {"t": LITERAL, "v": str(v)}
|
return {"t": LITERAL, "v": str(v)}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ import logging
|
||||||
import base64
|
import base64
|
||||||
import types
|
import types
|
||||||
from dataclasses import asdict, is_dataclass
|
from dataclasses import asdict, is_dataclass
|
||||||
from typing import Any
|
from typing import Any, get_type_hints
|
||||||
|
|
||||||
from .backend import PubSubBackend, BackendProducer, BackendConsumer, Message
|
from .backend import PubSubBackend, BackendProducer, BackendConsumer, Message
|
||||||
|
|
||||||
|
|
@ -58,6 +58,7 @@ def dict_to_dataclass(data: dict, cls: type) -> Any:
|
||||||
Convert a dictionary back to a dataclass instance.
|
Convert a dictionary back to a dataclass instance.
|
||||||
|
|
||||||
Handles nested dataclasses and missing fields.
|
Handles nested dataclasses and missing fields.
|
||||||
|
Uses get_type_hints() to resolve forward references (string annotations).
|
||||||
"""
|
"""
|
||||||
if data is None:
|
if data is None:
|
||||||
return None
|
return None
|
||||||
|
|
@ -65,8 +66,13 @@ def dict_to_dataclass(data: dict, cls: type) -> Any:
|
||||||
if not is_dataclass(cls):
|
if not is_dataclass(cls):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
# Get field types from the dataclass
|
# Get field types from the dataclass, resolving forward references
|
||||||
field_types = {f.name: f.type for f in cls.__dataclass_fields__.values()}
|
# get_type_hints() evaluates string annotations like "Triple | None"
|
||||||
|
try:
|
||||||
|
field_types = get_type_hints(cls)
|
||||||
|
except Exception:
|
||||||
|
# Fallback if get_type_hints fails (shouldn't happen normally)
|
||||||
|
field_types = {f.name: f.type for f in cls.__dataclass_fields__.values()}
|
||||||
kwargs = {}
|
kwargs = {}
|
||||||
|
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
|
|
|
||||||
|
|
@ -26,8 +26,40 @@ KEYWORD = 'https://schema.org/keywords'
|
||||||
class Uri(str):
|
class Uri(str):
|
||||||
def is_uri(self): return True
|
def is_uri(self): return True
|
||||||
def is_literal(self): return False
|
def is_literal(self): return False
|
||||||
|
def is_triple(self): return False
|
||||||
|
|
||||||
class Literal(str):
|
class Literal(str):
|
||||||
def is_uri(self): return False
|
def is_uri(self): return False
|
||||||
def is_literal(self): return True
|
def is_literal(self): return True
|
||||||
|
def is_triple(self): return False
|
||||||
|
|
||||||
|
class QuotedTriple:
|
||||||
|
"""
|
||||||
|
RDF-star quoted triple (reification).
|
||||||
|
|
||||||
|
Represents a triple that can be used as the object of another triple,
|
||||||
|
enabling statements about statements.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
# stmt:123 tg:reifies <<:Hope skos:definition "A feeling...">>
|
||||||
|
qt = QuotedTriple(
|
||||||
|
s=Uri("https://example.org/Hope"),
|
||||||
|
p=Uri("http://www.w3.org/2004/02/skos/core#definition"),
|
||||||
|
o=Literal("A feeling of expectation")
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
def __init__(self, s, p, o):
|
||||||
|
self.s = s # Uri, Literal, or QuotedTriple
|
||||||
|
self.p = p # Uri
|
||||||
|
self.o = o # Uri, Literal, or QuotedTriple
|
||||||
|
|
||||||
|
def is_uri(self): return False
|
||||||
|
def is_literal(self): return False
|
||||||
|
def is_triple(self): return True
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<<{self.s} {self.p} {self.o}>>"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"<<{self.s} {self.p} {self.o}>>"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,7 @@ def _triple_translator_to_pulsar(data: Dict[str, Any]) -> Triple:
|
||||||
|
|
||||||
|
|
||||||
def _triple_translator_from_pulsar(obj: Triple) -> Dict[str, Any]:
|
def _triple_translator_from_pulsar(obj: Triple) -> Dict[str, Any]:
|
||||||
|
"""Convert Triple object to wire format dict."""
|
||||||
term_translator = TermTranslator()
|
term_translator = TermTranslator()
|
||||||
result: Dict[str, Any] = {}
|
result: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,18 @@ RDF_TYPE = RDF + "type"
|
||||||
RDFS = "http://www.w3.org/2000/01/rdf-schema#"
|
RDFS = "http://www.w3.org/2000/01/rdf-schema#"
|
||||||
RDFS_LABEL = RDFS + "label"
|
RDFS_LABEL = RDFS + "label"
|
||||||
|
|
||||||
|
# Schema.org namespace
|
||||||
|
SCHEMA = "https://schema.org/"
|
||||||
|
SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
|
||||||
|
SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
|
||||||
|
SCHEMA_DESCRIPTION = SCHEMA + "description"
|
||||||
|
SCHEMA_KEYWORDS = SCHEMA + "keywords"
|
||||||
|
SCHEMA_NAME = SCHEMA + "name"
|
||||||
|
|
||||||
|
# SKOS namespace
|
||||||
|
SKOS = "http://www.w3.org/2004/02/skos/core#"
|
||||||
|
SKOS_DEFINITION = SKOS + "definition"
|
||||||
|
|
||||||
# TrustGraph namespace for custom predicates
|
# TrustGraph namespace for custom predicates
|
||||||
TG = "https://trustgraph.ai/ns/"
|
TG = "https://trustgraph.ai/ns/"
|
||||||
TG_REIFIES = TG + "reifies"
|
TG_REIFIES = TG + "reifies"
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ Helper functions to build PROV-O triples for extraction-time provenance.
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from .. schema import Triple, Term, IRI, LITERAL
|
from .. schema import Triple, Term, IRI, LITERAL, TRIPLE
|
||||||
|
|
||||||
from . namespaces import (
|
from . namespaces import (
|
||||||
RDF_TYPE, RDFS_LABEL,
|
RDF_TYPE, RDFS_LABEL,
|
||||||
|
|
@ -145,6 +145,7 @@ def derived_entity_triples(
|
||||||
|
|
||||||
# Activity declaration
|
# Activity declaration
|
||||||
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
||||||
|
_triple(act_uri, RDFS_LABEL, _literal(f"{component_name} extraction")),
|
||||||
_triple(act_uri, PROV_USED, _iri(parent_uri)),
|
_triple(act_uri, PROV_USED, _iri(parent_uri)),
|
||||||
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
|
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
|
||||||
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
|
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
|
||||||
|
|
@ -181,9 +182,7 @@ def derived_entity_triples(
|
||||||
|
|
||||||
def triple_provenance_triples(
|
def triple_provenance_triples(
|
||||||
stmt_uri: str,
|
stmt_uri: str,
|
||||||
subject_uri: str,
|
extracted_triple: Triple,
|
||||||
predicate_uri: str,
|
|
||||||
object_term: Term,
|
|
||||||
chunk_uri: str,
|
chunk_uri: str,
|
||||||
component_name: str,
|
component_name: str,
|
||||||
component_version: str,
|
component_version: str,
|
||||||
|
|
@ -195,15 +194,13 @@ def triple_provenance_triples(
|
||||||
Build provenance triples for an extracted knowledge triple using reification.
|
Build provenance triples for an extracted knowledge triple using reification.
|
||||||
|
|
||||||
Creates:
|
Creates:
|
||||||
- Statement object that reifies the triple
|
- Reification triple: stmt_uri tg:reifies <<extracted_triple>>
|
||||||
- wasDerivedFrom link to source chunk
|
- wasDerivedFrom link to source chunk
|
||||||
- Activity and agent metadata
|
- Activity and agent metadata
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
stmt_uri: URI for the reified statement
|
stmt_uri: URI for the reified statement
|
||||||
subject_uri: Subject of the extracted triple
|
extracted_triple: The extracted Triple to reify
|
||||||
predicate_uri: Predicate of the extracted triple
|
|
||||||
object_term: Object of the extracted triple (Term)
|
|
||||||
chunk_uri: URI of source chunk
|
chunk_uri: URI of source chunk
|
||||||
component_name: Name of extractor component
|
component_name: Name of extractor component
|
||||||
component_version: Version of the component
|
component_version: Version of the component
|
||||||
|
|
@ -212,7 +209,7 @@ def triple_provenance_triples(
|
||||||
timestamp: ISO timestamp
|
timestamp: ISO timestamp
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Triple objects for the provenance (not the triple itself)
|
List of Triple objects for the provenance (including reification)
|
||||||
"""
|
"""
|
||||||
if timestamp is None:
|
if timestamp is None:
|
||||||
timestamp = datetime.utcnow().isoformat() + "Z"
|
timestamp = datetime.utcnow().isoformat() + "Z"
|
||||||
|
|
@ -220,18 +217,24 @@ def triple_provenance_triples(
|
||||||
act_uri = activity_uri()
|
act_uri = activity_uri()
|
||||||
agt_uri = agent_uri(component_name)
|
agt_uri = agent_uri(component_name)
|
||||||
|
|
||||||
# Note: The actual reification (tg:reifies pointing at the edge) requires
|
# Create the quoted triple term (RDF-star reification)
|
||||||
# RDF 1.2 triple term support. This builds the surrounding provenance.
|
triple_term = Term(type=TRIPLE, triple=extracted_triple)
|
||||||
# The actual reification link must be handled by the knowledge extractor
|
|
||||||
# using the graph store's reification API.
|
|
||||||
|
|
||||||
triples = [
|
triples = [
|
||||||
|
# Reification: stmt_uri tg:reifies <<s p o>>
|
||||||
|
Triple(
|
||||||
|
s=_iri(stmt_uri),
|
||||||
|
p=_iri(TG_REIFIES),
|
||||||
|
o=triple_term
|
||||||
|
),
|
||||||
|
|
||||||
# Statement provenance
|
# Statement provenance
|
||||||
_triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
|
_triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
|
||||||
_triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
|
_triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
|
||||||
|
|
||||||
# Activity
|
# Activity
|
||||||
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
||||||
|
_triple(act_uri, RDFS_LABEL, _literal(f"{component_name} extraction")),
|
||||||
_triple(act_uri, PROV_USED, _iri(chunk_uri)),
|
_triple(act_uri, PROV_USED, _iri(chunk_uri)),
|
||||||
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
|
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
|
||||||
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
|
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,12 @@
|
||||||
"""
|
"""
|
||||||
URI generation for provenance entities.
|
URI generation for provenance entities.
|
||||||
|
|
||||||
URI patterns:
|
Document IDs are already IRIs (e.g., https://trustgraph.ai/doc/abc123).
|
||||||
- Document: https://trustgraph.ai/doc/{doc_id}
|
Child entities (pages, chunks) append path segments to the parent IRI:
|
||||||
- Page: https://trustgraph.ai/page/{doc_id}/p{page_number}
|
- Document: {doc_iri} (as provided)
|
||||||
- Chunk: https://trustgraph.ai/chunk/{doc_id}/p{page}/c{chunk} (from page)
|
- Page: {doc_iri}/p{page_number}
|
||||||
https://trustgraph.ai/chunk/{doc_id}/c{chunk} (from text doc)
|
- Chunk: {page_iri}/c{chunk_index} (from page)
|
||||||
|
{doc_iri}/c{chunk_index} (from text doc)
|
||||||
- Activity: https://trustgraph.ai/activity/{uuid}
|
- Activity: https://trustgraph.ai/activity/{uuid}
|
||||||
- Statement: https://trustgraph.ai/stmt/{uuid}
|
- Statement: https://trustgraph.ai/stmt/{uuid}
|
||||||
"""
|
"""
|
||||||
|
|
@ -13,7 +14,7 @@ URI patterns:
|
||||||
import uuid
|
import uuid
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
# Base URI prefix
|
# Base URI prefix for generated URIs (activities, statements, agents)
|
||||||
TRUSTGRAPH_BASE = "https://trustgraph.ai"
|
TRUSTGRAPH_BASE = "https://trustgraph.ai"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -22,24 +23,24 @@ def _encode_id(id_str: str) -> str:
|
||||||
return urllib.parse.quote(str(id_str), safe='')
|
return urllib.parse.quote(str(id_str), safe='')
|
||||||
|
|
||||||
|
|
||||||
def document_uri(doc_id: str) -> str:
|
def document_uri(doc_iri: str) -> str:
|
||||||
"""Generate URI for a source document."""
|
"""Return the document IRI as-is (already a full URI)."""
|
||||||
return f"{TRUSTGRAPH_BASE}/doc/{_encode_id(doc_id)}"
|
return doc_iri
|
||||||
|
|
||||||
|
|
||||||
def page_uri(doc_id: str, page_number: int) -> str:
|
def page_uri(doc_iri: str, page_number: int) -> str:
|
||||||
"""Generate URI for a page extracted from a document."""
|
"""Generate URI for a page by appending to document IRI."""
|
||||||
return f"{TRUSTGRAPH_BASE}/page/{_encode_id(doc_id)}/p{page_number}"
|
return f"{doc_iri}/p{page_number}"
|
||||||
|
|
||||||
|
|
||||||
def chunk_uri_from_page(doc_id: str, page_number: int, chunk_index: int) -> str:
|
def chunk_uri_from_page(doc_iri: str, page_number: int, chunk_index: int) -> str:
|
||||||
"""Generate URI for a chunk extracted from a page."""
|
"""Generate URI for a chunk extracted from a page."""
|
||||||
return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/p{page_number}/c{chunk_index}"
|
return f"{doc_iri}/p{page_number}/c{chunk_index}"
|
||||||
|
|
||||||
|
|
||||||
def chunk_uri_from_doc(doc_id: str, chunk_index: int) -> str:
|
def chunk_uri_from_doc(doc_iri: str, chunk_index: int) -> str:
|
||||||
"""Generate URI for a chunk extracted directly from a text document."""
|
"""Generate URI for a chunk extracted directly from a text document."""
|
||||||
return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/c{chunk_index}"
|
return f"{doc_iri}/c{chunk_index}"
|
||||||
|
|
||||||
|
|
||||||
def activity_uri(activity_id: str = None) -> str:
|
def activity_uri(activity_id: str = None) -> str:
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,9 @@ from . namespaces import (
|
||||||
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
||||||
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
||||||
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
||||||
|
SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
|
||||||
|
SCHEMA_KEYWORDS, SCHEMA_NAME,
|
||||||
|
SKOS_DEFINITION,
|
||||||
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||||
|
|
@ -57,6 +60,20 @@ DC_PREDICATE_LABELS = [
|
||||||
_label_triple(DC_CREATOR, "creator"),
|
_label_triple(DC_CREATOR, "creator"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Schema.org labels
|
||||||
|
SCHEMA_LABELS = [
|
||||||
|
_label_triple(SCHEMA_SUBJECT_OF, "subject of"),
|
||||||
|
_label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
|
||||||
|
_label_triple(SCHEMA_DESCRIPTION, "description"),
|
||||||
|
_label_triple(SCHEMA_KEYWORDS, "keywords"),
|
||||||
|
_label_triple(SCHEMA_NAME, "name"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# SKOS labels
|
||||||
|
SKOS_LABELS = [
|
||||||
|
_label_triple(SKOS_DEFINITION, "definition"),
|
||||||
|
]
|
||||||
|
|
||||||
# TrustGraph predicate labels
|
# TrustGraph predicate labels
|
||||||
TG_PREDICATE_LABELS = [
|
TG_PREDICATE_LABELS = [
|
||||||
_label_triple(TG_REIFIES, "reifies"),
|
_label_triple(TG_REIFIES, "reifies"),
|
||||||
|
|
@ -97,5 +114,7 @@ def get_vocabulary_triples() -> List[Triple]:
|
||||||
PROV_CLASS_LABELS +
|
PROV_CLASS_LABELS +
|
||||||
PROV_PREDICATE_LABELS +
|
PROV_PREDICATE_LABELS +
|
||||||
DC_PREDICATE_LABELS +
|
DC_PREDICATE_LABELS +
|
||||||
|
SCHEMA_LABELS +
|
||||||
|
SKOS_LABELS +
|
||||||
TG_PREDICATE_LABELS
|
TG_PREDICATE_LABELS
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"""
|
"""
|
||||||
Connects to the graph query service and dumps all graph edges in Turtle
|
Connects to the graph query service and dumps all graph edges in Turtle
|
||||||
format.
|
format with RDF-star support for quoted triples.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import rdflib
|
import rdflib
|
||||||
|
|
@ -10,11 +10,37 @@ import argparse
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from trustgraph.api import Api, Uri
|
from trustgraph.api import Api, Uri
|
||||||
|
from trustgraph.knowledge import QuotedTriple
|
||||||
|
|
||||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||||
default_user = 'trustgraph'
|
default_user = 'trustgraph'
|
||||||
default_collection = 'default'
|
default_collection = 'default'
|
||||||
|
|
||||||
|
|
||||||
|
def value_to_rdflib(val):
|
||||||
|
"""Convert a TrustGraph value to an rdflib term."""
|
||||||
|
if isinstance(val, Uri):
|
||||||
|
# Skip malformed URLs with spaces
|
||||||
|
if " " in val:
|
||||||
|
return None
|
||||||
|
return rdflib.term.URIRef(val)
|
||||||
|
elif isinstance(val, QuotedTriple):
|
||||||
|
# RDF-star quoted triple
|
||||||
|
s_term = value_to_rdflib(val.s)
|
||||||
|
p_term = value_to_rdflib(val.p)
|
||||||
|
o_term = value_to_rdflib(val.o)
|
||||||
|
if s_term is None or p_term is None or o_term is None:
|
||||||
|
return None
|
||||||
|
# rdflib 6.x+ supports Triple as a term type
|
||||||
|
try:
|
||||||
|
return rdflib.term.Triple((s_term, p_term, o_term))
|
||||||
|
except AttributeError:
|
||||||
|
# Fallback for older rdflib versions - represent as string
|
||||||
|
return rdflib.term.Literal(f"<<{val.s} {val.p} {val.o}>>")
|
||||||
|
else:
|
||||||
|
return rdflib.term.Literal(str(val))
|
||||||
|
|
||||||
|
|
||||||
def show_graph(url, flow_id, user, collection):
|
def show_graph(url, flow_id, user, collection):
|
||||||
|
|
||||||
api = Api(url).flow().id(flow_id)
|
api = Api(url).flow().id(flow_id)
|
||||||
|
|
@ -30,18 +56,10 @@ def show_graph(url, flow_id, user, collection):
|
||||||
|
|
||||||
sv = rdflib.term.URIRef(row.s)
|
sv = rdflib.term.URIRef(row.s)
|
||||||
pv = rdflib.term.URIRef(row.p)
|
pv = rdflib.term.URIRef(row.p)
|
||||||
|
ov = value_to_rdflib(row.o)
|
||||||
|
|
||||||
if isinstance(row.o, Uri):
|
if ov is None:
|
||||||
|
continue
|
||||||
# Skip malformed URLs with spaces in
|
|
||||||
if " " in row.o:
|
|
||||||
continue
|
|
||||||
|
|
||||||
ov = rdflib.term.URIRef(row.o)
|
|
||||||
|
|
||||||
else:
|
|
||||||
|
|
||||||
ov = rdflib.term.Literal(row.o)
|
|
||||||
|
|
||||||
g.add((sv, pv, ov))
|
g.add((sv, pv, ov))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -90,8 +90,14 @@ class EntityVectors:
|
||||||
max_length=65535,
|
max_length=65535,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
chunk_id_field = FieldSchema(
|
||||||
|
name="chunk_id",
|
||||||
|
dtype=DataType.VARCHAR,
|
||||||
|
max_length=65535,
|
||||||
|
)
|
||||||
|
|
||||||
schema = CollectionSchema(
|
schema = CollectionSchema(
|
||||||
fields = [pkey_field, vec_field, entity_field],
|
fields = [pkey_field, vec_field, entity_field, chunk_id_field],
|
||||||
description = "Graph embedding schema",
|
description = "Graph embedding schema",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -119,7 +125,7 @@ class EntityVectors:
|
||||||
self.collections[(dimension, user, collection)] = collection_name
|
self.collections[(dimension, user, collection)] = collection_name
|
||||||
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
||||||
|
|
||||||
def insert(self, embeds, entity, user, collection):
|
def insert(self, embeds, entity, user, collection, chunk_id=""):
|
||||||
|
|
||||||
dim = len(embeds)
|
dim = len(embeds)
|
||||||
|
|
||||||
|
|
@ -130,6 +136,7 @@ class EntityVectors:
|
||||||
{
|
{
|
||||||
"vector": embeds,
|
"vector": embeds,
|
||||||
"entity": entity,
|
"entity": entity,
|
||||||
|
"chunk_id": chunk_id,
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,10 @@ from .... schema import PromptRequest, PromptResponse
|
||||||
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||||
|
|
||||||
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||||
from .... base import PromptClientSpec
|
from .... base import PromptClientSpec, ParameterSpec
|
||||||
|
|
||||||
|
from .... provenance import statement_uri, triple_provenance_triples
|
||||||
|
from .... flow_version import __version__ as COMPONENT_VERSION
|
||||||
|
|
||||||
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
|
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
|
||||||
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
|
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
|
||||||
|
|
@ -75,6 +78,10 @@ class Processor(FlowProcessor):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Optional flow parameters for provenance
|
||||||
|
self.register_specification(ParameterSpec("llm-model"))
|
||||||
|
self.register_specification(ParameterSpec("ontology"))
|
||||||
|
|
||||||
def to_uri(self, text):
|
def to_uri(self, text):
|
||||||
|
|
||||||
part = text.replace(" ", "-").lower().encode("utf-8")
|
part = text.replace(" ", "-").lower().encode("utf-8")
|
||||||
|
|
@ -132,6 +139,10 @@ class Processor(FlowProcessor):
|
||||||
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
||||||
chunk_uri = v.metadata.id # The URI form for the chunk
|
chunk_uri = v.metadata.id # The URI form for the chunk
|
||||||
|
|
||||||
|
# Get optional provenance parameters
|
||||||
|
llm_model = flow("llm-model")
|
||||||
|
ontology_uri = flow("ontology")
|
||||||
|
|
||||||
# Note: Document metadata is now emitted once by librarian at processing
|
# Note: Document metadata is now emitted once by librarian at processing
|
||||||
# initiation, so we don't need to duplicate it here.
|
# initiation, so we don't need to duplicate it here.
|
||||||
|
|
||||||
|
|
@ -157,9 +168,24 @@ class Processor(FlowProcessor):
|
||||||
o=Term(type=LITERAL, value=s),
|
o=Term(type=LITERAL, value=s),
|
||||||
))
|
))
|
||||||
|
|
||||||
triples.append(Triple(
|
# The definition triple - this is the main extracted fact
|
||||||
|
definition_triple = Triple(
|
||||||
s=s_value, p=DEFINITION_VALUE, o=o_value
|
s=s_value, p=DEFINITION_VALUE, o=o_value
|
||||||
))
|
)
|
||||||
|
triples.append(definition_triple)
|
||||||
|
|
||||||
|
# Generate provenance for the definition triple (reification)
|
||||||
|
stmt_uri = statement_uri()
|
||||||
|
prov_triples = triple_provenance_triples(
|
||||||
|
stmt_uri=stmt_uri,
|
||||||
|
extracted_triple=definition_triple,
|
||||||
|
chunk_uri=chunk_uri,
|
||||||
|
component_name=default_ident,
|
||||||
|
component_version=COMPONENT_VERSION,
|
||||||
|
llm_model=llm_model,
|
||||||
|
ontology_uri=ontology_uri,
|
||||||
|
)
|
||||||
|
triples.extend(prov_triples)
|
||||||
|
|
||||||
# Link entity to chunk (not top-level document)
|
# Link entity to chunk (not top-level document)
|
||||||
triples.append(Triple(
|
triples.append(Triple(
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,10 @@ from .... schema import PromptRequest, PromptResponse
|
||||||
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
|
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
|
||||||
|
|
||||||
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||||
from .... base import PromptClientSpec
|
from .... base import PromptClientSpec, ParameterSpec
|
||||||
|
|
||||||
|
from .... provenance import statement_uri, triple_provenance_triples
|
||||||
|
from .... flow_version import __version__ as COMPONENT_VERSION
|
||||||
|
|
||||||
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
|
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
|
||||||
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
|
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
|
||||||
|
|
@ -65,6 +68,10 @@ class Processor(FlowProcessor):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Optional flow parameters for provenance
|
||||||
|
self.register_specification(ParameterSpec("llm-model"))
|
||||||
|
self.register_specification(ParameterSpec("ontology"))
|
||||||
|
|
||||||
def to_uri(self, text):
|
def to_uri(self, text):
|
||||||
|
|
||||||
part = text.replace(" ", "-").lower().encode("utf-8")
|
part = text.replace(" ", "-").lower().encode("utf-8")
|
||||||
|
|
@ -113,6 +120,10 @@ class Processor(FlowProcessor):
|
||||||
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
||||||
chunk_uri = v.metadata.id # The URI form for the chunk
|
chunk_uri = v.metadata.id # The URI form for the chunk
|
||||||
|
|
||||||
|
# Get optional provenance parameters
|
||||||
|
llm_model = flow("llm-model")
|
||||||
|
ontology_uri = flow("ontology")
|
||||||
|
|
||||||
# Note: Document metadata is now emitted once by librarian at processing
|
# Note: Document metadata is now emitted once by librarian at processing
|
||||||
# initiation, so we don't need to duplicate it here.
|
# initiation, so we don't need to duplicate it here.
|
||||||
|
|
||||||
|
|
@ -142,11 +153,26 @@ class Processor(FlowProcessor):
|
||||||
else:
|
else:
|
||||||
o_value = Term(type=LITERAL, value=str(o))
|
o_value = Term(type=LITERAL, value=str(o))
|
||||||
|
|
||||||
triples.append(Triple(
|
# The relationship triple - this is the main extracted fact
|
||||||
|
relationship_triple = Triple(
|
||||||
s=s_value,
|
s=s_value,
|
||||||
p=p_value,
|
p=p_value,
|
||||||
o=o_value
|
o=o_value
|
||||||
))
|
)
|
||||||
|
triples.append(relationship_triple)
|
||||||
|
|
||||||
|
# Generate provenance for the relationship triple (reification)
|
||||||
|
stmt_uri = statement_uri()
|
||||||
|
prov_triples = triple_provenance_triples(
|
||||||
|
stmt_uri=stmt_uri,
|
||||||
|
extracted_triple=relationship_triple,
|
||||||
|
chunk_uri=chunk_uri,
|
||||||
|
component_name=default_ident,
|
||||||
|
component_version=COMPONENT_VERSION,
|
||||||
|
llm_model=llm_model,
|
||||||
|
ontology_uri=ontology_uri,
|
||||||
|
)
|
||||||
|
triples.extend(prov_triples)
|
||||||
|
|
||||||
# Label for s
|
# Label for s
|
||||||
triples.append(Triple(
|
triples.append(Triple(
|
||||||
|
|
|
||||||
|
|
@ -6,11 +6,13 @@ null. Output is a list of quads.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
from .... direct.cassandra_kg import (
|
from .... direct.cassandra_kg import (
|
||||||
EntityCentricKnowledgeGraph, GRAPH_WILDCARD, DEFAULT_GRAPH
|
EntityCentricKnowledgeGraph, GRAPH_WILDCARD, DEFAULT_GRAPH
|
||||||
)
|
)
|
||||||
from .... schema import TriplesQueryRequest, TriplesQueryResponse, Error
|
from .... schema import TriplesQueryRequest, TriplesQueryResponse, Error
|
||||||
from .... schema import Term, Triple, IRI, LITERAL
|
from .... schema import Term, Triple, IRI, LITERAL, TRIPLE
|
||||||
from .... base import TriplesQueryService
|
from .... base import TriplesQueryService
|
||||||
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
|
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
|
||||||
|
|
||||||
|
|
@ -33,6 +35,36 @@ def get_term_value(term):
|
||||||
return term.id or term.value
|
return term.id or term.value
|
||||||
|
|
||||||
|
|
||||||
|
def deserialize_term(term_dict):
|
||||||
|
"""Deserialize a term from JSON structure."""
|
||||||
|
if term_dict is None:
|
||||||
|
return None
|
||||||
|
term_type = term_dict.get("type", "")
|
||||||
|
if term_type == IRI:
|
||||||
|
return Term(type=IRI, iri=term_dict.get("iri", ""))
|
||||||
|
elif term_type == LITERAL:
|
||||||
|
return Term(
|
||||||
|
type=LITERAL,
|
||||||
|
value=term_dict.get("value", ""),
|
||||||
|
datatype=term_dict.get("datatype", ""),
|
||||||
|
language=term_dict.get("language", "")
|
||||||
|
)
|
||||||
|
elif term_type == TRIPLE:
|
||||||
|
# Recursive for nested triples
|
||||||
|
nested = term_dict.get("triple")
|
||||||
|
if nested:
|
||||||
|
return Term(
|
||||||
|
type=TRIPLE,
|
||||||
|
triple=Triple(
|
||||||
|
s=deserialize_term(nested.get("s")),
|
||||||
|
p=deserialize_term(nested.get("p")),
|
||||||
|
o=deserialize_term(nested.get("o")),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Fallback
|
||||||
|
return Term(type=LITERAL, value=str(term_dict))
|
||||||
|
|
||||||
|
|
||||||
def create_term(value, otype=None, dtype=None, lang=None):
|
def create_term(value, otype=None, dtype=None, lang=None):
|
||||||
"""
|
"""
|
||||||
Create a Term from a string value, optionally using type metadata.
|
Create a Term from a string value, optionally using type metadata.
|
||||||
|
|
@ -57,8 +89,22 @@ def create_term(value, otype=None, dtype=None, lang=None):
|
||||||
language=lang or ""
|
language=lang or ""
|
||||||
)
|
)
|
||||||
elif otype == 't':
|
elif otype == 't':
|
||||||
# Triple/reification - treat as IRI for now
|
# Triple/reification - parse JSON and create nested Triple
|
||||||
return Term(type=IRI, iri=value)
|
try:
|
||||||
|
triple_data = json.loads(value) if isinstance(value, str) else value
|
||||||
|
if isinstance(triple_data, dict):
|
||||||
|
return Term(
|
||||||
|
type=TRIPLE,
|
||||||
|
triple=Triple(
|
||||||
|
s=deserialize_term(triple_data.get("s")),
|
||||||
|
p=deserialize_term(triple_data.get("p")),
|
||||||
|
o=deserialize_term(triple_data.get("o")),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, TypeError) as e:
|
||||||
|
logger.warning(f"Failed to parse triple JSON: {e}")
|
||||||
|
# Fallback if parsing fails
|
||||||
|
return Term(type=LITERAL, value=str(value))
|
||||||
else:
|
else:
|
||||||
# Unknown otype, fall back to heuristic
|
# Unknown otype, fall back to heuristic
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,8 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
||||||
self.vecstore.insert(
|
self.vecstore.insert(
|
||||||
vec, entity_value,
|
vec, entity_value,
|
||||||
message.metadata.user,
|
message.metadata.user,
|
||||||
message.metadata.collection
|
message.metadata.collection,
|
||||||
|
chunk_id=entity.chunk_id or "",
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
||||||
|
|
@ -137,11 +137,15 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
||||||
# Generate unique ID for each vector
|
# Generate unique ID for each vector
|
||||||
vector_id = str(uuid.uuid4())
|
vector_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
metadata = {"entity": entity_value}
|
||||||
|
if entity.chunk_id:
|
||||||
|
metadata["chunk_id"] = entity.chunk_id
|
||||||
|
|
||||||
records = [
|
records = [
|
||||||
{
|
{
|
||||||
"id": vector_id,
|
"id": vector_id,
|
||||||
"values": vec,
|
"values": vec,
|
||||||
"metadata": { "entity": entity_value },
|
"metadata": metadata,
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -90,15 +90,19 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"entity": entity_value,
|
||||||
|
}
|
||||||
|
if entity.chunk_id:
|
||||||
|
payload["chunk_id"] = entity.chunk_id
|
||||||
|
|
||||||
self.qdrant.upsert(
|
self.qdrant.upsert(
|
||||||
collection_name=collection,
|
collection_name=collection,
|
||||||
points=[
|
points=[
|
||||||
PointStruct(
|
PointStruct(
|
||||||
id=str(uuid.uuid4()),
|
id=str(uuid.uuid4()),
|
||||||
vector=vec,
|
vector=vec,
|
||||||
payload={
|
payload=payload,
|
||||||
"entity": entity_value,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import os
|
||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
import json
|
||||||
|
|
||||||
from .... direct.cassandra_kg import (
|
from .... direct.cassandra_kg import (
|
||||||
EntityCentricKnowledgeGraph, DEFAULT_GRAPH
|
EntityCentricKnowledgeGraph, DEFAULT_GRAPH
|
||||||
|
|
@ -25,6 +26,37 @@ logger = logging.getLogger(__name__)
|
||||||
default_ident = "triples-write"
|
default_ident = "triples-write"
|
||||||
|
|
||||||
|
|
||||||
|
def serialize_triple(triple):
|
||||||
|
"""Serialize a Triple object to JSON for storage."""
|
||||||
|
if triple is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def term_to_dict(term):
|
||||||
|
if term is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = {"type": term.type}
|
||||||
|
if term.type == IRI:
|
||||||
|
result["iri"] = term.iri
|
||||||
|
elif term.type == LITERAL:
|
||||||
|
result["value"] = term.value
|
||||||
|
if term.datatype:
|
||||||
|
result["datatype"] = term.datatype
|
||||||
|
if term.language:
|
||||||
|
result["language"] = term.language
|
||||||
|
elif term.type == BLANK:
|
||||||
|
result["id"] = term.id
|
||||||
|
elif term.type == TRIPLE:
|
||||||
|
result["triple"] = serialize_triple(term.triple)
|
||||||
|
return result
|
||||||
|
|
||||||
|
return json.dumps({
|
||||||
|
"s": term_to_dict(triple.s),
|
||||||
|
"p": term_to_dict(triple.p),
|
||||||
|
"o": term_to_dict(triple.o),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def get_term_value(term):
|
def get_term_value(term):
|
||||||
"""Extract the string value from a Term"""
|
"""Extract the string value from a Term"""
|
||||||
if term is None:
|
if term is None:
|
||||||
|
|
@ -33,6 +65,9 @@ def get_term_value(term):
|
||||||
return term.iri
|
return term.iri
|
||||||
elif term.type == LITERAL:
|
elif term.type == LITERAL:
|
||||||
return term.value
|
return term.value
|
||||||
|
elif term.type == TRIPLE:
|
||||||
|
# Serialize nested triple as JSON
|
||||||
|
return serialize_triple(term.triple)
|
||||||
else:
|
else:
|
||||||
# For blank nodes or other types, use id or value
|
# For blank nodes or other types, use id or value
|
||||||
return term.id or term.value
|
return term.id or term.value
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue