trustgraph/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
Cyber MacGeddon 56d700f301 Expose LLM token usage (in_token, out_token, model) across all
service layers

Propagate token counts from LLM services through the prompt,
text-completion, graph-RAG, document-RAG, and agent orchestrator
pipelines to the API gateway and Python SDK. All fields are Optional
— None means "not available", distinguishing from a real zero count.

Key changes:

- Schema: Add in_token/out_token/model to TextCompletionResponse,
  PromptResponse, GraphRagResponse, DocumentRagResponse,
  AgentResponse

- TextCompletionClient: New TextCompletionResult return type. Split
  into text_completion() (non-streaming) and
  text_completion_stream() (streaming with per-chunk handler
  callback)

- PromptClient: New PromptResult with response_type
  (text/json/jsonl), typed fields (text/object/objects), and token
  usage. All callers updated.

- RAG services: Accumulate token usage across all prompt calls
  (extract-concepts, edge-scoring, edge-reasoning,
  synthesis). Non-streaming path sends single combined response
  instead of chunk + end_of_session.

- Agent orchestrator: UsageTracker accumulates tokens across
  meta-router, pattern prompt calls, and react reasoning. Attached
  to end_of_dialog.

- Translators: Encode token fields when not None (is not None, not truthy)

- Python SDK: RAG and text-completion methods return
  TextCompletionResult (non-streaming) or RAGChunk/AgentAnswer with
  token fields (streaming)

- CLI: --show-usage flag on tg-invoke-llm, tg-invoke-prompt,
  tg-invoke-graph-rag, tg-invoke-document-rag, tg-invoke-agent
2026-04-13 14:34:02 +01:00

904 lines
No EOL
38 KiB
Python

"""
OntoRAG: Ontology-based knowledge extraction service.
Extracts ontology-conformant triples from text chunks.
"""
import json
import logging
import asyncio
from typing import List, Dict, Any, Optional
from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from .... schema import EntityContext, EntityContexts
from .... schema import PromptRequest, PromptResponse
from .... rdf import TRUSTGRAPH_ENTITIES, RDF_TYPE, RDF_LABEL, DEFINITION
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, EmbeddingsClientSpec
from .ontology_loader import OntologyLoader
from .ontology_embedder import OntologyEmbedder
from .vector_store import InMemoryVectorStore
from .text_processor import TextProcessor
from .ontology_selector import OntologySelector, OntologySubset
from .simplified_parser import parse_extraction_response
from .triple_converter import TripleConverter
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
from .... flow_version import __version__ as COMPONENT_VERSION
logger = logging.getLogger(__name__)
default_ident = "kg-extract-ontology"
default_concurrency = 1
default_triples_batch_size = 50
default_entity_batch_size = 5
# URI prefix mappings for common namespaces
URI_PREFIXES = {
"rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs:": "http://www.w3.org/2000/01/rdf-schema#",
"owl:": "http://www.w3.org/2002/07/owl#",
"skos:": "http://www.w3.org/2004/02/skos/core#",
"schema:": "https://schema.org/",
"xsd:": "http://www.w3.org/2001/XMLSchema#",
}
def make_term(v, is_uri):
"""Helper to create Term from value and is_uri flag."""
if is_uri:
return Term(type=IRI, iri=v)
else:
return Term(type=LITERAL, value=v)
class Processor(FlowProcessor):
"""Main OntoRAG extraction processor."""
def __init__(self, **params):
id = params.get("id", default_ident)
concurrency = params.get("concurrency", default_concurrency)
self.triples_batch_size = params.get("triples_batch_size", default_triples_batch_size)
self.entity_batch_size = params.get("entity_batch_size", default_entity_batch_size)
super(Processor, self).__init__(
**params | {
"id": id,
"concurrency": concurrency,
}
)
# Register specifications
self.register_specification(
ConsumerSpec(
name="input",
schema=Chunk,
handler=self.on_message,
concurrency=concurrency,
)
)
self.register_specification(
PromptClientSpec(
request_name="prompt-request",
response_name="prompt-response",
)
)
self.register_specification(
EmbeddingsClientSpec(
request_name="embeddings-request",
response_name="embeddings-response"
)
)
self.register_specification(
ProducerSpec(
name="triples",
schema=Triples
)
)
self.register_specification(
ProducerSpec(
name="entity-contexts",
schema=EntityContexts
)
)
# Register config handler for ontology updates
self.register_config_handler(self.on_ontology_config, types=["ontology"])
# Shared components (not flow-specific)
self.ontology_loader = OntologyLoader()
self.text_processor = TextProcessor()
# Per-flow components (each flow gets its own embedder/vector store/selector)
self.flow_components = {} # flow_id -> {embedder, vector_store, selector}
# Configuration
self.top_k = params.get("top_k", 10)
self.similarity_threshold = params.get("similarity_threshold", 0.3)
# Track loaded ontology version
self.current_ontology_version = None
self.loaded_ontology_ids = set()
async def initialize_flow_components(self, flow):
"""Initialize per-flow OntoRAG components.
Each flow gets its own vector store and embedder to support
different embedding models across flows. The vector store dimension
is auto-detected from the embeddings service.
Args:
flow: Flow object for this processing context
Returns:
flow_id: Identifier for this flow's components
"""
# Use flow object as identifier
flow_id = id(flow)
if flow_id in self.flow_components:
return flow_id # Already initialized for this flow
try:
logger.info(f"Initializing components for flow {flow_id}")
# Use embeddings client directly (no wrapper needed)
embeddings_client = flow("embeddings-request")
# Detect embedding dimension by embedding a test string
logger.info("Detecting embedding dimension from embeddings service...")
test_embedding_response = await embeddings_client.embed(["test"])
test_embedding = test_embedding_response[0] # Extract first vector
dimension = len(test_embedding)
logger.info(f"Detected embedding dimension: {dimension}")
# Initialize vector store with detected dimension
vector_store = InMemoryVectorStore(
dimension=dimension,
index_type='flat'
)
ontology_embedder = OntologyEmbedder(
embedding_service=embeddings_client,
vector_store=vector_store
)
# Embed all loaded ontologies for this flow
if self.ontology_loader.get_all_ontologies():
logger.info(f"Embedding ontologies for flow {flow_id}")
for ont_id, ontology in self.ontology_loader.get_all_ontologies().items():
await ontology_embedder.embed_ontology(ontology)
logger.info(f"Embedded {ontology_embedder.get_embedded_count()} ontology elements for flow {flow_id}")
# Initialize ontology selector
ontology_selector = OntologySelector(
ontology_embedder=ontology_embedder,
ontology_loader=self.ontology_loader,
top_k=self.top_k,
similarity_threshold=self.similarity_threshold
)
# Store flow-specific components
self.flow_components[flow_id] = {
'embedder': ontology_embedder,
'vector_store': vector_store,
'selector': ontology_selector,
'dimension': dimension
}
logger.info(f"Flow {flow_id} components initialized successfully (dimension={dimension})")
return flow_id
except Exception as e:
logger.error(f"Failed to initialize flow {flow_id} components: {e}", exc_info=True)
raise
async def on_ontology_config(self, config, version):
"""
Handle ontology configuration updates from ConfigPush queue.
Parses and stores ontologies. Embedding happens per-flow on first message.
Called automatically when:
- Processor starts (gets full config history via start_of_messages=True)
- Config service pushes updates (immediate event-driven notification)
Args:
config: Full configuration map - config[type][key] = value
version: Config version number (monotonically increasing)
"""
try:
logger.info(f"Received ontology config update, version={version}")
# Skip if we've already processed this version
if version == self.current_ontology_version:
logger.debug(f"Already at version {version}, skipping")
return
# Extract ontology configurations
if "ontology" not in config:
logger.warning("No 'ontology' section in config")
return
ontology_configs = config["ontology"]
# Parse ontology definitions
ontologies = {}
for ont_id, ont_json in ontology_configs.items():
try:
ontologies[ont_id] = json.loads(ont_json)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse ontology '{ont_id}': {e}")
continue
logger.info(f"Loaded {len(ontologies)} ontology definitions")
# Determine what changed (for incremental updates)
new_ids = set(ontologies.keys())
added_ids = new_ids - self.loaded_ontology_ids
removed_ids = self.loaded_ontology_ids - new_ids
updated_ids = new_ids & self.loaded_ontology_ids # May have changed content
if added_ids:
logger.info(f"New ontologies: {added_ids}")
if removed_ids:
logger.info(f"Removed ontologies: {removed_ids}")
if updated_ids:
logger.info(f"Updated ontologies: {updated_ids}")
# Update ontology loader's internal state
self.ontology_loader.update_ontologies(ontologies)
# Clear all flow components to force re-embedding with new ontologies
if added_ids or removed_ids or updated_ids:
logger.info("Clearing flow components to trigger re-embedding")
self.flow_components.clear()
# Update tracking
self.current_ontology_version = version
self.loaded_ontology_ids = new_ids
logger.info(f"Ontology config update complete, version={version}")
except Exception as e:
logger.error(f"Failed to process ontology config: {e}", exc_info=True)
async def on_message(self, msg, consumer, flow):
"""Process incoming chunk message."""
v = msg.value()
logger.info(f"Extracting ontology-based triples from {v.metadata.id}...")
# Initialize flow-specific components if needed
flow_id = await self.initialize_flow_components(flow)
components = self.flow_components[flow_id]
chunk = v.chunk.decode("utf-8")
logger.debug(f"Processing chunk: {chunk[:200]}...")
try:
# Process text into segments
segments = self.text_processor.process_chunk(chunk, extract_phrases=True)
logger.debug(f"Split chunk into {len(segments)} segments")
# Select relevant ontology subset (using flow-specific selector)
ontology_subsets = await components['selector'].select_ontology_subset(segments)
if not ontology_subsets:
logger.warning("No relevant ontology elements found for chunk")
return
# Merge subsets if multiple ontologies matched
if len(ontology_subsets) > 1:
ontology_subset = components['selector'].merge_subsets(ontology_subsets)
else:
ontology_subset = ontology_subsets[0]
logger.debug(f"Selected ontology subset with {len(ontology_subset.classes)} classes, "
f"{len(ontology_subset.object_properties)} object properties, "
f"{len(ontology_subset.datatype_properties)} datatype properties")
# Build extraction prompt variables
prompt_variables = self.build_extraction_variables(chunk, ontology_subset)
# Extract using simplified entity-relationship-attribute format
triples = await self.extract_with_simplified_format(
flow, chunk, ontology_subset, prompt_variables
)
# Generate subgraph provenance for extracted triples
if triples:
chunk_uri = v.metadata.id
sg_uri = subgraph_uri()
prov_triples = subgraph_provenance_triples(
subgraph_uri=sg_uri,
extracted_triples=triples,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
)
# Generate ontology definition triples
ontology_triples = self.build_ontology_triples(ontology_subset)
# Combine extracted triples with ontology triples and provenance
all_triples = triples + ontology_triples
if triples:
all_triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Build entity contexts from all triples (including ontology elements)
entity_contexts = self.build_entity_contexts(all_triples)
# Emit triples in batches
for i in range(0, len(all_triples), self.triples_batch_size):
batch = all_triples[i:i + self.triples_batch_size]
await self.emit_triples(
flow("triples"),
v.metadata,
batch
)
# Emit entity contexts in batches
for i in range(0, len(entity_contexts), self.entity_batch_size):
batch = entity_contexts[i:i + self.entity_batch_size]
await self.emit_entity_contexts(
flow("entity-contexts"),
v.metadata,
batch
)
logger.info(f"Extracted {len(triples)} content triples + {len(ontology_triples)} ontology triples "
f"= {len(all_triples)} total triples and {len(entity_contexts)} entity contexts")
except Exception as e:
logger.error(f"OntoRAG extraction exception: {e}", exc_info=True)
async def extract_with_simplified_format(
self,
flow,
chunk: str,
ontology_subset: OntologySubset,
prompt_variables: Dict[str, Any]
) -> List[Triple]:
"""Extract triples using simplified entity-relationship-attribute format.
Args:
flow: Flow object for accessing services
chunk: Text chunk to extract from
ontology_subset: Selected ontology subset
prompt_variables: Variables for prompt template
Returns:
List of Triple objects
"""
try:
# Call prompt service with simplified format prompt
result = await flow("prompt-request").prompt(
id="extract-with-ontologies",
variables=prompt_variables
)
extraction_response = result.object
logger.debug(f"Simplified extraction response: {extraction_response}")
# Parse response into structured format
extraction_result = parse_extraction_response(extraction_response)
if not extraction_result:
logger.warning("Failed to parse extraction response")
return []
logger.info(f"Parsed {len(extraction_result.entities)} entities, "
f"{len(extraction_result.relationships)} relationships, "
f"{len(extraction_result.attributes)} attributes")
# Convert to RDF triples
converter = TripleConverter(ontology_subset, ontology_subset.ontology_id)
triples = converter.convert_all(extraction_result)
logger.info(f"Generated {len(triples)} RDF triples from simplified extraction")
return triples
except Exception as e:
logger.error(f"Simplified extraction error: {e}", exc_info=True)
return []
def build_extraction_variables(self, chunk: str, ontology_subset: OntologySubset) -> Dict[str, Any]:
"""Build variables for ontology-based extraction prompt template.
Args:
chunk: Text chunk to extract from
ontology_subset: Relevant ontology elements
Returns:
Dict with template variables: text, classes, object_properties, datatype_properties
"""
return {
"text": chunk,
"classes": ontology_subset.classes,
"object_properties": ontology_subset.object_properties,
"datatype_properties": ontology_subset.datatype_properties
}
def parse_and_validate_triples(self, triples_response: List[Any],
ontology_subset: OntologySubset) -> List[Triple]:
"""Parse and validate extracted triples against ontology."""
validated_triples = []
ontology_id = ontology_subset.ontology_id
for triple_data in triples_response:
try:
if isinstance(triple_data, dict):
subject = triple_data.get('subject', '')
predicate = triple_data.get('predicate', '')
object_val = triple_data.get('object', '')
if not subject or not predicate or not object_val:
continue
# Validate against ontology
if self.is_valid_triple(subject, predicate, object_val, ontology_subset):
# Expand URIs before creating Value objects
subject_uri = self.expand_uri(subject, ontology_subset, ontology_id)
predicate_uri = self.expand_uri(predicate, ontology_subset, ontology_id)
# Object might be URI or literal - check before expanding
if self.is_uri(object_val) or self.should_expand_as_uri(object_val, ontology_subset):
object_uri = self.expand_uri(object_val, ontology_subset, ontology_id)
is_object_uri = True
else:
object_uri = object_val
is_object_uri = False
# Create Triple object with expanded URIs
s_value = make_term(subject_uri, is_uri=True)
p_value = make_term(predicate_uri, is_uri=True)
o_value = make_term(object_uri, is_uri=is_object_uri)
validated_triples.append(Triple(
s=s_value,
p=p_value,
o=o_value
))
else:
logger.debug(f"Invalid triple: ({subject}, {predicate}, {object_val})")
except Exception as e:
logger.error(f"Error parsing triple: {e}")
return validated_triples
def should_expand_as_uri(self, value: str, ontology_subset: OntologySubset) -> bool:
"""Check if a value should be treated as URI (not literal).
Returns True if value is a class name, property name, or entity reference.
"""
# Check if it's a class or property from ontology
if value in ontology_subset.classes:
return True
if value in ontology_subset.object_properties:
return True
if value in ontology_subset.datatype_properties:
return True
# Check if it starts with a known prefix
for prefix in URI_PREFIXES.keys():
if value.startswith(prefix):
return True
# Check if it looks like an entity reference (e.g., "recipe:cornish-pasty")
if ":" in value and not value.startswith("http"):
return True
return False
def is_valid_triple(self, subject: str, predicate: str, object_val: str,
ontology_subset: OntologySubset) -> bool:
"""Validate triple against ontology constraints."""
# Special case for rdf:type
if predicate == "rdf:type" or predicate == str(RDF_TYPE):
# Check if object is a valid class
return object_val in ontology_subset.classes
# Special case for rdfs:label
if predicate == "rdfs:label" or predicate == str(RDF_LABEL):
return True # Labels are always valid
# Check if predicate is a valid property
is_obj_prop = predicate in ontology_subset.object_properties
is_dt_prop = predicate in ontology_subset.datatype_properties
if not is_obj_prop and not is_dt_prop:
return False # Unknown property
# TODO: Add more sophisticated validation (domain/range checking)
return True
def expand_uri(self, value: str, ontology_subset: OntologySubset, ontology_id: str = "unknown") -> str:
"""Expand prefix notation or short names to full URIs.
Args:
value: Value to expand (e.g., "rdf:type", "Recipe", "has_ingredient")
ontology_subset: Ontology subset for class/property lookup
ontology_id: ID of the ontology for constructing instance URIs
Returns:
Full URI string
"""
# Already a full URI
if value.startswith("http://") or value.startswith("https://"):
return value
# Check standard prefixes (rdf:, rdfs:, etc.)
for prefix, namespace in URI_PREFIXES.items():
if value.startswith(prefix):
return namespace + value[len(prefix):]
# Check if it's an ontology class
if value in ontology_subset.classes:
class_def = ontology_subset.classes[value]
# class_def is a dict (from cls.__dict__ in ontology_selector)
if isinstance(class_def, dict) and 'uri' in class_def and class_def['uri']:
return class_def['uri']
# Fallback: construct URI
return f"https://trustgraph.ai/ontology/{ontology_id}#{value}"
# Check if it's an ontology property
if value in ontology_subset.object_properties:
prop_def = ontology_subset.object_properties[value]
# prop_def is a dict (from prop.__dict__ in ontology_selector)
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
return prop_def['uri']
return f"https://trustgraph.ai/ontology/{ontology_id}#{value}"
if value in ontology_subset.datatype_properties:
prop_def = ontology_subset.datatype_properties[value]
# prop_def is a dict (from prop.__dict__ in ontology_selector)
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
return prop_def['uri']
return f"https://trustgraph.ai/ontology/{ontology_id}#{value}"
# Otherwise, treat as entity instance - construct unique URI
# Normalize the value for URI (lowercase, replace spaces with hyphens)
normalized = value.replace(" ", "-").lower()
return f"https://trustgraph.ai/{ontology_id}/{normalized}"
def is_uri(self, value: str) -> bool:
"""Check if value is already a full URI."""
return value.startswith("http://") or value.startswith("https://")
async def emit_triples(self, pub, metadata: Metadata, triples: List[Triple]):
"""Emit triples to output."""
t = Triples(
metadata=Metadata(
id=metadata.id,
root=metadata.root,
user=metadata.user,
collection=metadata.collection,
),
triples=triples,
)
await pub.send(t)
async def emit_entity_contexts(self, pub, metadata: Metadata, entities: List[EntityContext]):
"""Emit entity contexts to output."""
ec = EntityContexts(
metadata=Metadata(
id=metadata.id,
root=metadata.root,
user=metadata.user,
collection=metadata.collection,
),
entities=entities,
)
await pub.send(ec)
def build_ontology_triples(self, ontology_subset: OntologySubset) -> List[Triple]:
"""Build triples describing the ontology elements themselves.
Generates triples for classes and properties so they exist in the knowledge graph.
Args:
ontology_subset: The ontology subset used for extraction
Returns:
List of Triple objects describing ontology elements
"""
ontology_triples = []
# Generate triples for classes
for class_id, class_def in ontology_subset.classes.items():
# Get URI for class
if isinstance(class_def, dict) and 'uri' in class_def and class_def['uri']:
class_uri = class_def['uri']
else:
# Fallback to constructed URI
class_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{class_id}"
# rdf:type owl:Class
ontology_triples.append(Triple(
s=make_term(class_uri, is_uri=True),
p=make_term("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True),
o=make_term("http://www.w3.org/2002/07/owl#Class", is_uri=True)
))
# rdfs:label (stored as 'labels' in OntologyClass.__dict__)
if isinstance(class_def, dict) and 'labels' in class_def:
labels = class_def['labels']
if isinstance(labels, list) and labels:
label_val = labels[0].get('value', class_id) if isinstance(labels[0], dict) else str(labels[0])
ontology_triples.append(Triple(
s=make_term(class_uri, is_uri=True),
p=make_term(RDF_LABEL, is_uri=True),
o=make_term(label_val, is_uri=False)
))
# rdfs:comment (stored as 'comment' in OntologyClass.__dict__)
if isinstance(class_def, dict) and 'comment' in class_def and class_def['comment']:
comment = class_def['comment']
ontology_triples.append(Triple(
s=make_term(class_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#comment", is_uri=True),
o=make_term(comment, is_uri=False)
))
# rdfs:subClassOf (stored as 'subclass_of' in OntologyClass.__dict__)
if isinstance(class_def, dict) and 'subclass_of' in class_def and class_def['subclass_of']:
parent = class_def['subclass_of']
# Get parent URI
if parent in ontology_subset.classes:
parent_class_def = ontology_subset.classes[parent]
if isinstance(parent_class_def, dict) and 'uri' in parent_class_def and parent_class_def['uri']:
parent_uri = parent_class_def['uri']
else:
parent_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{parent}"
else:
parent_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{parent}"
ontology_triples.append(Triple(
s=make_term(class_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", is_uri=True),
o=make_term(parent_uri, is_uri=True)
))
# Generate triples for object properties
for prop_id, prop_def in ontology_subset.object_properties.items():
# Get URI for property
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
prop_uri = prop_def['uri']
else:
prop_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{prop_id}"
# rdf:type owl:ObjectProperty
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True),
o=make_term("http://www.w3.org/2002/07/owl#ObjectProperty", is_uri=True)
))
# rdfs:label (stored as 'labels' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'labels' in prop_def:
labels = prop_def['labels']
if isinstance(labels, list) and labels:
label_val = labels[0].get('value', prop_id) if isinstance(labels[0], dict) else str(labels[0])
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term(RDF_LABEL, is_uri=True),
o=make_term(label_val, is_uri=False)
))
# rdfs:comment (stored as 'comment' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'comment' in prop_def and prop_def['comment']:
comment = prop_def['comment']
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#comment", is_uri=True),
o=make_term(comment, is_uri=False)
))
# rdfs:domain (stored as 'domain' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'domain' in prop_def and prop_def['domain']:
domain = prop_def['domain']
# Get domain class URI
if domain in ontology_subset.classes:
domain_class_def = ontology_subset.classes[domain]
if isinstance(domain_class_def, dict) and 'uri' in domain_class_def and domain_class_def['uri']:
domain_uri = domain_class_def['uri']
else:
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
else:
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#domain", is_uri=True),
o=make_term(domain_uri, is_uri=True)
))
# rdfs:range (stored as 'range' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'range' in prop_def and prop_def['range']:
range_val = prop_def['range']
# Get range class URI
if range_val in ontology_subset.classes:
range_class_def = ontology_subset.classes[range_val]
if isinstance(range_class_def, dict) and 'uri' in range_class_def and range_class_def['uri']:
range_uri = range_class_def['uri']
else:
range_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{range_val}"
else:
range_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{range_val}"
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#range", is_uri=True),
o=make_term(range_uri, is_uri=True)
))
# Generate triples for datatype properties
for prop_id, prop_def in ontology_subset.datatype_properties.items():
# Get URI for property
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
prop_uri = prop_def['uri']
else:
prop_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{prop_id}"
# rdf:type owl:DatatypeProperty
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True),
o=make_term("http://www.w3.org/2002/07/owl#DatatypeProperty", is_uri=True)
))
# rdfs:label (stored as 'labels' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'labels' in prop_def:
labels = prop_def['labels']
if isinstance(labels, list) and labels:
label_val = labels[0].get('value', prop_id) if isinstance(labels[0], dict) else str(labels[0])
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term(RDF_LABEL, is_uri=True),
o=make_term(label_val, is_uri=False)
))
# rdfs:comment (stored as 'comment' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'comment' in prop_def and prop_def['comment']:
comment = prop_def['comment']
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#comment", is_uri=True),
o=make_term(comment, is_uri=False)
))
# rdfs:domain (stored as 'domain' in OntologyProperty.__dict__)
if isinstance(prop_def, dict) and 'domain' in prop_def and prop_def['domain']:
domain = prop_def['domain']
# Get domain class URI
if domain in ontology_subset.classes:
domain_class_def = ontology_subset.classes[domain]
if isinstance(domain_class_def, dict) and 'uri' in domain_class_def and domain_class_def['uri']:
domain_uri = domain_class_def['uri']
else:
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
else:
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#domain", is_uri=True),
o=make_term(domain_uri, is_uri=True)
))
# rdfs:range (datatype)
if isinstance(prop_def, dict) and 'rdfs:range' in prop_def and prop_def['rdfs:range']:
range_val = prop_def['rdfs:range']
# Range for datatype properties is usually xsd:string, xsd:int, etc.
if range_val.startswith('xsd:'):
range_uri = f"http://www.w3.org/2001/XMLSchema#{range_val[4:]}"
else:
range_uri = range_val
ontology_triples.append(Triple(
s=make_term(prop_uri, is_uri=True),
p=make_term("http://www.w3.org/2000/01/rdf-schema#range", is_uri=True),
o=make_term(range_uri, is_uri=True)
))
logger.info(f"Generated {len(ontology_triples)} triples describing ontology elements")
return ontology_triples
def build_entity_contexts(self, triples: List[Triple]) -> List[EntityContext]:
"""Build entity contexts from extracted triples.
Collects rdfs:label and definition properties for each entity to create
contextual descriptions for embedding.
Args:
triples: List of extracted triples
Returns:
List of EntityContext objects
"""
# Group triples by subject to collect entity information
entity_data = {} # subject_uri -> {labels: [], definitions: []}
for triple in triples:
subject_uri = triple.s.iri if triple.s.type == IRI else triple.s.value
predicate_uri = triple.p.iri if triple.p.type == IRI else triple.p.value
object_val = triple.o.value if triple.o.type == LITERAL else triple.o.iri
# Initialize entity data if not exists
if subject_uri not in entity_data:
entity_data[subject_uri] = {'labels': [], 'definitions': []}
# Collect labels (rdfs:label)
if predicate_uri == RDF_LABEL:
if triple.o.type == LITERAL: # Labels are literals
entity_data[subject_uri]['labels'].append(object_val)
# Collect definitions (skos:definition, schema:description)
elif predicate_uri == DEFINITION or predicate_uri == "https://schema.org/description":
if triple.o.type == LITERAL:
entity_data[subject_uri]['definitions'].append(object_val)
# Build EntityContext objects
entity_contexts = []
for subject_uri, data in entity_data.items():
# Build context text from labels and definitions
context_parts = []
if data['labels']:
context_parts.append(f"Label: {data['labels'][0]}")
if data['definitions']:
context_parts.extend(data['definitions'])
# Only create EntityContext if we have meaningful context
if context_parts:
context_text = ". ".join(context_parts)
entity_contexts.append(EntityContext(
entity=make_term(subject_uri, is_uri=True),
context=context_text
))
logger.debug(f"Built {len(entity_contexts)} entity contexts from {len(triples)} triples")
return entity_contexts
@staticmethod
def add_args(parser):
"""Add command-line arguments."""
parser.add_argument(
'-c', '--concurrency',
type=int,
default=default_concurrency,
help=f'Concurrent processing threads (default: {default_concurrency})'
)
parser.add_argument(
'--top-k',
type=int,
default=10,
help='Number of top ontology elements to retrieve (default: 10)'
)
parser.add_argument(
'--similarity-threshold',
type=float,
default=0.3,
help='Similarity threshold for ontology matching (default: 0.3, range: 0.0-1.0)'
)
parser.add_argument(
'--triples-batch-size',
type=int,
default=default_triples_batch_size,
help=f'Maximum triples per output message (default: {default_triples_batch_size})'
)
parser.add_argument(
'--entity-batch-size',
type=int,
default=default_entity_batch_size,
help=f'Maximum entity contexts per output message (default: {default_entity_batch_size})'
)
FlowProcessor.add_args(parser)
def run():
"""Launch the OntoRAG extraction service."""
Processor.launch(default_ident, __doc__)