mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-01 11:26:22 +02:00
service layers Propagate token counts from LLM services through the prompt, text-completion, graph-RAG, document-RAG, and agent orchestrator pipelines to the API gateway and Python SDK. All fields are Optional — None means "not available", distinguishing from a real zero count. Key changes: - Schema: Add in_token/out_token/model to TextCompletionResponse, PromptResponse, GraphRagResponse, DocumentRagResponse, AgentResponse - TextCompletionClient: New TextCompletionResult return type. Split into text_completion() (non-streaming) and text_completion_stream() (streaming with per-chunk handler callback) - PromptClient: New PromptResult with response_type (text/json/jsonl), typed fields (text/object/objects), and token usage. All callers updated. - RAG services: Accumulate token usage across all prompt calls (extract-concepts, edge-scoring, edge-reasoning, synthesis). Non-streaming path sends single combined response instead of chunk + end_of_session. - Agent orchestrator: UsageTracker accumulates tokens across meta-router, pattern prompt calls, and react reasoning. Attached to end_of_dialog. - Translators: Encode token fields when not None (is not None, not truthy) - Python SDK: RAG and text-completion methods return TextCompletionResult (non-streaming) or RAGChunk/AgentAnswer with token fields (streaming) - CLI: --show-usage flag on tg-invoke-llm, tg-invoke-prompt, tg-invoke-graph-rag, tg-invoke-document-rag, tg-invoke-agent
904 lines
No EOL
38 KiB
Python
904 lines
No EOL
38 KiB
Python
"""
|
|
OntoRAG: Ontology-based knowledge extraction service.
|
|
Extracts ontology-conformant triples from text chunks.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import asyncio
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
|
|
from .... schema import EntityContext, EntityContexts
|
|
from .... schema import PromptRequest, PromptResponse
|
|
from .... rdf import TRUSTGRAPH_ENTITIES, RDF_TYPE, RDF_LABEL, DEFINITION
|
|
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
|
from .... base import PromptClientSpec, EmbeddingsClientSpec
|
|
|
|
from .ontology_loader import OntologyLoader
|
|
from .ontology_embedder import OntologyEmbedder
|
|
from .vector_store import InMemoryVectorStore
|
|
from .text_processor import TextProcessor
|
|
from .ontology_selector import OntologySelector, OntologySubset
|
|
from .simplified_parser import parse_extraction_response
|
|
from .triple_converter import TripleConverter
|
|
|
|
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
|
|
from .... flow_version import __version__ as COMPONENT_VERSION
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
default_ident = "kg-extract-ontology"
|
|
default_concurrency = 1
|
|
default_triples_batch_size = 50
|
|
default_entity_batch_size = 5
|
|
|
|
# URI prefix mappings for common namespaces
|
|
URI_PREFIXES = {
|
|
"rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
"rdfs:": "http://www.w3.org/2000/01/rdf-schema#",
|
|
"owl:": "http://www.w3.org/2002/07/owl#",
|
|
"skos:": "http://www.w3.org/2004/02/skos/core#",
|
|
"schema:": "https://schema.org/",
|
|
"xsd:": "http://www.w3.org/2001/XMLSchema#",
|
|
}
|
|
|
|
|
|
def make_term(v, is_uri):
|
|
"""Helper to create Term from value and is_uri flag."""
|
|
if is_uri:
|
|
return Term(type=IRI, iri=v)
|
|
else:
|
|
return Term(type=LITERAL, value=v)
|
|
|
|
|
|
class Processor(FlowProcessor):
|
|
"""Main OntoRAG extraction processor."""
|
|
|
|
def __init__(self, **params):
|
|
id = params.get("id", default_ident)
|
|
concurrency = params.get("concurrency", default_concurrency)
|
|
self.triples_batch_size = params.get("triples_batch_size", default_triples_batch_size)
|
|
self.entity_batch_size = params.get("entity_batch_size", default_entity_batch_size)
|
|
|
|
super(Processor, self).__init__(
|
|
**params | {
|
|
"id": id,
|
|
"concurrency": concurrency,
|
|
}
|
|
)
|
|
|
|
# Register specifications
|
|
self.register_specification(
|
|
ConsumerSpec(
|
|
name="input",
|
|
schema=Chunk,
|
|
handler=self.on_message,
|
|
concurrency=concurrency,
|
|
)
|
|
)
|
|
|
|
self.register_specification(
|
|
PromptClientSpec(
|
|
request_name="prompt-request",
|
|
response_name="prompt-response",
|
|
)
|
|
)
|
|
|
|
self.register_specification(
|
|
EmbeddingsClientSpec(
|
|
request_name="embeddings-request",
|
|
response_name="embeddings-response"
|
|
)
|
|
)
|
|
|
|
self.register_specification(
|
|
ProducerSpec(
|
|
name="triples",
|
|
schema=Triples
|
|
)
|
|
)
|
|
|
|
self.register_specification(
|
|
ProducerSpec(
|
|
name="entity-contexts",
|
|
schema=EntityContexts
|
|
)
|
|
)
|
|
|
|
# Register config handler for ontology updates
|
|
self.register_config_handler(self.on_ontology_config, types=["ontology"])
|
|
|
|
# Shared components (not flow-specific)
|
|
self.ontology_loader = OntologyLoader()
|
|
self.text_processor = TextProcessor()
|
|
|
|
# Per-flow components (each flow gets its own embedder/vector store/selector)
|
|
self.flow_components = {} # flow_id -> {embedder, vector_store, selector}
|
|
|
|
# Configuration
|
|
self.top_k = params.get("top_k", 10)
|
|
self.similarity_threshold = params.get("similarity_threshold", 0.3)
|
|
|
|
# Track loaded ontology version
|
|
self.current_ontology_version = None
|
|
self.loaded_ontology_ids = set()
|
|
|
|
async def initialize_flow_components(self, flow):
|
|
"""Initialize per-flow OntoRAG components.
|
|
|
|
Each flow gets its own vector store and embedder to support
|
|
different embedding models across flows. The vector store dimension
|
|
is auto-detected from the embeddings service.
|
|
|
|
Args:
|
|
flow: Flow object for this processing context
|
|
|
|
Returns:
|
|
flow_id: Identifier for this flow's components
|
|
"""
|
|
# Use flow object as identifier
|
|
flow_id = id(flow)
|
|
|
|
if flow_id in self.flow_components:
|
|
return flow_id # Already initialized for this flow
|
|
|
|
try:
|
|
logger.info(f"Initializing components for flow {flow_id}")
|
|
|
|
# Use embeddings client directly (no wrapper needed)
|
|
embeddings_client = flow("embeddings-request")
|
|
|
|
# Detect embedding dimension by embedding a test string
|
|
logger.info("Detecting embedding dimension from embeddings service...")
|
|
test_embedding_response = await embeddings_client.embed(["test"])
|
|
test_embedding = test_embedding_response[0] # Extract first vector
|
|
dimension = len(test_embedding)
|
|
logger.info(f"Detected embedding dimension: {dimension}")
|
|
|
|
# Initialize vector store with detected dimension
|
|
vector_store = InMemoryVectorStore(
|
|
dimension=dimension,
|
|
index_type='flat'
|
|
)
|
|
|
|
ontology_embedder = OntologyEmbedder(
|
|
embedding_service=embeddings_client,
|
|
vector_store=vector_store
|
|
)
|
|
|
|
# Embed all loaded ontologies for this flow
|
|
if self.ontology_loader.get_all_ontologies():
|
|
logger.info(f"Embedding ontologies for flow {flow_id}")
|
|
for ont_id, ontology in self.ontology_loader.get_all_ontologies().items():
|
|
await ontology_embedder.embed_ontology(ontology)
|
|
logger.info(f"Embedded {ontology_embedder.get_embedded_count()} ontology elements for flow {flow_id}")
|
|
|
|
# Initialize ontology selector
|
|
ontology_selector = OntologySelector(
|
|
ontology_embedder=ontology_embedder,
|
|
ontology_loader=self.ontology_loader,
|
|
top_k=self.top_k,
|
|
similarity_threshold=self.similarity_threshold
|
|
)
|
|
|
|
# Store flow-specific components
|
|
self.flow_components[flow_id] = {
|
|
'embedder': ontology_embedder,
|
|
'vector_store': vector_store,
|
|
'selector': ontology_selector,
|
|
'dimension': dimension
|
|
}
|
|
|
|
logger.info(f"Flow {flow_id} components initialized successfully (dimension={dimension})")
|
|
return flow_id
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize flow {flow_id} components: {e}", exc_info=True)
|
|
raise
|
|
|
|
async def on_ontology_config(self, config, version):
|
|
"""
|
|
Handle ontology configuration updates from ConfigPush queue.
|
|
|
|
Parses and stores ontologies. Embedding happens per-flow on first message.
|
|
|
|
Called automatically when:
|
|
- Processor starts (gets full config history via start_of_messages=True)
|
|
- Config service pushes updates (immediate event-driven notification)
|
|
|
|
Args:
|
|
config: Full configuration map - config[type][key] = value
|
|
version: Config version number (monotonically increasing)
|
|
"""
|
|
try:
|
|
logger.info(f"Received ontology config update, version={version}")
|
|
|
|
# Skip if we've already processed this version
|
|
if version == self.current_ontology_version:
|
|
logger.debug(f"Already at version {version}, skipping")
|
|
return
|
|
|
|
# Extract ontology configurations
|
|
if "ontology" not in config:
|
|
logger.warning("No 'ontology' section in config")
|
|
return
|
|
|
|
ontology_configs = config["ontology"]
|
|
|
|
# Parse ontology definitions
|
|
ontologies = {}
|
|
for ont_id, ont_json in ontology_configs.items():
|
|
try:
|
|
ontologies[ont_id] = json.loads(ont_json)
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse ontology '{ont_id}': {e}")
|
|
continue
|
|
|
|
logger.info(f"Loaded {len(ontologies)} ontology definitions")
|
|
|
|
# Determine what changed (for incremental updates)
|
|
new_ids = set(ontologies.keys())
|
|
added_ids = new_ids - self.loaded_ontology_ids
|
|
removed_ids = self.loaded_ontology_ids - new_ids
|
|
updated_ids = new_ids & self.loaded_ontology_ids # May have changed content
|
|
|
|
if added_ids:
|
|
logger.info(f"New ontologies: {added_ids}")
|
|
if removed_ids:
|
|
logger.info(f"Removed ontologies: {removed_ids}")
|
|
if updated_ids:
|
|
logger.info(f"Updated ontologies: {updated_ids}")
|
|
|
|
# Update ontology loader's internal state
|
|
self.ontology_loader.update_ontologies(ontologies)
|
|
|
|
# Clear all flow components to force re-embedding with new ontologies
|
|
if added_ids or removed_ids or updated_ids:
|
|
logger.info("Clearing flow components to trigger re-embedding")
|
|
self.flow_components.clear()
|
|
|
|
# Update tracking
|
|
self.current_ontology_version = version
|
|
self.loaded_ontology_ids = new_ids
|
|
|
|
logger.info(f"Ontology config update complete, version={version}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process ontology config: {e}", exc_info=True)
|
|
|
|
async def on_message(self, msg, consumer, flow):
|
|
"""Process incoming chunk message."""
|
|
v = msg.value()
|
|
logger.info(f"Extracting ontology-based triples from {v.metadata.id}...")
|
|
|
|
# Initialize flow-specific components if needed
|
|
flow_id = await self.initialize_flow_components(flow)
|
|
components = self.flow_components[flow_id]
|
|
|
|
chunk = v.chunk.decode("utf-8")
|
|
logger.debug(f"Processing chunk: {chunk[:200]}...")
|
|
|
|
try:
|
|
# Process text into segments
|
|
segments = self.text_processor.process_chunk(chunk, extract_phrases=True)
|
|
logger.debug(f"Split chunk into {len(segments)} segments")
|
|
|
|
# Select relevant ontology subset (using flow-specific selector)
|
|
ontology_subsets = await components['selector'].select_ontology_subset(segments)
|
|
|
|
if not ontology_subsets:
|
|
logger.warning("No relevant ontology elements found for chunk")
|
|
return
|
|
|
|
# Merge subsets if multiple ontologies matched
|
|
if len(ontology_subsets) > 1:
|
|
ontology_subset = components['selector'].merge_subsets(ontology_subsets)
|
|
else:
|
|
ontology_subset = ontology_subsets[0]
|
|
|
|
logger.debug(f"Selected ontology subset with {len(ontology_subset.classes)} classes, "
|
|
f"{len(ontology_subset.object_properties)} object properties, "
|
|
f"{len(ontology_subset.datatype_properties)} datatype properties")
|
|
|
|
# Build extraction prompt variables
|
|
prompt_variables = self.build_extraction_variables(chunk, ontology_subset)
|
|
|
|
# Extract using simplified entity-relationship-attribute format
|
|
triples = await self.extract_with_simplified_format(
|
|
flow, chunk, ontology_subset, prompt_variables
|
|
)
|
|
|
|
# Generate subgraph provenance for extracted triples
|
|
if triples:
|
|
chunk_uri = v.metadata.id
|
|
sg_uri = subgraph_uri()
|
|
prov_triples = subgraph_provenance_triples(
|
|
subgraph_uri=sg_uri,
|
|
extracted_triples=triples,
|
|
chunk_uri=chunk_uri,
|
|
component_name=default_ident,
|
|
component_version=COMPONENT_VERSION,
|
|
)
|
|
|
|
# Generate ontology definition triples
|
|
ontology_triples = self.build_ontology_triples(ontology_subset)
|
|
|
|
# Combine extracted triples with ontology triples and provenance
|
|
all_triples = triples + ontology_triples
|
|
if triples:
|
|
all_triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
|
|
|
# Build entity contexts from all triples (including ontology elements)
|
|
entity_contexts = self.build_entity_contexts(all_triples)
|
|
|
|
# Emit triples in batches
|
|
for i in range(0, len(all_triples), self.triples_batch_size):
|
|
batch = all_triples[i:i + self.triples_batch_size]
|
|
await self.emit_triples(
|
|
flow("triples"),
|
|
v.metadata,
|
|
batch
|
|
)
|
|
|
|
# Emit entity contexts in batches
|
|
for i in range(0, len(entity_contexts), self.entity_batch_size):
|
|
batch = entity_contexts[i:i + self.entity_batch_size]
|
|
await self.emit_entity_contexts(
|
|
flow("entity-contexts"),
|
|
v.metadata,
|
|
batch
|
|
)
|
|
|
|
logger.info(f"Extracted {len(triples)} content triples + {len(ontology_triples)} ontology triples "
|
|
f"= {len(all_triples)} total triples and {len(entity_contexts)} entity contexts")
|
|
|
|
except Exception as e:
|
|
logger.error(f"OntoRAG extraction exception: {e}", exc_info=True)
|
|
|
|
async def extract_with_simplified_format(
|
|
self,
|
|
flow,
|
|
chunk: str,
|
|
ontology_subset: OntologySubset,
|
|
prompt_variables: Dict[str, Any]
|
|
) -> List[Triple]:
|
|
"""Extract triples using simplified entity-relationship-attribute format.
|
|
|
|
Args:
|
|
flow: Flow object for accessing services
|
|
chunk: Text chunk to extract from
|
|
ontology_subset: Selected ontology subset
|
|
prompt_variables: Variables for prompt template
|
|
|
|
Returns:
|
|
List of Triple objects
|
|
"""
|
|
try:
|
|
# Call prompt service with simplified format prompt
|
|
result = await flow("prompt-request").prompt(
|
|
id="extract-with-ontologies",
|
|
variables=prompt_variables
|
|
)
|
|
extraction_response = result.object
|
|
logger.debug(f"Simplified extraction response: {extraction_response}")
|
|
|
|
# Parse response into structured format
|
|
extraction_result = parse_extraction_response(extraction_response)
|
|
|
|
if not extraction_result:
|
|
logger.warning("Failed to parse extraction response")
|
|
return []
|
|
|
|
logger.info(f"Parsed {len(extraction_result.entities)} entities, "
|
|
f"{len(extraction_result.relationships)} relationships, "
|
|
f"{len(extraction_result.attributes)} attributes")
|
|
|
|
# Convert to RDF triples
|
|
converter = TripleConverter(ontology_subset, ontology_subset.ontology_id)
|
|
triples = converter.convert_all(extraction_result)
|
|
|
|
logger.info(f"Generated {len(triples)} RDF triples from simplified extraction")
|
|
|
|
return triples
|
|
|
|
except Exception as e:
|
|
logger.error(f"Simplified extraction error: {e}", exc_info=True)
|
|
return []
|
|
|
|
def build_extraction_variables(self, chunk: str, ontology_subset: OntologySubset) -> Dict[str, Any]:
|
|
"""Build variables for ontology-based extraction prompt template.
|
|
|
|
Args:
|
|
chunk: Text chunk to extract from
|
|
ontology_subset: Relevant ontology elements
|
|
|
|
Returns:
|
|
Dict with template variables: text, classes, object_properties, datatype_properties
|
|
"""
|
|
return {
|
|
"text": chunk,
|
|
"classes": ontology_subset.classes,
|
|
"object_properties": ontology_subset.object_properties,
|
|
"datatype_properties": ontology_subset.datatype_properties
|
|
}
|
|
|
|
def parse_and_validate_triples(self, triples_response: List[Any],
|
|
ontology_subset: OntologySubset) -> List[Triple]:
|
|
"""Parse and validate extracted triples against ontology."""
|
|
validated_triples = []
|
|
ontology_id = ontology_subset.ontology_id
|
|
|
|
for triple_data in triples_response:
|
|
try:
|
|
if isinstance(triple_data, dict):
|
|
subject = triple_data.get('subject', '')
|
|
predicate = triple_data.get('predicate', '')
|
|
object_val = triple_data.get('object', '')
|
|
|
|
if not subject or not predicate or not object_val:
|
|
continue
|
|
|
|
# Validate against ontology
|
|
if self.is_valid_triple(subject, predicate, object_val, ontology_subset):
|
|
# Expand URIs before creating Value objects
|
|
subject_uri = self.expand_uri(subject, ontology_subset, ontology_id)
|
|
predicate_uri = self.expand_uri(predicate, ontology_subset, ontology_id)
|
|
|
|
# Object might be URI or literal - check before expanding
|
|
if self.is_uri(object_val) or self.should_expand_as_uri(object_val, ontology_subset):
|
|
object_uri = self.expand_uri(object_val, ontology_subset, ontology_id)
|
|
is_object_uri = True
|
|
else:
|
|
object_uri = object_val
|
|
is_object_uri = False
|
|
|
|
# Create Triple object with expanded URIs
|
|
s_value = make_term(subject_uri, is_uri=True)
|
|
p_value = make_term(predicate_uri, is_uri=True)
|
|
o_value = make_term(object_uri, is_uri=is_object_uri)
|
|
|
|
validated_triples.append(Triple(
|
|
s=s_value,
|
|
p=p_value,
|
|
o=o_value
|
|
))
|
|
else:
|
|
logger.debug(f"Invalid triple: ({subject}, {predicate}, {object_val})")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing triple: {e}")
|
|
|
|
return validated_triples
|
|
|
|
def should_expand_as_uri(self, value: str, ontology_subset: OntologySubset) -> bool:
|
|
"""Check if a value should be treated as URI (not literal).
|
|
|
|
Returns True if value is a class name, property name, or entity reference.
|
|
"""
|
|
# Check if it's a class or property from ontology
|
|
if value in ontology_subset.classes:
|
|
return True
|
|
if value in ontology_subset.object_properties:
|
|
return True
|
|
if value in ontology_subset.datatype_properties:
|
|
return True
|
|
# Check if it starts with a known prefix
|
|
for prefix in URI_PREFIXES.keys():
|
|
if value.startswith(prefix):
|
|
return True
|
|
# Check if it looks like an entity reference (e.g., "recipe:cornish-pasty")
|
|
if ":" in value and not value.startswith("http"):
|
|
return True
|
|
return False
|
|
|
|
def is_valid_triple(self, subject: str, predicate: str, object_val: str,
|
|
ontology_subset: OntologySubset) -> bool:
|
|
"""Validate triple against ontology constraints."""
|
|
# Special case for rdf:type
|
|
if predicate == "rdf:type" or predicate == str(RDF_TYPE):
|
|
# Check if object is a valid class
|
|
return object_val in ontology_subset.classes
|
|
|
|
# Special case for rdfs:label
|
|
if predicate == "rdfs:label" or predicate == str(RDF_LABEL):
|
|
return True # Labels are always valid
|
|
|
|
# Check if predicate is a valid property
|
|
is_obj_prop = predicate in ontology_subset.object_properties
|
|
is_dt_prop = predicate in ontology_subset.datatype_properties
|
|
|
|
if not is_obj_prop and not is_dt_prop:
|
|
return False # Unknown property
|
|
|
|
# TODO: Add more sophisticated validation (domain/range checking)
|
|
return True
|
|
|
|
def expand_uri(self, value: str, ontology_subset: OntologySubset, ontology_id: str = "unknown") -> str:
|
|
"""Expand prefix notation or short names to full URIs.
|
|
|
|
Args:
|
|
value: Value to expand (e.g., "rdf:type", "Recipe", "has_ingredient")
|
|
ontology_subset: Ontology subset for class/property lookup
|
|
ontology_id: ID of the ontology for constructing instance URIs
|
|
|
|
Returns:
|
|
Full URI string
|
|
"""
|
|
# Already a full URI
|
|
if value.startswith("http://") or value.startswith("https://"):
|
|
return value
|
|
|
|
# Check standard prefixes (rdf:, rdfs:, etc.)
|
|
for prefix, namespace in URI_PREFIXES.items():
|
|
if value.startswith(prefix):
|
|
return namespace + value[len(prefix):]
|
|
|
|
# Check if it's an ontology class
|
|
if value in ontology_subset.classes:
|
|
class_def = ontology_subset.classes[value]
|
|
# class_def is a dict (from cls.__dict__ in ontology_selector)
|
|
if isinstance(class_def, dict) and 'uri' in class_def and class_def['uri']:
|
|
return class_def['uri']
|
|
# Fallback: construct URI
|
|
return f"https://trustgraph.ai/ontology/{ontology_id}#{value}"
|
|
|
|
# Check if it's an ontology property
|
|
if value in ontology_subset.object_properties:
|
|
prop_def = ontology_subset.object_properties[value]
|
|
# prop_def is a dict (from prop.__dict__ in ontology_selector)
|
|
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
|
|
return prop_def['uri']
|
|
return f"https://trustgraph.ai/ontology/{ontology_id}#{value}"
|
|
|
|
if value in ontology_subset.datatype_properties:
|
|
prop_def = ontology_subset.datatype_properties[value]
|
|
# prop_def is a dict (from prop.__dict__ in ontology_selector)
|
|
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
|
|
return prop_def['uri']
|
|
return f"https://trustgraph.ai/ontology/{ontology_id}#{value}"
|
|
|
|
# Otherwise, treat as entity instance - construct unique URI
|
|
# Normalize the value for URI (lowercase, replace spaces with hyphens)
|
|
normalized = value.replace(" ", "-").lower()
|
|
return f"https://trustgraph.ai/{ontology_id}/{normalized}"
|
|
|
|
def is_uri(self, value: str) -> bool:
|
|
"""Check if value is already a full URI."""
|
|
return value.startswith("http://") or value.startswith("https://")
|
|
|
|
async def emit_triples(self, pub, metadata: Metadata, triples: List[Triple]):
|
|
"""Emit triples to output."""
|
|
t = Triples(
|
|
metadata=Metadata(
|
|
id=metadata.id,
|
|
root=metadata.root,
|
|
user=metadata.user,
|
|
collection=metadata.collection,
|
|
),
|
|
triples=triples,
|
|
)
|
|
await pub.send(t)
|
|
|
|
async def emit_entity_contexts(self, pub, metadata: Metadata, entities: List[EntityContext]):
|
|
"""Emit entity contexts to output."""
|
|
ec = EntityContexts(
|
|
metadata=Metadata(
|
|
id=metadata.id,
|
|
root=metadata.root,
|
|
user=metadata.user,
|
|
collection=metadata.collection,
|
|
),
|
|
entities=entities,
|
|
)
|
|
await pub.send(ec)
|
|
|
|
def build_ontology_triples(self, ontology_subset: OntologySubset) -> List[Triple]:
|
|
"""Build triples describing the ontology elements themselves.
|
|
|
|
Generates triples for classes and properties so they exist in the knowledge graph.
|
|
|
|
Args:
|
|
ontology_subset: The ontology subset used for extraction
|
|
|
|
Returns:
|
|
List of Triple objects describing ontology elements
|
|
"""
|
|
ontology_triples = []
|
|
|
|
# Generate triples for classes
|
|
for class_id, class_def in ontology_subset.classes.items():
|
|
# Get URI for class
|
|
if isinstance(class_def, dict) and 'uri' in class_def and class_def['uri']:
|
|
class_uri = class_def['uri']
|
|
else:
|
|
# Fallback to constructed URI
|
|
class_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{class_id}"
|
|
|
|
# rdf:type owl:Class
|
|
ontology_triples.append(Triple(
|
|
s=make_term(class_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True),
|
|
o=make_term("http://www.w3.org/2002/07/owl#Class", is_uri=True)
|
|
))
|
|
|
|
# rdfs:label (stored as 'labels' in OntologyClass.__dict__)
|
|
if isinstance(class_def, dict) and 'labels' in class_def:
|
|
labels = class_def['labels']
|
|
if isinstance(labels, list) and labels:
|
|
label_val = labels[0].get('value', class_id) if isinstance(labels[0], dict) else str(labels[0])
|
|
ontology_triples.append(Triple(
|
|
s=make_term(class_uri, is_uri=True),
|
|
p=make_term(RDF_LABEL, is_uri=True),
|
|
o=make_term(label_val, is_uri=False)
|
|
))
|
|
|
|
# rdfs:comment (stored as 'comment' in OntologyClass.__dict__)
|
|
if isinstance(class_def, dict) and 'comment' in class_def and class_def['comment']:
|
|
comment = class_def['comment']
|
|
ontology_triples.append(Triple(
|
|
s=make_term(class_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#comment", is_uri=True),
|
|
o=make_term(comment, is_uri=False)
|
|
))
|
|
|
|
# rdfs:subClassOf (stored as 'subclass_of' in OntologyClass.__dict__)
|
|
if isinstance(class_def, dict) and 'subclass_of' in class_def and class_def['subclass_of']:
|
|
parent = class_def['subclass_of']
|
|
# Get parent URI
|
|
if parent in ontology_subset.classes:
|
|
parent_class_def = ontology_subset.classes[parent]
|
|
if isinstance(parent_class_def, dict) and 'uri' in parent_class_def and parent_class_def['uri']:
|
|
parent_uri = parent_class_def['uri']
|
|
else:
|
|
parent_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{parent}"
|
|
else:
|
|
parent_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{parent}"
|
|
|
|
ontology_triples.append(Triple(
|
|
s=make_term(class_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", is_uri=True),
|
|
o=make_term(parent_uri, is_uri=True)
|
|
))
|
|
|
|
# Generate triples for object properties
|
|
for prop_id, prop_def in ontology_subset.object_properties.items():
|
|
# Get URI for property
|
|
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
|
|
prop_uri = prop_def['uri']
|
|
else:
|
|
prop_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{prop_id}"
|
|
|
|
# rdf:type owl:ObjectProperty
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True),
|
|
o=make_term("http://www.w3.org/2002/07/owl#ObjectProperty", is_uri=True)
|
|
))
|
|
|
|
# rdfs:label (stored as 'labels' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'labels' in prop_def:
|
|
labels = prop_def['labels']
|
|
if isinstance(labels, list) and labels:
|
|
label_val = labels[0].get('value', prop_id) if isinstance(labels[0], dict) else str(labels[0])
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term(RDF_LABEL, is_uri=True),
|
|
o=make_term(label_val, is_uri=False)
|
|
))
|
|
|
|
# rdfs:comment (stored as 'comment' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'comment' in prop_def and prop_def['comment']:
|
|
comment = prop_def['comment']
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#comment", is_uri=True),
|
|
o=make_term(comment, is_uri=False)
|
|
))
|
|
|
|
# rdfs:domain (stored as 'domain' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'domain' in prop_def and prop_def['domain']:
|
|
domain = prop_def['domain']
|
|
# Get domain class URI
|
|
if domain in ontology_subset.classes:
|
|
domain_class_def = ontology_subset.classes[domain]
|
|
if isinstance(domain_class_def, dict) and 'uri' in domain_class_def and domain_class_def['uri']:
|
|
domain_uri = domain_class_def['uri']
|
|
else:
|
|
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
|
|
else:
|
|
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
|
|
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#domain", is_uri=True),
|
|
o=make_term(domain_uri, is_uri=True)
|
|
))
|
|
|
|
# rdfs:range (stored as 'range' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'range' in prop_def and prop_def['range']:
|
|
range_val = prop_def['range']
|
|
# Get range class URI
|
|
if range_val in ontology_subset.classes:
|
|
range_class_def = ontology_subset.classes[range_val]
|
|
if isinstance(range_class_def, dict) and 'uri' in range_class_def and range_class_def['uri']:
|
|
range_uri = range_class_def['uri']
|
|
else:
|
|
range_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{range_val}"
|
|
else:
|
|
range_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{range_val}"
|
|
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#range", is_uri=True),
|
|
o=make_term(range_uri, is_uri=True)
|
|
))
|
|
|
|
# Generate triples for datatype properties
|
|
for prop_id, prop_def in ontology_subset.datatype_properties.items():
|
|
# Get URI for property
|
|
if isinstance(prop_def, dict) and 'uri' in prop_def and prop_def['uri']:
|
|
prop_uri = prop_def['uri']
|
|
else:
|
|
prop_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{prop_id}"
|
|
|
|
# rdf:type owl:DatatypeProperty
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True),
|
|
o=make_term("http://www.w3.org/2002/07/owl#DatatypeProperty", is_uri=True)
|
|
))
|
|
|
|
# rdfs:label (stored as 'labels' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'labels' in prop_def:
|
|
labels = prop_def['labels']
|
|
if isinstance(labels, list) and labels:
|
|
label_val = labels[0].get('value', prop_id) if isinstance(labels[0], dict) else str(labels[0])
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term(RDF_LABEL, is_uri=True),
|
|
o=make_term(label_val, is_uri=False)
|
|
))
|
|
|
|
# rdfs:comment (stored as 'comment' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'comment' in prop_def and prop_def['comment']:
|
|
comment = prop_def['comment']
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#comment", is_uri=True),
|
|
o=make_term(comment, is_uri=False)
|
|
))
|
|
|
|
# rdfs:domain (stored as 'domain' in OntologyProperty.__dict__)
|
|
if isinstance(prop_def, dict) and 'domain' in prop_def and prop_def['domain']:
|
|
domain = prop_def['domain']
|
|
# Get domain class URI
|
|
if domain in ontology_subset.classes:
|
|
domain_class_def = ontology_subset.classes[domain]
|
|
if isinstance(domain_class_def, dict) and 'uri' in domain_class_def and domain_class_def['uri']:
|
|
domain_uri = domain_class_def['uri']
|
|
else:
|
|
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
|
|
else:
|
|
domain_uri = f"https://trustgraph.ai/ontology/{ontology_subset.ontology_id}#{domain}"
|
|
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#domain", is_uri=True),
|
|
o=make_term(domain_uri, is_uri=True)
|
|
))
|
|
|
|
# rdfs:range (datatype)
|
|
if isinstance(prop_def, dict) and 'rdfs:range' in prop_def and prop_def['rdfs:range']:
|
|
range_val = prop_def['rdfs:range']
|
|
# Range for datatype properties is usually xsd:string, xsd:int, etc.
|
|
if range_val.startswith('xsd:'):
|
|
range_uri = f"http://www.w3.org/2001/XMLSchema#{range_val[4:]}"
|
|
else:
|
|
range_uri = range_val
|
|
|
|
ontology_triples.append(Triple(
|
|
s=make_term(prop_uri, is_uri=True),
|
|
p=make_term("http://www.w3.org/2000/01/rdf-schema#range", is_uri=True),
|
|
o=make_term(range_uri, is_uri=True)
|
|
))
|
|
|
|
logger.info(f"Generated {len(ontology_triples)} triples describing ontology elements")
|
|
return ontology_triples
|
|
|
|
def build_entity_contexts(self, triples: List[Triple]) -> List[EntityContext]:
|
|
"""Build entity contexts from extracted triples.
|
|
|
|
Collects rdfs:label and definition properties for each entity to create
|
|
contextual descriptions for embedding.
|
|
|
|
Args:
|
|
triples: List of extracted triples
|
|
|
|
Returns:
|
|
List of EntityContext objects
|
|
"""
|
|
# Group triples by subject to collect entity information
|
|
entity_data = {} # subject_uri -> {labels: [], definitions: []}
|
|
|
|
for triple in triples:
|
|
subject_uri = triple.s.iri if triple.s.type == IRI else triple.s.value
|
|
predicate_uri = triple.p.iri if triple.p.type == IRI else triple.p.value
|
|
object_val = triple.o.value if triple.o.type == LITERAL else triple.o.iri
|
|
|
|
# Initialize entity data if not exists
|
|
if subject_uri not in entity_data:
|
|
entity_data[subject_uri] = {'labels': [], 'definitions': []}
|
|
|
|
# Collect labels (rdfs:label)
|
|
if predicate_uri == RDF_LABEL:
|
|
if triple.o.type == LITERAL: # Labels are literals
|
|
entity_data[subject_uri]['labels'].append(object_val)
|
|
|
|
# Collect definitions (skos:definition, schema:description)
|
|
elif predicate_uri == DEFINITION or predicate_uri == "https://schema.org/description":
|
|
if triple.o.type == LITERAL:
|
|
entity_data[subject_uri]['definitions'].append(object_val)
|
|
|
|
# Build EntityContext objects
|
|
entity_contexts = []
|
|
for subject_uri, data in entity_data.items():
|
|
# Build context text from labels and definitions
|
|
context_parts = []
|
|
|
|
if data['labels']:
|
|
context_parts.append(f"Label: {data['labels'][0]}")
|
|
|
|
if data['definitions']:
|
|
context_parts.extend(data['definitions'])
|
|
|
|
# Only create EntityContext if we have meaningful context
|
|
if context_parts:
|
|
context_text = ". ".join(context_parts)
|
|
entity_contexts.append(EntityContext(
|
|
entity=make_term(subject_uri, is_uri=True),
|
|
context=context_text
|
|
))
|
|
|
|
logger.debug(f"Built {len(entity_contexts)} entity contexts from {len(triples)} triples")
|
|
return entity_contexts
|
|
|
|
@staticmethod
|
|
def add_args(parser):
|
|
"""Add command-line arguments."""
|
|
parser.add_argument(
|
|
'-c', '--concurrency',
|
|
type=int,
|
|
default=default_concurrency,
|
|
help=f'Concurrent processing threads (default: {default_concurrency})'
|
|
)
|
|
parser.add_argument(
|
|
'--top-k',
|
|
type=int,
|
|
default=10,
|
|
help='Number of top ontology elements to retrieve (default: 10)'
|
|
)
|
|
parser.add_argument(
|
|
'--similarity-threshold',
|
|
type=float,
|
|
default=0.3,
|
|
help='Similarity threshold for ontology matching (default: 0.3, range: 0.0-1.0)'
|
|
)
|
|
parser.add_argument(
|
|
'--triples-batch-size',
|
|
type=int,
|
|
default=default_triples_batch_size,
|
|
help=f'Maximum triples per output message (default: {default_triples_batch_size})'
|
|
)
|
|
parser.add_argument(
|
|
'--entity-batch-size',
|
|
type=int,
|
|
default=default_entity_batch_size,
|
|
help=f'Maximum entity contexts per output message (default: {default_entity_batch_size})'
|
|
)
|
|
FlowProcessor.add_args(parser)
|
|
|
|
|
|
def run():
|
|
"""Launch the OntoRAG extraction service."""
|
|
Processor.launch(default_ident, __doc__) |