diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index 10247464..403f27db 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -88,6 +88,7 @@ class Processor(FlowProcessor): # Track loaded ontology version self.current_ontology_version = None self.loaded_ontology_ids = set() + self.pending_config = None # Store config until components initialized async def initialize_components(self, flow): """Initialize OntoRAG components.""" @@ -124,8 +125,12 @@ class Processor(FlowProcessor): self.initialized = True logger.info("OntoRAG components initialized successfully") - # NOTE: Ontologies will be loaded via on_ontology_config() handler - # when ConfigPush messages arrive (including initial config on startup) + # Process pending config if available + if self.pending_config: + logger.info("Processing pending config from startup") + config, version = self.pending_config + self.pending_config = None + await self.on_ontology_config(config, version) except Exception as e: logger.error(f"Failed to initialize OntoRAG components: {e}", exc_info=True) @@ -156,6 +161,12 @@ class Processor(FlowProcessor): logger.warning("No 'ontology' section in config") return + # Check if components are initialized + if not self.ontology_loader: + logger.debug("Components not yet initialized, storing config for later processing") + self.pending_config = (config, version) + return + ontology_configs = config["ontology"] # Parse ontology definitions diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py b/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py index e6c92f98..b0231e3d 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py @@ -14,9 +14,16 @@ logger = logging.getLogger(__name__) # Ensure required NLTK data is downloaded try: - nltk.data.find('tokenizers/punkt') + nltk.data.find('tokenizers/punkt_tab') except LookupError: - nltk.download('punkt', quiet=True) + try: + nltk.download('punkt_tab', quiet=True) + except: + # Fallback to older punkt if punkt_tab not available + try: + nltk.download('punkt', quiet=True) + except: + pass try: nltk.data.find('taggers/averaged_perceptron_tagger') @@ -44,8 +51,14 @@ class SentenceSplitter: def __init__(self): """Initialize sentence splitter.""" - self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') - logger.info("Using NLTK sentence tokenizer") + try: + # Try newer punkt_tab first + self.sent_detector = nltk.data.load('tokenizers/punkt_tab/english/') + logger.info("Using NLTK sentence tokenizer (punkt_tab)") + except: + # Fallback to older punkt + self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') + logger.info("Using NLTK sentence tokenizer (punkt)") def split(self, text: str) -> List[str]: """Split text into sentences.