Fix crash

2026-05-01 11:26:22 +02:00 · 2025-11-12 16:15:22 +00:00 · 2025-11-12 16:15:22 +00:00 · d624247dbd
commit d624247dbd
parent dfd7ad3a56
2 changed files with 30 additions and 6 deletions
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
@ -88,6 +88,7 @@ class Processor(FlowProcessor):
        # Track loaded ontology version
        self.current_ontology_version = None
        self.loaded_ontology_ids = set()
+        self.pending_config = None  # Store config until components initialized

    async def initialize_components(self, flow):
        """Initialize OntoRAG components."""
@ -124,8 +125,12 @@ class Processor(FlowProcessor):
            self.initialized = True
            logger.info("OntoRAG components initialized successfully")

-            # NOTE: Ontologies will be loaded via on_ontology_config() handler
-            # when ConfigPush messages arrive (including initial config on startup)
+            # Process pending config if available
+            if self.pending_config:
+                logger.info("Processing pending config from startup")
+                config, version = self.pending_config
+                self.pending_config = None
+                await self.on_ontology_config(config, version)

        except Exception as e:
            logger.error(f"Failed to initialize OntoRAG components: {e}", exc_info=True)
@ -156,6 +161,12 @@ class Processor(FlowProcessor):
                logger.warning("No 'ontology' section in config")
                return

+            # Check if components are initialized
+            if not self.ontology_loader:
+                logger.debug("Components not yet initialized, storing config for later processing")
+                self.pending_config = (config, version)
+                return
+
            ontology_configs = config["ontology"]

            # Parse ontology definitions
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py
@ -14,9 +14,16 @@ logger = logging.getLogger(__name__)

 # Ensure required NLTK data is downloaded
 try:
-    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('tokenizers/punkt_tab')
 except LookupError:
-    nltk.download('punkt', quiet=True)
+    try:
+        nltk.download('punkt_tab', quiet=True)
+    except:
+        # Fallback to older punkt if punkt_tab not available
+        try:
+            nltk.download('punkt', quiet=True)
+        except:
+            pass

 try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
@ -44,8 +51,14 @@ class SentenceSplitter:

    def __init__(self):
        """Initialize sentence splitter."""
-        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
-        logger.info("Using NLTK sentence tokenizer")
+        try:
+            # Try newer punkt_tab first
+            self.sent_detector = nltk.data.load('tokenizers/punkt_tab/english/')
+            logger.info("Using NLTK sentence tokenizer (punkt_tab)")
+        except:
+            # Fallback to older punkt
+            self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
+            logger.info("Using NLTK sentence tokenizer (punkt)")

    def split(self, text: str) -> List[str]:
        """Split text into sentences.