Fix crash

2026-07-13 23:32:11 +02:00 · 2025-11-12 16:15:22 +00:00 · 2025-11-12 16:15:22 +00:00 · d624247dbd
commit d624247dbd
parent dfd7ad3a56
2 changed files with 30 additions and 6 deletions
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
@ -88,6 +88,7 @@ class Processor(FlowProcessor):
        # Track loaded ontology version
        self.current_ontology_version = None
        self.loaded_ontology_ids = set()
        self.pending_config = None  # Store config until components initialized
    async def initialize_components(self, flow):
        """Initialize OntoRAG components."""
@ -124,8 +125,12 @@ class Processor(FlowProcessor):
            self.initialized = True
            logger.info("OntoRAG components initialized successfully")
-            # NOTE: Ontologies will be loaded via on_ontology_config() handler
+            # Process pending config if available
-            # when ConfigPush messages arrive (including initial config on startup)
+            if self.pending_config:
                logger.info("Processing pending config from startup")
                config, version = self.pending_config
                self.pending_config = None
                await self.on_ontology_config(config, version)
        except Exception as e:
            logger.error(f"Failed to initialize OntoRAG components: {e}", exc_info=True)
@ -156,6 +161,12 @@ class Processor(FlowProcessor):
                logger.warning("No 'ontology' section in config")
                return
            # Check if components are initialized
            if not self.ontology_loader:
                logger.debug("Components not yet initialized, storing config for later processing")
                self.pending_config = (config, version)
                return
            ontology_configs = config["ontology"]
            # Parse ontology definitions
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py
@ -14,9 +14,16 @@ logger = logging.getLogger(__name__)
 # Ensure required NLTK data is downloaded
 try:
-    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('tokenizers/punkt_tab')
 except LookupError:
-    nltk.download('punkt', quiet=True)
+    try:
        nltk.download('punkt_tab', quiet=True)
    except:
        # Fallback to older punkt if punkt_tab not available
        try:
            nltk.download('punkt', quiet=True)
        except:
            pass
 try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
@ -44,8 +51,14 @@ class SentenceSplitter:
    def __init__(self):
        """Initialize sentence splitter."""
-        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
+        try:
-        logger.info("Using NLTK sentence tokenizer")
+            # Try newer punkt_tab first
            self.sent_detector = nltk.data.load('tokenizers/punkt_tab/english/')
            logger.info("Using NLTK sentence tokenizer (punkt_tab)")
        except:
            # Fallback to older punkt
            self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            logger.info("Using NLTK sentence tokenizer (punkt)")
    def split(self, text: str) -> List[str]:
        """Split text into sentences.