Fix flow lifetimes

2026-05-01 03:16:23 +02:00 · 2025-11-12 16:48:49 +00:00 · 2025-11-12 16:48:49 +00:00 · 0410e7dc9d
commit 0410e7dc9d
parent d624247dbd
1 changed files with 54 additions and 51 deletions
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
@ -74,12 +74,12 @@ class Processor(FlowProcessor):
        # Register config handler for ontology updates
        self.register_config_handler(self.on_ontology_config)
-        # Initialize components
+        # Shared components (not flow-specific)
-        self.ontology_loader = None
+        self.ontology_loader = OntologyLoader()
        self.ontology_embedder = None
        self.text_processor = TextProcessor()
-        self.ontology_selector = None
+
-        self.initialized = False
+        # Per-flow components (each flow gets its own embedder/vector store/selector)
        self.flow_components = {}  # flow_id -> {embedder, vector_store, selector}
        # Configuration
        self.top_k = params.get("top_k", 10)
@ -88,17 +88,27 @@ class Processor(FlowProcessor):
        # Track loaded ontology version
        self.current_ontology_version = None
        self.loaded_ontology_ids = set()
        self.pending_config = None  # Store config until components initialized
-    async def initialize_components(self, flow):
+    async def initialize_flow_components(self, flow):
-        """Initialize OntoRAG components."""
+        """Initialize per-flow OntoRAG components.
-        if self.initialized:
+
-            return
+        Each flow gets its own vector store and embedder to support
        different embedding models across flows.
        Args:
            flow: Flow object for this processing context
        Returns:
            flow_id: Identifier for this flow's components
        """
        # Use flow object as identifier
        flow_id = id(flow)
        if flow_id in self.flow_components:
            return flow_id  # Already initialized for this flow
        try:
-            # Initialize ontology loader (no ConfigTableStore needed)
+            logger.info(f"Initializing components for flow {flow_id}")
            self.ontology_loader = OntologyLoader()
            logger.info("Ontology loader initialized")
            # Initialize vector store (FAISS only, no fallback)
            vector_store = InMemoryVectorStore(
@ -109,37 +119,46 @@ class Processor(FlowProcessor):
            # Use embeddings client directly (no wrapper needed)
            embeddings_client = flow("embeddings-request")
-            self.ontology_embedder = OntologyEmbedder(
+            ontology_embedder = OntologyEmbedder(
                embedding_service=embeddings_client,
                vector_store=vector_store
            )
            # Embed all loaded ontologies for this flow
            if self.ontology_loader.get_all_ontologies():
                logger.info(f"Embedding ontologies for flow {flow_id}")
                for ont_id, ontology in self.ontology_loader.get_all_ontologies().items():
                    await ontology_embedder.embed_ontology(ontology)
                logger.info(f"Embedded {ontology_embedder.get_embedded_count()} ontology elements for flow {flow_id}")
            # Initialize ontology selector
-            self.ontology_selector = OntologySelector(
+            ontology_selector = OntologySelector(
-                ontology_embedder=self.ontology_embedder,
+                ontology_embedder=ontology_embedder,
                ontology_loader=self.ontology_loader,
                top_k=self.top_k,
                similarity_threshold=self.similarity_threshold
            )
-            self.initialized = True
+            # Store flow-specific components
-            logger.info("OntoRAG components initialized successfully")
+            self.flow_components[flow_id] = {
                'embedder': ontology_embedder,
                'vector_store': vector_store,
                'selector': ontology_selector
            }
-            # Process pending config if available
+            logger.info(f"Flow {flow_id} components initialized successfully")
-            if self.pending_config:
+            return flow_id
                logger.info("Processing pending config from startup")
                config, version = self.pending_config
                self.pending_config = None
                await self.on_ontology_config(config, version)
        except Exception as e:
-            logger.error(f"Failed to initialize OntoRAG components: {e}", exc_info=True)
+            logger.error(f"Failed to initialize flow {flow_id} components: {e}", exc_info=True)
            raise
    async def on_ontology_config(self, config, version):
        """
        Handle ontology configuration updates from ConfigPush queue.
        Parses and stores ontologies. Embedding happens per-flow on first message.
        Called automatically when:
        - Processor starts (gets full config history via start_of_messages=True)
        - Config service pushes updates (immediate event-driven notification)
@ -161,12 +180,6 @@ class Processor(FlowProcessor):
                logger.warning("No 'ontology' section in config")
                return
            # Check if components are initialized
            if not self.ontology_loader:
                logger.debug("Components not yet initialized, storing config for later processing")
                self.pending_config = (config, version)
                return
            ontology_configs = config["ontology"]
            # Parse ontology definitions
@ -196,20 +209,10 @@ class Processor(FlowProcessor):
            # Update ontology loader's internal state
            self.ontology_loader.update_ontologies(ontologies)
-            # Re-embed changed ontologies
+            # Clear all flow components to force re-embedding with new ontologies
-            if self.ontology_embedder:
+            if added_ids or removed_ids or updated_ids:
-                # Remove embeddings for deleted ontologies
+                logger.info("Clearing flow components to trigger re-embedding")
-                for ont_id in removed_ids:
+                self.flow_components.clear()
                    self.ontology_embedder.remove_ontology(ont_id)
                # Embed new and updated ontologies
                for ont_id in added_ids | updated_ids:
                    if ont_id in self.ontology_loader.get_all_ontologies():
                        await self.ontology_embedder.embed_ontology(
                            self.ontology_loader.get_ontology(ont_id)
                        )
                logger.info(f"Re-embedded ontologies, total elements: {self.ontology_embedder.get_embedded_count()}")
            # Update tracking
            self.current_ontology_version = version
@ -225,9 +228,9 @@ class Processor(FlowProcessor):
        v = msg.value()
        logger.info(f"Extracting ontology-based triples from {v.metadata.id}...")
-        # Initialize components if needed
+        # Initialize flow-specific components if needed
-        if not self.initialized:
+        flow_id = await self.initialize_flow_components(flow)
-            await self.initialize_components(flow)
+        components = self.flow_components[flow_id]
        chunk = v.chunk.decode("utf-8")
        logger.debug(f"Processing chunk: {chunk[:200]}...")
@ -237,8 +240,8 @@ class Processor(FlowProcessor):
            segments = self.text_processor.process_chunk(chunk, extract_phrases=True)
            logger.debug(f"Split chunk into {len(segments)} segments")
-            # Select relevant ontology subset
+            # Select relevant ontology subset (using flow-specific selector)
-            ontology_subsets = await self.ontology_selector.select_ontology_subset(segments)
+            ontology_subsets = await components['selector'].select_ontology_subset(segments)
            if not ontology_subsets:
                logger.warning("No relevant ontology elements found for chunk")
@ -252,7 +255,7 @@ class Processor(FlowProcessor):
            # Merge subsets if multiple ontologies matched
            if len(ontology_subsets) > 1:
-                ontology_subset = self.ontology_selector.merge_subsets(ontology_subsets)
+                ontology_subset = components['selector'].merge_subsets(ontology_subsets)
            else:
                ontology_subset = ontology_subsets[0]