diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index 764600dc..7e474855 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -93,7 +93,8 @@ class Processor(FlowProcessor): """Initialize per-flow OntoRAG components. Each flow gets its own vector store and embedder to support - different embedding models across flows. + different embedding models across flows. The vector store dimension + is auto-detected from the embeddings service. Args: flow: Flow object for this processing context @@ -110,15 +111,22 @@ class Processor(FlowProcessor): try: logger.info(f"Initializing components for flow {flow_id}") - # Initialize vector store (FAISS only, no fallback) - vector_store = InMemoryVectorStore( - dimension=1536, # text-embedding-3-small - index_type='flat' - ) - # Use embeddings client directly (no wrapper needed) embeddings_client = flow("embeddings-request") + # Detect embedding dimension by embedding a test string + logger.info("Detecting embedding dimension from embeddings service...") + test_embedding_response = await embeddings_client.embed("test") + test_embedding = test_embedding_response[0] # Extract from [[vector]] + dimension = len(test_embedding) + logger.info(f"Detected embedding dimension: {dimension}") + + # Initialize vector store with detected dimension + vector_store = InMemoryVectorStore( + dimension=dimension, + index_type='flat' + ) + ontology_embedder = OntologyEmbedder( embedding_service=embeddings_client, vector_store=vector_store @@ -143,10 +151,11 @@ class Processor(FlowProcessor): self.flow_components[flow_id] = { 'embedder': ontology_embedder, 'vector_store': vector_store, - 'selector': ontology_selector + 'selector': ontology_selector, + 'dimension': dimension } - logger.info(f"Flow {flow_id} components initialized successfully") + logger.info(f"Flow {flow_id} components initialized successfully (dimension={dimension})") return flow_id except Exception as e: diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py b/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py index 6430a0a7..8eee76b4 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py @@ -164,6 +164,9 @@ class OntologyEmbedder: # Convert to numpy array embeddings = np.array(embeddings_list) + # Log embedding shape for debugging + logger.debug(f"Embeddings shape: {embeddings.shape}, expected: ({len(batch)}, {self.vector_store.dimension})") + # Store in vector store ids = [elem['id'] for elem in batch] metadata_list = [elem['metadata'] for elem in batch] @@ -174,7 +177,7 @@ class OntologyEmbedder: logger.debug(f"Embedded batch of {len(batch)} elements from ontology {ontology.id}") except Exception as e: - logger.error(f"Failed to embed batch for ontology {ontology.id}: {e}") + logger.error(f"Failed to embed batch for ontology {ontology.id}: {e}", exc_info=True) self.embedded_ontologies.add(ontology.id) logger.info(f"Embedded {embedded_count} elements from ontology {ontology.id}")