From 4fb602242de2939b1e55125d2db4ab0c588745cd Mon Sep 17 00:00:00 2001 From: Cyber MacGeddon Date: Wed, 12 Nov 2025 16:50:47 +0000 Subject: [PATCH] Fix nltk stuff --- .../extract/kg/ontology/ontology_embedder.py | 15 +++++++++++---- .../extract/kg/ontology/text_processor.py | 11 +++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py b/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py index cbb0f0bf..4fdaaf7f 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_embedder.py @@ -3,6 +3,7 @@ Ontology embedder component for OntoRAG system. Generates and stores embeddings for ontology elements. """ +import asyncio import logging import numpy as np from typing import Dict, List, Any, Optional @@ -152,8 +153,12 @@ class OntologyEmbedder: # Get embeddings for batch texts = [elem['text'] for elem in batch] try: - # Call embedding service (async) - embeddings = await self.embedding_service.embed_batch(texts) + # Call embedding service for each text (EmbeddingsClient.embed() is single-text) + embedding_tasks = [self.embedding_service.embed(text) for text in texts] + embeddings_list = await asyncio.gather(*embedding_tasks) + + # Convert to numpy array + embeddings = np.array(embeddings_list) # Store in vector store ids = [elem['id'] for elem in batch] @@ -226,8 +231,10 @@ class OntologyEmbedder: return None try: - embeddings = await self.embedding_service.embed_batch(texts) - return embeddings + # EmbeddingsClient.embed() is single-text, so call in parallel + embedding_tasks = [self.embedding_service.embed(text) for text in texts] + embeddings_list = await asyncio.gather(*embedding_tasks) + return np.array(embeddings_list) except Exception as e: logger.error(f"Failed to embed texts: {e}") return None diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py b/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py index b0231e3d..685699d1 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/text_processor.py @@ -26,9 +26,16 @@ except LookupError: pass try: - nltk.data.find('taggers/averaged_perceptron_tagger') + nltk.data.find('taggers/averaged_perceptron_tagger_eng') except LookupError: - nltk.download('averaged_perceptron_tagger', quiet=True) + try: + nltk.download('averaged_perceptron_tagger_eng', quiet=True) + except: + # Fallback to older name + try: + nltk.download('averaged_perceptron_tagger', quiet=True) + except: + pass try: nltk.data.find('corpora/stopwords')