Fix nltk stuff

This commit is contained in:
Cyber MacGeddon 2025-11-12 16:50:47 +00:00
parent 0410e7dc9d
commit 4fb602242d
2 changed files with 20 additions and 6 deletions

View file

@ -3,6 +3,7 @@ Ontology embedder component for OntoRAG system.
Generates and stores embeddings for ontology elements. Generates and stores embeddings for ontology elements.
""" """
import asyncio
import logging import logging
import numpy as np import numpy as np
from typing import Dict, List, Any, Optional from typing import Dict, List, Any, Optional
@ -152,8 +153,12 @@ class OntologyEmbedder:
# Get embeddings for batch # Get embeddings for batch
texts = [elem['text'] for elem in batch] texts = [elem['text'] for elem in batch]
try: try:
# Call embedding service (async) # Call embedding service for each text (EmbeddingsClient.embed() is single-text)
embeddings = await self.embedding_service.embed_batch(texts) embedding_tasks = [self.embedding_service.embed(text) for text in texts]
embeddings_list = await asyncio.gather(*embedding_tasks)
# Convert to numpy array
embeddings = np.array(embeddings_list)
# Store in vector store # Store in vector store
ids = [elem['id'] for elem in batch] ids = [elem['id'] for elem in batch]
@ -226,8 +231,10 @@ class OntologyEmbedder:
return None return None
try: try:
embeddings = await self.embedding_service.embed_batch(texts) # EmbeddingsClient.embed() is single-text, so call in parallel
return embeddings embedding_tasks = [self.embedding_service.embed(text) for text in texts]
embeddings_list = await asyncio.gather(*embedding_tasks)
return np.array(embeddings_list)
except Exception as e: except Exception as e:
logger.error(f"Failed to embed texts: {e}") logger.error(f"Failed to embed texts: {e}")
return None return None

View file

@ -26,9 +26,16 @@ except LookupError:
pass pass
try: try:
nltk.data.find('taggers/averaged_perceptron_tagger') nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError: except LookupError:
nltk.download('averaged_perceptron_tagger', quiet=True) try:
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
except:
# Fallback to older name
try:
nltk.download('averaged_perceptron_tagger', quiet=True)
except:
pass
try: try:
nltk.data.find('corpora/stopwords') nltk.data.find('corpora/stopwords')