diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index 7e474855..1f389e25 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -83,7 +83,7 @@ class Processor(FlowProcessor): # Configuration self.top_k = params.get("top_k", 10) - self.similarity_threshold = params.get("similarity_threshold", 0.7) + self.similarity_threshold = params.get("similarity_threshold", 0.3) # Track loaded ontology version self.current_ontology_version = None @@ -272,13 +272,15 @@ class Processor(FlowProcessor): f"{len(ontology_subset.object_properties)} object properties, " f"{len(ontology_subset.datatype_properties)} datatype properties") - # Build extraction prompt - prompt = self.build_extraction_prompt(chunk, ontology_subset) + # Build extraction prompt variables + prompt_variables = self.build_extraction_variables(chunk, ontology_subset) # Call prompt service for extraction try: - triples_response = await flow("prompt-request").extract_ontology_triples( - prompt=prompt + # Use prompt() method with extract-with-ontologies prompt ID + triples_response = await flow("prompt-request").prompt( + id="extract-with-ontologies", + variables=prompt_variables ) logger.debug(f"Extraction response: {triples_response}") @@ -477,8 +479,8 @@ TRIPLES (JSON array):""" parser.add_argument( '--similarity-threshold', type=float, - default=0.7, - help='Similarity threshold for ontology matching (default: 0.7)' + default=0.3, + help='Similarity threshold for ontology matching (default: 0.3, range: 0.0-1.0)' ) FlowProcessor.add_args(parser) diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_selector.py b/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_selector.py index d76f585e..6b493ea2 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_selector.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/ontology_selector.py @@ -84,22 +84,20 @@ class OntologySelector: # Check if vector store has any elements vector_store = self.embedder.get_vector_store() store_size = vector_store.size() - logger.debug(f"Vector store size: {store_size} elements") + logger.info(f"Vector store size: {store_size} elements") if store_size == 0: logger.warning("Vector store is empty - no ontology elements embedded") return relevant_elements - # Process each segment - for segment in segments: + # Process each segment (log first few for debugging) + for i, segment in enumerate(segments): # Get embedding for segment embedding = await self.embedder.embed_text(segment.text) if embedding is None: logger.warning(f"Failed to embed segment: {segment.text[:50]}...") continue - logger.debug(f"Searching for segment: {segment.text[:100]}... (embedding shape: {embedding.shape})") - # Search vector store with no threshold to see all scores all_results = vector_store.search( embedding=embedding, @@ -107,16 +105,17 @@ class OntologySelector: threshold=0.0 # Get all results to see scores ) - # Log top scores for debugging - if all_results: + # Log top scores for first 3 segments to debug + if i < 3 and all_results: top_scores = [r.score for r in all_results[:3]] - logger.debug(f"Top 3 scores for segment: {top_scores}, threshold={self.similarity_threshold}") + top_elements = [r.metadata['element'] for r in all_results[:3]] + logger.info(f"Segment {i}: '{segment.text[:60]}...'") + logger.info(f" Top 3 scores: {top_scores} (threshold={self.similarity_threshold})") + logger.info(f" Top 3 elements: {top_elements}") # Filter by threshold results = [r for r in all_results if r.score >= self.similarity_threshold] - logger.debug(f"Found {len(results)} results above threshold (out of {len(all_results)} total)") - # Process results for result in results: metadata = result.metadata