1. dynamically add insight

2. insight from scratch in real time
2026-05-08 15:22:38 +02:00 · 2024-10-11 18:37:35 +08:00 · 2024-10-11 18:37:35 +08:00 · eda9322361
commit eda9322361
parent f7374c03af
6 changed files with 115 additions and 18 deletions
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@ -1,6 +1,7 @@
 import json
 import os
 import random
+from difflib import SequenceMatcher

 from expo.insights.solution_designer import SolutionDesigner
 from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger
@ -33,11 +34,21 @@ DATA_CONFIG = load_data_config()
 class InstructionGenerator:
    data_config = DATA_CONFIG

-    def __init__(self, file_path, use_fixed_insights=False):
-        self.file_path = file_path
+    def __init__(self, state, use_fixed_insights, from_scratch):
+        self.state = state
+        self.file_path = state["exp_pool_path"]
+        self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
+        with open(self.dataset_info_path, "r") as file:
+            self.dataset_info = json.load(file)
        self.use_fixed_insights = use_fixed_insights
-        self.analysis_pool = self.load_insight_pool(file_path, use_fixed_insights)
        self.proposer = SolutionDesigner()
+        self.from_scratch = from_scratch
+
+    async def initialize(self):
+        if self.from_scratch:
+            self.insight_pool = await self.generate_solutions_from_scratch(self.dataset_info, self.state["task"])
+        else:
+            self.insight_pool = self.load_insight_pool(self.file_path, self.use_fixed_insights)

    @staticmethod
    def load_json_data(json_dir):
@ -84,14 +95,14 @@ class InstructionGenerator:
            data.extend(fixed_insights)
        for item in data:
            if "task_id" not in item:
-                raise ValueError("task_id is not found in the analysis pool")
+                raise ValueError("task_id is not found in the insight_pool")

        if task_id:
            data = [item for item in data if int(item["task_id"]) == int(task_id)]
        return data

    async def generate_new_instructions(self, task_id, original_instruction, max_num, ext_info=None):
-        data = self.analysis_pool
+        data = self.insight_pool
        new_instructions = []
        if len(data) == 0:
            mcts_logger.log("MCTS", f"No insights available for task {task_id}")
@ -108,6 +119,34 @@ class InstructionGenerator:
            new_instructions.append(new_instruction)
        return new_instructions

+    async def propose_new_insights(self, solution, score):
+        new_insights = await self.proposer.propose_insights(solution, score)
+        added_insights = self.add_insight(new_insights)
+        return added_insights
+
+    async def generate_solutions_from_scratch(self, dataset_info, dataset_name):
+        insight_pool = await self.proposer.generate_solutions(dataset_info, dataset_name, save_analysis_pool=False)
+        return insight_pool
+
+    def add_insight(self, new_insights):
+        added_insights = []
+        for new_insight in new_insights:
+            if not self.is_similar_to_existing(new_insight):
+                added_insights.append(new_insight)
+                self.insight_pool.append(new_insight)
+        return added_insights
+
+    def is_similar_to_existing(self, new_insight, similarity_threshold=0.8):
+        for existing_insight in self.insight_pool:
+            similarity = self.calculate_similarity(new_insight["Analysis"], existing_insight["Analysis"])
+            if similarity > similarity_threshold:
+                return True
+        return False
+
+    @staticmethod
+    def calculate_similarity(text1, text2):
+        return SequenceMatcher(None, text1, text2).ratio()
+
    @staticmethod
    async def generate_new_instruction(original_instruction, insights, ext_info):
        prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)
--- a/expo/insights/solution_designer.py
+++ b/expo/insights/solution_designer.py
@ -70,6 +70,45 @@ Your model choices should be advanced enough to be helpful.
 ```
 """

+
+INSIGHT_PROPOSAL_PROMPT = """
+You are an AI assistant tasked with analyzing a machine learning solution and proposing new insights to improve its performance. Given the current solution code and development score, suggest innovative approaches to enhance the model.
+
+Current Solution Code:
+{solution_code}
+
+Development Score: {dev_score}
+
+Based on this information, propose 3-5 new insights across different aspects of the machine learning pipeline (Data Preprocessing, Feature Engineering, and Model Training). Your insights should be specific, actionable, and have the potential to improve the model's performance.
+
+Please format your response as a JSON array with the following structure:
+[
+
+    {{
+        "task_type": "Data Preprocessing",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }},
+    {{
+        "task_type": "Feature Engineering",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }},
+    {{
+        "task_type": "Model Training",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }}
+]
+"""
+
+
 KEY_DATASET_FEATURES = [
    "NumberOfClasses",
    "NumberOfFeatures",
@ -86,7 +125,7 @@ TASK_TO_ID = {"EDA": 1, "Data Preprocessing": 2, "Feature Engineering": 3, "Mode
 class SolutionDesigner:
    data_dir: str = DATA_CONFIG["datasets_dir"]

-    async def generate_solutions(self, dataset_info, dataset_name):
+    async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True):
        llm = LLM()
        context = DATASET_INSIGHT_PROMPT.format(
            dataset=dataset_info["description"],
@ -96,8 +135,18 @@ class SolutionDesigner:
        rsp = await llm.aask(context)
        rsp = clean_json_from_rsp(rsp)
        analysis_pool = self.process_analysis_pool(json.loads(rsp))
-        dataset_path = f"{self.data_dir}/{dataset_name}"
-        self.save_analysis_pool(dataset_path, analysis_pool)
+        if save_analysis_pool:
+            dataset_path = f"{self.data_dir}/{dataset_name}"
+            self.save_analysis_pool(dataset_path, analysis_pool)
+        return analysis_pool
+
+    async def propose_new_insights(self, solution, score):
+        llm = LLM()
+        context = INSIGHT_PROPOSAL_PROMPT.format(solution_code=solution, dev_score=score)
+        rsp = await llm.aask(context)
+        rsp = clean_json_from_rsp(rsp)
+        new_insights = self.process_analysis_pool(json.loads(rsp))
+        return new_insights

    def process_analysis_pool(self, insights_rsp):
        analysis_pool = []