rename expo folder to sela

2026-07-20 16:51:07 +02:00 · 2024-10-22 21:33:31 +08:00 · 2024-10-22 21:33:31 +08:00 · 7c5b29de63
commit 7c5b29de63
parent 4bed19b931
33 changed files with 53 additions and 53 deletions
--- a/sela/insights/fixed_insights.json
+++ b/sela/insights/fixed_insights.json
@ -0,0 +1,22 @@
+[
+{
+    "Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "use k-fold bagging and early stopping",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor; You could do some quick model prototyping to see which models work best and then use them in the ensemble.",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.",
+    "Category": "Model Training",
+    "task_id": 4
+}
+]
--- a/sela/insights/instruction_generator.py
+++ b/sela/insights/instruction_generator.py
@ -0,0 +1,169 @@
+import json
+import os
+import random
+from difflib import SequenceMatcher
+
+from sela.insights.solution_designer import SolutionDesigner
+from sela.utils import clean_json_from_rsp, load_data_config, mcts_logger
+from metagpt.llm import LLM
+from metagpt.schema import Message
+
+REFLECTION_SYSTEM_MSG = "As a Kaggle Grandmaster competing in a challenge, your task is to suggest potential evolutionary improvements that could enhance the performance of the baseline code."
+
+CHANGE_INSTRUCTION = """
+# Original instruction
+{instruction}
+
+# Insights
+{insights}
+
+Rewrite the original instruction according to the insights 
+(If the original instruction involves splitting the data, ensure that your insights are integrated with the data split instructions, 
+rather than replacing them.)
+
+# Expected Output Hard Format
+```json
+{{
+    "Original Instruction": "original instruction",
+    "New Instruction": "new instruction"
+}}
+```
+"""
+
+DATA_CONFIG = load_data_config()
+
+
+class InstructionGenerator:
+    data_config = DATA_CONFIG
+
+    def __init__(self, state, use_fixed_insights, from_scratch):
+        self.state = state
+        self.file_path = state["exp_pool_path"]
+        if state["custom_dataset_dir"]:
+            with open(f"{state['custom_dataset_dir']}/description.md", "r", encoding="utf-8") as file:
+                self.dataset_info = file.read()
+        else:
+            dataset_info_path = (
+                f"{self.data_config['datasets_dir']}/{state['dataset_config']['dataset']}/dataset_info.json"
+            )
+            with open(dataset_info_path, "r") as file:
+                self.dataset_info = json.load(file)
+        self.use_fixed_insights = use_fixed_insights
+        self.proposer = SolutionDesigner()
+        if self.file_path is None:
+            self.from_scratch = True
+        else:
+            self.from_scratch = from_scratch
+
+    async def initialize(self):
+        if self.from_scratch:
+            self.insight_pool = await self.generate_solutions_from_scratch(self.dataset_info, self.state["task"])
+        else:
+            self.insight_pool = self.load_insight_pool(self.file_path, self.use_fixed_insights)
+
+    @staticmethod
+    def load_json_data(json_dir):
+        with open(json_dir, "r") as file:
+            json_data = json.load(file)
+            return json_data
+
+    @staticmethod
+    def _random_sample(analysis, num_samples):
+        return random.sample(analysis, num_samples)
+
+    @staticmethod
+    def sample_instruction_set(data):
+        data_dict = {}
+        for item in data:
+            task_id = item["task_id"]
+            if task_id not in data_dict:
+                data_dict[task_id] = []
+            data_dict[task_id].append(item)
+        instruction_set = []
+        for task_id in sorted(data_dict.keys()):
+            instruction_set.append(random.choice(data_dict[task_id]))
+        return instruction_set
+
+    @staticmethod
+    def format_output(rsp):
+        rsp_list = []
+        new_data = []
+        rsp_list.append(rsp)
+        for item in rsp_list:
+            item_dict = json.loads(item)
+            data = {
+                "Insights": item_dict,
+            }
+            new_data.append(data)
+        return new_data
+
+    @staticmethod
+    def load_insight_pool(file_path, use_fixed_insights, task_id=None):
+        data = InstructionGenerator.load_json_data(file_path)
+        if use_fixed_insights:
+            current_directory = os.path.dirname(__file__)
+            fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json")
+            data.extend(fixed_insights)
+        for item in data:
+            if "task_id" not in item:
+                raise ValueError("task_id is not found in the insight_pool")
+
+        if task_id:
+            data = [item for item in data if int(item["task_id"]) == int(task_id)]
+        return data
+
+    async def generate_new_instructions(self, task_id, original_instruction, max_num, ext_info=None):
+        data = self.insight_pool
+        new_instructions = []
+        if len(data) == 0:
+            mcts_logger.log("MCTS", f"No insights available for task {task_id}")
+            # return [original_instruction]  # Return the original instruction if no insights are available
+        for i in range(max_num):
+            if len(data) == 0:
+                insights = "No insights available"
+            else:
+                item = data[i]
+                insights = item["Analysis"]
+            new_instruction = await InstructionGenerator.generate_new_instruction(
+                original_instruction, insights, ext_info
+            )
+            new_instructions.append(new_instruction)
+        return new_instructions
+
+    async def propose_new_insights(self, solution, score):
+        new_insights = await self.proposer.propose_insights(solution, score)
+        added_insights = self.add_insight(new_insights)
+        return added_insights
+
+    async def generate_solutions_from_scratch(self, dataset_info, dataset_name):
+        insight_pool = await self.proposer.generate_solutions(dataset_info, dataset_name, save_analysis_pool=False)
+        return insight_pool
+
+    def add_insight(self, new_insights):
+        added_insights = []
+        for new_insight in new_insights:
+            if not self.is_similar_to_existing(new_insight):
+                added_insights.append(new_insight)
+                self.insight_pool.append(new_insight)
+        return added_insights
+
+    def is_similar_to_existing(self, new_insight, similarity_threshold=0.8):
+        for existing_insight in self.insight_pool:
+            similarity = self.calculate_similarity(new_insight["Analysis"], existing_insight["Analysis"])
+            if similarity > similarity_threshold:
+                return True
+        return False
+
+    @staticmethod
+    def calculate_similarity(text1, text2):
+        return SequenceMatcher(None, text1, text2).ratio()
+
+    @staticmethod
+    async def generate_new_instruction(original_instruction, insights, ext_info):
+        prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)
+        llm = LLM()
+        context = llm.format_msg([Message(content=prompt, role="user")])
+        llm_response = await llm.aask(context, system_msgs=[REFLECTION_SYSTEM_MSG])
+        rsp = clean_json_from_rsp(llm_response)
+        new_instruction = json.loads(rsp)["New Instruction"]
+        return new_instruction
--- a/sela/insights/solution_designer.py
+++ b/sela/insights/solution_designer.py
@ -0,0 +1,183 @@
+import json
+
+from sela.utils import clean_json_from_rsp, load_data_config
+from metagpt.llm import LLM
+
+DATA_CONFIG = load_data_config()
+
+
+DATASET_DESCRIPTION_SELA_PROMPT = """
+# Dataset Description
+{dataset}
+
+# Dataset Metadata
+{metadata}
+
+# Dataset Head
+{head}
+"""
+
+DATASET_DESCRIPTION_CUSTOM_PROMPT = """
+# Dataset Description
+{dataset_description}
+"""
+
+DATASET_INSIGHT_PROMPT = """
+{description}
+
+# Instruction
+Propose insights to help improve the performance of the model on this dataset.
+The insights should be proposed based on the dataset description with different task types.
+Each task type should have at least 5 insights.
+Make sure each method is diverse enough and can be implemented separately.
+Be specific about models' choices, ensemble and tuning techniques, and preprocessing & feature engineering techniques.
+Your model choices should be advanced enough to be helpful.
+
+# Format
+```json
+[
+    {{
+        "task_type": "EDA",
+        "insights": [
+            "insight1",
+            "insight2",
+            "insight3",
+            ...
+            "insightN"
+        ]   
+    }},
+    {{
+        "task_type": "Data Preprocessing",
+        "insights": [
+            "insight1",
+            "insight2",
+            "insight3",
+            ...
+            "insightN"
+        ]   
+    }},
+    {{
+        "task_type": "Feature Engineering",
+        "insights": [
+            "insight1",
+            "insight2",
+            "insight3",
+            ...
+            "insightN"
+        ]   
+    }},
+    {{
+        "task_type": "Model Training",
+        "insights": [
+            "insight1",
+            "insight2",
+            "insight3",
+            ...
+            "insightN"
+        ]   
+    }}
+]
+```
+"""
+
+
+INSIGHT_PROPOSAL_PROMPT = """
+You are an AI assistant tasked with analyzing a machine learning solution and proposing new insights to improve its performance. Given the current solution code and development score, suggest innovative approaches to enhance the model.
+
+Current Solution Code:
+{solution_code}
+
+Development Score: {dev_score}
+
+Based on this information, propose 3-5 new insights across different aspects of the machine learning pipeline (Data Preprocessing, Feature Engineering, and Model Training). Your insights should be specific, actionable, and have the potential to improve the model's performance.
+
+Please format your response as a JSON array with the following structure:
+[
+
+    {{
+        "task_type": "Data Preprocessing",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }},
+    {{
+        "task_type": "Feature Engineering",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }},
+    {{
+        "task_type": "Model Training",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }}
+]
+"""
+
+
+KEY_DATASET_FEATURES = [
+    "NumberOfClasses",
+    "NumberOfFeatures",
+    "NumberOfInstances",
+    "NumberOfInstancesWithMissingValues",
+    "NumberOfMissingValues",
+    "NumberOfNumericFeatures",
+    "NumberOfSymbolicFeatures",
+]
+
+TASK_TO_ID = {"EDA": 1, "Data Preprocessing": 2, "Feature Engineering": 3, "Model Training": 4, "Model Evaluation": 5}
+
+
+class SolutionDesigner:
+    data_dir: str = DATA_CONFIG["datasets_dir"]
+
+    async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True):
+        llm = LLM()
+        if type(dataset_info) == dict:
+            description_prompt = DATASET_DESCRIPTION_SELA_PROMPT.format(
+                dataset=dataset_info["description"],
+                metadata=self.metadata_builder(dataset_info["metadata"]),
+                head=dataset_info["df_head"],
+            )
+        else:
+            description_prompt = DATASET_DESCRIPTION_CUSTOM_PROMPT.format(dataset_description=dataset_info)
+        context = DATASET_INSIGHT_PROMPT.format(description=description_prompt)
+        rsp = await llm.aask(context)
+        rsp = clean_json_from_rsp(rsp)
+        analysis_pool = self.process_analysis_pool(json.loads(rsp))
+        if save_analysis_pool:
+            dataset_path = f"{self.data_dir}/{dataset_name}"
+            self.save_analysis_pool(dataset_path, analysis_pool)
+        return analysis_pool
+
+    async def propose_new_insights(self, solution, score):
+        llm = LLM()
+        context = INSIGHT_PROPOSAL_PROMPT.format(solution_code=solution, dev_score=score)
+        rsp = await llm.aask(context)
+        rsp = clean_json_from_rsp(rsp)
+        new_insights = self.process_analysis_pool(json.loads(rsp))
+        return new_insights
+
+    def process_analysis_pool(self, insights_rsp):
+        analysis_pool = []
+        for task_type_insights in insights_rsp:
+            task_type = task_type_insights["task_type"]
+            for insight in task_type_insights["insights"]:
+                analysis_pool.append({"Analysis": insight, "Category": task_type, "task_id": TASK_TO_ID[task_type]})
+        return analysis_pool
+
+    def metadata_builder(self, qualities):
+        metadata = {}
+        for key in KEY_DATASET_FEATURES:
+            metadata[key] = qualities.get(key, "N/A")
+        metadata_text = json.dumps(metadata, indent=4)
+        return metadata_text
+
+    def save_analysis_pool(self, dataset_path, analysis_pool):
+        fpath = f"{dataset_path}/ds_analysis_pool.json"
+        with open(fpath, "w") as file:
+            json.dump(analysis_pool, file, indent=4)