rename expo folder to sela

This commit is contained in:
Cyzus Chi 2024-10-22 21:33:31 +08:00
parent 4bed19b931
commit 7c5b29de63
33 changed files with 53 additions and 53 deletions

View file

@ -0,0 +1,22 @@
[
{
"Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.",
"Category": "Model Training",
"task_id": 4
},
{
"Analysis": "use k-fold bagging and early stopping",
"Category": "Model Training",
"task_id": 4
},
{
"Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor; You could do some quick model prototyping to see which models work best and then use them in the ensemble.",
"Category": "Model Training",
"task_id": 4
},
{
"Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.",
"Category": "Model Training",
"task_id": 4
}
]

View file

@ -0,0 +1,169 @@
import json
import os
import random
from difflib import SequenceMatcher
from sela.insights.solution_designer import SolutionDesigner
from sela.utils import clean_json_from_rsp, load_data_config, mcts_logger
from metagpt.llm import LLM
from metagpt.schema import Message
REFLECTION_SYSTEM_MSG = "As a Kaggle Grandmaster competing in a challenge, your task is to suggest potential evolutionary improvements that could enhance the performance of the baseline code."
CHANGE_INSTRUCTION = """
# Original instruction
{instruction}
# Insights
{insights}
Rewrite the original instruction according to the insights
(If the original instruction involves splitting the data, ensure that your insights are integrated with the data split instructions,
rather than replacing them.)
# Expected Output Hard Format
```json
{{
"Original Instruction": "original instruction",
"New Instruction": "new instruction"
}}
```
"""
DATA_CONFIG = load_data_config()
class InstructionGenerator:
data_config = DATA_CONFIG
def __init__(self, state, use_fixed_insights, from_scratch):
self.state = state
self.file_path = state["exp_pool_path"]
if state["custom_dataset_dir"]:
with open(f"{state['custom_dataset_dir']}/description.md", "r", encoding="utf-8") as file:
self.dataset_info = file.read()
else:
dataset_info_path = (
f"{self.data_config['datasets_dir']}/{state['dataset_config']['dataset']}/dataset_info.json"
)
with open(dataset_info_path, "r") as file:
self.dataset_info = json.load(file)
self.use_fixed_insights = use_fixed_insights
self.proposer = SolutionDesigner()
if self.file_path is None:
self.from_scratch = True
else:
self.from_scratch = from_scratch
async def initialize(self):
if self.from_scratch:
self.insight_pool = await self.generate_solutions_from_scratch(self.dataset_info, self.state["task"])
else:
self.insight_pool = self.load_insight_pool(self.file_path, self.use_fixed_insights)
@staticmethod
def load_json_data(json_dir):
with open(json_dir, "r") as file:
json_data = json.load(file)
return json_data
@staticmethod
def _random_sample(analysis, num_samples):
return random.sample(analysis, num_samples)
@staticmethod
def sample_instruction_set(data):
data_dict = {}
for item in data:
task_id = item["task_id"]
if task_id not in data_dict:
data_dict[task_id] = []
data_dict[task_id].append(item)
instruction_set = []
for task_id in sorted(data_dict.keys()):
instruction_set.append(random.choice(data_dict[task_id]))
return instruction_set
@staticmethod
def format_output(rsp):
rsp_list = []
new_data = []
rsp_list.append(rsp)
for item in rsp_list:
item_dict = json.loads(item)
data = {
"Insights": item_dict,
}
new_data.append(data)
return new_data
@staticmethod
def load_insight_pool(file_path, use_fixed_insights, task_id=None):
data = InstructionGenerator.load_json_data(file_path)
if use_fixed_insights:
current_directory = os.path.dirname(__file__)
fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json")
data.extend(fixed_insights)
for item in data:
if "task_id" not in item:
raise ValueError("task_id is not found in the insight_pool")
if task_id:
data = [item for item in data if int(item["task_id"]) == int(task_id)]
return data
async def generate_new_instructions(self, task_id, original_instruction, max_num, ext_info=None):
data = self.insight_pool
new_instructions = []
if len(data) == 0:
mcts_logger.log("MCTS", f"No insights available for task {task_id}")
# return [original_instruction] # Return the original instruction if no insights are available
for i in range(max_num):
if len(data) == 0:
insights = "No insights available"
else:
item = data[i]
insights = item["Analysis"]
new_instruction = await InstructionGenerator.generate_new_instruction(
original_instruction, insights, ext_info
)
new_instructions.append(new_instruction)
return new_instructions
async def propose_new_insights(self, solution, score):
new_insights = await self.proposer.propose_insights(solution, score)
added_insights = self.add_insight(new_insights)
return added_insights
async def generate_solutions_from_scratch(self, dataset_info, dataset_name):
insight_pool = await self.proposer.generate_solutions(dataset_info, dataset_name, save_analysis_pool=False)
return insight_pool
def add_insight(self, new_insights):
added_insights = []
for new_insight in new_insights:
if not self.is_similar_to_existing(new_insight):
added_insights.append(new_insight)
self.insight_pool.append(new_insight)
return added_insights
def is_similar_to_existing(self, new_insight, similarity_threshold=0.8):
for existing_insight in self.insight_pool:
similarity = self.calculate_similarity(new_insight["Analysis"], existing_insight["Analysis"])
if similarity > similarity_threshold:
return True
return False
@staticmethod
def calculate_similarity(text1, text2):
return SequenceMatcher(None, text1, text2).ratio()
@staticmethod
async def generate_new_instruction(original_instruction, insights, ext_info):
prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)
llm = LLM()
context = llm.format_msg([Message(content=prompt, role="user")])
llm_response = await llm.aask(context, system_msgs=[REFLECTION_SYSTEM_MSG])
rsp = clean_json_from_rsp(llm_response)
new_instruction = json.loads(rsp)["New Instruction"]
return new_instruction

View file

@ -0,0 +1,183 @@
import json
from sela.utils import clean_json_from_rsp, load_data_config
from metagpt.llm import LLM
DATA_CONFIG = load_data_config()
DATASET_DESCRIPTION_SELA_PROMPT = """
# Dataset Description
{dataset}
# Dataset Metadata
{metadata}
# Dataset Head
{head}
"""
DATASET_DESCRIPTION_CUSTOM_PROMPT = """
# Dataset Description
{dataset_description}
"""
DATASET_INSIGHT_PROMPT = """
{description}
# Instruction
Propose insights to help improve the performance of the model on this dataset.
The insights should be proposed based on the dataset description with different task types.
Each task type should have at least 5 insights.
Make sure each method is diverse enough and can be implemented separately.
Be specific about models' choices, ensemble and tuning techniques, and preprocessing & feature engineering techniques.
Your model choices should be advanced enough to be helpful.
# Format
```json
[
{{
"task_type": "EDA",
"insights": [
"insight1",
"insight2",
"insight3",
...
"insightN"
]
}},
{{
"task_type": "Data Preprocessing",
"insights": [
"insight1",
"insight2",
"insight3",
...
"insightN"
]
}},
{{
"task_type": "Feature Engineering",
"insights": [
"insight1",
"insight2",
"insight3",
...
"insightN"
]
}},
{{
"task_type": "Model Training",
"insights": [
"insight1",
"insight2",
"insight3",
...
"insightN"
]
}}
]
```
"""
INSIGHT_PROPOSAL_PROMPT = """
You are an AI assistant tasked with analyzing a machine learning solution and proposing new insights to improve its performance. Given the current solution code and development score, suggest innovative approaches to enhance the model.
Current Solution Code:
{solution_code}
Development Score: {dev_score}
Based on this information, propose 3-5 new insights across different aspects of the machine learning pipeline (Data Preprocessing, Feature Engineering, and Model Training). Your insights should be specific, actionable, and have the potential to improve the model's performance.
Please format your response as a JSON array with the following structure:
[
{{
"task_type": "Data Preprocessing",
"insights": [
"insight1",
"insight2"
]
}},
{{
"task_type": "Feature Engineering",
"insights": [
"insight1",
"insight2"
]
}},
{{
"task_type": "Model Training",
"insights": [
"insight1",
"insight2"
]
}}
]
"""
KEY_DATASET_FEATURES = [
"NumberOfClasses",
"NumberOfFeatures",
"NumberOfInstances",
"NumberOfInstancesWithMissingValues",
"NumberOfMissingValues",
"NumberOfNumericFeatures",
"NumberOfSymbolicFeatures",
]
TASK_TO_ID = {"EDA": 1, "Data Preprocessing": 2, "Feature Engineering": 3, "Model Training": 4, "Model Evaluation": 5}
class SolutionDesigner:
data_dir: str = DATA_CONFIG["datasets_dir"]
async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True):
llm = LLM()
if type(dataset_info) == dict:
description_prompt = DATASET_DESCRIPTION_SELA_PROMPT.format(
dataset=dataset_info["description"],
metadata=self.metadata_builder(dataset_info["metadata"]),
head=dataset_info["df_head"],
)
else:
description_prompt = DATASET_DESCRIPTION_CUSTOM_PROMPT.format(dataset_description=dataset_info)
context = DATASET_INSIGHT_PROMPT.format(description=description_prompt)
rsp = await llm.aask(context)
rsp = clean_json_from_rsp(rsp)
analysis_pool = self.process_analysis_pool(json.loads(rsp))
if save_analysis_pool:
dataset_path = f"{self.data_dir}/{dataset_name}"
self.save_analysis_pool(dataset_path, analysis_pool)
return analysis_pool
async def propose_new_insights(self, solution, score):
llm = LLM()
context = INSIGHT_PROPOSAL_PROMPT.format(solution_code=solution, dev_score=score)
rsp = await llm.aask(context)
rsp = clean_json_from_rsp(rsp)
new_insights = self.process_analysis_pool(json.loads(rsp))
return new_insights
def process_analysis_pool(self, insights_rsp):
analysis_pool = []
for task_type_insights in insights_rsp:
task_type = task_type_insights["task_type"]
for insight in task_type_insights["insights"]:
analysis_pool.append({"Analysis": insight, "Category": task_type, "task_id": TASK_TO_ID[task_type]})
return analysis_pool
def metadata_builder(self, qualities):
metadata = {}
for key in KEY_DATASET_FEATURES:
metadata[key] = qualities.get(key, "N/A")
metadata_text = json.dumps(metadata, indent=4)
return metadata_text
def save_analysis_pool(self, dataset_path, analysis_pool):
fpath = f"{dataset_path}/ds_analysis_pool.json"
with open(fpath, "w") as file:
json.dump(analysis_pool, file, indent=4)