diff --git a/expo/README.md b/expo/README.md index eab3298dc..0d47ab3e4 100644 --- a/expo/README.md +++ b/expo/README.md @@ -57,7 +57,151 @@ ### DS Agent ### AIDE -提供github链接,并说明使用的命令以及参数设置 + +#### Setup + +``` +git clone https://github.com/WecoAI/aideml.git +``` + +修改 `aideml/aide/utils/config.yaml` 内容如下 + +```yaml +# path to the task data directory +data_dir: null + +# either provide a path to a plaintext file describing the task +desc_file: null +# or provide the task goal (and optionally evaluation information) as arguments +goal: null +eval: null + +log_dir: logs +workspace_dir: workspaces + +# whether to unzip any archives in the data directory +preprocess_data: True +# whether to copy the data to the workspace directory (otherwise it will be symlinked) +# copying is recommended to prevent the agent from accidentally modifying the original data +copy_data: True + +exp_name: null # a random experiment name will be generated if not provided + +# settings for code execution +exec: + timeout: 3600 + agent_file_name: runfile.py + format_tb_ipython: False + +# agent hyperparams +agent: + # how many improvement iterations to run + steps: 10 + # whether to instruct the agent to use CV (set to 1 to disable) + k_fold_validation: 1 + # whether to instruct the agent to generate a prediction function + expose_prediction: False + # whether to provide the agent with a preview of the data + data_preview: True + + # LLM settings for coding + code: + model: deepseek-coder + temp: 0.5 + + # LLM settings for evaluating program output / tracebacks + feedback: + model: deepseek-coder + temp: 0.5 + + # hyperparameters for the tree search + search: + max_debug_depth: 3 + debug_prob: 0.5 + num_drafts: 5 +``` + +由于 deepseek 完全兼容 OpenAI 的 API,修改`base_url`为`自己的url`,`api_key`为`自己的key`即可 + +``` +export OPENAI_API_KEY="自己的key" +export OPENAI_BASE_URL="自己的url" +``` + +修改`aideml/aide/backend/__init__.py` 30 行内容如下: + +```python +model_kwargs = model_kwargs | { + "model": model, + "temperature": temperature, + "max_tokens": max_tokens, + } + if "claude-" in model: + query_func = backend_anthropic.query + else: + query_func = backend_openai.query +``` + +由于 deepseekV2.5 不再支持 system message 使用 function call,修改 `aideml/aide/agent.py` 312 行内容如下: + +```python +response = cast( + dict, + query( + system_message=None, + user_message=prompt, + func_spec=review_func_spec, + model=self.acfg.feedback.model, + temperature=self.acfg.feedback.temp, + ), + ) +``` + +修改完后 + +``` +cd aideml +pip install -e . +``` + +#### Run + +运行下面脚本获取运行结果,在当前目录下将生成一个 log 文件夹以及 workspace 文件夹 +log 文件夹中将包含实验使用配置以及生成方案记录,workspace 文件夹下将保存 aide 最后生成的结果文件 + +```python +import aide +import os +import time + +os.environ["OPENAI_API_KEY"] = "sk-xxx" +os.environ["OPENAI_BASE_URL"] = "your url" +start_time = time.time() +data_dir = "xxx/data/titanic" +goal = f""" +# User requirement +({data_dir}, 'This is a 04_titanic dataset. Your goal is to predict the target column `Survived`.\nPerform data analysis, data preprocessing, feature engineering, and modeling to predict the target. \nReport f1 on the eval data. Do not plot or make any visualizations.\n') + +# Data dir +training (with labels): train.csv +testing (without labels): test.csv +dataset description: dataset_info.json (You can use this file to get additional information about the dataset)""" + +exp = aide.Experiment( + data_dir=data_dir, # replace this with your own directory + goal=goal, + eval="f1", # replace with your own evaluation metric +) + +best_solution = exp.run(steps=10) + +print(f"Best solution has validation metric: {best_solution.valid_metric}") +print(f"Best solution code: {best_solution.code}") +end_time = time.time() +execution_time = end_time - start_time + +print(f"run time : {execution_time} seconds") +``` ### Autogluon #### Setup