From 89081d942c077190d9aa89b0b88cbcc03162da2c Mon Sep 17 00:00:00 2001 From: douglas Date: Mon, 17 Apr 2023 17:22:31 -0400 Subject: [PATCH] First commit for AutoGPT Benchmarks --- .gitmodules | 3 + README.md | 38 ++++++++ auto_gpt_benchmarking/Auto-GPT | 1 + auto_gpt_benchmarking/AutoGPTAgent.py | 88 +++++++++++++++++++ .../AutoGPTData/ai_settings.yaml | 8 ++ auto_gpt_benchmarking/CompletionFn.py | 27 ++++++ auto_gpt_benchmarking/LangChainCompletions.py | 34 +++++++ auto_gpt_benchmarking/__init__.py | 0 .../auto_gpt_completion_fn.yaml | 2 + auto_gpt_benchmarking/main.py | 4 + requirements.txt | 1 + 11 files changed, 206 insertions(+) create mode 100644 .gitmodules create mode 160000 auto_gpt_benchmarking/Auto-GPT create mode 100644 auto_gpt_benchmarking/AutoGPTAgent.py create mode 100644 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml create mode 100644 auto_gpt_benchmarking/CompletionFn.py create mode 100644 auto_gpt_benchmarking/LangChainCompletions.py create mode 100644 auto_gpt_benchmarking/__init__.py create mode 100644 auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml create mode 100644 auto_gpt_benchmarking/main.py create mode 100644 requirements.txt diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000000..d293ba9c4bd8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "Auto-GPT"] + path = auto_gpt_benchmarking/Auto-GPT + url = https://github.com/Significant-Gravitas/Auto-GPT.git diff --git a/README.md b/README.md index 0120d4fca6ec..75db145a2478 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ # Auto-GPT-Benchmarks A set of standardised benchmarks to assess the performance of Auto-GPTs. + +# What is next? + +- [ ] Build longer form tasks, (code fix backed by testing) +- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project +- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc) +- [ ] Lower priority, but put this in a webserver backend so we have a good API +- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used +- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. + + +## Understanding OpenAI Evals + +The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs + +The basic idea is this though: +1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test. +2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir. +3. Run the evals against the completion function. + +Then you can make more also, yaml defined evals and run them against the completion function as needed. + +### Completions Functions + +See our yaml file in `completion_fns` dir for the registration of the completion function. +See our completion function itself in CompletionFn.py +That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py + + +# RANDOM SHIT + +You must add the auto_gpt_bencchmarking dir to the python path +Do this with a path file in your venv. OpenAI evals needs to import it. + +I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: +`/home/douglas/AGI/Auto-GPT-Benchmarks-fork` + + diff --git a/auto_gpt_benchmarking/Auto-GPT b/auto_gpt_benchmarking/Auto-GPT new file mode 160000 index 000000000000..97d62cc16bf4 --- /dev/null +++ b/auto_gpt_benchmarking/Auto-GPT @@ -0,0 +1 @@ +Subproject commit 97d62cc16bf45fcd406efeb33d042ebd58c24670 diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py new file mode 100644 index 000000000000..f24b150b44b1 --- /dev/null +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -0,0 +1,88 @@ +""" +This instantiates an AutoGPT agent who is capable of handling any task. +It is designed to pass benchmarks as effectively as possible. + +Loads in the ai_settings.yaml file to get the AI's name, role, and goals. +Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation. + +The model is instantiated with a prompt from the AutoGPT completion function. + +Eventualy we will also save and log all of the associated output and thinking for the model as well +""" +from pathlib import Path +import os + + +class AutoGPTAgent: + """ + A class object that contains the configuration information for the AI + The init function takes an evaluation prompt. + It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo. + It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt + It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder. + If the model has used more than 50,000 tokens, it kills the model. + If the model has used less than 50,000 tokens, it returns the output.txt file. + """ + def _clean_up_workspace(self): + """ + Cleans up the workspace by deleting the prompt.txt and output.txt files. + :return: + """ + # check if the files are there and delete them if they are + if self.prompt_file.exists(): + self.prompt_file.unlink() + if self.output_file.exists(): + self.output_file.unlink() + + def _copy_ai_settings(self): + self.ai_settings_dest.write_text(self.ai_settings_file.read_text()) + + def _copy_prompt(self): + self.prompt_file.write_text(self.prompt) + + def _start_agent(self): + """ + This starts the agent in the docker container. + This assumes you have the docker image built with: + docker build -t autogpt . + In the dockerfile in the Auto-GPT repo. + You also must set up the .env file in the Auto-GPT repo. + :return: + """ + env_file = self.auto_gpt_path / ".env" + # run it in continuous mode and skip re-prompts + os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'") + + def _poll_for_output(self): + """ + This polls the output file to see if the model has finished. + :return: + """ + while True: + if self.output_file.exists(): + return self.output_file.read_text() + + def __init__(self, prompt): + self.auto_gpt_path = Path(__file__).parent / "Auto-GPT" + self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" + self.prompt_file = self.auto_workspace / "prompt.txt" + self.output_file = self.auto_workspace / "output.txt" + self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml" + self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml" + self.prompt = prompt + self._clean_up_workspace() + self._copy_ai_settings() + self._copy_prompt() + + def start(self): + self._start_agent() + answer = self._poll_for_output() + print('about to do clean up') + print(answer) + self._clean_up_workspace() + print('did clean up') + return answer + + + + diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml new file mode 100644 index 000000000000..b7cc573d5cf7 --- /dev/null +++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml @@ -0,0 +1,8 @@ +ai_goals: +- Evaluate the prompt in `prompt.txt` +- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided. +- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer. +- Save your work in the `output.txt` file, the second you do this, exit the program. +- Exit the program when you are done. +ai_name: EvaluationAgent +ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py new file mode 100644 index 000000000000..9bb4bb32b052 --- /dev/null +++ b/auto_gpt_benchmarking/CompletionFn.py @@ -0,0 +1,27 @@ +import importlib +from typing import Optional +from evals.api import CompletionFn, CompletionResult + +from evals.prompt.base import CompletionPrompt +from evals.record import record_sampling +from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent + + +class AutoGPTCompletionResult(CompletionResult): + def __init__(self, response) -> None: + self.response = response + + def get_completions(self) -> list[str]: + return [self.response.strip()] + + +class AutoGPTCompletionFn(CompletionFn): + def __init__(self, **kwargs) -> None: + pass + + def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult: + prompt = CompletionPrompt(prompt).to_formatted_prompt() + agent = AutoGPTAgent(prompt) + response = agent.start() + record_sampling(prompt=prompt, sampled=response) + return AutoGPTCompletionResult(response) \ No newline at end of file diff --git a/auto_gpt_benchmarking/LangChainCompletions.py b/auto_gpt_benchmarking/LangChainCompletions.py new file mode 100644 index 000000000000..17f52bfa124b --- /dev/null +++ b/auto_gpt_benchmarking/LangChainCompletions.py @@ -0,0 +1,34 @@ +import importlib +from typing import Optional +from evals.api import CompletionFn, CompletionResult + +from langchain.llms import BaseLLM + +from evals.prompt.base import CompletionPrompt +from evals.record import record_sampling + + +class LangChainLLMCompletionResult(CompletionResult): + def __init__(self, response) -> None: + self.response = response + + def get_completions(self) -> list[str]: + return [self.response.strip()] + + +class LangChainLLMCompletionFn(CompletionFn): + def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None: + # Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM + module = importlib.import_module("langchain.llms") + LLMClass = getattr(module, llm) + + if issubclass(LLMClass, BaseLLM): + self.llm = LLMClass(**llm_kwargs) + else: + raise ValueError(f"{llm} is not a subclass of BaseLLM") + + def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult: + prompt = CompletionPrompt(prompt).to_formatted_prompt() + response = self.llm(prompt) + record_sampling(prompt=prompt, sampled=response) + return LangChainLLMCompletionResult(response) diff --git a/auto_gpt_benchmarking/__init__.py b/auto_gpt_benchmarking/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml new file mode 100644 index 000000000000..d6a55a29bf1e --- /dev/null +++ b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml @@ -0,0 +1,2 @@ +auto_gpt_completion_fn: + class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn \ No newline at end of file diff --git a/auto_gpt_benchmarking/main.py b/auto_gpt_benchmarking/main.py new file mode 100644 index 000000000000..f0303f1e73d7 --- /dev/null +++ b/auto_gpt_benchmarking/main.py @@ -0,0 +1,4 @@ +""" +To run auto-gpt we need to run the following command: + +""" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000000..a59bcbdd382d --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +evals \ No newline at end of file