First commit for AutoGPT Benchmarks

waynehamadi · Apr 17, 2023 · 89081d9 · 89081d9
1 parent 0b899eb
commit 89081d9
Show file tree

Hide file tree

Showing 11 changed files with 206 additions and 0 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "Auto-GPT"]
+	path = auto_gpt_benchmarking/Auto-GPT
+	url = https://github.com/Significant-Gravitas/Auto-GPT.git
diff --git a/README.md b/README.md
@@ -1,2 +1,40 @@
 # Auto-GPT-Benchmarks
 A set of standardised benchmarks to assess the performance of Auto-GPTs.
+
+# What is next?
+
+- [ ] Build longer form tasks, (code fix backed by testing)
+- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
+- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc)
+- [ ] Lower priority, but put this in a webserver backend so we have a good API
+- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
+- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
+
+
+## Understanding OpenAI Evals
+
+The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
+
+The basic idea is this though:
+1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
+2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
+3. Run the evals against the completion function.
+
+Then you can make more also, yaml defined evals and run them against the completion function as needed.
+
+### Completions Functions
+
+See our yaml file in `completion_fns` dir for the registration of the completion function.
+See our completion function itself in CompletionFn.py
+That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
+
+
+# RANDOM SHIT
+
+You must add the auto_gpt_bencchmarking dir to the python path
+Do this with a path file in your venv. OpenAI evals needs to import it. 
+
+I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: 
+`/home/douglas/AGI/Auto-GPT-Benchmarks-fork`
+
+
diff --git a/auto_gpt_benchmarking/Auto-GPT b/auto_gpt_benchmarking/Auto-GPT
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -0,0 +1,88 @@
+"""
+This instantiates an AutoGPT agent who is capable of handling any task.
+It is designed to pass benchmarks as effectively as possible.
+
+Loads in the ai_settings.yaml file to get the AI's name, role, and goals.
+Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation.
+
+The model is instantiated with a prompt from the AutoGPT completion function.
+
+Eventualy we will also save and log all of the associated output and thinking for the model as well
+"""
+from pathlib import Path
+import os
+
+
+class AutoGPTAgent:
+    """
+    A class object that contains the configuration information for the AI
+    The init function takes an evaluation prompt.
+    It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo.
+    It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt
+    It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder.
+    If the model has used more than 50,000 tokens, it kills the model.
+    If the model has used less than 50,000 tokens, it returns the output.txt file.
+    """
+    def _clean_up_workspace(self):
+        """
+        Cleans up the workspace by deleting the prompt.txt and output.txt files.
+        :return:
+        """
+        # check if the files are there and delete them if they are
+        if self.prompt_file.exists():
+            self.prompt_file.unlink()
+        if self.output_file.exists():
+            self.output_file.unlink()
+
+    def _copy_ai_settings(self):
+        self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
+
+    def _copy_prompt(self):
+        self.prompt_file.write_text(self.prompt)
+
+    def _start_agent(self):
+        """
+        This starts the agent in the docker container.
+        This assumes you have the docker image built with:
+        docker build -t autogpt .
+        In the dockerfile in the Auto-GPT repo.
+        You also must set up the .env file in the Auto-GPT repo.
+        :return:
+        """
+        env_file = self.auto_gpt_path / ".env"
+        # run it in continuous mode and skip re-prompts
+        os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'")
+
+    def _poll_for_output(self):
+        """
+        This polls the output file to see if the model has finished.
+        :return:
+        """
+        while True:
+            if self.output_file.exists():
+                return self.output_file.read_text()
+
+    def __init__(self, prompt):
+        self.auto_gpt_path = Path(__file__).parent / "Auto-GPT"
+        self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
+        self.prompt_file = self.auto_workspace / "prompt.txt"
+        self.output_file = self.auto_workspace / "output.txt"
+        self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml"
+        self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml"
+        self.prompt = prompt
+        self._clean_up_workspace()
+        self._copy_ai_settings()
+        self._copy_prompt()
+
+    def start(self):
+        self._start_agent()
+        answer = self._poll_for_output()
+        print('about to do clean up')
+        print(answer)
+        self._clean_up_workspace()
+        print('did clean up')
+        return answer
+
+
+
+
diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
@@ -0,0 +1,8 @@
+ai_goals:
+- Evaluate the prompt in `prompt.txt`
+- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided.
+- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
+- Save your work in the `output.txt` file, the second you do this, exit the program.
+- Exit the program when you are done.
+ai_name: EvaluationAgent
+ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible
diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py
@@ -0,0 +1,27 @@
+import importlib
+from typing import Optional
+from evals.api import CompletionFn, CompletionResult
+
+from evals.prompt.base import CompletionPrompt
+from evals.record import record_sampling
+from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent
+
+
+class AutoGPTCompletionResult(CompletionResult):
+    def __init__(self, response) -> None:
+        self.response = response
+
+    def get_completions(self) -> list[str]:
+        return [self.response.strip()]
+
+
+class AutoGPTCompletionFn(CompletionFn):
+    def __init__(self, **kwargs) -> None:
+        pass
+
+    def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult:
+        prompt = CompletionPrompt(prompt).to_formatted_prompt()
+        agent = AutoGPTAgent(prompt)
+        response = agent.start()
+        record_sampling(prompt=prompt, sampled=response)
+        return AutoGPTCompletionResult(response)
diff --git a/auto_gpt_benchmarking/LangChainCompletions.py b/auto_gpt_benchmarking/LangChainCompletions.py
@@ -0,0 +1,34 @@
+import importlib
+from typing import Optional
+from evals.api import CompletionFn, CompletionResult
+
+from langchain.llms import BaseLLM
+
+from evals.prompt.base import CompletionPrompt
+from evals.record import record_sampling
+
+
+class LangChainLLMCompletionResult(CompletionResult):
+    def __init__(self, response) -> None:
+        self.response = response
+
+    def get_completions(self) -> list[str]:
+        return [self.response.strip()]
+
+
+class LangChainLLMCompletionFn(CompletionFn):
+    def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None:
+        # Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM
+        module = importlib.import_module("langchain.llms")
+        LLMClass = getattr(module, llm)
+
+        if issubclass(LLMClass, BaseLLM):
+            self.llm = LLMClass(**llm_kwargs)
+        else:
+            raise ValueError(f"{llm} is not a subclass of BaseLLM")
+
+    def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult:
+        prompt = CompletionPrompt(prompt).to_formatted_prompt()
+        response = self.llm(prompt)
+        record_sampling(prompt=prompt, sampled=response)
+        return LangChainLLMCompletionResult(response)
diff --git a/auto_gpt_benchmarking/__init__.py b/auto_gpt_benchmarking/__init__.py
diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
@@ -0,0 +1,2 @@
+auto_gpt_completion_fn:
+  class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn
diff --git a/auto_gpt_benchmarking/main.py b/auto_gpt_benchmarking/main.py
@@ -0,0 +1,4 @@
+"""
+To run auto-gpt we need to run the following command:
+
+"""
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+evals