sgoodfriend commited on Feb 7, 2023

Commit

341188c

1 Parent(s): f995b88

PPO playing Acrobot-v1 from https://github.com/sgoodfriend/rl-algo-impls/tree/5598ebc4b03054f16eebe76792486ba7bcacfc5c

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +147 -0
LICENSE +21 -0
README.md +127 -0
benchmarks/colab_atari1.sh +5 -0
benchmarks/colab_atari2.sh +5 -0
benchmarks/colab_basic.sh +5 -0
benchmarks/colab_benchmark.ipynb +210 -0
benchmarks/colab_carracing.sh +5 -0
benchmarks/colab_pybullet.sh +5 -0
benchmarks/train_loop.sh +17 -0
colab_enjoy.ipynb +213 -0
colab_requirements.txt +7 -0
colab_train.ipynb +215 -0
dqn/dqn.py +182 -0
dqn/policy.py +37 -0
dqn/q_net.py +29 -0
enjoy.py +105 -0
environment.yml +17 -0
hyperparams/dqn.yml +117 -0
hyperparams/ppo.yml +202 -0
hyperparams/vpg.yml +157 -0
lambda_labs/benchmark.sh +33 -0
lambda_labs/lambda_requirements.txt +9 -0
lambda_labs/setup.sh +10 -0
poetry.lock +0 -0
ppo/policy.py +36 -0
ppo/ppo.py +367 -0
pyproject.toml +27 -0
replay.meta.json +1 -0
replay.mp4 +0 -0
runner/config.py +130 -0
runner/env.py +134 -0
runner/running_utils.py +188 -0
runner/train.py +126 -0
saved_models/ppo-Acrobot-v1-S4-best/model.pth +3 -0
saved_models/ppo-Acrobot-v1-S4-best/vecnormalize.pkl +3 -0
shared/algorithm.py +35 -0
shared/callbacks/callback.py +12 -0
shared/callbacks/eval_callback.py +174 -0
shared/module.py +121 -0
shared/policy/actor.py +304 -0
shared/policy/critic.py +27 -0
shared/policy/on_policy.py +177 -0
shared/policy/policy.py +60 -0
shared/schedule.py +19 -0
shared/stats.py +173 -0
shared/trajectory.py +30 -0
shared/utils.py +8 -0
train.py +81 -0
vpg/policy.py +119 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,147 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Logging into tensorboard and wandb
+runs/*
+wandb
+# macOS
+.DS_STORE
+# Local scratch work
+scratch/*
+# vscode
+.vscode/
+# Don't bother tracking saved_models or videos
+saved_models/*
+downloaded_models/*
+videos/*

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Scott Goodfriend
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,127 @@

+---
+library_name: rl-algo-impls
+tags:
+- Acrobot-v1
+- ppo
+- deep-reinforcement-learning
+- reinforcement-learning
+model-index:
+- name: ppo
+  results:
+  - metrics:
+    - type: mean_reward
+      value: -72.5 +/- 7.68
+      name: mean_reward
+    task:
+      type: reinforcement-learning
+      name: reinforcement-learning
+    dataset:
+      name: Acrobot-v1
+      type: Acrobot-v1
+---
+# **PPO** Agent playing **Acrobot-v1**
+This is a trained model of a **PPO** agent playing **Acrobot-v1** using the [/sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) repo.
+All models trained at this commit can be found at https://api.wandb.ai/links/sgoodfriend/6p2sjqtn.
+## Training Results
+This model was trained from 3 trainings of **PPO** agents using different initial seeds. These agents were trained by checking out [5598ebc](https://github.com/sgoodfriend/rl-algo-impls/tree/5598ebc4b03054f16eebe76792486ba7bcacfc5c). The best and last models were kept from each training. This submission has loaded the best models from each training, reevaluates them, and selects the best model from these latest evaluations (mean - std).
+| algo   | env        |   seed |   reward_mean |   reward_std |   eval_episodes | best   | wandb_url                                                                    |
+|:-------|:-----------|-------:|--------------:|-------------:|----------------:|:-------|:-----------------------------------------------------------------------------|
+| ppo    | Acrobot-v1 |      4 |       -72.5   |      7.68115 |              16 | *      | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/bzab0jtv) |
+| ppo    | Acrobot-v1 |      5 |       -71.875 |      9.55167 |              16 |        | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/zqord0fg) |
+| ppo    | Acrobot-v1 |      6 |       -74.375 |     14.5081  |              16 |        | [wandb](https://wandb.ai/sgoodfriend/rl-algo-impls-benchmarks/runs/y1w2hqhu) |
+### Prerequisites: Weights & Biases (WandB)
+Training and benchmarking assumes you have a Weights & Biases project to upload runs to.
+By default training goes to a rl-algo-impls project while benchmarks go to
+rl-algo-impls-benchmarks. During training and benchmarking runs, videos of the best
+models and the model weights are uploaded to WandB.
+Before doing any of the runs below, you'll need to create a wandb account and run `wandb
+login`.
+## Usage
+/sgoodfriend/rl-algo-impls: https://github.com/sgoodfriend/rl-algo-impls
+Note: While the model state dictionary and hyperaparameters are saved, the
+implementation could be sufficiently different to not be able to reproduce similar
+results. You might need to checkout the commit the agent was trained on:
+[5598ebc](https://github.com/sgoodfriend/rl-algo-impls/tree/5598ebc4b03054f16eebe76792486ba7bcacfc5c).
+```
+# Downloads the model, sets hyperparameters, and runs agent for 3 episodes
+python enjoy.py --wandb-run-path=sgoodfriend/rl-algo-impls-benchmarks/bzab0jtv
+```
+Setup hasn't been completely worked out yet, so you might be best served by using Google
+Colab starting from the
+[colab_enjoy.ipynb](https://github.com/sgoodfriend/rl-algo-impls/blob/main/colab_enjoy.ipynb)
+notebook.
+## Training
+If you want the highest chance to reproduce these results, you'll want to checkout the
+commit the agent was trained on: [5598ebc](https://github.com/sgoodfriend/rl-algo-impls/tree/5598ebc4b03054f16eebe76792486ba7bcacfc5c).
+```
+python train.py --algo ppo --env Acrobot-v1 --seed 4
+```
+Setup hasn't been completely worked out yet, so you might be best served by using Google
+Colab starting from the
+[colab_train.ipynb](https://github.com/sgoodfriend/rl-algo-impls/blob/main/colab_train.ipynb)
+notebook.
+## Benchmarking (with Lambda Labs instance)
+This and other models from https://api.wandb.ai/links/sgoodfriend/6p2sjqtn were generated by running a script on a Lambda
+Labs instance. In a Lambda Labs instance terminal:
+```
+git clone git@github.com:sgoodfriend/rl-algo-impls.git
+cd rl-algo-impls
+bash ./lambda_labs/setup.sh
+wandb login
+bash ./lambda_labs/benchmark.sh
+```
+### Alternative: Google Colab Pro+
+As an alternative,
+[colab_benchmark.ipynb](https://github.com/sgoodfriend/rl-algo-impls/tree/main/benchmarks#:~:text=colab_benchmark.ipynb),
+can be used. However, this requires a Google Colab Pro+ subscription and running across
+4 separate instances because otherwise running all jobs will exceed the 24-hour limit.
+## Hyperparameters
+This isn't exactly the format of hyperparams in hyperparams/ppo.yml, but instead the Wandb Run Config. However, it's very
+close and has some additional data:
+```
+algo: ppo
+algo_hyperparams:
+  ent_coef: 0
+  gae_lambda: 0.94
+  gamma: 0.99
+  n_epochs: 4
+  n_steps: 256
+env: Acrobot-v1
+env_hyperparams:
+  n_envs: 16
+  normalize: true
+n_timesteps: 1000000
+seed: 4
+use_deterministic_algorithms: true
+wandb_entity: null
+wandb_project_name: rl-algo-impls-benchmarks
+wandb_tags:
+- benchmark_5598ebc
+- host_192-9-145-26
+```

benchmarks/colab_atari1.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+source benchmarks/train_loop.sh
+ALGOS="ppo"
+ENVS="PongNoFrameskip-v4 BreakoutNoFrameskip-v4"
+BENCHMARK_MAX_PROCS="${BENCHMARK_MAX_PROCS:-3}"
+train_loop $ALGOS "$ENVS" | xargs -I CMD -P $BENCHMARK_MAX_PROCS bash -c CMD

benchmarks/colab_atari2.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+source benchmarks/train_loop.sh
+ALGOS="ppo"
+ENVS="SpaceInvadersNoFrameskip-v4 QbertNoFrameskip-v4"
+BENCHMARK_MAX_PROCS="${BENCHMARK_MAX_PROCS:-3}"
+train_loop $ALGOS "$ENVS" | xargs -I CMD -P $BENCHMARK_MAX_PROCS bash -c CMD

benchmarks/colab_basic.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+source benchmarks/train_loop.sh
+ALGOS="ppo"
+ENVS="CartPole-v1 MountainCar-v0 MountainCarContinuous-v0 Acrobot-v1 LunarLander-v2"
+BENCHMARK_MAX_PROCS="${BENCHMARK_MAX_PROCS:-3}"
+train_loop $ALGOS "$ENVS" | xargs -I CMD -P $BENCHMARK_MAX_PROCS bash -c CMD

benchmarks/colab_benchmark.ipynb ADDED Viewed

	@@ -0,0 +1,210 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm",
+      "authorship_tag": "ABX9TyMJFprw7XNl/BqbKAHd/483",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "gpuClass": "standard",
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/sgoodfriend/rl-algo-impls/blob/main/benchmarks/colab_benchmark.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# [sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) in Google Colaboratory\n",
+        "## Parameters\n",
+        "\n",
+        "\n",
+        "1.   Wandb\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "S-tXDWP8WTLc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from getpass import getpass\n",
+        "import os\n",
+        "os.environ[\"WANDB_API_KEY\"] = getpass(\"Wandb API key to upload metrics, videos, and models: \")"
+      ],
+      "metadata": {
+        "id": "1ZtdYgxWNGwZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Setup\n",
+        "Clone [sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) "
+      ],
+      "metadata": {
+        "id": "bsG35Io0hmKG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!mkdir -p ~/.ssh\n",
+        "\n",
+        "with open(\"/root/.ssh/id_ed25519\", mode=\"w\") as f:\n",
+        "    f.write(\"\"\"-----BEGIN OPENSSH PRIVATE KEY-----\n",
+        "b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW\n",
+        "QyNTUxOQAAACAkIepH6T90umhyp8+bkmSplqth1/+yxnu/Dax61KlSVAAAAKA4W3D3OFtw\n",
+        "9wAAAAtzc2gtZWQyNTUxOQAAACAkIepH6T90umhyp8+bkmSplqth1/+yxnu/Dax61KlSVA\n",
+        "AAAEA4SPGDm0/gofiOYXPTAi1Oxmw4mTppG2GdNgdMwMiDaSQh6kfpP3S6aHKnz5uSZKmW\n",
+        "q2HX/7LGe78NrHrUqVJUAAAAGmdvb2RmcmllbmQuc2NvdHRAZ21haWwuY29tAQID\n",
+        "-----END OPENSSH PRIVATE KEY-----\n",
+        "\"\"\"\n",
+        ")\n",
+        "\n",
+        "!ssh-keyscan -t ed25519 github.com >> ~/.ssh/known_hosts\n",
+        "!chmod go-rwx /root/.ssh/id_ed25519\n",
+        "!git clone git@github.com:sgoodfriend/rl-algo-impls.git"
+      ],
+      "metadata": {
+        "id": "k5ynTV25hdAf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Installing the correct packages:\n",
+        "\n",
+        "While conda and poetry are generally used for package management, the mismatch in Python versions (3.10 in the project file vs 3.8 in Colab) makes using the package yml files difficult to use. For now, instead I'm going to specify the list of requirements manually below:"
+      ],
+      "metadata": {
+        "id": "jKxGok-ElYQ7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!apt install python-opengl\n",
+        "!apt install ffmpeg\n",
+        "!apt install xvfb\n",
+        "!apt install swig"
+      ],
+      "metadata": {
+        "id": "nn6EETTc2Ewf"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "%cd /content/rl-algo-impls\n",
+        "!pip install -r colab_requirements.txt"
+      ],
+      "metadata": {
+        "id": "AfZh9rH3yQii"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run Once Per Runtime"
+      ],
+      "metadata": {
+        "id": "4o5HOLjc4wq7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import wandb\n",
+        "wandb.login()"
+      ],
+      "metadata": {
+        "id": "PCXa5tdS2qFX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Restart Session beteween runs"
+      ],
+      "metadata": {
+        "id": "AZBZfSUV43JQ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "from pyvirtualdisplay import Display\n",
+        "\n",
+        "virtual_display = Display(visible=0, size=(1400, 900))\n",
+        "virtual_display.start()"
+      ],
+      "metadata": {
+        "id": "VzemeQJP2NO9"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The below 5 bash scripts train agents on environments with 3 seeds each:\n",
+        "- colab_basic.sh and colab_pybullet.sh test on a set of basic gym environments and 4 PyBullet environments. Running both together will likely take about 18 hours. This is likely to run into runtime limits for free Colab and Colab Pro, but is fine for Colab Pro+.\n",
+        "- colab_carracing.sh only trains 3 seeds on CarRacing-v0, which takes almost 22 hours on Colab Pro+ on high-RAM, standard GPU.\n",
+        "- colab_atari1.sh and colab_atari2.sh likely need to be run separately because each takes about 19 hours on high-RAM, standard GPU."
+      ],
+      "metadata": {
+        "id": "nSHfna0hLlO1"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content/rl-algo-impls\n",
+        "os.environ[\"BENCHMARK_MAX_PROCS\"] = str(1) # Can't reliably raise this to 2+, but would make it faster.\n",
+        "!./benchmarks/colab_basic.sh\n",
+        "!./benchmarks/colab_pybullet.sh\n",
+        "# !./benchmarks/colab_carracing.sh\n",
+        "# !./benchmarks/colab_atari1.sh\n",
+        "# !./benchmarks/colab_atari2.sh"
+      ],
+      "metadata": {
+        "id": "07aHYFH1zfXa"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

benchmarks/colab_carracing.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+source benchmarks/train_loop.sh
+ALGOS="ppo"
+ENVS="CarRacing-v0"
+BENCHMARK_MAX_PROCS="${BENCHMARK_MAX_PROCS:-3}"
+train_loop $ALGOS "$ENVS" | xargs -I CMD -P $BENCHMARK_MAX_PROCS bash -c CMD

benchmarks/colab_pybullet.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+source benchmarks/train_loop.sh
+ALGOS="ppo"
+ENVS="HalfCheetahBulletEnv-v0 AntBulletEnv-v0 Walker2DBulletEnv-v0 HopperBulletEnv-v0"
+BENCHMARK_MAX_PROCS="${BENCHMARK_MAX_PROCS:-3}"
+train_loop $ALGOS "$ENVS" | xargs -I CMD -P $BENCHMARK_MAX_PROCS bash -c CMD

benchmarks/train_loop.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+train_loop () {
+    local WANDB_TAGS="benchmark_$(git rev-parse --short HEAD) host_$(hostname)"
+    local algo
+    local env
+    local seed
+    local WANDB_PROJECT_NAME="${WANDB_PROJECT_NAME:-rl-algo-impls-benchmarks}"
+    local args=()
+    (( VIRTUAL_DISPLAY == 1)) && args+=("--virtual-display")
+    local SEEDS="${SEEDS:-1 2 3}"
+    for algo in $(echo $1); do
+        for env in $(echo $2); do
+            for seed in $SEEDS; do
+                echo python train.py --algo $algo --env $env --seed $seed --pool-size 1 --wandb-tags $WANDB_TAGS --wandb-project-name $WANDB_PROJECT_NAME ${args[@]}
+            done
+        done
+    done
+}

colab_enjoy.ipynb ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm",
+      "authorship_tag": "ABX9TyM1iRYRLhijbxWxPLk9Ba7f",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "gpuClass": "standard",
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/sgoodfriend/rl-algo-impls/blob/main/colab_enjoy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# [sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) in Google Colaboratory\n",
+        "## Parameters\n",
+        "\n",
+        "\n",
+        "1.   Wandb\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "S-tXDWP8WTLc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from getpass import getpass\n",
+        "import os\n",
+        "os.environ[\"WANDB_API_KEY\"] = getpass(\"Wandb API key to upload metrics, videos, and models: \")"
+      ],
+      "metadata": {
+        "id": "1ZtdYgxWNGwZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "2. enjoy.py parameters"
+      ],
+      "metadata": {
+        "id": "ao0nAh3MOdN7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "WANDB_RUN_PATH=\"sgoodfriend/rl-algo-impls-benchmarks/rd0lisee\""
+      ],
+      "metadata": {
+        "id": "jKL_NFhVOjSc"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Setup\n",
+        "Clone [sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) "
+      ],
+      "metadata": {
+        "id": "bsG35Io0hmKG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!mkdir -p ~/.ssh\n",
+        "\n",
+        "with open(\"/root/.ssh/id_ed25519\", mode=\"w\") as f:\n",
+        "    f.write(\"\"\"-----BEGIN OPENSSH PRIVATE KEY-----\n",
+        "b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW\n",
+        "QyNTUxOQAAACAkIepH6T90umhyp8+bkmSplqth1/+yxnu/Dax61KlSVAAAAKA4W3D3OFtw\n",
+        "9wAAAAtzc2gtZWQyNTUxOQAAACAkIepH6T90umhyp8+bkmSplqth1/+yxnu/Dax61KlSVA\n",
+        "AAAEA4SPGDm0/gofiOYXPTAi1Oxmw4mTppG2GdNgdMwMiDaSQh6kfpP3S6aHKnz5uSZKmW\n",
+        "q2HX/7LGe78NrHrUqVJUAAAAGmdvb2RmcmllbmQuc2NvdHRAZ21haWwuY29tAQID\n",
+        "-----END OPENSSH PRIVATE KEY-----\n",
+        "\"\"\"\n",
+        ")\n",
+        "\n",
+        "!ssh-keyscan -t ed25519 github.com >> ~/.ssh/known_hosts\n",
+        "!chmod go-rwx /root/.ssh/id_ed25519\n",
+        "!git clone git@github.com:sgoodfriend/rl-algo-impls.git"
+      ],
+      "metadata": {
+        "id": "k5ynTV25hdAf"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Installing the correct packages:\n",
+        "\n",
+        "While conda and poetry are generally used for package management, the mismatch in Python versions (3.10 in the project file vs 3.8 in Colab) makes using the package yml files difficult to use. For now, instead I'm going to specify the list of requirements manually below:"
+      ],
+      "metadata": {
+        "id": "jKxGok-ElYQ7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!apt install python-opengl\n",
+        "!apt install ffmpeg\n",
+        "!apt install xvfb\n",
+        "!apt install swig"
+      ],
+      "metadata": {
+        "id": "nn6EETTc2Ewf"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "%cd /content/rl-algo-impls\n",
+        "!pip install -r colab_requirements.txt"
+      ],
+      "metadata": {
+        "id": "AfZh9rH3yQii"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run Once Per Runtime"
+      ],
+      "metadata": {
+        "id": "4o5HOLjc4wq7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import wandb\n",
+        "wandb.login()"
+      ],
+      "metadata": {
+        "id": "PCXa5tdS2qFX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Restart Session beteween runs"
+      ],
+      "metadata": {
+        "id": "AZBZfSUV43JQ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "from pyvirtualdisplay import Display\n",
+        "\n",
+        "virtual_display = Display(visible=0, size=(1400, 900))\n",
+        "virtual_display.start()"
+      ],
+      "metadata": {
+        "id": "VzemeQJP2NO9"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content/rl-algo-impls\n",
+        "!python enjoy.py --wandb-run-path={WANDB_RUN_PATH}"
+      ],
+      "metadata": {
+        "id": "07aHYFH1zfXa"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

colab_requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+AutoROM.accept-rom-license >= 0.4.2, < 0.5
+stable-baselines3[extra] >= 1.7.0, < 1.8
+gym[box2d] >= 0.21.0, < 0.22
+pyglet == 1.5.27
+wandb >= 0.13.9, < 0.14
+pyvirtualdisplay == 3.0
+pybullet >= 3.2.5, < 3.3

colab_train.ipynb ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm",
+      "authorship_tag": "ABX9TyNGs5TudweZiYKySQxg6H+K",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "gpuClass": "standard",
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/sgoodfriend/rl-algo-impls/blob/main/colab_train.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# [sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) in Google Colaboratory\n",
+        "## Parameters\n",
+        "\n",
+        "\n",
+        "1.   Wandb\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "S-tXDWP8WTLc"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from getpass import getpass\n",
+        "import os\n",
+        "os.environ[\"WANDB_API_KEY\"] = getpass(\"Wandb API key to upload metrics, videos, and models: \")"
+      ],
+      "metadata": {
+        "id": "1ZtdYgxWNGwZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "2. train run parameters"
+      ],
+      "metadata": {
+        "id": "ao0nAh3MOdN7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ALGO = \"ppo\"\n",
+        "ENV = \"CartPole-v1\"\n",
+        "SEED = 1"
+      ],
+      "metadata": {
+        "id": "jKL_NFhVOjSc"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Setup\n",
+        "Clone [sgoodfriend/rl-algo-impls](https://github.com/sgoodfriend/rl-algo-impls) "
+      ],
+      "metadata": {
+        "id": "bsG35Io0hmKG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!mkdir -p ~/.ssh\n",
+        "\n",
+        "with open(\"/root/.ssh/id_ed25519\", mode=\"w\") as f:\n",
+        "    f.write(\"\"\"-----BEGIN OPENSSH PRIVATE KEY-----\n",
+        "b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW\n",
+        "QyNTUxOQAAACAkIepH6T90umhyp8+bkmSplqth1/+yxnu/Dax61KlSVAAAAKA4W3D3OFtw\n",
+        "9wAAAAtzc2gtZWQyNTUxOQAAACAkIepH6T90umhyp8+bkmSplqth1/+yxnu/Dax61KlSVA\n",
+        "AAAEA4SPGDm0/gofiOYXPTAi1Oxmw4mTppG2GdNgdMwMiDaSQh6kfpP3S6aHKnz5uSZKmW\n",
+        "q2HX/7LGe78NrHrUqVJUAAAAGmdvb2RmcmllbmQuc2NvdHRAZ21haWwuY29tAQID\n",
+        "-----END OPENSSH PRIVATE KEY-----\n",
+        "\"\"\"\n",
+        ")\n",
+        "\n",
+        "!ssh-keyscan -t ed25519 github.com >> ~/.ssh/known_hosts\n",
+        "!chmod go-rwx /root/.ssh/id_ed25519\n",
+        "!git clone git@github.com:sgoodfriend/rl-algo-impls.git"
+      ],
+      "metadata": {
+        "id": "k5ynTV25hdAf"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Installing the correct packages:\n",
+        "\n",
+        "While conda and poetry are generally used for package management, the mismatch in Python versions (3.10 in the project file vs 3.8 in Colab) makes using the package yml files difficult to use. For now, instead I'm going to specify the list of requirements manually below:"
+      ],
+      "metadata": {
+        "id": "jKxGok-ElYQ7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!apt install python-opengl\n",
+        "!apt install ffmpeg\n",
+        "!apt install xvfb\n",
+        "!apt install swig"
+      ],
+      "metadata": {
+        "id": "nn6EETTc2Ewf"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "%cd /content/rl-algo-impls\n",
+        "!pip install -r colab_requirements.txt"
+      ],
+      "metadata": {
+        "id": "AfZh9rH3yQii"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run Once Per Runtime"
+      ],
+      "metadata": {
+        "id": "4o5HOLjc4wq7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import wandb\n",
+        "wandb.login()"
+      ],
+      "metadata": {
+        "id": "PCXa5tdS2qFX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Restart Session beteween runs"
+      ],
+      "metadata": {
+        "id": "AZBZfSUV43JQ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "from pyvirtualdisplay import Display\n",
+        "\n",
+        "virtual_display = Display(visible=0, size=(1400, 900))\n",
+        "virtual_display.start()"
+      ],
+      "metadata": {
+        "id": "VzemeQJP2NO9"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content/rl-algo-impls\n",
+        "!python train.py --algo {ALGO} --env {ENV} --seed {SEED}"
+      ],
+      "metadata": {
+        "id": "07aHYFH1zfXa"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

dqn/dqn.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import copy
+import numpy as np
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import deque
+from torch.optim import Adam
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import List, NamedTuple, Optional, TypeVar
+from dqn.policy import DQNPolicy
+from shared.algorithm import Algorithm
+from shared.callbacks.callback import Callback
+from shared.schedule import linear_schedule
+class Transition(NamedTuple):
+    obs: np.ndarray
+    action: np.ndarray
+    reward: float
+    done: bool
+    next_obs: np.ndarray
+class Batch(NamedTuple):
+    obs: np.ndarray
+    actions: np.ndarray
+    rewards: np.ndarray
+    dones: np.ndarray
+    next_obs: np.ndarray
+class ReplayBuffer:
+    def __init__(self, num_envs: int, maxlen: int) -> None:
+        self.num_envs = num_envs
+        self.buffer = deque(maxlen=maxlen)
+    def add(
+        self,
+        obs: VecEnvObs,
+        action: np.ndarray,
+        reward: np.ndarray,
+        done: np.ndarray,
+        next_obs: VecEnvObs,
+    ) -> None:
+        assert isinstance(obs, np.ndarray)
+        assert isinstance(next_obs, np.ndarray)
+        for i in range(self.num_envs):
+            self.buffer.append(
+                Transition(obs[i], action[i], reward[i], done[i], next_obs[i])
+            )
+    def sample(self, batch_size: int) -> Batch:
+        ts = random.sample(self.buffer, batch_size)
+        return Batch(
+            obs=np.array([t.obs for t in ts]),
+            actions=np.array([t.action for t in ts]),
+            rewards=np.array([t.reward for t in ts]),
+            dones=np.array([t.done for t in ts]),
+            next_obs=np.array([t.next_obs for t in ts]),
+        )
+    def __len__(self) -> int:
+        return len(self.buffer)
+DQNSelf = TypeVar("DQNSelf", bound="DQN")
+class DQN(Algorithm):
+    def __init__(
+        self,
+        policy: DQNPolicy,
+        env: VecEnv,
+        device: torch.device,
+        tb_writer: SummaryWriter,
+        learning_rate: float = 1e-4,
+        buffer_size: int = 1_000_000,
+        learning_starts: int = 50_000,
+        batch_size: int = 32,
+        tau: float = 1.0,
+        gamma: float = 0.99,
+        train_freq: int = 4,
+        gradient_steps: int = 1,
+        target_update_interval: int = 10_000,
+        exploration_fraction: float = 0.1,
+        exploration_initial_eps: float = 1.0,
+        exploration_final_eps: float = 0.05,
+        max_grad_norm: float = 10.0,
+    ) -> None:
+        super().__init__(policy, env, device, tb_writer)
+        self.policy = policy
+        self.optimizer = Adam(self.policy.q_net.parameters(), lr=learning_rate)
+        self.target_q_net = copy.deepcopy(self.policy.q_net).to(self.device)
+        self.target_q_net.train(False)
+        self.tau = tau
+        self.target_update_interval = target_update_interval
+        self.replay_buffer = ReplayBuffer(self.env.num_envs, buffer_size)
+        self.batch_size = batch_size
+        self.learning_starts = learning_starts
+        self.train_freq = train_freq
+        self.gradient_steps = gradient_steps
+        self.gamma = gamma
+        self.exploration_eps_schedule = linear_schedule(
+            exploration_initial_eps,
+            exploration_final_eps,
+            end_fraction=exploration_fraction,
+        )
+        self.max_grad_norm = max_grad_norm
+    def learn(
+        self: DQNSelf, total_timesteps: int, callback: Optional[Callback] = None
+    ) -> DQNSelf:
+        self.policy.train(True)
+        obs = self.env.reset()
+        obs = self._collect_rollout(self.learning_starts, obs, 1)
+        learning_steps = total_timesteps - self.learning_starts
+        timesteps_elapsed = 0
+        steps_since_target_update = 0
+        while timesteps_elapsed < learning_steps:
+            progress = timesteps_elapsed / learning_steps
+            eps = self.exploration_eps_schedule(progress)
+            obs = self._collect_rollout(self.train_freq, obs, eps)
+            rollout_steps = self.train_freq
+            timesteps_elapsed += rollout_steps
+            for _ in range(
+                self.gradient_steps if self.gradient_steps > 0 else self.train_freq
+            ):
+                self.train()
+            steps_since_target_update += rollout_steps
+            if steps_since_target_update >= self.target_update_interval:
+                self._update_target()
+                steps_since_target_update = 0
+            if callback:
+                callback.on_step(timesteps_elapsed=rollout_steps)
+        return self
+    def train(self) -> None:
+        if len(self.replay_buffer) < self.batch_size:
+            return
+        o, a, r, d, next_o = self.replay_buffer.sample(self.batch_size)
+        o = torch.as_tensor(o, device=self.device)
+        a = torch.as_tensor(a, device=self.device).unsqueeze(1)
+        r = torch.as_tensor(r, dtype=torch.float32, device=self.device)
+        d = torch.as_tensor(d, dtype=torch.long, device=self.device)
+        next_o = torch.as_tensor(next_o, device=self.device)
+        with torch.no_grad():
+            target = r + (1 - d) * self.gamma * self.target_q_net(next_o).max(1).values
+        current = self.policy.q_net(o).gather(dim=1, index=a).squeeze(1)
+        loss = F.smooth_l1_loss(current, target)
+        self.optimizer.zero_grad()
+        loss.backward()
+        if self.max_grad_norm:
+            nn.utils.clip_grad_norm_(self.policy.q_net.parameters(), self.max_grad_norm)
+        self.optimizer.step()
+    def _collect_rollout(self, timesteps: int, obs: VecEnvObs, eps: float) -> VecEnvObs:
+        for _ in range(0, timesteps, self.env.num_envs):
+            action = self.policy.act(obs, eps, deterministic=False)
+            next_obs, reward, done, _ = self.env.step(action)
+            self.replay_buffer.add(obs, action, reward, done, next_obs)
+            obs = next_obs
+        return obs
+    def _update_target(self) -> None:
+        for target_param, param in zip(
+            self.target_q_net.parameters(), self.policy.q_net.parameters()
+        ):
+            target_param.data.copy_(
+                self.tau * param.data + (1 - self.tau) * target_param.data
+            )

dqn/policy.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import os
+import torch
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs
+from typing import Sequence, TypeVar
+from dqn.q_net import QNetwork
+from shared.policy.policy import Policy
+DQNPolicySelf = TypeVar("DQNPolicySelf", bound="DQNPolicy")
+class DQNPolicy(Policy):
+    def __init__(
+        self,
+        env: VecEnv,
+        hidden_sizes: Sequence[int],
+        **kwargs,
+    ) -> None:
+        super().__init__(env, **kwargs)
+        self.q_net = QNetwork(env.observation_space, env.action_space, hidden_sizes)
+    def act(
+        self, obs: VecEnvObs, eps: float = 0, deterministic: bool = True
+    ) -> np.ndarray:
+        assert eps == 0 if deterministic else eps >= 0
+        if not deterministic and np.random.random() < eps:
+            return np.array(
+                [self.env.action_space.sample() for _ in range(self.env.num_envs)]
+            )
+        else:
+            with torch.no_grad():
+                obs_th = torch.as_tensor(np.array(obs))
+                if self.device:
+                    obs_th = obs_th.to(self.device)
+                return self.q_net(obs_th).argmax(axis=1).cpu().numpy()

dqn/q_net.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import gym
+import torch as th
+import torch.nn as nn
+from gym.spaces import Discrete
+from typing import Sequence, Type
+from shared.module import FeatureExtractor, mlp
+class QNetwork(nn.Module):
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        hidden_sizes: Sequence[int],
+        activation: Type[nn.Module] = nn.ReLU,  # Used by stable-baselines3
+    ) -> None:
+        super().__init__()
+        assert isinstance(action_space, Discrete)
+        self._feature_extractor = FeatureExtractor(observation_space, activation)
+        layer_sizes = (
+            (self._feature_extractor.out_dim,) + tuple(hidden_sizes) + (action_space.n,)
+        )
+        self._fc = mlp(layer_sizes, activation)
+    def forward(self, obs: th.Tensor) -> th.Tensor:
+        x = self._feature_extractor(obs)
+        return self._fc(x)

enjoy.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Support for PyTorch mps mode (https://pytorch.org/docs/stable/notes/mps.html)
+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+import shutil
+import yaml
+from dataclasses import dataclass
+from typing import Optional
+from runner.env import make_eval_env
+from runner.config import Config, RunArgs
+from runner.running_utils import (
+    base_parser,
+    load_hyperparams,
+    set_seeds,
+    get_device,
+    make_policy,
+)
+from shared.callbacks.eval_callback import evaluate
+@dataclass
+class EvalArgs(RunArgs):
+    render: bool = True
+    best: bool = True
+    n_envs: int = 1
+    n_episodes: int = 3
+    deterministic: Optional[bool] = None
+    wandb_run_path: Optional[str] = None
+if __name__ == "__main__":
+    parser = base_parser()
+    parser.add_argument("--render", default=True, type=bool)
+    parser.add_argument("--best", default=True, type=bool)
+    parser.add_argument("--n_envs", default=1, type=int)
+    parser.add_argument("--n_episodes", default=3, type=int)
+    parser.add_argument("--deterministic", default=None, type=bool)
+    parser.add_argument("--wandb-run-path", default=None, type=str)
+    parser.set_defaults(
+        wandb_run_path="sgoodfriend/rl-algo-impls/sfi78a3t",
+    )
+    args = EvalArgs(**vars(parser.parse_args()))
+    if args.wandb_run_path:
+        import wandb
+        api = wandb.Api()
+        run = api.run(args.wandb_run_path)
+        hyperparams = run.config
+        args.algo = hyperparams["algo"]
+        args.env = hyperparams["env"]
+        args.use_deterministic_algorithms = hyperparams.get(
+            "use_deterministic_algorithms", True
+        )
+        config = Config(args, hyperparams, os.path.dirname(__file__))
+        model_path = config.model_dir_path(best=args.best, downloaded=True)
+        model_archive_name = config.model_dir_name(best=args.best, extension=".zip")
+        run.file(model_archive_name).download()
+        if os.path.isdir(model_path):
+            shutil.rmtree(model_path)
+        shutil.unpack_archive(model_archive_name, model_path)
+        os.remove(model_archive_name)
+    else:
+        hyperparams = load_hyperparams(args.algo, args.env, os.path.dirname(__file__))
+        config = Config(args, hyperparams, os.path.dirname(__file__))
+        model_path = config.model_dir_path(best=args.best)
+    print(args)
+    set_seeds(args.seed, args.use_deterministic_algorithms)
+    env = make_eval_env(
+        config,
+        override_n_envs=args.n_envs,
+        render=args.render,
+        normalize_load_path=model_path,
+        **config.env_hyperparams,
+    )
+    device = get_device(config.device, env)
+    policy = make_policy(
+        args.algo,
+        env,
+        device,
+        load_path=model_path,
+        **config.policy_hyperparams,
+    ).eval()
+    if args.deterministic is None:
+        deterministic = config.eval_params.get("deterministic", True)
+    else:
+        deterministic = args.deterministic
+    evaluate(
+        env,
+        policy,
+        args.n_episodes,
+        render=args.render,
+        deterministic=deterministic,
+    )

environment.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: rl_algo_impls
+channels:
+  - pytorch
+  - conda-forge
+  - nodefaults
+dependencies:
+  - python=3.10.*
+  - mamba
+  - pip
+  - poetry
+  - pytorch
+  - torchvision
+  - torchaudio
+  - cmake
+  - swig
+  - ipywidgets
+  - black

hyperparams/dqn.yml ADDED Viewed

	@@ -0,0 +1,117 @@

+CartPole-v1: &cartpole-defaults
+  n_timesteps: !!float 5e4
+  env_hyperparams:
+    n_envs: 1
+    rolling_length: 50
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    learning_rate: !!float 2.3e-3
+    batch_size: 64
+    buffer_size: 100000
+    learning_starts: 1000
+    gamma: 0.99
+    target_update_interval: 10
+    train_freq: 256
+    gradient_steps: 128
+    exploration_fraction: 0.16
+    exploration_final_eps: 0.04
+  eval_params:
+    step_freq: !!float 1e4
+    n_episodes: 10
+    save_best: true
+CartPole-v0:
+  <<: *cartpole-defaults
+  n_timesteps: !!float 4e4
+MountainCar-v0:
+  n_timesteps: !!float 1.2e5
+  env_hyperparams:
+    rolling_length: 50
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    learning_rate: !!float 4e-3
+    batch_size: 128
+    buffer_size: 10000
+    learning_starts: 1000
+    gamma: 0.98
+    target_update_interval: 600
+    train_freq: 16
+    gradient_steps: 8
+    exploration_fraction: 0.2
+    exploration_final_eps: 0.07
+Acrobot-v1:
+  n_timesteps: !!float 1e5
+  env_hyperparams:
+    rolling_length: 10
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    learning_rate: !!float 6.3e-4
+    batch_size: 128
+    buffer_size: 50000
+    learning_starts: 0
+    gamma: 0.99
+    target_update_interval: 250
+    train_freq: 4
+    gradient_steps: -1
+    exploration_fraction: 0.12
+    exploration_final_eps: 0.1
+LunarLander-v2:
+  n_timesteps: !!float 5e5
+  env_hyperparams:
+    rolling_length: 10
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    learning_rate: !!float 1e-4
+    batch_size: 256
+    buffer_size: 100000
+    learning_starts: 10000
+    gamma: 0.99
+    target_update_interval: 250
+    train_freq: 8
+    gradient_steps: -1
+    exploration_fraction: 0.12
+    exploration_final_eps: 0.1
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: 25_000
+    n_episodes: 10
+    save_best: true
+SpaceInvadersNoFrameskip-v4: &atari-defaults
+  n_timesteps: !!float 1e7
+  env_hyperparams:
+    frame_stack: 4
+    no_reward_timeout_steps: 1_000
+    n_envs: 8
+    vec_env_class: "subproc"
+    rolling_length: 20
+  policy_hyperparams:
+    hidden_sizes: [512]
+  algo_hyperparams:
+    buffer_size: 100000
+    learning_rate: !!float 1e-4
+    batch_size: 32
+    learning_starts: 100000
+    target_update_interval: 1000
+    train_freq: 8
+    gradient_steps: 2
+    exploration_fraction: 0.1
+    exploration_final_eps: 0.01
+  eval_params:
+    step_freq: 100_000
+    n_episodes: 10
+    save_best: true
+BreakoutNoFrameskip-v4:
+  <<: *atari-defaults
+PongNoFrameskip-v4:
+  <<: *atari-defaults
+  n_timesteps: !!float 2.5e6

hyperparams/ppo.yml ADDED Viewed

	@@ -0,0 +1,202 @@

+CartPole-v1: &cartpole-defaults
+  n_timesteps: !!float 1e5
+  env_hyperparams:
+    n_envs: 8
+  algo_hyperparams:
+    n_steps: 32
+    batch_size: 256
+    n_epochs: 20
+    gae_lambda: 0.8
+    gamma: 0.98
+    ent_coef: 0.0
+    learning_rate: 0.001
+    learning_rate_decay: linear
+    clip_range: 0.2
+    clip_range_decay: linear
+  eval_params:
+    step_freq: !!float 2.5e4
+    n_episodes: 10
+    save_best: true
+CartPole-v0:
+  <<: *cartpole-defaults
+  n_timesteps: !!float 5e4
+MountainCar-v0:
+  n_timesteps: !!float 1e6
+  env_hyperparams:
+    normalize: true
+    n_envs: 16
+  algo_hyperparams:
+    n_steps: 16
+    n_epochs: 4
+    gae_lambda: 0.98
+    gamma: 0.99
+    ent_coef: 0.0
+MountainCarContinuous-v0:
+  n_timesteps: !!float 1e5
+  env_hyperparams:
+    normalize: true
+    n_envs: 4
+  policy_hyperparams:
+    init_layers_orthogonal: false
+    # log_std_init: -3.29
+  algo_hyperparams:
+    n_steps: 512
+    batch_size: 256
+    n_epochs: 10
+    learning_rate: !!float 7.77e-5
+    ent_coef: 0.01 # 0.00429
+    ent_coef_decay: linear
+    clip_range: 0.1
+    gae_lambda: 0.9
+    max_grad_norm: 5
+    vf_coef: 0.19
+    # use_sde: true
+  eval_params:
+    step_freq: 5000
+    n_episodes: 10
+    save_best: true
+Acrobot-v1:
+  n_timesteps: !!float 1e6
+  env_hyperparams:
+    n_envs: 16
+    normalize: true
+  algo_hyperparams:
+    n_steps: 256
+    n_epochs: 4
+    gae_lambda: 0.94
+    gamma: 0.99
+    ent_coef: 0.0
+LunarLander-v2:
+  n_timesteps: !!float 1e6
+  env_hyperparams:
+    n_envs: 16
+  algo_hyperparams:
+    n_steps: 1024
+    batch_size: 64
+    n_epochs: 4
+    gae_lambda: 0.98
+    gamma: 0.999
+    ent_coef: 0.01
+    ent_coef_decay: linear
+    normalize_advantage: false
+  eval_params:
+    step_freq: !!float 5e4
+    n_episodes: 10
+    save_best: true
+CarRacing-v0:
+  n_timesteps: !!float 4e6
+  env_hyperparams:
+    n_envs: 8
+    frame_stack: 4
+  policy_hyperparams:
+    use_sde: true
+    log_std_init: -2
+    init_layers_orthogonal: false
+    activation_fn: relu
+    share_features_extractor: false
+    cnn_feature_dim: 256
+  algo_hyperparams:
+    n_steps: 512
+    batch_size: 128
+    n_epochs: 10
+    learning_rate: !!float 1e-4
+    learning_rate_decay: linear
+    gamma: 0.99
+    gae_lambda: 0.95
+    ent_coef: 0.0
+    sde_sample_freq: 4
+    max_grad_norm: 0.5
+    vf_coef: 0.5
+    clip_range: 0.2
+# BreakoutNoFrameskip-v4
+# PongNoFrameskip-v4
+# SpaceInvadersNoFrameskip-v4
+# QbertNoFrameskip-v4
+atari: &atari-defaults
+  n_timesteps: !!float 1e7
+  policy_hyperparams:
+    activation_fn: relu
+  env_hyperparams: &atari-env-defaults
+    n_envs: 8
+    frame_stack: 4
+    no_reward_timeout_steps: 1000
+    no_reward_fire_steps: 500
+    vec_env_class: subproc
+  algo_hyperparams:
+    n_steps: 128
+    batch_size: 256
+    n_epochs: 4
+    learning_rate: !!float 2.5e-4
+    learning_rate_decay: linear
+    clip_range: 0.1
+    clip_range_decay: linear
+    vf_coef: 0.5
+    ent_coef: 0.01
+  eval_params:
+    deterministic: false
+HalfCheetahBulletEnv-v0: &pybullet-defaults
+  n_timesteps: !!float 2e6
+  env_hyperparams: &pybullet-env-defaults
+    n_envs: 16
+    normalize: true
+  policy_hyperparams: &pybullet-policy-defaults
+    pi_hidden_sizes: [256, 256]
+    v_hidden_sizes: [256, 256]
+    activation_fn: relu
+  algo_hyperparams: &pybullet-algo-defaults
+    n_steps: 512
+    batch_size: 128
+    n_epochs: 20
+    gamma: 0.99
+    gae_lambda: 0.9
+    ent_coef: 0.0
+    sde_sample_freq: 4
+    max_grad_norm: 0.5
+    vf_coef: 0.5
+    learning_rate: !!float 3e-5
+    clip_range: 0.4
+AntBulletEnv-v0:
+  <<: *pybullet-defaults
+  policy_hyperparams:
+    <<: *pybullet-policy-defaults
+  algo_hyperparams:
+    <<: *pybullet-algo-defaults
+Walker2DBulletEnv-v0:
+  <<: *pybullet-defaults
+  algo_hyperparams:
+    <<: *pybullet-algo-defaults
+    clip_range_decay: linear
+HopperBulletEnv-v0:
+  <<: *pybullet-defaults
+  algo_hyperparams:
+    <<: *pybullet-algo-defaults
+    clip_range_decay: linear
+HumanoidBulletEnv-v0:
+  <<: *pybullet-defaults
+  n_timesteps: !!float 1e7
+  env_hyperparams:
+    <<: *pybullet-env-defaults
+    n_envs: 8
+  policy_hyperparams:
+    <<: *pybullet-policy-defaults
+    # log_std_init: -1
+  algo_hyperparams:
+    <<: *pybullet-algo-defaults
+    n_steps: 2048
+    batch_size: 64
+    n_epochs: 10
+    gae_lambda: 0.95
+    learning_rate: !!float 2.5e-4
+    clip_range: 0.2

hyperparams/vpg.yml ADDED Viewed

	@@ -0,0 +1,157 @@

+CartPole-v1: &cartpole-defaults
+  n_timesteps: !!float 4e5
+  policy_hyperparams:
+    hidden_sizes: [32]
+  algo_hyperparams:
+    steps_per_epoch: 4096
+    pi_lr: 0.01
+    gamma: 0.99
+    lam: 1
+    val_lr: 0.01
+    train_v_iters: 80
+  eval_params:
+    step_freq: !!float 2.5e4
+    n_episodes: 10
+    save_best: true
+CartPole-v0:
+  <<: *cartpole-defaults
+  n_timesteps: !!float 1e5
+  algo_hyperparams:
+    steps_per_epoch: 1024
+    pi_lr: 0.01
+    gamma: 0.99
+    lam: 1
+    val_lr: 0.01
+    train_v_iters: 80
+Acrobot-v1:
+  n_timesteps: !!float 2e5
+  policy_hyperparams:
+    hidden_sizes: [32, 32]
+  algo_hyperparams:
+    steps_per_epoch: 2048
+    pi_lr: 0.005
+    gamma: 0.99
+    lam: 0.97
+    val_lr: 0.01
+    train_v_iters: 80
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: !!float 4e4
+    n_episodes: 10
+    save_best: true
+LunarLander-v2:
+  n_timesteps: !!float 4e6
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    steps_per_epoch: 2048
+    pi_lr: 0.0001
+    gamma: 0.999
+    lam: 0.97
+    val_lr: 0.0001
+    train_v_iters: 80
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: !!float 5e4
+    n_episodes: 10
+    save_best: true
+CarRacing-v0:
+  n_timesteps: !!float 4e6
+  env_hyperparams:
+    frame_stack: 4
+    n_envs: 4
+    vec_env_class: "dummy"
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    steps_per_epoch: 4000
+    pi_lr: !!float 7e-5
+    gamma: 0.99
+    lam: 0.95
+    val_lr: !!float 1e-4
+    train_v_iters: 40
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: !!float 5e4
+    n_episodes: 10
+    save_best: true
+HalfCheetahBulletEnv-v0: &pybullet-defaults
+  n_timesteps: !!float 2e6
+  policy_hyperparams:
+    hidden_sizes: [64, 64]
+    init_layers_orthogonal: true
+  algo_hyperparams:
+    steps_per_epoch: 4000
+    pi_lr: !!float 3e-4
+    gamma: 0.99
+    lam: 0.97
+    val_lr: !!float 1e-3
+    train_v_iters: 80
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: !!float 1e5
+    n_episodes: 10
+    save_best: true
+HopperBulletEnv-v0:
+  <<: *pybullet-defaults
+AntBulletEnv-v0:
+  <<: *pybullet-defaults
+  policy_hyperparams:
+    hidden_sizes: [400, 300]
+  algo_hyperparams:
+    pi_lr: !!float 7e-4
+    gamma: 0.99
+    lam: 0.97
+    val_lr: !!float 7e-3
+    train_v_iters: 80
+    max_grad_norm: 0.5
+FrozenLake-v1:
+  n_timesteps: !!float 8e5
+  env_params:
+    make_kwargs:
+      map_name: 8x8
+      is_slippery: true
+  policy_hyperparams:
+    hidden_sizes: [64]
+  algo_hyperparams:
+    steps_per_epoch: 2048
+    pi_lr: 0.01
+    gamma: 0.99
+    lam: 0.98
+    val_lr: 0.01
+    train_v_iters: 80
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: !!float 5e4
+    n_episodes: 10
+    save_best: true
+SpaceInvadersNoFrameskip-v4: &atari-defaults
+  n_timesteps: !!float 1e7
+  env_hyperparams:
+    frame_stack: 4
+    no_reward_timeout_steps: 1_000
+    n_envs: 8
+    vec_env_class: "subproc"
+  policy_hyperparams:
+    hidden_sizes: [256, 256]
+  algo_hyperparams:
+    steps_per_epoch: 4096
+    pi_lr: !!float 1e-4
+    gamma: 0.99
+    lam: 0.95
+    val_lr: !!float 2e-4
+    train_v_iters: 80
+    max_grad_norm: 0.5
+  eval_params:
+    step_freq: !!float 1e5
+    n_episodes: 10
+    save_best: true

lambda_labs/benchmark.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+source benchmarks/train_loop.sh
+# export WANDB_PROJECT_NAME="rl-algo-impls"
+export VIRTUAL_DISPLAY=1
+BENCHMARK_MAX_PROCS="${BENCHMARK_MAX_PROCS:-6}"
+ALGOS=(
+    # "vpg"
+    # "dqn"
+    "ppo"
+)
+ENVS=(
+    # Basic
+    "CartPole-v1"
+    "MountainCar-v0"
+    "MountainCarContinuous-v0"
+    "Acrobot-v1"
+    "LunarLander-v2"
+    # PyBullet
+    "HalfCheetahBulletEnv-v0"
+    "AntBulletEnv-v0"
+    "Walker2DBulletEnv-v0"
+    "HopperBulletEnv-v0"
+    # CarRacing
+    "CarRacing-v0"
+    # Atari
+    "PongNoFrameskip-v4"
+    "BreakoutNoFrameskip-v4"
+    "SpaceInvadersNoFrameskip-v4"
+    "QbertNoFrameskip-v4"
+)
+train_loop "${ALGOS[*]}" "${ENVS[*]}" | xargs -I CMD -P $BENCHMARK_MAX_PROCS bash -c CMD

lambda_labs/lambda_requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+scipy >= 1.10.0, < 1.11
+tensorboard >= ^2.11.0, < 2.12
+AutoROM.accept-rom-license >= 0.4.2, < 0.5
+stable-baselines3[extra] >= 1.7.0, < 1.8
+gym[box2d] >= 0.21.0, < 0.22
+pyglet == 1.5.27
+wandb >= 0.13.9, < 0.14
+pyvirtualdisplay == 3.0
+pybullet >= 3.2.5, < 3.3

lambda_labs/setup.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+sudo apt update
+sudo apt install -y python-opengl
+sudo apt install -y ffmpeg
+sudo apt install -y xvfb
+sudo apt install -y swig
+python3 -m pip install --upgrade pip
+pip install --upgrade torch torchvision torchaudio
+pip install --upgrade -r ~/rl-algo-impls/lambda_labs/lambda_requirements.txt

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

ppo/policy.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from stable_baselines3.common.vec_env.base_vec_env import VecEnv
+from typing import Optional, Sequence
+from gym.spaces import Box, Discrete
+from shared.policy.on_policy import ActorCritic
+class PPOActorCritic(ActorCritic):
+    def __init__(
+        self,
+        env: VecEnv,
+        pi_hidden_sizes: Optional[Sequence[int]] = None,
+        v_hidden_sizes: Optional[Sequence[int]] = None,
+        **kwargs,
+    ) -> None:
+        obs_space = env.observation_space
+        if isinstance(obs_space, Box):
+            if len(obs_space.shape) == 3:
+                pi_hidden_sizes = pi_hidden_sizes or []
+                v_hidden_sizes = v_hidden_sizes or []
+            elif len(obs_space.shape) == 1:
+                pi_hidden_sizes = pi_hidden_sizes or [64, 64]
+                v_hidden_sizes = v_hidden_sizes or [64, 64]
+            else:
+                raise ValueError(f"Unsupported observation space: {obs_space}")
+        elif isinstance(obs_space, Discrete):
+            pi_hidden_sizes = pi_hidden_sizes or [64]
+            v_hidden_sizes = v_hidden_sizes or [64]
+        else:
+            raise ValueError(f"Unsupported observation space: {obs_space}")
+        super().__init__(
+            env,
+            pi_hidden_sizes,
+            v_hidden_sizes,
+            **kwargs,
+        )

ppo/ppo.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from dataclasses import asdict, dataclass
+from torch.optim import Adam
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import List, Optional, Sequence, NamedTuple, TypeVar
+from shared.algorithm import Algorithm
+from shared.callbacks.callback import Callback
+from shared.policy.on_policy import ActorCritic
+from shared.schedule import constant_schedule, linear_schedule
+from shared.trajectory import Trajectory as BaseTrajectory
+from shared.utils import discounted_cumsum
+@dataclass
+class PPOTrajectory(BaseTrajectory):
+    logp_a: List[float]
+    next_obs: Optional[np.ndarray]
+    def __init__(self) -> None:
+        super().__init__()
+        self.logp_a = []
+        self.next_obs = None
+    def add(
+        self,
+        obs: np.ndarray,
+        act: np.ndarray,
+        next_obs: np.ndarray,
+        rew: float,
+        terminated: bool,
+        v: float,
+        logp_a: float,
+    ):
+        super().add(obs, act, rew, v)
+        self.next_obs = next_obs if not terminated else None
+        self.terminated = terminated
+        self.logp_a.append(logp_a)
+class TrajectoryAccumulator:
+    def __init__(self, num_envs: int) -> None:
+        self.num_envs = num_envs
+        self.trajectories_ = []
+        self.current_trajectories_ = [PPOTrajectory() for _ in range(num_envs)]
+    def step(
+        self,
+        obs: VecEnvObs,
+        action: np.ndarray,
+        next_obs: VecEnvObs,
+        reward: np.ndarray,
+        done: np.ndarray,
+        val: np.ndarray,
+        logp_a: np.ndarray,
+    ) -> None:
+        assert isinstance(obs, np.ndarray)
+        assert isinstance(next_obs, np.ndarray)
+        for i, trajectory in enumerate(self.current_trajectories_):
+            # TODO: Eventually take advantage of terminated/truncated differentiation in
+            # later versions of gym.
+            trajectory.add(
+                obs[i], action[i], next_obs[i], reward[i], done[i], val[i], logp_a[i]
+            )
+            if done[i]:
+                self.trajectories_.append(trajectory)
+                self.current_trajectories_[i] = PPOTrajectory()
+    @property
+    def all_trajectories(self) -> List[PPOTrajectory]:
+        return self.trajectories_ + list(
+            filter(lambda t: len(t), self.current_trajectories_)
+        )
+class RtgAdvantage(NamedTuple):
+    rewards_to_go: torch.Tensor
+    advantage: torch.Tensor
+class TrainStepStats(NamedTuple):
+    loss: float
+    pi_loss: float
+    v_loss: float
+    entropy_loss: float
+    approx_kl: float
+    clipped_frac: float
+@dataclass
+class TrainStats:
+    loss: float
+    pi_loss: float
+    v_loss: float
+    entropy_loss: float
+    approx_kl: float
+    clipped_frac: float
+    def __init__(self, step_stats: List[TrainStepStats]) -> None:
+        self.loss = np.mean([s.loss for s in step_stats]).item()
+        self.pi_loss = np.mean([s.pi_loss for s in step_stats]).item()
+        self.v_loss = np.mean([s.v_loss for s in step_stats]).item()
+        self.entropy_loss = np.mean([s.entropy_loss for s in step_stats]).item()
+        self.approx_kl = np.mean([s.approx_kl for s in step_stats]).item()
+        self.clipped_frac = np.mean([s.clipped_frac for s in step_stats]).item()
+    def write_to_tensorboard(self, tb_writer: SummaryWriter, global_step: int) -> None:
+        tb_writer.add_scalars("losses", asdict(self), global_step=global_step)
+    def __repr__(self) -> str:
+        return " | ".join(
+            [
+                f"Loss: {round(self.loss, 2)}",
+                f"Pi L: {round(self.pi_loss, 2)}",
+                f"V L: {round(self.v_loss, 2)}",
+                f"E L: {round(self.entropy_loss, 2)}",
+                f"Apx KL Div: {round(self.approx_kl, 2)}",
+                f"Clip Frac: {round(self.clipped_frac, 2)}",
+            ]
+        )
+PPOSelf = TypeVar("PPOSelf", bound="PPO")
+class PPO(Algorithm):
+    def __init__(
+        self,
+        policy: ActorCritic,
+        env: VecEnv,
+        device: torch.device,
+        tb_writer: SummaryWriter,
+        learning_rate: float = 3e-4,
+        learning_rate_decay: str = "none",
+        n_steps: int = 2048,
+        batch_size: int = 64,
+        n_epochs: int = 10,
+        gamma: float = 0.99,
+        gae_lambda: float = 0.95,
+        clip_range: float = 0.2,
+        clip_range_decay: str = "none",
+        clip_range_vf: Optional[float] = None,
+        clip_range_vf_decay: str = "none",
+        normalize_advantage: bool = True,
+        ent_coef: float = 0.0,
+        ent_coef_decay: str = "none",
+        vf_coef: float = 0.5,
+        max_grad_norm: float = 0.5,
+        update_rtg_between_epochs: bool = False,
+        sde_sample_freq: int = -1,
+    ) -> None:
+        super().__init__(policy, env, device, tb_writer)
+        self.policy = policy
+        self.gamma = gamma
+        self.gae_lambda = gae_lambda
+        self.optimizer = Adam(self.policy.parameters(), lr=learning_rate)
+        self.lr_schedule = (
+            linear_schedule(learning_rate, 0)
+            if learning_rate_decay == "linear"
+            else constant_schedule(learning_rate)
+        )
+        self.max_grad_norm = max_grad_norm
+        self.clip_range_schedule = (
+            linear_schedule(clip_range, 0)
+            if clip_range_decay == "linear"
+            else constant_schedule(clip_range)
+        )
+        self.clip_range_vf_schedule = None
+        if clip_range_vf:
+            self.clip_range_vf_schedule = (
+                linear_schedule(clip_range_vf, 0)
+                if clip_range_vf_decay == "linear"
+                else constant_schedule(clip_range_vf)
+            )
+        self.normalize_advantage = normalize_advantage
+        self.ent_coef_schedule = (
+            linear_schedule(ent_coef, 0)
+            if ent_coef_decay == "linear"
+            else constant_schedule(ent_coef)
+        )
+        self.vf_coef = vf_coef
+        self.n_steps = n_steps
+        self.batch_size = batch_size
+        self.n_epochs = n_epochs
+        self.sde_sample_freq = sde_sample_freq
+        self.update_rtg_between_epochs = update_rtg_between_epochs
+    def learn(
+        self: PPOSelf,
+        total_timesteps: int,
+        callback: Optional[Callback] = None,
+    ) -> PPOSelf:
+        obs = self.env.reset()
+        ts_elapsed = 0
+        while ts_elapsed < total_timesteps:
+            accumulator = self._collect_trajectories(obs)
+            progress = ts_elapsed / total_timesteps
+            train_stats = self.train(accumulator.all_trajectories, progress)
+            rollout_steps = self.n_steps * self.env.num_envs
+            ts_elapsed += rollout_steps
+            train_stats.write_to_tensorboard(self.tb_writer, ts_elapsed)
+            if callback:
+                callback.on_step(timesteps_elapsed=rollout_steps)
+        return self
+    def _collect_trajectories(self, obs: VecEnvObs) -> TrajectoryAccumulator:
+        self.policy.eval()
+        accumulator = TrajectoryAccumulator(self.env.num_envs)
+        self.policy.reset_noise()
+        for i in range(self.n_steps):
+            if self.sde_sample_freq > 0 and i > 0 and i % self.sde_sample_freq == 0:
+                self.policy.reset_noise()
+            action, value, logp_a, clamped_action = self.policy.step(obs)
+            next_obs, reward, done, _ = self.env.step(clamped_action)
+            accumulator.step(obs, action, next_obs, reward, done, value, logp_a)
+            obs = next_obs
+        return accumulator
+    def train(self, trajectories: List[PPOTrajectory], progress: float) -> TrainStats:
+        self.policy.train()
+        learning_rate = self.lr_schedule(progress)
+        self.optimizer.param_groups[0]["lr"] = learning_rate
+        pi_clip = self.clip_range_schedule(progress)
+        v_clip = (
+            self.clip_range_vf_schedule(progress)
+            if self.clip_range_vf_schedule
+            else None
+        )
+        ent_coef = self.ent_coef_schedule(progress)
+        obs = torch.as_tensor(
+            np.concatenate([np.array(t.obs) for t in trajectories]), device=self.device
+        )
+        act = torch.as_tensor(
+            np.concatenate([np.array(t.act) for t in trajectories]), device=self.device
+        )
+        rtg, adv = self._compute_rtg_and_advantage(trajectories)
+        orig_v = torch.as_tensor(
+            np.concatenate([np.array(t.v) for t in trajectories]), device=self.device
+        )
+        orig_logp_a = torch.as_tensor(
+            np.concatenate([np.array(t.logp_a) for t in trajectories]),
+            device=self.device,
+        )
+        step_stats = []
+        for _ in range(self.n_epochs):
+            if self.update_rtg_between_epochs:
+                rtg, adv = self._compute_rtg_and_advantage(trajectories)
+            else:
+                adv = self._compute_advantage(trajectories)
+            idxs = torch.randperm(len(obs))
+            for i in range(0, len(obs), self.batch_size):
+                mb_idxs = idxs[i : i + self.batch_size]
+                mb_adv = adv[mb_idxs]
+                if self.normalize_advantage:
+                    mb_adv = (mb_adv - mb_adv.mean(-1)) / (mb_adv.std(-1) + 1e-8)
+                step_stats.append(
+                    self._train_step(
+                        pi_clip,
+                        v_clip,
+                        ent_coef,
+                        obs[mb_idxs],
+                        act[mb_idxs],
+                        rtg[mb_idxs],
+                        mb_adv,
+                        orig_v[mb_idxs],
+                        orig_logp_a[mb_idxs],
+                    )
+                )
+        return TrainStats(step_stats)
+    def _train_step(
+        self,
+        pi_clip: float,
+        v_clip: Optional[float],
+        ent_coef: float,
+        obs: torch.Tensor,
+        act: torch.Tensor,
+        rtg: torch.Tensor,
+        adv: torch.Tensor,
+        orig_v: torch.Tensor,
+        orig_logp_a: torch.Tensor,
+    ) -> TrainStepStats:
+        logp_a, entropy, v = self.policy(obs, act)
+        logratio = logp_a - orig_logp_a
+        ratio = torch.exp(logratio)
+        clip_ratio = torch.clamp(ratio, min=1 - pi_clip, max=1 + pi_clip)
+        pi_loss = torch.maximum(-ratio * adv, -clip_ratio * adv).mean()
+        v_loss = (v - rtg).pow(2)
+        if v_clip:
+            v_clipped = (torch.clamp(v, orig_v - v_clip, orig_v + v_clip) - rtg).pow(2)
+            v_loss = torch.maximum(v_loss, v_clipped)
+        v_loss = v_loss.mean()
+        entropy_loss = entropy.mean()
+        loss = pi_loss - ent_coef * entropy_loss + self.vf_coef * v_loss
+        self.optimizer.zero_grad()
+        loss.backward()
+        nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
+        self.optimizer.step()
+        with torch.no_grad():
+            approx_kl = ((ratio - 1) - logratio).mean().cpu().numpy().item()
+            clipped_frac = (
+                ((ratio - 1).abs() > pi_clip).float().mean().cpu().numpy().item()
+            )
+        return TrainStepStats(
+            loss.item(),
+            pi_loss.item(),
+            v_loss.item(),
+            entropy_loss.item(),
+            approx_kl,
+            clipped_frac,
+        )
+    def _compute_advantage(self, trajectories: Sequence[PPOTrajectory]) -> torch.Tensor:
+        advantage = []
+        for traj in trajectories:
+            last_val = 0
+            if not traj.terminated and traj.next_obs is not None:
+                last_val = self.policy.value(np.array(traj.next_obs))
+            rew = np.append(np.array(traj.rew), last_val)
+            v = np.append(np.array(traj.v), last_val)
+            deltas = rew[:-1] + self.gamma * v[1:] - v[:-1]
+            advantage.append(discounted_cumsum(deltas, self.gamma * self.gae_lambda))
+        return torch.as_tensor(
+            np.concatenate(advantage), dtype=torch.float32, device=self.device
+        )
+    def _compute_rtg_and_advantage(
+        self, trajectories: Sequence[PPOTrajectory]
+    ) -> RtgAdvantage:
+        rewards_to_go = []
+        advantages = []
+        for traj in trajectories:
+            last_val = 0
+            if not traj.terminated and traj.next_obs is not None:
+                last_val = self.policy.value(np.array(traj.next_obs))
+            rew = np.append(np.array(traj.rew), last_val)
+            v = np.append(np.array(traj.v), last_val)
+            deltas = rew[:-1] + self.gamma * v[1:] - v[:-1]
+            adv = discounted_cumsum(deltas, self.gamma * self.gae_lambda)
+            advantages.append(adv)
+            rewards_to_go.append(v[:-1] + adv)
+        return RtgAdvantage(
+            torch.as_tensor(
+                np.concatenate(rewards_to_go), dtype=torch.float32, device=self.device
+            ),
+            torch.as_tensor(
+                np.concatenate(advantages), dtype=torch.float32, device=self.device
+            ),
+        )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[tool.poetry]
+name = "rl-algo-impls"
+version = "0.1.0"
+description = "Implementations of reinforcement learning algorithms"
+authors = ["Scott Goodfriend <goodfriend.scott@gmail.com>"]
+license = "MIT License"
+readme = "README.md"
+packages = [{include = "rl_algo_impls"}]
+[tool.poetry.dependencies]
+python = "~3.10"
+"AutoROM.accept-rom-license" = "^0.4.2"
+stable-baselines3 = {extras = ["extra"], version = "^1.7.0"}
+scipy = "^1.10.0"
+gym = {extras = ["box2d"], version = "^0.21.0"}
+pyglet = "1.5.27"
+PyYAML = "^6.0"
+tensorboard = "^2.11.0"
+pybullet = "^3.2.5"
+wandb = "^0.13.9"
+conda-lock = "^1.3.0"
+torch-tb-profiler = "^0.4.1"
+jupyter = "^1.0.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

replay.meta.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers\\nbuilt with clang version 14.0.6\\nconfiguration: --prefix=/Users/runner/miniforge3/conda-bld/ffmpeg_1671040513231/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_pl --cc=arm64-apple-darwin20.0.0-clang --cxx=arm64-apple-darwin20.0.0-clang++ --nm=arm64-apple-darwin20.0.0-nm --ar=arm64-apple-darwin20.0.0-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1671040513231/_build_env/bin/x86_64-apple-darwin13.4.0-clang --enable-neon --enable-gnutls --enable-libmp3lame --enable-libvpx --enable-pthreads --enable-gpl --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-pic --enable-shared --disable-static --enable-version3 --enable-zlib --pkg-config=/Users/runner/miniforge3/conda-bld/ffmpeg_1671040513231/_build_env/bin/pkg-config\\nlibavutil 57. 28.100 / 57. 28.100\\nlibavcodec 59. 37.100 / 59. 37.100\\nlibavformat 59. 27.100 / 59. 27.100\\nlibavdevice 59. 7.100 / 59. 7.100\\nlibavfilter 8. 44.100 / 8. 44.100\\nlibswscale 6. 7.100 / 6. 7.100\\nlibswresample 4. 7.100 / 4. 7.100\\nlibpostproc 56. 6.100 / 56. 6.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-f", "rawvideo", "-s:v", "500x500", "-pix_fmt", "rgb24", "-framerate", "30", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "-r", "30", "/var/folders/9g/my5557_91xddp6lx00nkzly80000gn/T/tmp7t2v9jcd/ppo-Acrobot-v1/replay.mp4"]}, "episode": {"r": -73.0, "l": 74, "t": 1.272925}}

replay.mp4 ADDED Viewed

Binary file (62.5 kB). View file

runner/config.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+from datetime import datetime
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, TypedDict, Union
+@dataclass
+class RunArgs:
+    algo: str
+    env: str
+    seed: Optional[int] = None
+    use_deterministic_algorithms: bool = True
+class Hyperparams(TypedDict, total=False):
+    device: str
+    n_timesteps: Union[int, float]
+    env_hyperparams: Dict[str, Any]
+    policy_hyperparams: Dict[str, Any]
+    algo_hyperparams: Dict[str, Any]
+    eval_params: Dict[str, Any]
+@dataclass
+class Config:
+    args: RunArgs
+    hyperparams: Hyperparams
+    root_dir: str
+    run_id: str = datetime.now().isoformat()
+    def seed(self, training: bool = True) -> Optional[int]:
+        seed = self.args.seed
+        if training or seed is None:
+            return seed
+        return seed + self.env_hyperparams.get("n_envs", 1)
+    @property
+    def device(self) -> str:
+        return self.hyperparams.get("device", "auto")
+    @property
+    def n_timesteps(self) -> int:
+        return int(self.hyperparams.get("n_timesteps", 100_000))
+    @property
+    def env_hyperparams(self) -> Dict[str, Any]:
+        return self.hyperparams.get("env_hyperparams", {})
+    @property
+    def policy_hyperparams(self) -> Dict[str, Any]:
+        return self.hyperparams.get("policy_hyperparams", {})
+    @property
+    def algo_hyperparams(self) -> Dict[str, Any]:
+        return self.hyperparams.get("algo_hyperparams", {})
+    @property
+    def eval_params(self) -> Dict[str, Any]:
+        return self.hyperparams.get("eval_params", {})
+    @property
+    def env_id(self) -> str:
+        return self.args.env
+    @property
+    def model_name(self) -> str:
+        parts = [self.args.algo, self.env_id]
+        if self.args.seed is not None:
+            parts.append(f"S{self.args.seed}")
+        make_kwargs = self.env_hyperparams.get("make_kwargs", {})
+        if make_kwargs:
+            for k, v in make_kwargs.items():
+                if type(v) == bool and v:
+                    parts.append(k)
+                elif type(v) == int and v:
+                    parts.append(f"{k}{v}")
+                else:
+                    parts.append(str(v))
+        return "-".join(parts)
+    @property
+    def run_name(self) -> str:
+        parts = [self.model_name, self.run_id]
+        return "-".join(parts)
+    @property
+    def saved_models_dir(self) -> str:
+        return os.path.join(self.root_dir, "saved_models")
+    @property
+    def downloaded_models_dir(self) -> str:
+        return os.path.join(self.root_dir, "downloaded_models")
+    def model_dir_name(
+        self,
+        best: bool = False,
+        extension: str = "",
+    ) -> str:
+        return self.model_name + ("-best" if best else "") + extension
+    def model_dir_path(self, best: bool = False, downloaded: bool = False) -> str:
+        return os.path.join(
+            self.saved_models_dir if not downloaded else self.downloaded_models_dir,
+            self.model_dir_name(best=best),
+        )
+    @property
+    def runs_dir(self) -> str:
+        return os.path.join(self.root_dir, "runs")
+    @property
+    def tensorboard_summary_path(self) -> str:
+        return os.path.join(self.runs_dir, self.run_name)
+    @property
+    def logs_path(self) -> str:
+        return os.path.join(self.runs_dir, f"log.yml")
+    @property
+    def videos_dir(self) -> str:
+        return os.path.join(self.root_dir, "videos")
+    @property
+    def video_prefix(self) -> str:
+        return os.path.join(self.videos_dir, self.model_name)
+    @property
+    def best_videos_dir(self) -> str:
+        return os.path.join(self.videos_dir, f"{self.model_name}-best")

runner/env.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import gym
+import os
+from gym.wrappers.resize_observation import ResizeObservation
+from gym.wrappers.gray_scale_observation import GrayScaleObservation
+from gym.wrappers.frame_stack import FrameStack
+from stable_baselines3.common.atari_wrappers import (
+    MaxAndSkipEnv,
+    NoopResetEnv,
+)
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv
+from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
+from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
+from stable_baselines3.common.vec_env.vec_normalize import VecNormalize
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import Any, Callable, Dict, Optional, Union
+from runner.config import Config
+from shared.policy.policy import VEC_NORMALIZE_FILENAME
+from wrappers.atari_wrappers import EpisodicLifeEnv, FireOnLifeStarttEnv, ClipRewardEnv
+from wrappers.episode_record_video import EpisodeRecordVideo
+from wrappers.episode_stats_writer import EpisodeStatsWriter
+from wrappers.initial_step_truncate_wrapper import InitialStepTruncateWrapper
+from wrappers.video_compat_wrapper import VideoCompatWrapper
+def make_env(
+    config: Config,
+    training: bool = True,
+    render: bool = False,
+    normalize_load_path: Optional[str] = None,
+    n_envs: int = 1,
+    frame_stack: int = 1,
+    make_kwargs: Optional[Dict[str, Any]] = None,
+    no_reward_timeout_steps: Optional[int] = None,
+    no_reward_fire_steps: Optional[int] = None,
+    vec_env_class: str = "dummy",
+    normalize: bool = False,
+    normalize_kwargs: Optional[Dict[str, Any]] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    rolling_length: int = 100,
+    train_record_video: bool = False,
+    video_step_interval: Union[int, float] = 1_000_000,
+    initial_steps_to_truncate: Optional[int] = None,
+) -> VecEnv:
+    if "BulletEnv" in config.env_id:
+        import pybullet_envs
+    make_kwargs = make_kwargs if make_kwargs is not None else {}
+    if "BulletEnv" in config.env_id and render:
+        make_kwargs["render"] = True
+    if "CarRacing" in config.env_id:
+        make_kwargs["verbose"] = 0
+    spec = gym.spec(config.env_id)
+    def make(idx: int) -> Callable[[], gym.Env]:
+        def _make() -> gym.Env:
+            env = gym.make(config.env_id, **make_kwargs)
+            env = gym.wrappers.RecordEpisodeStatistics(env)
+            env = VideoCompatWrapper(env)
+            if training and train_record_video and idx == 0:
+                env = EpisodeRecordVideo(
+                    env,
+                    config.video_prefix,
+                    step_increment=n_envs,
+                    video_step_interval=int(video_step_interval),
+                )
+            if training and initial_steps_to_truncate:
+                env = InitialStepTruncateWrapper(
+                    env, idx * initial_steps_to_truncate // n_envs
+                )
+            if "AtariEnv" in spec.entry_point:  # type: ignore
+                env = NoopResetEnv(env, noop_max=30)
+                env = MaxAndSkipEnv(env, skip=4)
+                env = EpisodicLifeEnv(env, training=training)
+                action_meanings = env.unwrapped.get_action_meanings()
+                if "FIRE" in action_meanings:  # type: ignore
+                    env = FireOnLifeStarttEnv(env, action_meanings.index("FIRE"))
+                env = ClipRewardEnv(env, training=training)
+                env = ResizeObservation(env, (84, 84))
+                env = GrayScaleObservation(env, keep_dim=False)
+                env = FrameStack(env, frame_stack)
+            elif "CarRacing" in config.env_id:
+                env = ResizeObservation(env, (64, 64))
+                env = GrayScaleObservation(env, keep_dim=False)
+                env = FrameStack(env, frame_stack)
+            if no_reward_timeout_steps:
+                from wrappers.no_reward_timeout import NoRewardTimeout
+                env = NoRewardTimeout(
+                    env, no_reward_timeout_steps, n_fire_steps=no_reward_fire_steps
+                )
+            seed = config.seed(training=training)
+            if seed is not None:
+                env.seed(seed + idx)
+                env.action_space.seed(seed + idx)
+                env.observation_space.seed(seed + idx)
+            return env
+        return _make
+    VecEnvClass = {"dummy": DummyVecEnv, "subproc": SubprocVecEnv}[vec_env_class]
+    venv = VecEnvClass([make(i) for i in range(n_envs)])
+    if training:
+        assert tb_writer
+        venv = EpisodeStatsWriter(
+            venv, tb_writer, training=training, rolling_length=rolling_length
+        )
+    if normalize:
+        if normalize_load_path:
+            venv = VecNormalize.load(
+                os.path.join(normalize_load_path, VEC_NORMALIZE_FILENAME), venv
+            )
+        else:
+            venv = VecNormalize(venv, training=training, **(normalize_kwargs or {}))
+        if not training:
+            venv.norm_reward = False
+    return venv
+def make_eval_env(
+    config: Config, override_n_envs: Optional[int] = None, **kwargs
+) -> VecEnv:
+    kwargs = kwargs.copy()
+    kwargs["training"] = False
+    if override_n_envs is not None:
+        kwargs["n_envs"] = override_n_envs
+        if override_n_envs == 1:
+            kwargs["vec_env_class"] = "dummy"
+    return make_env(config, **kwargs)

runner/running_utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import argparse
+import gym
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import random
+import torch
+import torch.backends.cudnn
+import yaml
+from gym.spaces import Box, Discrete
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import Dict, Optional, Type, Union
+from runner.config import Hyperparams
+from shared.algorithm import Algorithm
+from shared.callbacks.eval_callback import EvalCallback
+from shared.policy.policy import Policy
+from dqn.dqn import DQN
+from dqn.policy import DQNPolicy
+from vpg.vpg import VanillaPolicyGradient
+from vpg.policy import VPGActorCritic
+from ppo.ppo import PPO
+from ppo.policy import PPOActorCritic
+ALGOS: Dict[str, Type[Algorithm]] = {
+    "dqn": DQN,
+    "vpg": VanillaPolicyGradient,
+    "ppo": PPO,
+}
+POLICIES: Dict[str, Type[Policy]] = {
+    "dqn": DQNPolicy,
+    "vpg": VPGActorCritic,
+    "ppo": PPOActorCritic,
+}
+HYPERPARAMS_PATH = "hyperparams"
+def base_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--algo",
+        default="dqn",
+        type=str,
+        choices=list(ALGOS.keys()),
+        nargs="+",
+        help="Abbreviation(s) of algorithm(s)",
+    )
+    parser.add_argument(
+        "--env",
+        default="CartPole-v1",
+        type=str,
+        nargs="+",
+        help="Name of environment(s) in gym",
+    )
+    parser.add_argument(
+        "--seed",
+        default=1,
+        type=int,
+        nargs="*",
+        help="Seeds to run experiment. Unset will do one run with no set seed",
+    )
+    parser.add_argument(
+        "--use-deterministic-algorithms",
+        default=True,
+        type=bool,
+        help="If seed set, set torch.use_deterministic_algorithms",
+    )
+    return parser
+def load_hyperparams(algo: str, env_id: str, root_path: str) -> Hyperparams:
+    hyperparams_path = os.path.join(root_path, HYPERPARAMS_PATH, f"{algo}.yml")
+    with open(hyperparams_path, "r") as f:
+        hyperparams_dict = yaml.safe_load(f)
+    if "BulletEnv" in env_id:
+        import pybullet_envs
+    spec = gym.spec(env_id)
+    if env_id in hyperparams_dict:
+        return hyperparams_dict[env_id]
+    elif "AtariEnv" in str(spec.entry_point) and "atari" in hyperparams_dict:
+        return hyperparams_dict["atari"]
+    else:
+        raise ValueError(f"{env_id} not specified in {algo} hyperparameters file")
+def get_device(device: str, env: VecEnv) -> torch.device:
+    # cuda by default
+    if device == "auto":
+        device = "cuda"
+    # Apple MPS is a second choice (sometimes)
+    if device == "cuda" and not torch.cuda.is_available():
+        device = "mps"
+    # If no MPS, fallback to cpu
+    if device == "mps" and not torch.backends.mps.is_available():
+        device = "cpu"
+    # Simple environments like Discreet and 1-D Boxes might also be better
+    # served with the CPU.
+    if device == "mps":
+        obs_space = env.observation_space
+        if isinstance(obs_space, Discrete):
+            device = "cpu"
+        elif isinstance(obs_space, Box) and len(obs_space.shape) == 1:
+            device = "cpu"
+    print(f"Device: {device}")
+    return torch.device(device)
+def set_seeds(seed: Optional[int], use_deterministic_algorithms: bool) -> None:
+    if seed is None:
+        return
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.use_deterministic_algorithms(use_deterministic_algorithms)
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+def make_policy(
+    algo: str,
+    env: VecEnv,
+    device: torch.device,
+    load_path: Optional[str] = None,
+    **kwargs,
+) -> Policy:
+    policy = POLICIES[algo](env, **kwargs).to(device)
+    if load_path:
+        policy.load(load_path)
+    return policy
+def plot_eval_callback(callback: EvalCallback, tb_writer: SummaryWriter, run_name: str):
+    figure = plt.figure()
+    cumulative_steps = [
+        (idx + 1) * callback.step_freq for idx in range(len(callback.stats))
+    ]
+    plt.plot(
+        cumulative_steps,
+        [s.score.mean for s in callback.stats],
+        "b-",
+        label="mean",
+    )
+    plt.plot(
+        cumulative_steps,
+        [s.score.mean - s.score.std for s in callback.stats],
+        "g--",
+        label="mean-std",
+    )
+    plt.fill_between(
+        cumulative_steps,
+        [s.score.min for s in callback.stats],  # type: ignore
+        [s.score.max for s in callback.stats],  # type: ignore
+        facecolor="cyan",
+        label="range",
+    )
+    plt.xlabel("Steps")
+    plt.ylabel("Score")
+    plt.legend()
+    plt.title(f"Eval {run_name}")
+    tb_writer.add_figure("eval", figure)
+Scalar = Union[bool, str, float, int, None]
+def flatten_hyperparameters(
+    hyperparams: Hyperparams, args: Dict[str, Union[Scalar, list]]
+) -> Dict[str, Scalar]:
+    flattened = args.copy()
+    for k, v in flattened.items():
+        if isinstance(v, list):
+            flattened[k] = json.dumps(v)
+    for k, v in hyperparams.items():
+        if isinstance(v, dict):
+            for sk, sv in v.items():
+                key = f"{k}/{sk}"
+                if isinstance(sv, dict) or isinstance(sv, list):
+                    flattened[key] = str(sv)
+                else:
+                    flattened[key] = sv
+        else:
+            flattened[k] = v  # type: ignore
+    return flattened  # type: ignore

runner/train.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Support for PyTorch mps mode (https://pytorch.org/docs/stable/notes/mps.html)
+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+import dataclasses
+import shutil
+import wandb
+import yaml
+from dataclasses import dataclass
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import Any, Dict, Optional, Sequence
+from shared.callbacks.eval_callback import EvalCallback
+from runner.env import make_env, make_eval_env
+from runner.config import Config, RunArgs
+from runner.running_utils import (
+    ALGOS,
+    load_hyperparams,
+    set_seeds,
+    get_device,
+    make_policy,
+    plot_eval_callback,
+    flatten_hyperparameters,
+)
+from shared.stats import EpisodesStats
+@dataclass
+class TrainArgs(RunArgs):
+    wandb_project_name: Optional[str] = None
+    wandb_entity: Optional[str] = None
+    wandb_tags: Sequence[str] = dataclasses.field(default_factory=list)
+def train(args: TrainArgs):
+    print(args)
+    hyperparams = load_hyperparams(args.algo, args.env, os.getcwd())
+    print(hyperparams)
+    config = Config(args, hyperparams, os.getcwd())
+    wandb_enabled = args.wandb_project_name
+    if wandb_enabled:
+        wandb.tensorboard.patch(
+            root_logdir=config.tensorboard_summary_path, pytorch=True
+        )
+        wandb.init(
+            project=args.wandb_project_name,
+            entity=args.wandb_entity,
+            config=hyperparams,  # type: ignore
+            name=config.run_name,
+            monitor_gym=True,
+            save_code=True,
+            tags=args.wandb_tags,
+        )
+        wandb.config.update(args)
+    tb_writer = SummaryWriter(config.tensorboard_summary_path)
+    set_seeds(args.seed, args.use_deterministic_algorithms)
+    env = make_env(config, tb_writer=tb_writer, **config.env_hyperparams)
+    device = get_device(config.device, env)
+    policy = make_policy(args.algo, env, device, **config.policy_hyperparams)
+    algo = ALGOS[args.algo](policy, env, device, tb_writer, **config.algo_hyperparams)
+    eval_env = make_eval_env(config, **config.env_hyperparams)
+    record_best_videos = config.eval_params.get("record_best_videos", True)
+    callback = EvalCallback(
+        policy,
+        eval_env,
+        tb_writer,
+        best_model_path=config.model_dir_path(best=True),
+        **config.eval_params,
+        video_env=make_eval_env(config, override_n_envs=1, **config.env_hyperparams)
+        if record_best_videos
+        else None,
+        best_video_dir=config.best_videos_dir,
+    )
+    algo.learn(config.n_timesteps, callback=callback)
+    policy.save(config.model_dir_path(best=False))
+    eval_stats = callback.evaluate(n_episodes=10, print_returns=True)
+    plot_eval_callback(callback, tb_writer, config.run_name)
+    log_dict: Dict[str, Any] = {
+        "eval": eval_stats._asdict(),
+    }
+    if callback.best:
+        log_dict["best_eval"] = callback.best._asdict()
+    log_dict.update(hyperparams)
+    log_dict.update(vars(args))
+    with open(config.logs_path, "a") as f:
+        yaml.dump({config.run_name: log_dict}, f)
+    best_eval_stats: EpisodesStats = callback.best  # type: ignore
+    tb_writer.add_hparams(
+        flatten_hyperparameters(hyperparams, vars(args)),
+        {
+            "hparam/best_mean": best_eval_stats.score.mean,
+            "hparam/best_result": best_eval_stats.score.mean
+            - best_eval_stats.score.std,
+            "hparam/last_mean": eval_stats.score.mean,
+            "hparam/last_result": eval_stats.score.mean - eval_stats.score.std,
+        },
+        None,
+        config.run_name,
+    )
+    tb_writer.close()
+    if wandb_enabled:
+        shutil.make_archive(
+            os.path.join(wandb.run.dir, config.model_dir_name()),
+            "zip",
+            config.model_dir_path(),
+        )
+        shutil.make_archive(
+            os.path.join(wandb.run.dir, config.model_dir_name(best=True)),
+            "zip",
+            config.model_dir_path(best=True),
+        )
+        wandb.finish()

saved_models/ppo-Acrobot-v1-S4-best/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f409782c624f44a81d24dc84be10b3ea2d373dbfccfe86f782a1a9109a9880de
+size 41509

saved_models/ppo-Acrobot-v1-S4-best/vecnormalize.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb89bec8a5fe259e3d4705482ece78d999ac400dcd5b35ddffdd8cea0a284cf5
+size 7013

shared/algorithm.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gym
+import torch
+from abc import ABC, abstractmethod
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import List, Optional, TypeVar
+from shared.callbacks.callback import Callback
+from shared.policy.policy import Policy
+from shared.stats import EpisodesStats
+AlgorithmSelf = TypeVar("AlgorithmSelf", bound="Algorithm")
+class Algorithm(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        policy: Policy,
+        env: VecEnv,
+        device: torch.device,
+        tb_writer: SummaryWriter,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.policy = policy
+        self.env = env
+        self.device = device
+        self.tb_writer = tb_writer
+    @abstractmethod
+    def learn(
+        self: AlgorithmSelf, total_timesteps: int, callback: Optional[Callback] = None
+    ) -> AlgorithmSelf:
+        ...

shared/callbacks/callback.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from abc import ABC, abstractmethod
+class Callback(ABC):
+    def __init__(self) -> None:
+        super().__init__()
+        self.timesteps_elapsed = 0
+    def on_step(self, timesteps_elapsed: int = 1) -> bool:
+        self.timesteps_elapsed += timesteps_elapsed
+        return True

shared/callbacks/eval_callback.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import itertools
+import numpy as np
+import os
+from copy import deepcopy
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvWrapper
+from stable_baselines3.common.vec_env.vec_normalize import VecNormalize
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import List, Optional, Union
+from shared.callbacks.callback import Callback
+from shared.policy.policy import Policy
+from shared.stats import Episode, EpisodeAccumulator, EpisodesStats
+from wrappers.vec_episode_recorder import VecEpisodeRecorder
+class EvaluateAccumulator(EpisodeAccumulator):
+    def __init__(self, num_envs: int, goal_episodes: int, print_returns: bool = True):
+        super().__init__(num_envs)
+        self.completed_episodes_by_env_idx = [[] for _ in range(num_envs)]
+        self.goal_episodes_per_env = int(np.ceil(goal_episodes / num_envs))
+        self.print_returns = print_returns
+    def on_done(self, ep_idx: int, episode: Episode) -> None:
+        if len(self.completed_episodes_by_env_idx[ep_idx]) >= self.goal_episodes_per_env:
+            return
+        self.completed_episodes_by_env_idx[ep_idx].append(episode)
+        if self.print_returns:
+            print(
+                f"Episode {len(self)} | "
+                f"Score {episode.score} | "
+                f"Length {episode.length}"
+            )
+    def __len__(self) -> int:
+        return sum(len(ce) for ce in self.completed_episodes_by_env_idx)
+    @property
+    def episodes(self) -> bool:
+        return list(itertools.chain(*self.completed_episodes_by_env_idx))
+    def is_done(self) -> bool:
+        return all(len(ce) == self.goal_episodes_per_env for ce in self.completed_episodes_by_env_idx)
+def evaluate(
+    env: VecEnv,
+    policy: Policy,
+    n_episodes: int,
+    render: bool = False,
+    deterministic: bool = True,
+    print_returns: bool = True,
+) -> EpisodesStats:
+    policy.eval()
+    episodes = EvaluateAccumulator(env.num_envs, n_episodes, print_returns)
+    obs = env.reset()
+    while not episodes.is_done():
+        act = policy.act(obs, deterministic=deterministic)
+        obs, rew, done, _ = env.step(act)
+        episodes.step(rew, done)
+        if render:
+            env.render()
+    stats = EpisodesStats(episodes.episodes)
+    if print_returns:
+        print(stats)
+    return stats
+class EvalCallback(Callback):
+    def __init__(
+        self,
+        policy: Policy,
+        env: VecEnv,
+        tb_writer: SummaryWriter,
+        best_model_path: Optional[str] = None,
+        step_freq: Union[int, float] = 50_000,
+        n_episodes: int = 10,
+        save_best: bool = True,
+        deterministic: bool = True,
+        record_best_videos: bool = True,
+        video_env: Optional[VecEnv] = None,
+        best_video_dir: Optional[str] = None,
+        max_video_length: int = 3600,
+    ) -> None:
+        super().__init__()
+        self.policy = policy
+        self.env = env
+        self.tb_writer = tb_writer
+        self.best_model_path = best_model_path
+        self.step_freq = int(step_freq)
+        self.n_episodes = n_episodes
+        self.save_best = save_best
+        self.deterministic = deterministic
+        self.stats: List[EpisodesStats] = []
+        self.best = None
+        self.record_best_videos = record_best_videos
+        assert video_env or not record_best_videos
+        self.video_env = video_env
+        assert best_video_dir or not record_best_videos
+        self.best_video_dir = best_video_dir
+        if best_video_dir:
+            os.makedirs(best_video_dir, exist_ok=True)
+        self.max_video_length = max_video_length
+        self.best_video_base_path = None
+    def on_step(self, timesteps_elapsed: int = 1) -> bool:
+        super().on_step(timesteps_elapsed)
+        if self.timesteps_elapsed // self.step_freq >= len(self.stats):
+            self.sync_vec_normalize(self.env)
+            self.evaluate()
+        return True
+    def evaluate(
+        self, n_episodes: Optional[int] = None, print_returns: Optional[bool] = None
+    ) -> EpisodesStats:
+        eval_stat = evaluate(
+            self.env,
+            self.policy,
+            n_episodes or self.n_episodes,
+            deterministic=self.deterministic,
+            print_returns=print_returns or False,
+        )
+        self.policy.train(True)
+        print(f"Eval Timesteps: {self.timesteps_elapsed} | {eval_stat}")
+        self.stats.append(eval_stat)
+        if not self.best or eval_stat >= self.best:
+            strictly_better = not self.best or eval_stat > self.best
+            self.best = eval_stat
+            if self.save_best:
+                assert self.best_model_path
+                self.policy.save(self.best_model_path)
+                print("Saved best model")
+            self.best.write_to_tensorboard(self.tb_writer, "best_eval", self.timesteps_elapsed)
+            if strictly_better and self.record_best_videos:
+                assert self.video_env and self.best_video_dir
+                self.sync_vec_normalize(self.video_env)
+                self.best_video_base_path = os.path.join(
+                    self.best_video_dir, str(self.timesteps_elapsed)
+                )
+                video_wrapped = VecEpisodeRecorder(
+                    self.video_env,
+                    self.best_video_base_path,
+                    max_video_length=self.max_video_length,
+                )
+                video_stats = evaluate(
+                    video_wrapped,
+                    self.policy,
+                    1,
+                    deterministic=self.deterministic,
+                    print_returns=False,
+                )
+                print(f"Saved best video: {video_stats}")
+        eval_stat.write_to_tensorboard(self.tb_writer, "eval", self.timesteps_elapsed)
+        return eval_stat
+    def sync_vec_normalize(self, destination_env: VecEnv) -> None:
+        if self.policy.vec_normalize is not None:
+            eval_env_wrapper = destination_env
+            while isinstance(eval_env_wrapper, VecEnvWrapper):
+                if isinstance(eval_env_wrapper, VecNormalize):
+                    if hasattr(self.policy.vec_normalize, "obs_rms"):
+                        eval_env_wrapper.obs_rms = deepcopy(
+                            self.policy.vec_normalize.obs_rms
+                        )
+                    eval_env_wrapper.ret_rms = deepcopy(
+                        self.policy.vec_normalize.ret_rms
+                    )
+                eval_env_wrapper = eval_env_wrapper.venv

shared/module.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from gym.spaces import Box, Discrete
+from stable_baselines3.common.preprocessing import get_flattened_obs_dim
+from typing import Sequence, Type
+class FeatureExtractor(nn.Module):
+    def __init__(
+        self,
+        obs_space: gym.Space,
+        activation: Type[nn.Module],
+        init_layers_orthogonal: bool = False,
+        cnn_feature_dim: int = 512,
+    ) -> None:
+        super().__init__()
+        if isinstance(obs_space, Box):
+            # Conv2D: (channels, height, width)
+            if len(obs_space.shape) == 3:
+                # CNN from DQN Nature paper: Mnih, Volodymyr, et al.
+                # "Human-level control through deep reinforcement learning."
+                # Nature 518.7540 (2015): 529-533.
+                cnn = nn.Sequential(
+                    layer_init(
+                        nn.Conv2d(obs_space.shape[0], 32, kernel_size=8, stride=4),
+                        init_layers_orthogonal,
+                    ),
+                    activation(),
+                    layer_init(
+                        nn.Conv2d(32, 64, kernel_size=4, stride=2),
+                        init_layers_orthogonal,
+                    ),
+                    activation(),
+                    layer_init(
+                        nn.Conv2d(64, 64, kernel_size=3, stride=1),
+                        init_layers_orthogonal,
+                    ),
+                    activation(),
+                    nn.Flatten(),
+                )
+                def preprocess(obs: torch.Tensor) -> torch.Tensor:
+                    if len(obs.shape) == 3:
+                        obs = obs.unsqueeze(0)
+                    return obs.float() / 255.0
+                with torch.no_grad():
+                    cnn_out = cnn(preprocess(torch.as_tensor(obs_space.sample())))
+                self.preprocess = preprocess
+                self.feature_extractor = nn.Sequential(
+                    cnn,
+                    layer_init(
+                        nn.Linear(cnn_out.shape[1], cnn_feature_dim),
+                        init_layers_orthogonal,
+                    ),
+                    activation(),
+                )
+                self.out_dim = cnn_feature_dim
+            elif len(obs_space.shape) == 1:
+                def preprocess(obs: torch.Tensor) -> torch.Tensor:
+                    if len(obs.shape) == 1:
+                        obs = obs.unsqueeze(0)
+                    return obs.float()
+                self.preprocess = preprocess
+                self.feature_extractor = nn.Flatten()
+                self.out_dim = get_flattened_obs_dim(obs_space)
+            else:
+                raise ValueError(f"Unsupported observation space: {obs_space}")
+        elif isinstance(obs_space, Discrete):
+            self.preprocess = lambda x: F.one_hot(x, obs_space.n).float()
+            self.feature_extractor = nn.Flatten()
+            self.out_dim = obs_space.n
+        else:
+            raise NotImplementedError
+    def forward(self, obs: torch.Tensor) -> torch.Tensor:
+        if self.preprocess:
+            obs = self.preprocess(obs)
+        return self.feature_extractor(obs)
+def mlp(
+    layer_sizes: Sequence[int],
+    activation: Type[nn.Module],
+    output_activation: Type[nn.Module] = nn.Identity,
+    init_layers_orthogonal: bool = False,
+    final_layer_gain: float = np.sqrt(2),
+) -> nn.Module:
+    layers = []
+    for i in range(len(layer_sizes) - 2):
+        layers.append(
+            layer_init(
+                nn.Linear(layer_sizes[i], layer_sizes[i + 1]), init_layers_orthogonal
+            )
+        )
+        layers.append(activation())
+    layers.append(
+        layer_init(
+            nn.Linear(layer_sizes[-2], layer_sizes[-1]),
+            init_layers_orthogonal,
+            std=final_layer_gain,
+        )
+    )
+    layers.append(output_activation())
+    return nn.Sequential(*layers)
+def layer_init(
+    layer: nn.Module, init_layers_orthogonal: bool, std: float = np.sqrt(2)
+) -> nn.Module:
+    if not init_layers_orthogonal:
+        return layer
+    nn.init.orthogonal_(layer.weight, std)  # type: ignore
+    nn.init.constant_(layer.bias, 0.0)  # type: ignore
+    return layer

shared/policy/actor.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import gym
+import torch
+import torch.nn as nn
+from abc import ABC, abstractmethod
+from gym.spaces import Box, Discrete
+from torch.distributions import Categorical, Distribution, Normal
+from typing import NamedTuple, Optional, Sequence, Type, TypeVar, Union
+from shared.module import FeatureExtractor, mlp
+class PiForward(NamedTuple):
+    pi: Distribution
+    logp_a: Optional[torch.Tensor]
+    entropy: Optional[torch.Tensor]
+class Actor(nn.Module, ABC):
+    @abstractmethod
+    def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward:
+        ...
+class CategoricalActorHead(Actor):
+    def __init__(
+        self,
+        act_dim: int,
+        hidden_sizes: Sequence[int] = (32,),
+        activation: Type[nn.Module] = nn.Tanh,
+        init_layers_orthogonal: bool = True,
+    ) -> None:
+        super().__init__()
+        layer_sizes = tuple(hidden_sizes) + (act_dim,)
+        self._fc = mlp(
+            layer_sizes,
+            activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+            final_layer_gain=0.01,
+        )
+    def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward:
+        logits = self._fc(obs)
+        pi = Categorical(logits=logits)
+        logp_a = None
+        entropy = None
+        if a is not None:
+            logp_a = pi.log_prob(a)
+            entropy = pi.entropy()
+        return PiForward(pi, logp_a, entropy)
+class GaussianDistribution(Normal):
+    def log_prob(self, a: torch.Tensor) -> torch.Tensor:
+        return super().log_prob(a).sum(axis=-1)
+    def sample(self) -> torch.Tensor:
+        return self.rsample()
+class GaussianActorHead(Actor):
+    def __init__(
+        self,
+        act_dim: int,
+        hidden_sizes: Sequence[int] = (32,),
+        activation: Type[nn.Module] = nn.Tanh,
+        init_layers_orthogonal: bool = True,
+        log_std_init: float = -0.5,
+    ) -> None:
+        super().__init__()
+        layer_sizes = tuple(hidden_sizes) + (act_dim,)
+        self.mu_net = mlp(
+            layer_sizes,
+            activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+            final_layer_gain=0.01,
+        )
+        self.log_std = nn.Parameter(
+            torch.ones(act_dim, dtype=torch.float32) * log_std_init
+        )
+    def _distribution(self, obs: torch.Tensor) -> Distribution:
+        mu = self.mu_net(obs)
+        std = torch.exp(self.log_std)
+        return GaussianDistribution(mu, std)
+    def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward:
+        pi = self._distribution(obs)
+        logp_a = None
+        entropy = None
+        if a is not None:
+            logp_a = pi.log_prob(a)
+            entropy = pi.entropy()
+        return PiForward(pi, logp_a, entropy)
+class TanhBijector:
+    def __init__(self, epsilon: float = 1e-6) -> None:
+        self.epsilon = epsilon
+    @staticmethod
+    def forward(x: torch.Tensor) -> torch.Tensor:
+        return torch.tanh(x)
+    @staticmethod
+    def inverse(y: torch.Tensor) -> torch.Tensor:
+        eps = torch.finfo(y.dtype).eps
+        clamped_y = y.clamp(min=-1.0 + eps, max=1.0 - eps)
+        return torch.atanh(clamped_y)
+    def log_prob_correction(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.log(1.0 - torch.tanh(x) ** 2 + self.epsilon)
+class StateDependentNoiseDistribution(Normal):
+    def __init__(
+        self,
+        loc,
+        scale,
+        latent_sde: torch.Tensor,
+        exploration_mat: torch.Tensor,
+        exploration_matrices: torch.Tensor,
+        bijector: Optional[TanhBijector] = None,
+        validate_args=None,
+    ):
+        super().__init__(loc, scale, validate_args)
+        self.latent_sde = latent_sde
+        self.exploration_mat = exploration_mat
+        self.exploration_matrices = exploration_matrices
+        self.bijector = bijector
+    def log_prob(self, a: torch.Tensor) -> torch.Tensor:
+        gaussian_a = self.bijector.inverse(a) if self.bijector else a
+        log_prob = super().log_prob(gaussian_a).sum(axis=-1)
+        if self.bijector:
+            log_prob -= torch.sum(self.bijector.log_prob_correction(gaussian_a), dim=1)
+        return log_prob
+    def sample(self) -> torch.Tensor:
+        noise = self._get_noise()
+        actions = self.mean + noise
+        return self.bijector.forward(actions) if self.bijector else actions
+    def _get_noise(self) -> torch.Tensor:
+        if len(self.latent_sde) == 1 or len(self.latent_sde) != len(
+            self.exploration_matrices
+        ):
+            return torch.mm(self.latent_sde, self.exploration_mat)
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = self.latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        return noise.squeeze(dim=1)
+    @property
+    def mode(self) -> torch.Tensor:
+        mean = super().mode
+        return self.bijector.forward(mean) if self.bijector else mean
+StateDependentNoiseActorHeadSelf = TypeVar(
+    "StateDependentNoiseActorHeadSelf", bound="StateDependentNoiseActorHead"
+)
+class StateDependentNoiseActorHead(Actor):
+    def __init__(
+        self,
+        act_dim: int,
+        hidden_sizes: Sequence[int] = (32,),
+        activation: Type[nn.Module] = nn.Tanh,
+        init_layers_orthogonal: bool = True,
+        log_std_init: float = -0.5,
+        full_std: bool = True,
+        squash_output: bool = False,
+        learn_std: bool = False,
+    ) -> None:
+        super().__init__()
+        self.act_dim = act_dim
+        layer_sizes = tuple(hidden_sizes) + (self.act_dim,)
+        if len(layer_sizes) == 2:
+            self.latent_net = nn.Identity()
+        elif len(layer_sizes) > 2:
+            self.latent_net = mlp(
+                layer_sizes[:-1],
+                activation,
+                output_activation=activation,
+                init_layers_orthogonal=init_layers_orthogonal,
+            )
+        else:
+            raise ValueError("hidden_sizes must be of at least length 1")
+        self.mu_net = mlp(
+            layer_sizes[-2:],
+            activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+            final_layer_gain=0.01,
+        )
+        self.full_std = full_std
+        std_dim = (hidden_sizes[-1], act_dim if self.full_std else 1)
+        self.log_std = nn.Parameter(
+            torch.ones(std_dim, dtype=torch.float32) * log_std_init
+        )
+        self.bijector = TanhBijector() if squash_output else None
+        self.learn_std = learn_std
+        self.device = None
+        self.exploration_mat = None
+        self.exploration_matrices = None
+        self.sample_weights()
+    def to(
+        self: StateDependentNoiseActorHeadSelf,
+        device: Optional[torch.device] = None,
+        dtype: Optional[Union[torch.dtype, str]] = None,
+        non_blocking: bool = False,
+    ) -> StateDependentNoiseActorHeadSelf:
+        super().to(device, dtype, non_blocking)
+        self.device = device
+        return self
+    def _distribution(self, obs: torch.Tensor) -> Distribution:
+        latent = self.latent_net(obs)
+        mu = self.mu_net(latent)
+        latent_sde = latent if self.learn_std else latent.detach()
+        variance = torch.mm(latent_sde**2, self._get_std() ** 2)
+        assert self.exploration_mat is not None
+        assert self.exploration_matrices is not None
+        return StateDependentNoiseDistribution(
+            mu,
+            torch.sqrt(variance + 1e-6),
+            latent_sde,
+            self.exploration_mat,
+            self.exploration_matrices,
+            self.bijector,
+        )
+    def _get_std(self) -> torch.Tensor:
+        std = torch.exp(self.log_std)
+        if self.full_std:
+            return std
+        ones = torch.ones(self.log_std.shape[0], self.act_dim)
+        if self.device:
+            ones = ones.to(self.device)
+        return ones * std
+    def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward:
+        pi = self._distribution(obs)
+        logp_a = None
+        entropy = None
+        if a is not None:
+            logp_a = pi.log_prob(a)
+            entropy = -logp_a
+        return PiForward(pi, logp_a, entropy)
+    def sample_weights(self, batch_size: int = 1) -> None:
+        std = self._get_std()
+        weights_dist = Normal(torch.zeros_like(std), std)
+        # Reparametrization trick to pass gradients
+        self.exploration_mat = weights_dist.rsample()
+        self.exploration_matrices = weights_dist.rsample(torch.Size((batch_size,)))
+def actor_head(
+    action_space: gym.Space,
+    hidden_sizes: Sequence[int],
+    init_layers_orthogonal: bool,
+    activation: Type[nn.Module],
+    log_std_init: float = -0.5,
+    use_sde: bool = False,
+    full_std: bool = True,
+    squash_output: bool = False,
+) -> Actor:
+    assert not use_sde or isinstance(
+        action_space, Box
+    ), "use_sde only valid if Box action_space"
+    assert not squash_output or use_sde, "squash_output only valid if use_sde"
+    if isinstance(action_space, Discrete):
+        return CategoricalActorHead(
+            action_space.n,
+            hidden_sizes=hidden_sizes,
+            activation=activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+        )
+    elif isinstance(action_space, Box):
+        if use_sde:
+            return StateDependentNoiseActorHead(
+                action_space.shape[0],
+                hidden_sizes=hidden_sizes,
+                activation=activation,
+                init_layers_orthogonal=init_layers_orthogonal,
+                log_std_init=log_std_init,
+                full_std=full_std,
+                squash_output=squash_output,
+            )
+        else:
+            return GaussianActorHead(
+                action_space.shape[0],
+                hidden_sizes=hidden_sizes,
+                activation=activation,
+                init_layers_orthogonal=init_layers_orthogonal,
+                log_std_init=log_std_init,
+            )
+    else:
+        raise ValueError(f"Unsupported action space: {action_space}")

shared/policy/critic.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import gym
+import torch
+import torch.nn as nn
+from typing import Sequence, Type
+from shared.module import FeatureExtractor, mlp
+class CriticHead(nn.Module):
+    def __init__(
+        self,
+        hidden_sizes: Sequence[int] = (32,),
+        activation: Type[nn.Module] = nn.Tanh,
+        init_layers_orthogonal: bool = True,
+    ) -> None:
+        super().__init__()
+        layer_sizes = tuple(hidden_sizes) + (1,)
+        self._fc = mlp(
+            layer_sizes,
+            activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+            final_layer_gain=1.0,
+        )
+    def forward(self, obs: torch.Tensor) -> torch.Tensor:
+        v = self._fc(obs)
+        return v.squeeze(-1)

shared/policy/on_policy.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import gym
+import numpy as np
+import torch
+from gym.spaces import Box
+from pathlib import Path
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs
+from typing import NamedTuple, Optional, Sequence, Tuple, TypeVar
+from shared.module import FeatureExtractor
+from shared.policy.actor import PiForward, StateDependentNoiseActorHead, actor_head
+from shared.policy.critic import CriticHead
+from shared.policy.policy import ACTIVATION, Policy
+class Step(NamedTuple):
+    a: np.ndarray
+    v: np.ndarray
+    logp_a: np.ndarray
+    clamped_a: np.ndarray
+class ACForward(NamedTuple):
+    logp_a: torch.Tensor
+    entropy: torch.Tensor
+    v: torch.Tensor
+FEAT_EXT_FILE_NAME = "feat_ext.pt"
+V_FEAT_EXT_FILE_NAME = "v_feat_ext.pt"
+PI_FILE_NAME = "pi.pt"
+V_FILE_NAME = "v.pt"
+ActorCriticSelf = TypeVar("ActorCriticSelf", bound="ActorCritic")
+def clamp_actions(
+    actions: np.ndarray, action_space: gym.Space, squash_output: bool
+) -> np.ndarray:
+    if isinstance(action_space, Box):
+        low, high = action_space.low, action_space.high  # type: ignore
+        if squash_output:
+            # Squashed output is already between -1 and 1. Rescale if the actual
+            # output needs to something other than -1 and 1
+            return low + 0.5 * (actions + 1) * (high - low)
+        else:
+            return np.clip(actions, low, high)
+    return actions
+class ActorCritic(Policy):
+    def __init__(
+        self,
+        env: VecEnv,
+        pi_hidden_sizes: Sequence[int],
+        v_hidden_sizes: Sequence[int],
+        init_layers_orthogonal: bool = True,
+        activation_fn: str = "tanh",
+        log_std_init: float = -0.5,
+        use_sde: bool = False,
+        full_std: bool = True,
+        squash_output: bool = False,
+        share_features_extractor: bool = True,
+        cnn_feature_dim: int = 512,
+        **kwargs,
+    ) -> None:
+        super().__init__(env, **kwargs)
+        activation = ACTIVATION[activation_fn]
+        observation_space = env.observation_space
+        self.action_space = env.action_space
+        self.squash_output = squash_output
+        self.share_features_extractor = share_features_extractor
+        self._feature_extractor = FeatureExtractor(
+            observation_space,
+            activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+            cnn_feature_dim=cnn_feature_dim,
+        )
+        self._pi = actor_head(
+            self.action_space,
+            (self._feature_extractor.out_dim,) + tuple(pi_hidden_sizes),
+            init_layers_orthogonal,
+            activation,
+            log_std_init=log_std_init,
+            use_sde=use_sde,
+            full_std=full_std,
+            squash_output=squash_output,
+        )
+        if not share_features_extractor:
+            self._v_feature_extractor = FeatureExtractor(
+                observation_space,
+                activation,
+                init_layers_orthogonal=init_layers_orthogonal,
+                cnn_feature_dim=cnn_feature_dim,
+            )
+            v_hidden_sizes = (self._v_feature_extractor.out_dim,) + tuple(
+                v_hidden_sizes
+            )
+        else:
+            self._v_feature_extractor = None
+            v_hidden_sizes = (self._feature_extractor.out_dim,) + tuple(v_hidden_sizes)
+        self._v = CriticHead(
+            hidden_sizes=v_hidden_sizes,
+            activation=activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+        )
+    def _pi_forward(
+        self, obs: torch.Tensor, action: Optional[torch.Tensor] = None
+    ) -> Tuple[PiForward, torch.Tensor]:
+        p_fe = self._feature_extractor(obs)
+        pi_forward = self._pi(p_fe, action)
+        return pi_forward, p_fe
+    def _v_forward(self, obs: torch.Tensor, p_fc: torch.Tensor) -> torch.Tensor:
+        v_fe = self._v_feature_extractor(obs) if self._v_feature_extractor else p_fc
+        return self._v(v_fe)
+    def forward(self, obs: torch.Tensor, action: torch.Tensor) -> ACForward:
+        (_, logp_a, entropy), p_fc = self._pi_forward(obs, action)
+        v = self._v_forward(obs, p_fc)
+        assert logp_a is not None
+        assert entropy is not None
+        return ACForward(logp_a, entropy, v)
+    def _as_tensor(self, obs: VecEnvObs) -> torch.Tensor:
+        assert isinstance(obs, np.ndarray)
+        o = torch.as_tensor(obs)
+        if self.device is not None:
+            o = o.to(self.device)
+        return o
+    def value(self, obs: VecEnvObs) -> np.ndarray:
+        o = self._as_tensor(obs)
+        with torch.no_grad():
+            fe = (
+                self._v_feature_extractor(o)
+                if self._v_feature_extractor
+                else self._feature_extractor(o)
+            )
+            v = self._v(fe)
+        return v.cpu().numpy()
+    def step(self, obs: VecEnvObs) -> Step:
+        o = self._as_tensor(obs)
+        with torch.no_grad():
+            (pi, _, _), p_fc = self._pi_forward(o)
+            a = pi.sample()
+            logp_a = pi.log_prob(a)
+            v = self._v_forward(o, p_fc)
+        a_np = a.cpu().numpy()
+        clamped_a_np = clamp_actions(a_np, self.action_space, self.squash_output)
+        return Step(a_np, v.cpu().numpy(), logp_a.cpu().numpy(), clamped_a_np)
+    def act(self, obs: np.ndarray, deterministic: bool = True) -> np.ndarray:
+        if not deterministic:
+            return self.step(obs).clamped_a
+        else:
+            o = self._as_tensor(obs)
+            with torch.no_grad():
+                (pi, _, _), _ = self._pi_forward(o)
+                a = pi.mode
+            return clamp_actions(a.cpu().numpy(), self.action_space, self.squash_output)
+    def load(self, path: str) -> None:
+        super().load(path)
+        self.reset_noise()
+    def reset_noise(self, batch_size: Optional[int] = None) -> None:
+        if isinstance(self._pi, StateDependentNoiseActorHead):
+            self._pi.sample_weights(
+                batch_size=batch_size if batch_size else self.env.num_envs
+            )

shared/policy/policy.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import numpy as np
+import os
+import torch
+import torch.nn as nn
+from abc import ABC, abstractmethod
+from stable_baselines3.common.vec_env import unwrap_vec_normalize
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs
+from typing import Dict, Optional, Type, TypeVar, Union
+ACTIVATION: Dict[str, Type[nn.Module]] = {
+    "tanh": nn.Tanh,
+    "relu": nn.ReLU,
+}
+VEC_NORMALIZE_FILENAME = "vecnormalize.pkl"
+MODEL_FILENAME = "model.pth"
+PolicySelf = TypeVar("PolicySelf", bound="Policy")
+class Policy(nn.Module, ABC):
+    @abstractmethod
+    def __init__(self, env: VecEnv, **kwargs) -> None:
+        super().__init__()
+        self.env = env
+        self.vec_normalize = unwrap_vec_normalize(env)
+        self.device = None
+    def to(
+        self: PolicySelf,
+        device: Optional[torch.device] = None,
+        dtype: Optional[Union[torch.dtype, str]] = None,
+        non_blocking: bool = False,
+    ) -> PolicySelf:
+        super().to(device, dtype, non_blocking)
+        self.device = device
+        return self
+    @abstractmethod
+    def act(self, obs: VecEnvObs, deterministic: bool = True) -> np.ndarray:
+        ...
+    def save(self, path: str) -> None:
+        os.makedirs(path, exist_ok=True)
+        if self.vec_normalize:
+            self.vec_normalize.save(os.path.join(path, VEC_NORMALIZE_FILENAME))
+        torch.save(
+            self.state_dict(),
+            os.path.join(path, MODEL_FILENAME),
+        )
+    @abstractmethod
+    def load(self, path: str) -> None:
+        # VecNormalize load occurs in env.py
+        self.load_state_dict(torch.load(os.path.join(path, MODEL_FILENAME)))
+    def reset_noise(self) -> None:
+        pass

shared/schedule.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import Callable
+Schedule = Callable[[float], float]
+def linear_schedule(
+    start_val: float, end_val: float, end_fraction: float = 1.0
+) -> Schedule:
+    def func(progress_fraction: float) -> float:
+        if progress_fraction >= end_fraction:
+            return end_val
+        else:
+            return start_val + (end_val - start_val) * progress_fraction / end_fraction
+    return func
+def constant_schedule(val: float) -> Schedule:
+    return lambda f: val

shared/stats.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import numpy as np
+from dataclasses import dataclass
+from torch.utils.tensorboard.writer import SummaryWriter
+from typing import Dict, List, Optional, Sequence, TypeVar
+@dataclass
+class Episode:
+    score: float = 0
+    length: int = 0
+StatisticSelf = TypeVar("StatisticSelf", bound="Statistic")
+@dataclass
+class Statistic:
+    values: np.ndarray
+    round_digits: int = 2
+    @property
+    def mean(self) -> float:
+        return np.mean(self.values).item()
+    @property
+    def std(self) -> float:
+        return np.std(self.values).item()
+    @property
+    def min(self) -> float:
+        return np.min(self.values).item()
+    @property
+    def max(self) -> float:
+        return np.max(self.values).item()
+    def sum(self) -> float:
+        return np.sum(self.values).item()
+    def __len__(self) -> int:
+        return len(self.values)
+    def _diff(self: StatisticSelf, o: StatisticSelf) -> float:
+        return (self.mean - self.std) - (o.mean - o.std)
+    def __gt__(self: StatisticSelf, o: StatisticSelf) -> bool:
+        return self._diff(o) > 0
+    def __ge__(self: StatisticSelf, o: StatisticSelf) -> bool:
+        return self._diff(o) >= 0
+    def __repr__(self) -> str:
+        mean = round(self.mean, self.round_digits)
+        std = round(self.std, self.round_digits)
+        if self.round_digits == 0:
+            mean = int(mean)
+            std = int(std)
+        return f"{mean} +/- {std}"
+    def to_dict(self) -> Dict[str, float]:
+        return {
+            "mean": self.mean,
+            "std": self.std,
+            "min": self.min,
+            "max": self.max,
+        }
+EpisodesStatsSelf = TypeVar("EpisodesStatsSelf", bound="EpisodesStats")
+class EpisodesStats:
+    episodes: Sequence[Episode]
+    simple: bool
+    score: Statistic
+    length: Statistic
+    def __init__(self, episodes: Sequence[Episode], simple: bool = False) -> None:
+        self.episodes = episodes
+        self.simple = simple
+        self.score = Statistic(np.array([e.score for e in episodes]))
+        self.length = Statistic(np.array([e.length for e in episodes]), round_digits=0)
+    def __gt__(self: EpisodesStatsSelf, o: EpisodesStatsSelf) -> bool:
+        return self.score > o.score
+    def __ge__(self: EpisodesStatsSelf, o: EpisodesStatsSelf) -> bool:
+        return self.score >= o.score
+    def __repr__(self) -> str:
+        return (
+            f"Score: {self.score} ({round(self.score.mean - self.score.std, 2)}) | "
+            f"Length: {self.length}"
+        )
+    def _asdict(self) -> dict:
+        return {
+            "n_episodes": len(self.episodes),
+            "score": self.score.to_dict(),
+            "length": self.length.to_dict(),
+        }
+    def write_to_tensorboard(
+        self, tb_writer: SummaryWriter, main_tag: str, global_step: Optional[int] = None
+    ) -> None:
+        stats = {"mean": self.score.mean}
+        if not self.simple:
+            stats.update(
+                {
+                    "min": self.score.min,
+                    "max": self.score.max,
+                    "result": self.score.mean - self.score.std,
+                    "n_episodes": len(self.episodes),
+                }
+            )
+        tb_writer.add_scalars(
+            main_tag,
+            stats,
+            global_step=global_step,
+        )
+class EpisodeAccumulator:
+    def __init__(self, num_envs: int):
+        self._episodes = []
+        self.current_episodes = [Episode() for _ in range(num_envs)]
+    @property
+    def episodes(self) -> List[Episode]:
+        return self._episodes
+    def step(self, reward: np.ndarray, done: np.ndarray) -> None:
+        for idx, current in enumerate(self.current_episodes):
+            current.score += reward[idx]
+            current.length += 1
+            if done[idx]:
+                self._episodes.append(current)
+                self.on_done(idx, current)
+                self.current_episodes[idx] = Episode()
+    def __len__(self) -> int:
+        return len(self.episodes)
+    def on_done(self, ep_idx: int, episode: Episode) -> None:
+        pass
+    def stats(self) -> EpisodesStats:
+        return EpisodesStats(self.episodes)
+class RolloutStats(EpisodeAccumulator):
+    def __init__(self, num_envs: int, print_n_episodes: int, tb_writer: SummaryWriter):
+        super().__init__(num_envs)
+        self.print_n_episodes = print_n_episodes
+        self.epochs: List[EpisodesStats] = []
+        self.tb_writer = tb_writer
+    def on_done(self, ep_idx: int, episode: Episode) -> None:
+        if (
+            self.print_n_episodes >= 0
+            and len(self.episodes) % self.print_n_episodes == 0
+        ):
+            sample = self.episodes[-self.print_n_episodes :]
+            epoch = EpisodesStats(sample)
+            self.epochs.append(epoch)
+            total_steps = np.sum([e.length for e in self.episodes])
+            print(
+                f"Episode: {len(self.episodes)} | "
+                f"{epoch} | "
+                f"Total Steps: {total_steps}"
+            )
+            epoch.write_to_tensorboard(self.tb_writer, "train", global_step=total_steps)

shared/trajectory.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import numpy as np
+import torch
+from dataclasses import dataclass
+from typing import List
+@dataclass
+class Trajectory:
+    obs: List[np.ndarray]
+    act: List[np.ndarray]
+    rew: List[float]
+    v: List[float]
+    terminated: bool
+    def __init__(self) -> None:
+        self.obs = []
+        self.act = []
+        self.rew = []
+        self.v = []
+        self.terminated = False
+    def add(self, obs: np.ndarray, act: np.ndarray, rew: float, v: float):
+        self.obs.append(obs)
+        self.act.append(act)
+        self.rew.append(rew)
+        self.v.append(v)
+    def __len__(self) -> int:
+        return len(self.obs)

shared/utils.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import numpy as np
+def discounted_cumsum(x: np.ndarray, gamma: float) -> np.ndarray:
+    dc = x.copy()
+    for i in reversed(range(len(x) - 1)):
+        dc[i] += gamma * dc[i + 1]
+    return dc

train.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Support for PyTorch mps mode (https://pytorch.org/docs/stable/notes/mps.html)
+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+import itertools
+from argparse import Namespace
+from multiprocessing import Pool
+from typing import Any, Dict
+from runner.running_utils import base_parser
+from runner.train import train, TrainArgs
+def args_dict(algo: str, env: str, seed: str, args: Namespace) -> Dict[str, Any]:
+    d = vars(args).copy()
+    d.update(
+        {
+            "algo": algo,
+            "env": env,
+            "seed": seed,
+        }
+    )
+    return d
+if __name__ == "__main__":
+    parser = base_parser()
+    parser.add_argument(
+        "--wandb-project-name",
+        type=str,
+        default="rl-algo-impls",
+        help="WandB project namme to upload training data to. If none, won't upload.",
+    )
+    parser.add_argument(
+        "--wandb-entity",
+        type=str,
+        default=None,
+        help="WandB team of project. None uses default entity",
+    )
+    parser.add_argument(
+        "--wandb-tags", type=str, nargs="*", help="WandB tags to add to run"
+    )
+    parser.add_argument(
+        "--pool-size", type=int, default=1, help="Simultaneous training jobs to run"
+    )
+    parser.add_argument(
+        "--virtual-display",
+        action="store_true",
+        help="Whether to create a virtual display for video rendering",
+    )
+    parser.set_defaults(algo="ppo", env="CartPole-v1", seed=1)
+    args = parser.parse_args()
+    print(args)
+    if args.virtual_display:
+        from pyvirtualdisplay import Display
+        virtual_display = Display(visible=0, size=(1400, 900))
+        virtual_display.start()
+    delattr(args, "virtual_display")
+    # pool_size isn't a TrainArg so must be removed from args
+    pool_size = args.pool_size
+    delattr(args, "pool_size")
+    algos = args.algo if isinstance(args.algo, list) else [args.algo]
+    envs = args.env if isinstance(args.env, list) else [args.env]
+    seeds = args.seed if isinstance(args.seed, list) else [args.seed]
+    if all(len(arg) == 1 for arg in [algos, envs, seeds]):
+        train(TrainArgs(**args_dict(algos[0], envs[0], seeds[0], args)))
+    else:
+        # Force a new process for each job to get around wandb not allowing more than one
+        # wandb.tensorboard.patch call per process.
+        with Pool(pool_size, maxtasksperchild=1) as p:
+            train_args = [
+                TrainArgs(**args_dict(algo, env, seed, args))
+                for algo, env, seed in itertools.product(algos, envs, seeds)
+            ]
+            p.map(train, train_args)

vpg/policy.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from gym.spaces import Box
+from pathlib import Path
+from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs
+from typing import NamedTuple, Optional, Sequence, TypeVar
+from shared.module import FeatureExtractor
+from shared.policy.actor import (
+    PiForward,
+    Actor,
+    StateDependentNoiseActorHead,
+    actor_head,
+)
+from shared.policy.critic import CriticHead
+from shared.policy.on_policy import Step, clamp_actions
+from shared.policy.policy import ACTIVATION, Policy
+PI_FILE_NAME = "pi.pt"
+V_FILE_NAME = "v.pt"
+class VPGActor(Actor):
+    def __init__(self, feature_extractor: FeatureExtractor, head: Actor) -> None:
+        super().__init__()
+        self.feature_extractor = feature_extractor
+        self.head = head
+    def forward(self, obs: torch.Tensor, a: Optional[torch.Tensor] = None) -> PiForward:
+        fe = self.feature_extractor(obs)
+        return self.head(fe, a)
+class VPGActorCritic(Policy):
+    def __init__(
+        self,
+        env: VecEnv,
+        hidden_sizes: Sequence[int],
+        init_layers_orthogonal: bool = True,
+        activation_fn: str = "tanh",
+        log_std_init: float = -0.5,
+        use_sde: bool = False,
+        full_std: bool = True,
+        squash_output: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(env, **kwargs)
+        activation = ACTIVATION[activation_fn]
+        obs_space = env.observation_space
+        self.action_space = env.action_space
+        self.use_sde = use_sde
+        self.squash_output = squash_output
+        pi_feature_extractor = FeatureExtractor(
+            obs_space, activation, init_layers_orthogonal=init_layers_orthogonal
+        )
+        pi_head = actor_head(
+            self.action_space,
+            (pi_feature_extractor.out_dim,) + tuple(hidden_sizes),
+            init_layers_orthogonal,
+            activation,
+            log_std_init=log_std_init,
+            use_sde=use_sde,
+            full_std=full_std,
+            squash_output=squash_output,
+        )
+        self.pi = VPGActor(pi_feature_extractor, pi_head)
+        v_feature_extractor = FeatureExtractor(
+            obs_space, activation, init_layers_orthogonal=init_layers_orthogonal
+        )
+        v_head = CriticHead(
+            (v_feature_extractor.out_dim,) + tuple(hidden_sizes),
+            activation=activation,
+            init_layers_orthogonal=init_layers_orthogonal,
+        )
+        self.v = nn.Sequential(v_feature_extractor, v_head)
+    def _as_tensor(self, obs: VecEnvObs) -> torch.Tensor:
+        assert isinstance(obs, np.ndarray)
+        o = torch.as_tensor(obs)
+        if self.device is not None:
+            o = o.to(self.device)
+        return o
+    def step(self, obs: VecEnvObs) -> Step:
+        o = self._as_tensor(obs)
+        with torch.no_grad():
+            pi, _, _ = self.pi(o)
+            a = pi.sample()
+            logp_a = pi.log_prob(a)
+            v = self.v(o)
+        a_np = a.cpu().numpy()
+        clamped_a_np = clamp_actions(a_np, self.action_space, self.squash_output)
+        return Step(a_np, v.cpu().numpy(), logp_a.cpu().numpy(), clamped_a_np)
+    def act(self, obs: np.ndarray, deterministic: bool = True) -> np.ndarray:
+        if not deterministic:
+            return self.step(obs).clamped_a
+        else:
+            o = self._as_tensor(obs)
+            with torch.no_grad():
+                pi, _, _ = self.pi(o)
+                a = pi.mode
+            return clamp_actions(a.cpu().numpy(), self.action_space, self.squash_output)
+    def load(self, path: str) -> None:
+        super().load(path)
+        self.reset_noise()
+    def reset_noise(self, batch_size: Optional[int] = None) -> None:
+        if isinstance(self.pi.head, StateDependentNoiseActorHead):
+            self.pi.head.sample_weights(
+                batch_size=batch_size if batch_size else self.env.num_envs
+            )