
GAIL: Generative Adversarial Imitation Learning

# Installation and Imports

## Installation

In [None]:

!pip install easypip

In [None]:
from easypip import easyimport
import functools
import time

easyimport("importlib_metadata==4.13.0")
OmegaConf = easyimport("omegaconf").OmegaConf
bbrl_gym = easyimport("bbrl_gym")
bbrl = easyimport("bbrl>=0.1.6")
expert_path = 'LunarLander-v2.expert.pkl'


### Imports

Below, we import standard python packages, pytorch packages and gym environments.

This is OmegaConf that makes it possible that by just defining the `def run_a2c(cfg):` function and then executing a long `params = {...}` variable at the bottom of this colab, the code is run with the parameters without calling an explicit main.

More precisely, the code is run by calling

`config=OmegaConf.create(params)`

`run_a2c(config)`

at the very bottom of the colab, after starting tensorboard.

[OpenAI gym](https://gym.openai.com/) is a collection of benchmark environments to evaluate RL algorithms.

In [None]:
import os
import copy
import time
import pickle
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import gym

from bbrl.agents.agent import Agent
from bbrl import get_arguments, get_class, instantiate_class
from bbrl.utils.functionalb import gae

# The workspace is the main class in BBRL, this is where all data is collected and stored
from bbrl.workspace import Workspace

# Agents(agent1,agent2,agent3,...) executes the different agents the one after the other
# TemporalAgent(agent) executes an agent over multiple timesteps in the workspace, 
# or until a given condition is reached
from bbrl.agents import Agents, TemporalAgent

# AutoResetGymAgent is an agent able to execute a batch of gym environments
# with auto-resetting. These agents produce multiple variables in the workspace: 
# ’env/env_obs’, ’env/reward’, ’env/timestep’, ’env/done’, ’env/initial_state’, ’env/cumulated_reward’, 
# ... When called at timestep t=0, then the environments are automatically reset. 
# At timestep t>0, these agents will read the ’action’ variable in the workspace at time t − 1
from bbrl.agents.gymb import AutoResetGymAgent, NoAutoResetGymAgent

# Allow to display the behavior of an agent
from bbrl.visu.play import load_agent, play


We first load the expert trajectories for LunarLander-v2

In [None]:

with open(expert_path, 'rb') as handle:
    # Dictionary
    # states => states
    # actions => long tensor
    expert_data = pickle.load(handle)


## Definition of agents
### Functions to build networks
We use the same utilitary functions to build neural networks as before

In [None]:
def build_backbone(sizes, activation):
    layers = []
    for j in range(len(sizes) - 1):
        layers += [nn.Linear(sizes[j], sizes[j + 1]), activation]
    return layers


def build_mlp(sizes, activation, output_activation=nn.Identity()):
    layers = []
    for j in range(len(sizes) - 1):
        act = activation if j < len(sizes) - 2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act]
    return nn.Sequential(*layers)

We also implement a base agent for PPO actors

In [None]:
class BaseActor(Agent):
    def copy_parameters(self, other):
        """Copy parameters from other agent"""
        for self_p, other_p in zip(self.parameters(), other.parameters()):
            self_p.data.copy_(other_p)

### The DiscreteActor

The DiscreteActor was already used in A2C to deal with discrete actions, but we have added the possibility to only predict the probability of an action using the ```predict_proba``` variable in the ```forward()``` function. The code is as follows.

In [None]:

class DiscreteActor(BaseActor):
    def __init__(self, state_dim, hidden_size, n_actions):
        super().__init__()
        self.model = build_mlp(
            [state_dim] + list(hidden_size) + [n_actions], activation=nn.Tanh()
        )
        
    def logits(self, obs):
        return self.model(obs)

    def dist(self, obs):
        scores = self.model(obs)
        probs = torch.softmax(scores, dim=-1)
        return torch.distributions.Categorical(probs)

    def forward(self, t, *, stochastic=True, predict_proba=False, compute_entropy=False, **kwargs):
        """
        Compute the action given either a time step (looking into the workspace)
        or an observation (in kwargs)
        """
        if "observation" in kwargs:
            observation = kwargs["observation"]
        else:
            observation = self.get(("env/env_obs", t))
        scores = self.model(observation)
        probs = torch.softmax(scores, dim=-1)

        if predict_proba:
            action = self.get(("action", t))
            log_prob = probs[torch.arange(probs.size()[0]), action].log()
            self.set(("logprob_predict", t), log_prob)
        else:
            if stochastic:
                action = torch.distributions.Categorical(probs).sample()
            else:
                action = scores.argmax(1)

            log_probs = probs[torch.arange(probs.size()[0]), action].log()

            self.set(("action", t), action)
            self.set(("action_logprobs", t), log_probs)

        if compute_entropy:
            entropy = torch.distributions.Categorical(probs).entropy()
            self.set(("entropy", t), entropy)

    def predict_action(self, obs, stochastic):
        scores = self.model(obs)

        if stochastic:
            probs = torch.softmax(scores, dim=-1)
            action = torch.distributions.Categorical(probs).sample()
        else:
            action = scores.argmax(0)
        return action


### Choosing a specific gym environment
First, we need to make our gym environment. As usual, this is implemented with the simple function below.

In [None]:
def make_gym_env(env_name):
    return gym.make(env_name)


## Training/Testing environment and agents

The code below corresponds to the PPO one
it defines the different classes and functions needed to run
PPO and define the train/test environment

In [None]:
def get_env_agents(cfg):
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    eval_env_agent = NoAutoResetGymAgent(
    get_class(cfg.gym_env),
    get_arguments(cfg.gym_env),
    cfg.algorithm.nb_evals,
    cfg.algorithm.seed,
    )
    return train_env_agent, eval_env_agent


class DAgent(Agent):
    """Discriminator agent"""
    def __init__(self, state_dim, act_dim, hidden_layers):
        super().__init__()
        self.is_q_function = False
        self.act_dim = act_dim
        self.model = build_mlp(
            [state_dim+act_dim] + list(hidden_layers) + [1], activation=nn.ReLU()
        )

    def forward(self, t, **kwargs):
        observation = self.get(("env/env_obs", t))
        action = self.get(("action", t))
        input = torch.cat((observation, nn.functional.one_hot(action, num_classes=self.act_dim)), dim=1)
        critic = self.model(input).squeeze(-1)
        self.set(("disc", t), critic)

class VAgent(Agent):
    def __init__(self, state_dim, hidden_layers):
        super().__init__()
        self.is_q_function = False
        self.model = build_mlp(
            [state_dim] + list(hidden_layers) + [1], activation=nn.ReLU()
        )

    def forward(self, t, **kwargs):
        observation = self.get(("env/env_obs", t))
        critic = self.values(observation)
        self.set(("v_value", t), critic)

    def values(self, observation):
        return self.model(observation).squeeze(-1)

class KLAgent(Agent):
    def __init__(self, model_1, model_2):
        super().__init__()
        self.model_1 = model_1
        self.model_2 = model_2

    def forward(self, t, **kwargs):
        obs = self.get(("env/env_obs", t))
        
        dist_1 = self.model_1.dist(obs)
        dist_2 = self.model_2.dist(obs)
        kl = torch.distributions.kl.kl_divergence(dist_1, dist_2)
        self.set(("kl", t), kl)        

# Create the PPO Agent
def create_ppo_agent(cfg, train_env_agent, eval_env_agent, needs_kl=None):
    obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()

    action_agent = DiscreteActor(
        obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
    )

    tr_agent = Agents(train_env_agent, action_agent)
    ev_agent = Agents(eval_env_agent, action_agent)

    critic_agent = TemporalAgent(
        VAgent(obs_size, cfg.algorithm.architecture.critic_hidden_size)
    )

    # The agent for discriminating
    disc_agent = DAgent(obs_size, act_size, cfg.algorithm.architecture.critic_hidden_size)

    train_agent = TemporalAgent(tr_agent)
    eval_agent = TemporalAgent(ev_agent)
    train_agent.seed(cfg.algorithm.seed)

    old_policy = copy.deepcopy(action_agent)
    old_critic_agent = copy.deepcopy(critic_agent)
    
    kl_agent = None
    if needs_kl:
        kl_agent = TemporalAgent(KLAgent(old_policy, action_agent))

    return action_agent, train_agent, eval_agent, critic_agent, old_policy, old_critic_agent, kl_agent, disc_agent

class Logger():
    def __init__(self, cfg, variant, env_name):
        kwargs = dict(cfg.logger)
        kwargs["log_dir"] = f'{kwargs["log_dir"]}/{env_name}/{variant}-{str(time.time())}'
        self.logger = instantiate_class(kwargs)

    def add_log(self, log_string, loss, epoch):
        self.logger.add_scalar(log_string, loss.item(), epoch)

    # Log losses
    def log_losses(self, epoch, critic_loss, entropy_loss, actor_loss):
        self.add_log("critic_loss", critic_loss, epoch)
        self.add_log("entropy_loss", entropy_loss, epoch)
        self.add_log("actor_loss", actor_loss, epoch)

def setup_optimizer(cfg, *agents):
    optimizer_args = get_arguments(cfg.optimizer)
    parameters = nn.Sequential(*agents).parameters()
    optimizer = get_class(cfg.optimizer)(parameters, **optimizer_args)
    return optimizer



def compute_advantage_loss(cfg, reward, must_bootstrap, v_value):
    # Compute temporal difference with GAE
    advantage = gae(
        v_value,
        reward,
        must_bootstrap,
        cfg.algorithm.discount_factor,
        cfg.algorithm.gae,
    )
    # Compute critic loss
    td_error = advantage**2
    critic_loss = td_error.mean()
    return critic_loss, advantage

def compute_clip_agent_loss(cfg, advantage, ratio, kl):
    """Computes the PPO CLIP loss
    """
    clip_range = cfg.clip_range

    actor_loss_1 = advantage * ratio
    actor_loss_2 = advantage * torch.clamp(ratio, 1 - clip_range, 1 + clip_range)
    actor_loss = torch.minimum(actor_loss_1, actor_loss_2).mean()

    return actor_loss


## Behavioral Cloning

In [None]:

def run_behavioral_cloning(cfg):
    # 1)  Build the  logger
    logger = Logger(cfg, "bc", cfg.gym_env.env_name)
    best_reward = -float('inf')

    # 2) Create the environment agent   
    eval_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.nb_evals,
        cfg.algorithm.seed,
    )

    obs_size, act_size = eval_env_agent.get_obs_and_actions_sizes()

    # Creates our policy
    policy = DiscreteActor(
        obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
    )

    eval_agent = TemporalAgent(Agents(eval_env_agent, policy))

    # Configure the optimizer
    optimizer_args = get_arguments(cfg.optimizer)
    optimizer = get_class(cfg.optimizer)(policy.parameters(), **optimizer_args)

    nb_steps = 0
    tmp_steps = 0

    ce_loss = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(cfg.algorithm.max_epochs):
        # À compléter...  
        assert False, 'Code non implémenté'

        nb_steps += len(expert_data["states"])

        # Evaluate if enough steps have been performed
        if nb_steps - tmp_steps > cfg.algorithm.eval_interval:
            tmp_steps = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_agent(
                eval_workspace,
                t=0,
                stop_variable="env/done",
                stochastic=True,
                predict_proba=False,
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.add_log("reward/mean", mean, nb_steps)
            logger.add_log("reward/max", rewards.max(), nb_steps)
            logger.add_log("reward/min", rewards.min(), nb_steps)
            logger.add_log("reward/std", rewards.std(), nb_steps)
            print(f"nb_steps: {nb_steps}, reward: {mean}")
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = f"./bc_agent/{cfg.gym_env.env_name}"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "bc_" + str(mean.item()) + ".agt"
                policy.save_model(filename)

## Definition of the parameters

The logger is defined as `bbrl.utils.logger.TFLogger` so as to use a tensorboard visualisation.

The two parameters that are specific to PPO are "clip_range" and "clip_range_vf", which are used to clip the actor loss and the critic loss respectively.

In [None]:

params={
  "save_best": True,
  "plot_policy": True,

  "logger":{
    "classname": "bbrl.utils.logger.TFLogger",
    "log_dir": f"{os.getcwd()}/tblogs/gail",
    "cache_size": 10000,
    "every_n_seconds": 10,
    "verbose": False,    
    },

  "algorithm":{
    "seed": 4,
    "nb_evals": 10,
    "eval_interval": 10000,
    "max_epochs": 5000,
    "architecture":{
      "actor_hidden_size": [64, 32],
    },
  },
  "gym_env":{
    "classname": "__main__.make_gym_env",
    "env_name": "LunarLander-v2",
    },
  "optimizer":{
    "classname": "torch.optim.Adam",
    "lr": 0.001,
  }
}

### Launching tensorboard to visualize the results

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./tmp

In [None]:
config=OmegaConf.create(params)
torch.manual_seed(config.algorithm.seed)

In [None]:
run_behavioral_cloning(config)

Now we can watch our agent...

In [None]:
agent = load_agent(Path("bc_agent") / config.gym_env.env_name, "bc_")
play(make_gym_env(config.gym_env.env_name), agent)



## Main training loop

In [None]:
def run_gail(cfg, variant="gail", needs_kl=False):
    # 1)  Build the  logger
    logger = Logger(cfg, variant, cfg.gym_env.env_name)
    best_reward = -10e9

    # 2) Create the environment agent
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    
    eval_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.nb_evals,
        cfg.algorithm.seed,
    )

    (
        policy,
        train_agent,
        eval_agent,
        critic_agent,
        old_policy,
        old_critic_agent,
        kl_agent,
        disc_agent
    ) = create_ppo_agent(cfg, train_env_agent, eval_env_agent, needs_kl=needs_kl)
    
    action_agent = TemporalAgent(policy)
    old_train_agent = TemporalAgent(old_policy)
    train_workspace = Workspace()

    # Configure the optimizer
    disc_optimizer = setup_optimizer(cfg, disc_agent)
    optimizer = setup_optimizer(cfg, train_agent, critic_agent)
    nb_steps = 0
    tmp_steps = 0

    # À compléter...  
    assert False, 'Code non implémenté'

    # Training loop
    for epoch in range(cfg.algorithm.max_epochs):
        # Execute the agent in the workspace
        
        # Handles continuation
        delta_t = 0
        if epoch > 0:
            train_workspace.zero_grad()
            delta_t = 1
            train_workspace.copy_n_last_steps(delta_t)

        # Run the train/old_train agents
        train_agent(
            train_workspace,
            t=delta_t,
            n_steps=cfg.algorithm.n_steps - delta_t,
            stochastic=True,
            predict_proba=False,
            compute_entropy=False
        )
        old_train_agent(
            train_workspace,
            t=delta_t,
            n_steps=cfg.algorithm.n_steps - delta_t,
            # Just computes the probability
            predict_proba=True,
        )

        # Compute the critic value over the whole workspace
        critic_agent(train_workspace, n_steps=cfg.algorithm.n_steps)

        transition_workspace = train_workspace.get_transitions()

        # We ignore the rewards from the environment
        done, truncated, action, action_logp, v_value = transition_workspace[
            "env/done",
            "env/truncated",
            "action",
            "action_logprobs",
            "v_value",
        ]
        nb_steps += action[0].shape[0]

        # ---- Discriminator training

        # À compléter...  
        assert False, 'Code non implémenté'

        # À compléter...  
        assert False, 'Code non implémenté'

        # ---- PPO training     

        must_bootstrap = torch.logical_or(~done[1], truncated[1])

        with torch.no_grad():
            old_critic_agent(train_workspace, n_steps=cfg.algorithm.n_steps)
            
        old_action_logp = transition_workspace["logprob_predict"].detach()
        old_v_value = transition_workspace["v_value"]
        if cfg.algorithm.clip_range_vf > 0:
            # Clip the difference between old and new values
            # NOTE: this depends on the reward scaling
            v_value = old_v_value + torch.clamp(
                v_value - old_v_value,
                -cfg.algorithm.clip_range_vf,
                cfg.algorithm.clip_range_vf,
            )
            
        critic_loss, advantage = compute_advantage_loss(
            cfg, reward, must_bootstrap, v_value
        )
        
        # We store the advantage into the transition_workspace
        advantage = advantage.detach().squeeze(0)
        transition_workspace.set("advantage", 0, advantage)
        transition_workspace.set("advantage", 1, torch.zeros_like(advantage))
        transition_workspace.set_full("old_action_logprobs", transition_workspace["logprob_predict"].detach())
        transition_workspace.clear("logprob_predict")
    
        for opt_epoch in range(cfg.algorithm.opt_epochs):
            if cfg.algorithm.minibatch_size > 0:
                sample_workspace = transition_workspace.sample_subworkspace(1, cfg.algorithm.minibatch_size, 2)
            else:
                sample_workspace = transition_workspace
                                 
            if opt_epoch > 0:
                critic_loss = 0. # We don't want to optimize the critic after the first mini-epoch

            action_agent(sample_workspace, t=0, n_steps=1, compute_entropy=True, predict_proba=True)

            advantage, action_logp, old_action_logp, entropy = sample_workspace[
                "advantage",
                "logprob_predict",
                "old_action_logprobs",
                "entropy"
            ]
            advantage = advantage[0]
            act_diff = action_logp[0] - old_action_logp[0]
            ratios = act_diff.exp()

            kl = None
            if kl_agent:
                kl_agent(sample_workspace, t=0, n_steps=1)
                kl = sample_workspace["kl"][0]

            actor_loss = compute_clip_agent_loss(
                cfg.algorithm, advantage, ratios, kl
            )
                            
            # Entropy loss favor exploration
            entropy_loss = torch.mean(entropy[0])

            # Store the losses for tensorboard display
            if opt_epoch == 0:
                # Just for the first epoch
                logger.log_losses(nb_steps, critic_loss, entropy_loss, actor_loss)

            loss = (
                cfg.algorithm.critic_coef * critic_loss
                - cfg.algorithm.actor_coef * actor_loss
                - cfg.algorithm.entropy_coef * entropy_loss
            )
            

            old_policy.copy_parameters(policy)
            old_critic_agent = copy.deepcopy(critic_agent)


            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                critic_agent.parameters(), cfg.algorithm.max_grad_norm
            )
            torch.nn.utils.clip_grad_norm_(
                train_agent.parameters(), cfg.algorithm.max_grad_norm
            )
            optimizer.step() 

        # Evaluate if enough steps have been performed
        if nb_steps - tmp_steps > cfg.algorithm.eval_interval:
            tmp_steps = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_agent(
                eval_workspace,
                t=0,
                stop_variable="env/done",
                stochastic=True,
                predict_proba=False,
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.add_log("reward/mean", mean, nb_steps)
            logger.add_log("reward/max", rewards.max(), nb_steps)
            logger.add_log("reward/min", rewards.min(), nb_steps)
            logger.add_log("reward/std", rewards.std(), nb_steps)
            print(f"nb_steps: {nb_steps}, reward: {mean}")
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = f"./gail_agent/{cfg.gym_env.env_name}/{variant}/"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "gail_" + str(mean.item()) + ".agt"
                policy.save_model(filename)

## Definition of the parameters

The logger is defined as `bbrl.utils.logger.TFLogger` so as to use a tensorboard visualisation.

The two parameters that are specific to PPO are "clip_range" and "clip_range_vf", which are used to clip the actor loss and the critic loss respectively.

In [None]:

params={
  "save_best": True,
  "plot_policy": True,

  "logger":{
    "classname": "bbrl.utils.logger.TFLogger",
    "log_dir": f"{os.getcwd()}/tblogs/ppo",
    "cache_size": 10000,
    "every_n_seconds": 10,
    "verbose": False,    
    },

  "algorithm":{
    "seed": 4,
    "n_envs": 8,
    "max_grad_norm": 0.5,
    "nb_evals":10,
    "n_steps": 20,
    "eval_interval": 1000,
    "max_epochs": 3000,
    "discount_factor": 0.95,
    "entropy_coef": 2.55e-5,
    "beta_kl": 1,
    "critic_coef": 0.6,
    "actor_coef": 1.0,
    "gae": 0.9,
    "clip_range": 0.2,
    "clip_range_vf": 0,
    "opt_epochs": 1,
    "minibatch_size": 0,
    "architecture":{
      "actor_hidden_size": [25, 36],
      "critic_hidden_size": [24, 36],
    },
  },
  "gym_env":{
    "classname": "__main__.make_gym_env",
    "env_name": "LunarLander-v2",
    },
  "optimizer":{
    "classname": "torch.optim.Adam",
    "lr": 0.001,
  }
}

### Launching tensorboard to visualize the results

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./tmp

In [None]:
config=OmegaConf.create(params)
torch.manual_seed(config.algorithm.seed)

In [None]:
run_gail(config)

Now we can watch our agent...

In [None]:
agent = load_agent(Path("gail_agent") / config.gym_env.env_name, "gail_")
play(make_gym_env(config.gym_env.env_name), agent)