Last active
August 23, 2019 17:56
-
-
Save tanzhenyu/35a2c80512e28aef14d47e775937db1d to your computer and use it in GitHub Desktop.
ppo TF2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import gym | |
import numpy as np | |
import scipy.signal | |
def mlp(ob_space, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): | |
model = tf.keras.Sequential() | |
for h in hidden_sizes[:-1]: | |
model.add(tf.keras.layers.Dense(units=h, activation=activation)) | |
model.add(tf.keras.layers.Dense(units=hidden_sizes[-1], activation=output_activation)) | |
model.build(input_shape=(None,) + ob_space.shape) | |
return model | |
class MlpCategoricalActorCritic(tf.keras.Model): | |
def __init__(self, ob_space, ac_space, hidden_sizes=(64, 64), activation=tf.keras.activations.tanh, output_activation=None): | |
super(MlpCategoricalActorCritic, self).__init__() | |
self.act_dim = ac_space.n | |
with tf.name_scope('pi'): | |
self.actor_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[self.act_dim], activation=activation) | |
with tf.name_scope('v'): | |
self.critic_mlp = mlp(ob_space=ob_space, hidden_sizes=list(hidden_sizes)+[1], activation=activation) | |
@tf.function | |
def get_pi_logpi_vf(self, observations): | |
logits = self.actor_mlp(observations) | |
logp_all = tf.nn.log_softmax(logits) | |
pi = tf.squeeze(tf.random.categorical(logits, num_samples=1, seed=0), axis=1) | |
logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1) | |
vf = self.critic_mlp(observations) | |
return pi, logp_pi, vf | |
@tf.function | |
def get_logp(self, observations, actions): | |
logits = self.actor_mlp(observations) | |
logp_all = tf.nn.log_softmax(logits) | |
return tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1) | |
@tf.function | |
def get_v(self, observations): | |
return tf.squeeze(self.critic_mlp(observations), axis=1) | |
def discount_cumsum(x, discount): | |
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] | |
def combined_shape(length, shape=None): | |
if shape is None: | |
return (length,) | |
return (length, shape) if np.isscalar(shape) else (length, *shape) | |
class PPOBuffer: | |
def __init__(self, ob_space, ac_space, size, gamma=0.99, lam=0.95): | |
self.obs_buf = np.zeros(combined_shape(size, ob_space.shape), dtype=ob_space.dtype) | |
self.act_buf = np.zeros(combined_shape(size, ac_space.shape), dtype=ac_space.dtype) | |
self.adv_buf = np.zeros(size, dtype=np.float32) | |
self.rew_buf = np.zeros(size, dtype=np.float32) | |
self.ret_buf = np.zeros(size, dtype=np.float32) | |
self.val_buf = np.zeros(size, dtype=np.float32) | |
self.logp_buf = np.zeros(size, dtype=np.float32) | |
self.gamma, self.lam = gamma, lam | |
self.ptr, self.path_start_idx, self.max_size = 0, 0, size | |
def store(self, obs, act, rew, val, logp): | |
assert self.ptr < self.max_size # buffer has to have room so you can store | |
self.obs_buf[self.ptr] = obs | |
self.act_buf[self.ptr] = act | |
self.rew_buf[self.ptr] = rew | |
self.val_buf[self.ptr] = val | |
self.logp_buf[self.ptr] = logp | |
self.ptr += 1 | |
def finish_path(self, last_val=0): | |
path_slice = slice(self.path_start_idx, self.ptr) | |
rews = np.append(self.rew_buf[path_slice], last_val) | |
vals = np.append(self.val_buf[path_slice], last_val) | |
# the next two lines implement GAE-Lambda advantage calculation | |
deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] | |
self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam) | |
# the next line computes rewards-to-go, to be targets for the value function | |
self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1] | |
self.path_start_idx = self.ptr | |
def get(self): | |
assert self.ptr == self.max_size # buffer has to be full before you can get | |
self.ptr, self.path_start_idx = 0, 0 | |
# the next two lines implement the advantage normalization trick | |
adv_mean = np.mean(self.adv_buf) | |
adv_std = np.std(self.adv_buf) | |
self.adv_buf = (self.adv_buf - adv_mean) / adv_std | |
return [self.obs_buf, self.act_buf, self.adv_buf, | |
self.ret_buf, self.logp_buf] | |
def ppo(seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, | |
vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01): | |
tf.random.set_seed(seed) | |
np.random.seed(seed) | |
env = gym.make('CartPole-v1') | |
ob_space = env.observation_space | |
ac_space = env.action_space | |
obs_dim = ob_space.shape | |
act_dim = ac_space.shape | |
model = Mlp_Categorical_Actor_Critic(ob_space, ac_space) | |
# Optimizers | |
opt_pi = tf.keras.optimizers.Adam(learning_rate=pi_lr) | |
opt_v = tf.keras.optimizers.Adam(learning_rate=vf_lr) | |
# Experience buffer | |
local_steps_per_epoch = int(steps_per_epoch) | |
buf = PPOBuffer(ob_space, ac_space, local_steps_per_epoch, gamma, lam) | |
# Trainable weight for actor and critic | |
actor_weights = model.actor_mlp.trainable_weights | |
critic_weights = model.critic_mlp.trainable_weights | |
@tf.function | |
def update(obs, acs, advs, rets, logp_olds): | |
stopIter = tf.constant(train_pi_iters) | |
pi_loss = 0. | |
for i in tf.range(train_pi_iters): | |
with tf.GradientTape() as tape: | |
logp = model.get_logp(obs, acs) | |
ratio = tf.exp(logp - logp_olds) | |
min_adv = tf.where(advs > 0, (1+clip_ratio)*advs, (1-clip_ratio)*advs) | |
pi_loss = -tf.reduce_mean(tf.minimum(ratio * advs, min_adv)) | |
grads = tape.gradient(pi_loss, actor_weights) | |
opt_pi.apply_gradients(zip(grads, actor_weights)) | |
kl = tf.reduce_mean(logp_olds - logp) | |
if kl > 1.5 * target_kl: | |
stopIter = i | |
break | |
v_loss = 0. | |
for i in tf.range(train_v_iters): | |
with tf.GradientTape() as tape: | |
v = model.get_v(obs) | |
v_loss = tf.reduce_mean((rets - v)**2) | |
grads = tape.gradient(v_loss, critic_weights) | |
opt_v.apply_gradients(zip(grads, critic_weights)) | |
return pi_loss, v_loss, stopIter | |
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 | |
# Main loop: collect experience in env and update/log each epoch | |
Ep_Ret = [] | |
for epoch in range(epochs): | |
Ep_Ret = [] | |
for t in range(local_steps_per_epoch): | |
expand_o = tf.constant(o.reshape(1, -1)) | |
a, logp_t, v_t = model.get_pi_logpi_vf(expand_o) | |
a = a.numpy()[0] | |
logp_t = logp_t.numpy()[0] | |
v_t = v_t.numpy()[0][0] | |
buf.store(o, a, r, v_t, logp_t) | |
o, r, d, _ = env.step(a) | |
ep_ret += r | |
ep_len += 1 | |
terminal = d or (ep_len == max_ep_len) | |
if terminal or (t==local_steps_per_epoch-1): | |
if not(terminal): | |
print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) | |
last_val = r if d else model.get_v(tf.constant(o.reshape(1, -1))).numpy()[0] | |
buf.finish_path(last_val) | |
if terminal: | |
Ep_Ret.append(ep_ret) | |
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 | |
obs, acs, advs, rets, logp_olds = buf.get() | |
pi_loss, v_loss, stopIter = update(obs, acs, advs, rets, logp_olds) | |
print('---------------------------------') | |
print('epoch {}'.format(epoch)) | |
print('pi loss {}'.format(pi_loss.numpy())) | |
print('vf loss {}'.format(v_loss.numpy())) | |
print('step iter {}'.format(stopIter)) | |
print('Ep Ret {}'.format(np.mean(Ep_Ret))) | |
return model, env | |
if __name__ == '__main__': | |
model, env = ppo() | |
obs = env.reset() | |
while True: | |
action, _, _ = model.get_pi_logpi_vf(obs.reshape(1, -1)) | |
obs, r, d, _ = env.step(action.numpy()[0]) | |
reward += r | |
env.render() | |
if d: | |
print('episode reward {}'.format(reward)) | |
reward = 0 | |
obs = env.reset() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment