Created
January 17, 2017 06:55
-
-
Save ctmakro/df15fc53d482ae91e41387a0448c6384 to your computer and use it in GitHub Desktop.
DDPG, train 64*50 every 50 steps, 1/f^2 noise added
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
# Deep Deterministic Policy Gradient Method | |
# David Silver et al. | |
# implemented in plain Keras, by Qin Yongliang | |
# 2017 01 13 | |
# heavily optimized for speed, lots of numpy flowed into tensorflow | |
# 2017 01 14 | |
''' | |
summary | |
0. s for state, a for action, r for reward, | |
q for 'action_quality', or expectation of sum of discounted future reward. | |
1. you have 2 network, Mr. actor and Mr. critic | |
- Mr. actor generate actions: a = actor(s) | |
- Mr. critic score (state,action) pairs: q = critic(s,a) | |
>in literature, Mr. actor is function mu(s), Mr. critic is function Q(s,a) | |
2. you improve Mr. critic by using Bellman equation, or what they call TD-learning | |
- Q(s1,a1) := r1 + gamma * Q(s2,a2) where a2 = actor(s2) | |
- train Mr. critic to predict the calculated Q(s1,a1) given s1 and a1, using gradient descent and MSE loss. | |
3. after that, improve Mr. actor by gradient ascent w.r.t. Q(s,a) | |
- a1_maybe = actor(s1), q1_maybe = critic(s1,a1_maybe) | |
- therefore q1_maybe = critic(s1,actor(s1)). we want to increase q1_maybe!! | |
- then figure out what is the gradient of actor w.r.t. q1_maybe, | |
using tf.gradient() or by compositing Keras Models (as I did, to keep things clean) | |
- then do gradient ascent to increase Mr. actor's actions' q-value | |
4. to stabilize the whole learning process: | |
- random sampling of training examples from replay memory | |
- use 'target' networks that are copy of actor and critic, | |
their weights gradually shift towards the weights of the real actor and critic | |
to reduce self-correlation/oscillation (well, if you know control theory) | |
- add noise to actor's output in the beginning of learning, to turn deterministic actions into probabilistic ones | |
- that's basically it | |
5. now go master the game of Gym | |
''' | |
''' | |
personal tricks: | |
check the Residual Dense Unit, it works! | |
''' | |
# gym boilerplate | |
import numpy as np | |
import gym | |
from gym import wrappers | |
from gym.spaces import Discrete, Box | |
# keras boilerplate: the simplest way to neural networking | |
from keras.models import * | |
from keras.layers import * | |
from keras.optimizers import * | |
import keras | |
from math import * | |
import random | |
import keras.backend as K | |
import time | |
# from winfrey import wavegraph | |
from rpm import rpm # replay memory implementation | |
from noise import one_fsq_noise | |
def bn(i): | |
return i | |
return BatchNormalization(mode=1)(i) | |
def relu(i): | |
return Activation('relu')(i) | |
# residual dense unit | |
def resdense(idim,odim): | |
def unit(i): | |
mdim = max(4,int(idim/4),int(odim/4)) | |
if idim==odim: | |
ident = i | |
i = bn(i) | |
i = relu(i) | |
i = Dense(mdim)(i) | |
i = bn(i) | |
i = relu(i) | |
i = Dense(odim)(i) | |
else: | |
i = bn(i) | |
i = relu(i) | |
ident = i | |
i = Dense(mdim)(i) | |
i = bn(i) | |
i = relu(i) | |
i = Dense(odim)(i) | |
ident = Dense(odim)(ident) | |
out = merge([ident,i],mode='sum') | |
return out | |
return unit | |
def softmax(x): | |
"""Compute softmax values for each sets of scores in x.""" | |
ex = np.exp(x) | |
return ex / np.sum(ex, axis=0) | |
class nnagent(object): | |
def __init__(self, | |
observation_space, | |
action_space, | |
stack_factor=1, | |
discount_factor=.99, # gamma | |
optimizer=RMSprop(), | |
train_skip_every=1, | |
): | |
self.rpm = rpm(1000000) # 1M history | |
self.render = True | |
self.noise_source = one_fsq_noise() | |
self.train_counter = 0 | |
self.train_skip_every = train_skip_every | |
self.observation_stack_factor = stack_factor | |
self.inputdims = observation_space.shape[0] * self.observation_stack_factor | |
# assume observation_space is continuous | |
self.is_continuous = True if isinstance(action_space,Box) else False | |
if self.is_continuous: # if action space is continuous | |
low = action_space.low | |
high = action_space.high | |
num_of_actions = action_space.shape[0] | |
self.action_bias = high/2. + low/2. | |
self.action_multiplier = high - self.action_bias | |
# say high,low -> [2,7], then bias -> 4.5 | |
# mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7] | |
def clamper(actions): | |
return np.clip(actions,a_max=action_space.high,a_min=action_space.low) | |
self.clamper = clamper | |
else: | |
num_of_actions = action_space.n | |
self.action_bias = .5 | |
self.action_multiplier = .5 # map (-1,1) into (0,1) | |
def clamper(actions): | |
return np.clip(actions,a_max=1.,a_min=0.) | |
self.clamper = clamper | |
self.outputdims = num_of_actions | |
self.discount_factor = discount_factor | |
self.optimizer = optimizer | |
ids,ods = self.inputdims,self.outputdims | |
self.actor, self.frozen_actor = self.create_actor_network(ids,ods) | |
self.critic, self.frozen_critic = self.create_critic_network(ids,ods) | |
print('inputdims:{}, outputdims:{}'.format(ids,ods)) | |
print('actor network:') | |
self.actor.summary() | |
print('critic network:') | |
self.critic.summary() | |
# target networks: identical copies of actor and critic | |
self.actor_target,self.frozen_actor_target = self.create_actor_network(ids,ods) | |
self.critic_target, self.frozen_critic_target = self.create_critic_network(ids,ods) | |
self.replace_weights(tau=1.) | |
self.create_q1_target_model() | |
self.create_actor_trainer() | |
self.create_critic_trainer() | |
def create_actor_trainer(self): | |
# now the dirty part: the actor trainer -------------------------------- | |
# explaination of this part is written in the train() method | |
s_given = Input(shape=(self.inputdims,)) | |
a1_maybe = self.actor(s_given) | |
q1_maybe = self.frozen_critic([s_given,a1_maybe]) | |
# frozen weight version of critic. so we can train only the actor | |
actor_trainer = Model(input=s_given,output=q1_maybe) | |
# use negative of q1_maybe as loss (so we can maximize q by minimizing the loss) | |
def neg_q1(y_true,y_pred): | |
return - y_pred # neat! | |
actor_trainer.compile(optimizer=self.optimizer,loss=neg_q1) | |
self.actor_trainer = actor_trainer | |
# dirty part ended ----------------------------------------------------- | |
# (gradually) replace target network weights with online network weights | |
def _replace_weights(self,tau=0.001): | |
theta_a,theta_c = self.actor.get_weights(),self.critic.get_weights() | |
theta_a_targ,theta_c_targ = self.actor_target.get_weights(),self.critic_target.get_weights() | |
# mixing factor tau : we gradually shift the weights... | |
theta_a_targ = [theta_a[i]*tau + theta_a_targ[i]*(1-tau) for i in range(len(theta_a))] | |
theta_c_targ = [theta_c[i]*tau + theta_c_targ[i]*(1-tau) for i in range(len(theta_c))] | |
self.actor_target.set_weights(theta_a_targ) | |
self.critic_target.set_weights(theta_c_targ) | |
# the method above uses numpy, how can we flow it in tensorflow? | |
def replace_weights(self,tau=0.001): | |
if not hasattr(self,'wflow'): | |
self.wflow = self.weights_flow() | |
flow = self.wflow | |
tau = np.array([tau],dtype='float32') | |
flow([tau,0]) | |
def weights_flow(self): | |
# define the weight replacing op | |
theta_a,theta_c = self.actor.weights,self.critic.weights | |
theta_a_targ,theta_c_targ = self.actor_target.weights,self.critic_target.weights | |
tau_place = K.placeholder(shape=(1,)) | |
ops = [] | |
for i,w in enumerate(theta_a_targ): | |
ops += [theta_a_targ[i].assign(theta_a[i]*tau_place + theta_a_targ[i]*(1-tau_place))] | |
for i,w in enumerate(theta_c_targ): | |
ops += [theta_c_targ[i].assign(theta_c[i]*tau_place + theta_c_targ[i]*(1-tau_place))] | |
flow = K.function([tau_place],ops) | |
return flow | |
# a = actor(s) : predict actions given state | |
def create_actor_network(self,inputdims,outputdims): | |
inp = Input(shape=(inputdims,)) | |
i = inp | |
i = Dense(128)(i) | |
i = resdense(128,128)(i) | |
# i = resdense(128,128)(i) | |
i = relu(bn(i)) | |
i = Dense(outputdims)(i) | |
if self.is_continuous: | |
# map into (-1,1) | |
i = Activation('tanh')(i) | |
# map into action_space | |
i = Lambda(lambda x:x * self.action_multiplier + self.action_bias)(i) | |
else: | |
# map into (0,1) | |
i = Activation('softmax')(i) | |
out = i | |
model = Model(input=inp,output=out) | |
# now we create a frozen_model, | |
# that uses the same layers with weights frozen when trained. | |
frozen_model = Model(input=inp,output=out) | |
frozen_model.trainable = False | |
return model,frozen_model | |
# q = critic(s,a) : predict q given state and action | |
def create_critic_network(self,inputdims,actiondims): | |
inp = Input(shape=(inputdims,)) | |
act = Input(shape=(actiondims,)) | |
i = merge([inp,act],mode='concat') | |
i = Dense(128)(i) | |
i = resdense(128,128)(i) | |
# i = resdense(128,128)(i) | |
i = relu(bn(i)) | |
i = Dense(1)(i) | |
out = i | |
model = Model(input=[inp,act],output=out) | |
# now we create a frozen_model, | |
# that uses the same layers with weights frozen when trained. | |
frozen_model = Model(input=[inp,act],output=out) | |
frozen_model.trainable = False | |
return model,frozen_model | |
def create_q1_target_model(self): | |
# this part is for performance optimization | |
# for explaination of this part, please check train() | |
s2i = Input(shape=(self.inputdims,)) | |
a2i = self.frozen_actor_target(s2i) | |
q2i = self.frozen_critic_target([s2i,a2i]) | |
r1i = Input(shape=(1,)) | |
isdonei = Input(shape=(1,)) | |
def calc_q1_target(x): | |
[r1i,isdonei,q2i] = x | |
return r1i + (1-isdonei) * self.discount_factor * q2i | |
def calc_output_shape(input_shapes): | |
return input_shapes[0] | |
q1_target = merge([r1i,isdonei,q2i],mode=calc_q1_target,output_shape=calc_output_shape) | |
q1_target_model = Model(input=[s2i,r1i,isdonei],output=q1_target) | |
self.q1_target_model = q1_target_model | |
def create_critic_trainer(self): | |
# this part is also for performance optimization... | |
qtm = self.q1_target_model | |
qtm.trainable = False | |
s1i = Input(shape=(self.inputdims,)) | |
s2i = Input(shape=(self.inputdims,)) | |
a1i = Input(shape=(self.outputdims,)) | |
r1i = Input(shape=(1,)) | |
isdonei = Input(shape=(1,)) | |
q1t = qtm([s2i,r1i,isdonei]) | |
crit = self.critic([s1i,a1i]) | |
def mse(x): | |
return (x[0]-x[1])**2 | |
def calc_output_shape(input_shapes): | |
return input_shapes[0] # shape of r1i | |
loss = merge([q1t,crit],mode=mse,output_shape=calc_output_shape) | |
def thru(y_true,y_pred): | |
return y_pred | |
model = Model(input=[s1i,a1i,r1i,isdonei,s2i],output=loss) | |
model.compile(loss=thru,optimizer=self.optimizer) | |
self.critic_trainer = model | |
def train(self,verbose=1): | |
memory = self.rpm | |
critic,frozen_critic = self.critic,self.frozen_critic | |
actor = self.actor | |
batch_size = 64 | |
total_size = batch_size * self.train_skip_every | |
epochs = 1 | |
self.train_counter+=1 | |
self.train_counter %= self.train_skip_every | |
if self.train_counter != 0: # train every few steps | |
return | |
if memory.size() > total_size: | |
#if enough samples in memory | |
# sample randomly a minibatch from memory | |
[s1,a1,r1,isdone,s2] = memory.sample_batch(total_size) | |
# print(s1.shape,a1.shape,r1.shape,isdone.shape,s2.shape) | |
if False: # the following is optimized away but kept for clarity. | |
# a2_targ = actor_targ(s2) : what will you do in s2, Mr. old actor? | |
a2 = self.actor_target.predict(s2) | |
# q2_targ = critic_targ(s2,a2) : how good is action a2, Mr. old critic? | |
q2 = self.critic_target.predict([s2,a2]) | |
# what if we combine the 2 above to improve performance? | |
s2i = Input(shape=(self.inputdims,)) | |
a2i = self.actor_target(s2i) | |
q2i = self.critic_target([s2i,a2i]) | |
# if a2 is q2-good, then what should q1 be? | |
# Use Bellman Equation! (recursive definition of q-values) | |
# if not last step of episode: | |
# q1 = (r1 + gamma * q2) | |
# else: | |
# q1 = r1 | |
q1_target = r1 + (1-isdone) * self.discount_factor * q2 | |
# but, what if we combine all above to improve performance? | |
r1i = Input(shape=(1,)) | |
isdonei = Input(shape=(1,)) | |
def calc_q1_target(x): | |
[r1i,isdonei,q2i] = x | |
return r1i + (1-isdonei) * self.discount_factor * q2i | |
def calc_output_shape(input_shapes): | |
return input_shapes[0] | |
q1_target = merge([r1i,isdonei,q2i],mode=calc_q1_target,output_shape=calc_output_shape) | |
q1_target_model = Model(input=[s2i,r1i,isdonei],output=q1_target) | |
else: | |
# q1_target_model is already implemented in create_q1_target_model() | |
# q1_target = self.q1_target_model.predict([s2,r1,isdone]) | |
# all above were optimized away... | |
critic_trainer = self.critic_trainer | |
# critic.fit([s1,a1], | |
# q1_target, | |
# batch_size=batch_size, | |
# nb_epoch=epochs, | |
# verbose=verbose, | |
# shuffle=False | |
# ) | |
critic_trainer.fit([s1,a1,r1,isdone,s2], | |
np.zeros((total_size,1)), # useless target label | |
batch_size=batch_size, | |
nb_epoch=epochs, | |
verbose=verbose, | |
shuffle=False | |
) | |
# now the critic can predict more accurate q given s and a. | |
# thanks to the Bellman equation, and David Silver. | |
# with a better critic, we can now improve our actor! | |
if False: # the following part is optimized away. left here for explaination purposes | |
# a1_pred = actor(s1) : what will you do in s1, Mr. actor? | |
a1_maybe = actor.predict(s1) | |
# this action may not be optimal. now let's ask the critic. | |
# what do you think of Mr. actor's action on s1, Mr. better critic? | |
q1_maybe = critic.predict([s1,a1_maybe]) | |
# what should we do to the actor, to increase q1_maybe? | |
# well, calculate the gradient of actor parameters | |
# w.r.t. q1_maybe, then do gradient ascent. | |
# so let's build a model that trains the actor to output higher q1_maybe values | |
s_given = Input(shape=(self.inputdims,)) | |
a1_maybe = actor(s_given) | |
q1_maybe = frozen_critic([s_given,a1_maybe]) | |
# frozen weight version of critic. so we only train the actor | |
actor_trainer = Model(input=s_given,output=q1_maybe) | |
# use negative of q1_maybe as loss (so we can maximize q by minimizing the loss) | |
def neg_q1(y_true,y_pred): | |
return - y_pred # neat! | |
actor_trainer.compile(optimizer=self.optimizer,loss=neg_q1) | |
else: # the actor_trainer is already initialized in create_actor_trainer() | |
actor_trainer = self.actor_trainer | |
actor_trainer.fit(s1, | |
np.zeros((total_size,1)), # useless target label | |
batch_size=batch_size, | |
nb_epoch=epochs, | |
verbose=verbose, | |
shuffle=False | |
) | |
# now both the actor and the critic have improved. | |
self.replace_weights(tau=0.001 * self.train_skip_every) | |
else: | |
pass | |
# print('# no enough samples, not training') | |
def feed_one(self,tup): | |
self.rpm.add(tup) | |
# gymnastics | |
def play(self,env,max_steps=-1,realtime=False,noise_level=0.): # play 1 episode | |
timer = time.time() | |
max_steps = max_steps if max_steps > 0 else 50000 | |
steps = 0 | |
total_reward = 0 | |
render = self.render | |
# stack a little history to ensure markov property | |
# LSTM will definitely be used here in the future... | |
# global que # python 2 quirk | |
self.que = np.zeros((self.inputdims,),dtype='float32') # list of recent history actions | |
def quein(observation): | |
# global que # python 2 quirk | |
length = len(observation) | |
self.que[0:-length] = self.que[length:] # left shift | |
self.que[-length:] = np.array(observation) | |
def quecopy(): | |
return self.que.copy() | |
# what the agent see as state is a stack of history observations. | |
observation = env.reset() | |
quein(observation) # quein o1 | |
while True and steps <= max_steps: | |
steps +=1 | |
thisque = quecopy() # s1 | |
action = self.act(thisque) # a1 | |
if self.is_continuous: | |
# add noise to our actions, since our policy by nature is deterministic | |
exploration_noise = self.noise_source.one((self.outputdims,),noise_level) | |
exploration_noise *= self.action_multiplier | |
# print(exploration_noise,exploration_noise.shape) | |
action += exploration_noise | |
action = self.clamper(action) | |
action_out = action | |
else: | |
# discretize our actions | |
probabilities = action | |
csprob = np.cumsum(probabilities) | |
action_index = (csprob > np.random.rand()).argmax() | |
action_out = action_index | |
# o2, r1, | |
observation, reward, done, _info = env.step(action_out) | |
# d1 | |
isdone = 1 if done else 0 | |
total_reward += reward | |
quein(observation) # quein o2 | |
nextque = quecopy() # s2 | |
# feed into replay memory | |
self.feed_one((thisque,action,reward,isdone,nextque)) # s1,a1,r1,isdone,s2 | |
if render and (steps%10==0 or realtime==True): env.render() | |
if done : | |
break | |
verbose= 2 if steps==1 else 0 | |
self.train(verbose=verbose) | |
# print('episode done in',steps,'steps',time.time()-timer,'second total reward',total_reward) | |
totaltime = time.time()-timer | |
print('episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'.format( | |
steps,totaltime,totaltime/steps,total_reward | |
)) | |
return | |
# one step of action, given observation | |
def act(self,observation): | |
actor,critic = self.actor,self.critic | |
obs = np.reshape(observation,(1,len(observation))) | |
actions = actor.predict(obs) | |
q = critic.predict([obs,actions])[0] | |
disp_actions = (actions[0]-self.action_bias) / self.action_multiplier | |
disp_actions = disp_actions * 5 + np.arange(self.outputdims) * 12.0 + 30 | |
noise = self.noise_source.ask() * 5 - np.arange(self.outputdims) * 12.0 - 30 | |
self.loggraph(np.hstack([disp_actions,noise,q])) | |
return actions[0] | |
def loggraph(self,waves): | |
pass | |
# if not hasattr(self,'wavegraph'): | |
# def rn(): | |
# r = np.random.uniform() | |
# return 0.2+r*0.4 | |
# colors = [] | |
# for i in range(len(waves)-1): | |
# color = [rn(),rn(),rn()] | |
# colors.append(color) | |
# colors.append([0.2,0.5,0.9]) | |
# self.wavegraph = wavegraph(len(waves),'actions/noises/Q',np.array(colors)) | |
# wg = self.wavegraph | |
# wg.one(waves.reshape((-1,))) | |
class playground(object): | |
def __init__(self,envname): | |
self.envname=envname | |
env = gym.make(envname) | |
self.env = env | |
self.monpath = './experiment-'+self.envname | |
def wrap(self): | |
from gym import wrappers | |
self.env = wrappers.Monitor(self.env,self.monpath,force=True) | |
def up(self): | |
self.env.close() | |
gym.upload(self.monpath, api_key='sk_ge0PoVXsS6C5ojZ9amTkSA') | |
# p = playground('LunarLanderContinuous-v2') | |
p = playground('Pendulum-v0') | |
# p = playground('MountainCar-v0')BipedalWalker-v2 | |
# p = playground('BipedalWalker-v2') | |
e = p.env | |
agent = nnagent( | |
e.observation_space, | |
e.action_space, | |
discount_factor=.99, | |
stack_factor=1, | |
optimizer=RMSprop(lr=1e-4), | |
train_skip_every=50, | |
) | |
def r(ep): | |
agent.render = False | |
e = p.env | |
noise_level = 1. | |
for i in range(ep): | |
noise_level *= .95 | |
noise_level = max(1e-8,noise_level - 1e-4) | |
print('ep',i,'/',ep,'noise_level',noise_level) | |
agent.play(e,max_steps=-1,noise_level=noise_level) | |
def test(): | |
e = p.env | |
agent.render = True | |
agent.play(e,realtime=True,max_steps=-1,noise_level=1e-11) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
class one_fsq_noise(object): | |
def __init__(self): | |
self.buffer = np.array([0.]) | |
def one(self,size,noise_level=1.): | |
# draw one gaussian | |
g = np.random.normal(loc=0.,scale=noise_level,size=size) | |
if self.buffer.shape != size: | |
self.buffer = np.zeros(size,dtype='float32') | |
self.buffer += g | |
# high pass a little | |
self.buffer *= .98 | |
return self.buffer.copy() | |
def ask(self): | |
return self.buffer.copy() | |
# 1/f^2 noise: http://hal.in2p3.fr/in2p3-00024797/document |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import deque | |
import numpy as np | |
import random | |
# replay buffer per http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html | |
class rpm(object): | |
#replay memory | |
def __init__(self,buffer_size): | |
self.buffer_size = buffer_size | |
self.count = 0 | |
self.buffer = deque() | |
def add(self, tup): | |
experience = tup | |
if self.count < self.buffer_size: | |
self.buffer.append(experience) | |
self.count += 1 | |
else: | |
self.buffer.popleft() | |
self.buffer.append(experience) | |
def size(self): | |
return self.count | |
def sample_batch(self, batch_size): | |
''' | |
batch_size specifies the number of experiences to add | |
to the batch. If the replay buffer has less than batch_size | |
elements, simply return all of the elements within the buffer. | |
Generally, you'll want to wait until the buffer has at least | |
batch_size elements before beginning to sample from it. | |
''' | |
batch = [] | |
if self.count < batch_size: | |
batch = random.sample(self.buffer, self.count) | |
else: | |
batch = random.sample(self.buffer, batch_size) | |
item_count = len(batch[0]) | |
res = [] | |
for i in range(item_count): | |
k = np.array([item[i] for item in batch]) | |
if len(k.shape)==1: k = k.reshape(k.shape+(1,)) | |
res.append(k) | |
return res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment