Skip to content

Instantly share code, notes, and snippets.

@vishaljain3991
Created August 2, 2017 09:37
Show Gist options
  • Save vishaljain3991/7d65cd267a3b2dd6d6ca28e5aa8f267a to your computer and use it in GitHub Desktop.
Save vishaljain3991/7d65cd267a3b2dd6d6ca28e5aa8f267a to your computer and use it in GitHub Desktop.
import numpy as np
import random
import argparse
# from PIL import Image
import skimage as skimage
from skimage import transform, color, exposure
# from collections import deque
# from copy import deepcopy
#
# import matplotlib
# matplotlib.use('TkAgg')
# import matplotlib.pyplot as plt
# from skimage.viewer import ImageViewer
from keras.models import Sequential, model_from_config
from keras.layers import Dense, Activation, Flatten, Conv2D, Permute, BatchNormalization
from keras.optimizers import Adam
import sys
# import gym
from ale_python_interface import ALEInterface
parser = argparse.ArgumentParser(description='Description of your program')
parser.add_argument('-m','--mode', help='train / run', required=True)
parser.add_argument('-w','--weights', help='model.h5', required=False)
parser.add_argument('-e','--env', help='environment name', required=True)
parser.add_argument('-d','--monitor', help='monitor directory', required=True)
args = vars(parser.parse_args())
# env = gym.make(args['env'])
rom_file = str.encode(sys.argv[1])
ale.loadROM(rom_file)
num_actions = len(ale.getLegalActionSet())
ale.setInt(b'random_seed', 123)
# print ()
# from gym import wrappers
# env = wrappers.Monitor(env, args['monitor'] + args['env'] + '/', force=True)
#
def clone_model(model):
# Requires Keras 1.0.7 since get_config has breaking changes.
config = {
'class_name': model.__class__.__name__,
'config': model.get_config(),
}
clone = model_from_config(config)
clone.set_weights(model.get_weights())
return clone
maxlen = 10**6
counter = -1
replay_memory = [None for _ in range(maxlen)]
# warmup_steps = 10000
warmup_steps = 20000
target_model_update = 10000
num_steps = 0
eps_max = 0.3
eps_min = 0.1
anneal_steps = 10**6
learning_rate = 0.00025
gamma = 0.99
# max_steps = 1750000
max_episodes = 12000
model_save_steps = 10000
num_frames_per_action = 4
model = Sequential()
model.add(Conv2D(32, (8, 8), strides=4, input_shape=(84, 84, 4)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(64, (4, 4), strides=2))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(num_actions))
model.add(Activation('linear'))
model.compile(optimizer=Adam(lr=learning_rate),
loss='mean_squared_error')
# print (model.__class__.__name__)
# print model.metrics_names
print(model.summary())
if args['weights'] is not None:
print ("Now we load weight")
model.load_weights(args['weights'])
adam = Adam(lr=learning_rate)
model.compile(loss='mse',optimizer=adam)
# sys.exit()
target_model = clone_model(model)
# replay_memory = deque()
def epsilon_greedy(epsilon, state):
"Epsilon greedy policy"
if np.random.uniform(0,1) < 1-epsilon:
# choose maximum q action
state = np.expand_dims(state, axis=0)
epsilon_action = np.argmax(model.predict_on_batch(state))
# print epsilon_action
return epsilon_action
else:
a = np.random.randint(num_actions)
return a
def get_epsilon(eps):
eps = eps - (eps_max - eps_min)/anneal_steps
return max(eps, eps_min)
def image_process(observation):
# observation = skimage.color.rgb2gray(observation)
observation = observation.squeeze()
observation = skimage.transform.resize(observation,(84,84))
observation = skimage.exposure.rescale_intensity(observation,out_range=(0,255))
observation = observation.reshape(observation.shape[0], observation.shape[1], 1)
return observation
def batch_unpack(minibatch):
observation_batch, action_batch, reward_batch, terminal_batch, next_observation_batch \
= [],[],[],[],[]
for experience in minibatch:
observation, action, reward, next_observation, terminal = experience
observation_batch.append(observation)
action_batch.append(action)
next_observation_batch.append(next_observation)
reward_batch.append(reward)
terminal_batch.append(terminal)
observation_batch = np.array(observation_batch).squeeze()
action_batch = np.array(action_batch)
reward_batch = np.array(reward_batch)
terminal_batch = np.array(terminal_batch, dtype=int)
next_observation_batch = np.array(next_observation_batch).squeeze()
# print terminal_batch.shape
return observation_batch, action_batch, reward_batch, terminal_batch, next_observation_batch
epsilon = eps_max
episode_num = 0
if args['mode'] == 'train':
for i in range(max_episodes):
episode_num += 1
# observation = ale.getScreenRGB()
observation = ale.getScreenGrayscale()
observation = image_process(observation)
observation_stack = np.stack((observation, observation, observation, observation), axis=2)
observation_stack = observation_stack.reshape(1, observation_stack.shape[0], observation_stack.shape[1], observation_stack.shape[2])
episode_steps = 0
cumulative_reward = 0
while True:
if episode_steps % num_frames_per_action==0:
action = epsilon_greedy(epsilon, observation_stack)
epsilon = get_epsilon(epsilon)
reward = ale.act(action)
next_observation = ale.getScreenGrayscale()
next_observation = image_process(next_observation)
next_observation = next_observation.reshape(1, next_observation.shape[0], next_observation.shape[1], 1)
done = ale.game_over()
next_observation_stack = np.append(next_observation, observation_stack[:,:,:,:3], axis=3)
replay_memory[num_steps % maxlen] = (observation_stack, action, reward, next_observation_stack, done)
observation_stack = next_observation_stack
num_steps += 1
episode_steps += 1
cumulative_reward += reward
batch_loss = 0
if num_steps > warmup_steps:
minibatch = random.sample(replay_memory[:min(num_steps, maxlen)], 32)
observation_batch, action_batch, reward_batch, terminal_batch, next_observation_batch \
= batch_unpack(minibatch)
next_action_values_batch = model.predict_on_batch(next_observation_batch)
b = np.zeros_like(next_action_values_batch)
b[np.arange(len(next_action_values_batch)), next_action_values_batch.argmax(1)] = 1
c = target_model.predict_on_batch(next_observation_batch)
final_reward_batch = (reward_batch + gamma * c.T * terminal_batch).T\
* b
batch_loss = model.train_on_batch(observation_batch, final_reward_batch)
if num_steps > 1 and num_steps%target_model_update==0:
target_model = clone_model(model)
if num_steps > 1 and num_steps%model_save_steps==0:
print ("Saving model.....")
model.save_weights("model" + str(num_steps) + ".h5", overwrite=True)
# print "num_steps:",num_steps, " episode_num:", episode_num, " episode_steps:", \
# episode_steps, " batch_loss:",batch_loss,\
# " reward:", reward, " action:", action
if done:
print ("************Episode finished****************")
print "num_steps:",num_steps, " episode_num:", episode_num, " episode_steps:", \
episode_steps,\
" cumulative_reward:", cumulative_reward
break
# observation = next_observation
else:
# print ("Now we load weight")
# model.load_weights(args['weights'])
# adam = Adam(lr=learning_rate)
# model.compile(loss='mse',optimizer=adam)
epsilon = 0
# avg_reward = 0
cumulative_rewards = []
for i in range(1000):
cumulative_reward = 0
episode_num += 1
observation = env.reset()
observation = image_process(observation)
episode_steps = 0
while True:
env.render()
# print(observation.shape)
# print (observation.shape)
# print (observation)
# viewer = ImageViewer(observation)
# viewer.show()
# for this use squeeze
# plt.figure()
# plt.imshow(observation)
# plt.show()
action = epsilon_greedy(epsilon, observation)
# epsilon = get_epsilon(epsilon)
# print epsilon
# action = env.action_space.sample()
next_observation, reward, done, info = env.step(action)
next_observation = image_process(next_observation)
# counter += 1
# print num_steps%maxlen
# replay_memory[num_steps % maxlen] = (observation, action, reward, next_observation, done)
num_steps += 1
episode_steps += 1
cumulative_reward += reward
batch_loss = 0
# print "num_steps:",num_steps, " episode_num:", episode_num, " episode_steps:", \
# episode_steps, " batch_loss:",batch_loss,\
# " reward:", reward, " action:", action
if done:
print ("************Episode finished****************", cumulative_reward)
break
observation = next_observation
# avg_reward += cumulative_reward/100
# print ("************Episode Average finished****************", avg_reward)
cumulative_rewards.append(cumulative_reward)
cumulative_rewards = np.array(cumulative_rewards)
print ("*************Finished. Here are the stats ******************")
print ("Mean:", np.mean(cumulative_rewards))
print ("Median:", np.median(cumulative_rewards))
print ("Standard Deviation:", np.std(cumulative_rewards))
hist = np.histogram(cumulative_rewards, bins=50)
print hist
# plt.hist(hist, bins='auto')
# plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment