Created
August 2, 2017 09:37
-
-
Save vishaljain3991/7d65cd267a3b2dd6d6ca28e5aa8f267a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import random | |
import argparse | |
# from PIL import Image | |
import skimage as skimage | |
from skimage import transform, color, exposure | |
# from collections import deque | |
# from copy import deepcopy | |
# | |
# import matplotlib | |
# matplotlib.use('TkAgg') | |
# import matplotlib.pyplot as plt | |
# from skimage.viewer import ImageViewer | |
from keras.models import Sequential, model_from_config | |
from keras.layers import Dense, Activation, Flatten, Conv2D, Permute, BatchNormalization | |
from keras.optimizers import Adam | |
import sys | |
# import gym | |
from ale_python_interface import ALEInterface | |
parser = argparse.ArgumentParser(description='Description of your program') | |
parser.add_argument('-m','--mode', help='train / run', required=True) | |
parser.add_argument('-w','--weights', help='model.h5', required=False) | |
parser.add_argument('-e','--env', help='environment name', required=True) | |
parser.add_argument('-d','--monitor', help='monitor directory', required=True) | |
args = vars(parser.parse_args()) | |
# env = gym.make(args['env']) | |
rom_file = str.encode(sys.argv[1]) | |
ale.loadROM(rom_file) | |
num_actions = len(ale.getLegalActionSet()) | |
ale.setInt(b'random_seed', 123) | |
# print () | |
# from gym import wrappers | |
# env = wrappers.Monitor(env, args['monitor'] + args['env'] + '/', force=True) | |
# | |
def clone_model(model): | |
# Requires Keras 1.0.7 since get_config has breaking changes. | |
config = { | |
'class_name': model.__class__.__name__, | |
'config': model.get_config(), | |
} | |
clone = model_from_config(config) | |
clone.set_weights(model.get_weights()) | |
return clone | |
maxlen = 10**6 | |
counter = -1 | |
replay_memory = [None for _ in range(maxlen)] | |
# warmup_steps = 10000 | |
warmup_steps = 20000 | |
target_model_update = 10000 | |
num_steps = 0 | |
eps_max = 0.3 | |
eps_min = 0.1 | |
anneal_steps = 10**6 | |
learning_rate = 0.00025 | |
gamma = 0.99 | |
# max_steps = 1750000 | |
max_episodes = 12000 | |
model_save_steps = 10000 | |
num_frames_per_action = 4 | |
model = Sequential() | |
model.add(Conv2D(32, (8, 8), strides=4, input_shape=(84, 84, 4))) | |
model.add(BatchNormalization()) | |
model.add(Activation('relu')) | |
model.add(Conv2D(64, (4, 4), strides=2)) | |
model.add(BatchNormalization()) | |
model.add(Activation('relu')) | |
model.add(Conv2D(64, (3, 3))) | |
model.add(BatchNormalization()) | |
model.add(Activation('relu')) | |
model.add(Flatten()) | |
model.add(Dense(512)) | |
model.add(Activation('relu')) | |
model.add(Dense(num_actions)) | |
model.add(Activation('linear')) | |
model.compile(optimizer=Adam(lr=learning_rate), | |
loss='mean_squared_error') | |
# print (model.__class__.__name__) | |
# print model.metrics_names | |
print(model.summary()) | |
if args['weights'] is not None: | |
print ("Now we load weight") | |
model.load_weights(args['weights']) | |
adam = Adam(lr=learning_rate) | |
model.compile(loss='mse',optimizer=adam) | |
# sys.exit() | |
target_model = clone_model(model) | |
# replay_memory = deque() | |
def epsilon_greedy(epsilon, state): | |
"Epsilon greedy policy" | |
if np.random.uniform(0,1) < 1-epsilon: | |
# choose maximum q action | |
state = np.expand_dims(state, axis=0) | |
epsilon_action = np.argmax(model.predict_on_batch(state)) | |
# print epsilon_action | |
return epsilon_action | |
else: | |
a = np.random.randint(num_actions) | |
return a | |
def get_epsilon(eps): | |
eps = eps - (eps_max - eps_min)/anneal_steps | |
return max(eps, eps_min) | |
def image_process(observation): | |
# observation = skimage.color.rgb2gray(observation) | |
observation = observation.squeeze() | |
observation = skimage.transform.resize(observation,(84,84)) | |
observation = skimage.exposure.rescale_intensity(observation,out_range=(0,255)) | |
observation = observation.reshape(observation.shape[0], observation.shape[1], 1) | |
return observation | |
def batch_unpack(minibatch): | |
observation_batch, action_batch, reward_batch, terminal_batch, next_observation_batch \ | |
= [],[],[],[],[] | |
for experience in minibatch: | |
observation, action, reward, next_observation, terminal = experience | |
observation_batch.append(observation) | |
action_batch.append(action) | |
next_observation_batch.append(next_observation) | |
reward_batch.append(reward) | |
terminal_batch.append(terminal) | |
observation_batch = np.array(observation_batch).squeeze() | |
action_batch = np.array(action_batch) | |
reward_batch = np.array(reward_batch) | |
terminal_batch = np.array(terminal_batch, dtype=int) | |
next_observation_batch = np.array(next_observation_batch).squeeze() | |
# print terminal_batch.shape | |
return observation_batch, action_batch, reward_batch, terminal_batch, next_observation_batch | |
epsilon = eps_max | |
episode_num = 0 | |
if args['mode'] == 'train': | |
for i in range(max_episodes): | |
episode_num += 1 | |
# observation = ale.getScreenRGB() | |
observation = ale.getScreenGrayscale() | |
observation = image_process(observation) | |
observation_stack = np.stack((observation, observation, observation, observation), axis=2) | |
observation_stack = observation_stack.reshape(1, observation_stack.shape[0], observation_stack.shape[1], observation_stack.shape[2]) | |
episode_steps = 0 | |
cumulative_reward = 0 | |
while True: | |
if episode_steps % num_frames_per_action==0: | |
action = epsilon_greedy(epsilon, observation_stack) | |
epsilon = get_epsilon(epsilon) | |
reward = ale.act(action) | |
next_observation = ale.getScreenGrayscale() | |
next_observation = image_process(next_observation) | |
next_observation = next_observation.reshape(1, next_observation.shape[0], next_observation.shape[1], 1) | |
done = ale.game_over() | |
next_observation_stack = np.append(next_observation, observation_stack[:,:,:,:3], axis=3) | |
replay_memory[num_steps % maxlen] = (observation_stack, action, reward, next_observation_stack, done) | |
observation_stack = next_observation_stack | |
num_steps += 1 | |
episode_steps += 1 | |
cumulative_reward += reward | |
batch_loss = 0 | |
if num_steps > warmup_steps: | |
minibatch = random.sample(replay_memory[:min(num_steps, maxlen)], 32) | |
observation_batch, action_batch, reward_batch, terminal_batch, next_observation_batch \ | |
= batch_unpack(minibatch) | |
next_action_values_batch = model.predict_on_batch(next_observation_batch) | |
b = np.zeros_like(next_action_values_batch) | |
b[np.arange(len(next_action_values_batch)), next_action_values_batch.argmax(1)] = 1 | |
c = target_model.predict_on_batch(next_observation_batch) | |
final_reward_batch = (reward_batch + gamma * c.T * terminal_batch).T\ | |
* b | |
batch_loss = model.train_on_batch(observation_batch, final_reward_batch) | |
if num_steps > 1 and num_steps%target_model_update==0: | |
target_model = clone_model(model) | |
if num_steps > 1 and num_steps%model_save_steps==0: | |
print ("Saving model.....") | |
model.save_weights("model" + str(num_steps) + ".h5", overwrite=True) | |
# print "num_steps:",num_steps, " episode_num:", episode_num, " episode_steps:", \ | |
# episode_steps, " batch_loss:",batch_loss,\ | |
# " reward:", reward, " action:", action | |
if done: | |
print ("************Episode finished****************") | |
print "num_steps:",num_steps, " episode_num:", episode_num, " episode_steps:", \ | |
episode_steps,\ | |
" cumulative_reward:", cumulative_reward | |
break | |
# observation = next_observation | |
else: | |
# print ("Now we load weight") | |
# model.load_weights(args['weights']) | |
# adam = Adam(lr=learning_rate) | |
# model.compile(loss='mse',optimizer=adam) | |
epsilon = 0 | |
# avg_reward = 0 | |
cumulative_rewards = [] | |
for i in range(1000): | |
cumulative_reward = 0 | |
episode_num += 1 | |
observation = env.reset() | |
observation = image_process(observation) | |
episode_steps = 0 | |
while True: | |
env.render() | |
# print(observation.shape) | |
# print (observation.shape) | |
# print (observation) | |
# viewer = ImageViewer(observation) | |
# viewer.show() | |
# for this use squeeze | |
# plt.figure() | |
# plt.imshow(observation) | |
# plt.show() | |
action = epsilon_greedy(epsilon, observation) | |
# epsilon = get_epsilon(epsilon) | |
# print epsilon | |
# action = env.action_space.sample() | |
next_observation, reward, done, info = env.step(action) | |
next_observation = image_process(next_observation) | |
# counter += 1 | |
# print num_steps%maxlen | |
# replay_memory[num_steps % maxlen] = (observation, action, reward, next_observation, done) | |
num_steps += 1 | |
episode_steps += 1 | |
cumulative_reward += reward | |
batch_loss = 0 | |
# print "num_steps:",num_steps, " episode_num:", episode_num, " episode_steps:", \ | |
# episode_steps, " batch_loss:",batch_loss,\ | |
# " reward:", reward, " action:", action | |
if done: | |
print ("************Episode finished****************", cumulative_reward) | |
break | |
observation = next_observation | |
# avg_reward += cumulative_reward/100 | |
# print ("************Episode Average finished****************", avg_reward) | |
cumulative_rewards.append(cumulative_reward) | |
cumulative_rewards = np.array(cumulative_rewards) | |
print ("*************Finished. Here are the stats ******************") | |
print ("Mean:", np.mean(cumulative_rewards)) | |
print ("Median:", np.median(cumulative_rewards)) | |
print ("Standard Deviation:", np.std(cumulative_rewards)) | |
hist = np.histogram(cumulative_rewards, bins=50) | |
print hist | |
# plt.hist(hist, bins='auto') | |
# plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment