Skip to content

Instantly share code, notes, and snippets.

@tehZevo
Last active October 17, 2024 15:09
Show Gist options
  • Save tehZevo/48fda3a94fd1abbbb25b237b78dabd24 to your computer and use it in GitHub Desktop.
Save tehZevo/48fda3a94fd1abbbb25b237b78dabd24 to your computer and use it in GitHub Desktop.
Fix bugs in GeeksforGeeks A2C example
import numpy as np
import tensorflow as tf
import gymnasium as gym
# Create the CartPole Environment
env = gym.make('CartPole-v1')
# Define the actor and critic networks
actor = tf.keras.Sequential([
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(env.action_space.n, activation='softmax')
])
critic = tf.keras.Sequential([
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1)
])
# Define optimizer and loss functions
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
# Main training loop
num_episodes = 1000
gamma = 0.99
n_episodes_rewards = []
for episode in range(num_episodes):
state, reset_info = env.reset()
episode_reward = 0
for t in range(1, 10000): # Limit the number of time steps
with tf.GradientTape(persistent=True) as tape:
# Choose an action using the actor
action_probs = actor(np.array([state]))
action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])
# Take the chosen action and observe the next state and reward
next_state, reward, done, terminated, _ = env.step(action)
# Compute the advantage
state_value = critic(np.array([state]))[0, 0]
next_state_value = critic(np.array([next_state]))[0, 0]
advantage = reward + gamma * next_state_value - state_value
# Compute actor and critic losses
actor_loss = -tf.math.log(action_probs[0, action]) * advantage
critic_loss = tf.square(advantage)
episode_reward += reward
state = next_state
# Update actor and critic
actor_gradients = tape.gradient(actor_loss, actor.trainable_variables)
critic_gradients = tape.gradient(critic_loss, critic.trainable_variables)
actor_optimizer.apply_gradients(zip(actor_gradients, actor.trainable_variables))
critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
if done or terminated:
break
n_episodes_rewards.append(episode_reward)
if episode % 10 == 0:
print(f"Episode {episode}, Average episode reward: {np.mean(n_episodes_rewards)}")
n_episodes_rewards = []
env.close()
@tehZevo
Copy link
Author

tehZevo commented Oct 17, 2024

Program output:

Episode 0, Average reward: 21.0
Episode 10, Average reward: 18.4
Episode 20, Average reward: 24.1
Episode 30, Average reward: 24.5
Episode 40, Average reward: 43.5
Episode 50, Average reward: 46.0
Episode 60, Average reward: 51.7
Episode 70, Average reward: 66.8
Episode 80, Average reward: 71.4
Episode 90, Average reward: 76.4
Episode 100, Average reward: 71.6
Episode 110, Average reward: 84.2
Episode 120, Average reward: 100.7
Episode 130, Average reward: 83.1
Episode 140, Average reward: 67.4
Episode 150, Average reward: 71.5
Episode 160, Average reward: 80.7
Episode 170, Average reward: 75.7
Episode 180, Average reward: 60.7
Episode 190, Average reward: 104.3
Episode 200, Average reward: 102.5
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment