diff --git a/.gitignore b/.gitignore index efaa3648f..a5dc1270b 100644 --- a/.gitignore +++ b/.gitignore @@ -137,4 +137,7 @@ dmypy.json # Pyre type checker .pyre/ -/db-data \ No newline at end of file +/db-data + +# wandb +/wandb/ \ No newline at end of file diff --git a/catanatron_experimental/catanatron_experimental/cli/cli_players.py b/catanatron_experimental/catanatron_experimental/cli/cli_players.py index 850841978..695d13b08 100644 --- a/catanatron_experimental/catanatron_experimental/cli/cli_players.py +++ b/catanatron_experimental/catanatron_experimental/cli/cli_players.py @@ -22,6 +22,7 @@ ) from catanatron.players.search import VictoryPointPlayer from catanatron_experimental.machine_learning.players.mcts import MCTSPlayer +from catanatron_experimental.machine_learning.players.ppo import PPOPlayer from catanatron_experimental.machine_learning.players.playouts import ( GreedyPlayoutsPlayer, ) @@ -95,6 +96,12 @@ "AlphaBeta but searches only within turn", SameTurnAlphaBetaPlayer, ), + CliPlayer( + "PPO", + "PPOPlayer", + "Proximal Policy Optimization reinforcement learning agent.", + PPOPlayer, + ), ] diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/custom_cnn.py b/catanatron_experimental/catanatron_experimental/machine_learning/custom_cnn.py new file mode 100644 index 000000000..6e3219059 --- /dev/null +++ b/catanatron_experimental/catanatron_experimental/machine_learning/custom_cnn.py @@ -0,0 +1,51 @@ +import torch as th +from torch import nn +import gymnasium as gym +from stable_baselines3.common.torch_layers import BaseFeaturesExtractor + + +class CustomCNN(BaseFeaturesExtractor): + """ + Custom CNN to process the board observations. + :param observation_space: (gym.Space) + :param cnn_arch: List of integers specifying the number of filters in each Conv layer. + :param features_dim: (int) Number of features extracted. + """ + + def __init__( + self, + observation_space: gym.spaces.Dict, + cnn_arch, + features_dim: int = 256, + ): + super(CustomCNN, self).__init__(observation_space, features_dim) + n_input_channels = observation_space["board"].shape[0] + + layers = [] + in_channels = n_input_channels + for out_channels in cnn_arch: + layers.append( + nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + ) + layers.append(nn.BatchNorm2d(out_channels)) + layers.append(nn.ReLU()) + in_channels = out_channels + layers.append(nn.Flatten()) + self.cnn = nn.Sequential(*layers) + + # Compute the number of features after CNN + with th.no_grad(): + sample_board = th.as_tensor( + observation_space.sample()["board"][None] + ).float() + n_flatten = self.cnn(sample_board).shape[1] + + n_numeric_features = observation_space["numeric"].shape[0] + self.linear = nn.Sequential( + nn.Linear(n_flatten + n_numeric_features, features_dim), nn.ReLU() + ) + + def forward(self, observations: dict) -> th.Tensor: + board_features = self.cnn(observations["board"]) + concatenated_tensor = th.cat([board_features, observations["numeric"]], dim=1) + return self.linear(concatenated_tensor) diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/players/ppo.py b/catanatron_experimental/catanatron_experimental/machine_learning/players/ppo.py new file mode 100644 index 000000000..791edd5db --- /dev/null +++ b/catanatron_experimental/catanatron_experimental/machine_learning/players/ppo.py @@ -0,0 +1,119 @@ +from typing import Iterable +import numpy as np +import os +from sb3_contrib import MaskablePPO + +from catanatron.game import Game +from catanatron.models.actions import Action +from catanatron.models.player import Player +from catanatron_gym.envs.catanatron_env import from_action_space, to_action_space +from catanatron_gym.features import create_sample, get_feature_ordering +from catanatron_gym.board_tensor_features import ( + create_board_tensor, + is_graph_feature, +) +from catanatron_experimental.machine_learning.custom_cnn import CustomCNN + + +class PPOPlayer(Player): + """ + Proximal Policy Optimization (PPO) reinforcement learning agent. + """ + + def __init__(self, color, model_path=None): + super().__init__(color) + self.model = None + self.numeric_features = None + if model_path is None: + script_dir = os.path.dirname(os.path.abspath(__file__)) + model_path = os.path.join(script_dir, "..", "model.zip") + if model_path: + self.load(model_path) + + def decide(self, game: Game, playable_actions: Iterable[Action]): + if self.model is None: + raise ValueError("Model not loaded. Call load() first.") + + # Initialize numeric_features based on the current game + if self.numeric_features is None: + num_players = len(game.state.players) + self.features = get_feature_ordering(num_players) + self.numeric_features = [ + f for f in self.features if not is_graph_feature(f) + ] + + # Generate observation from the game state + obs = self.generate_observation(game) + + # Generate action mask from playable actions + action_mask = self.generate_action_mask(playable_actions) + + # Predict the action index + action_index, _ = self.model.predict( + obs, action_masks=action_mask, deterministic=True + ) + + # Map the action index to the actual Action + try: + selected_action = self.action_index_to_action( + action_index, playable_actions + ) + return selected_action + except Exception as e: + print(f"Error mapping action index to Action: {e}") + # Default to the first playable action + return list(playable_actions)[0] + + def generate_observation(self, game: Game): + # Create the sample + sample = create_sample(game, self.color) + + # Generate board tensor + board_tensor = create_board_tensor( + game, self.color, channels_first=True + ).astype(np.float32) + + # Extract numeric features + numeric = np.array( + [float(sample[i]) for i in self.numeric_features], dtype=np.float32 + ) + + # Create the observation + obs = {"board": board_tensor, "numeric": numeric} + return obs + + def generate_action_mask(self, playable_actions: Iterable[Action]): + action_mask = np.zeros(self.model.action_space.n, dtype=bool) + for action in playable_actions: + try: + action_index = self.action_to_action_index(action) + if ( + action_index is not None + and 0 <= action_index < self.model.action_space.n + ): + action_mask[action_index] = True + except Exception as e: + print(f"Error in action_to_action_index: {e}") + continue + return action_mask + + def action_to_action_index(self, action: Action): + action_index = to_action_space(action) + return action_index + + def action_index_to_action( + self, action_index: int, playable_actions: Iterable[Action] + ): + action = from_action_space(action_index, playable_actions) + if action in playable_actions: + return action + else: + raise ValueError(f"Action {action} not in playable actions.") + + def load(self, path): + self.model = MaskablePPO.load( + path, + custom_objects={ + "features_extractor_class": CustomCNN, + }, + ) diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/reward_functions.py b/catanatron_experimental/catanatron_experimental/machine_learning/reward_functions.py new file mode 100644 index 000000000..8458ead75 --- /dev/null +++ b/catanatron_experimental/catanatron_experimental/machine_learning/reward_functions.py @@ -0,0 +1,55 @@ +# reward_functions.py + +import numpy as np +from catanatron.state_functions import get_actual_victory_points + + +def partial_rewards(game, p0_color, vps_to_win): + """ + Calculate the partial rewards for the game. + + Args: + game: The game instance. + p0_color: The color representing the player's position. + vps_to_win: The victory points required to win the game. + + Returns: + A float representing the partial reward. + """ + winning_color = game.winning_color() + if winning_color is None: + return 0 + + total = 0 + if p0_color == winning_color: + total += 0.20 + else: + total -= 0.20 + enemy_vps = [ + get_actual_victory_points(game.state, color) + for color in game.state.colors + if color != p0_color + ] + enemy_avg_vp = sum(enemy_vps) / len(enemy_vps) + my_vps = get_actual_victory_points(game.state, p0_color) + vp_diff = (my_vps - enemy_avg_vp) / (vps_to_win - 1) + + total += 0.80 * vp_diff + print(f"my_vps = {my_vps} enemy_avg_vp = {enemy_avg_vp} partial_rewards = {total}") + return total + + +def mask_fn(env) -> np.ndarray: + """ + Generates a boolean mask of valid actions for the environment. + + Args: + env: The environment instance. + + Returns: + A numpy array of booleans indicating valid actions. + """ + valid_actions = env.unwrapped.get_valid_actions() + mask = np.zeros(env.action_space.n, dtype=bool) + mask[valid_actions] = True + return mask diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/train_ppo_agent.py b/catanatron_experimental/catanatron_experimental/machine_learning/train_ppo_agent.py new file mode 100644 index 000000000..ee6e29e13 --- /dev/null +++ b/catanatron_experimental/catanatron_experimental/machine_learning/train_ppo_agent.py @@ -0,0 +1,243 @@ +import math +import os +import random + +os.environ["WANDB_DISABLE_SYMLINKS"] = "True" +from typing import Any +import atexit +import time +import multiprocessing + +from functools import partial +import gymnasium as gym +import torch as th +import numpy as np +from stable_baselines3.common.callbacks import ( + CheckpointCallback, + CallbackList, +) +from stable_baselines3.common.vec_env import SubprocVecEnv +from stable_baselines3.common.vec_env import VecMonitor +from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy +from sb3_contrib.common.wrappers import ActionMasker +from sb3_contrib.ppo_mask import MaskablePPO +import wandb +from wandb.integration.sb3 import WandbCallback + +from catanatron import Color +from catanatron_experimental.machine_learning.custom_cnn import CustomCNN +from catanatron_experimental.machine_learning.players.value import ( + ValueFunctionPlayer, +) +from catanatron.models.player import RandomPlayer +from reward_functions import partial_rewards, mask_fn + +LOAD = False + + +def learning_rate_schedule(initial_lr, final_lr): + def lr_schedule(progress_remaining): + return final_lr + (initial_lr - final_lr) * progress_remaining + + return lr_schedule + + +def main(): + # ===== Params: + # With 100,000,000 timesteps, training took 4.97 days + total_timesteps = 100_000_000 + cnn_arch = [64, 128, 256, 512] + net_arch = [ + dict( + vf=[4096, 4096, 2048, 2048, 1024, 1024, 512, 512, 256], + pi=[4096, 4096, 2048, 2048, 1024, 1024, 512, 512, 256], + ) + ] + activation_fn = th.nn.LeakyReLU + initial_lr = 1e-4 + final_lr = 1e-6 + ent_coef = 0.01 + vps_to_win = 10 + env_name = "catanatron_gym:catanatron-v1" + map_type = "BASE" + enemies = [ValueFunctionPlayer(Color.RED)] + reward_function = partial(partial_rewards, vps_to_win=vps_to_win) + reward_function.__name__ = partial_rewards.__name__ + representation = "mixed" + # batch_size = 64 + gamma = 0.99 + normalized = False + selfplay = False + seed = 42 + + n_envs = 8 + n_steps = 256 + batch_size = n_envs * n_steps + n_epochs = 10 + + assert ( + n_envs * n_steps + ) % batch_size == 0, "batch_size must divide n_envs * n_steps" + + start_time = time.time() + + # Set random seeds for reproducibility + random.seed(seed) + np.random.seed(seed) + th.manual_seed(seed) + th.cuda.manual_seed_all(seed) + th.backends.cudnn.deterministic = True + th.backends.cudnn.benchmark = False + + # Create learning rate schedule + lr_schedule = learning_rate_schedule(initial_lr, final_lr) + + # Build Experiment Name + iters = round(math.log(total_timesteps, 10)) + arch_str = ( + activation_fn.__name__ + + "x".join([str(i) for i in net_arch[:-1]]) + + "+" + + "vf=" + + "x".join([str(i) for i in net_arch[-1]["vf"]]) + + "+" + + "pi=" + + "x".join([str(i) for i in net_arch[-1]["pi"]]) + ) + if representation == "mixed": + arch_str = "Cnn" + "x".join([str(i) for i in cnn_arch]) + "+" + arch_str + enemy_desc = "".join(e.__class__.__name__ for e in enemies) + experiment_name = f"ppo-{selfplay}-{normalized}-{iters}-{batch_size}-{gamma}-{enemy_desc}-{reward_function.__name__}-{representation}-{arch_str}-{initial_lr}lr-{vps_to_win}vp-{map_type}map" + print(experiment_name) + + # WandB config + config = { + "initial_learning_rate": initial_lr, + "final_learning_rate": final_lr, + "total_timesteps": total_timesteps, + "net_arch": net_arch, + "activation_fn": activation_fn.__name__, + "vps_to_win": vps_to_win, + "map_type": map_type, + "enemies": [str(enemy) for enemy in enemies], + "reward_function": reward_function.__name__, + "representation": representation, + "batch_size": batch_size, + "gamma": gamma, + "normalized": normalized, + "cnn_arch": cnn_arch, + "selfplay": selfplay, + "experiment_name": experiment_name, + "n_envs": n_envs, + "n_steps": n_steps, + "n_epochs": n_epochs, + "seed": seed, + } + run = wandb.init( + project="catanatron", + config=config, + sync_tensorboard=True, + ) + + def print_name(): + print(experiment_name) + + atexit.register(print_name) + + # Define the environment creation function + def make_env(rank, seed=0): + def _init(): + env = gym.make( + env_name, + config={ + "map_type": map_type, + "vps_to_win": vps_to_win, + "enemies": enemies, + "reward_function": reward_function, + "representation": representation, + "normalized": True, + }, + ) + env = ActionMasker(env, mask_fn) + return env + + return _init + + # Create the vectorized environment + env = SubprocVecEnv([make_env(i, seed) for i in range(n_envs)]) + env = VecMonitor(env) + + # Print the observation space to verify its type + print("Observation Space:", env.observation_space) + device = th.device("cuda" if th.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + script_dir = os.path.dirname(os.path.abspath(__file__)) + model_path = os.path.join(script_dir, "model") + try: + model = MaskablePPO.load(model_path, env, device=device) + # Override the training configuration from the previously trained model + model.gamma = gamma + model.ent_coef = ent_coef + model.learning_rate = lr_schedule + model.batch_size = batch_size + model._setup_lr_schedule() + print("Loaded", "model") + except Exception as e: + print(f"Failed to load the model from {model_path}: {e}") + print("Creating a new model.") + policy_kwargs: Any = dict(activation_fn=activation_fn, net_arch=net_arch[0]) + if representation == "mixed": + policy_kwargs["features_extractor_class"] = CustomCNN + policy_kwargs["features_extractor_kwargs"] = dict( + cnn_arch=cnn_arch, features_dim=512 # Adjust as needed + ) + model = MaskablePPO( + MaskableActorCriticPolicy, + env, + gamma=gamma, + n_steps=n_steps, + batch_size=batch_size, + n_epochs=n_epochs, + policy_kwargs=policy_kwargs, + learning_rate=lr_schedule, + ent_coef=ent_coef, + verbose=1, + tensorboard_log="./logs/mppo_tensorboard/" + experiment_name, + device=device, + seed=seed, + ) + + # Save a checkpoint every 100,000 steps + wandb_callback = WandbCallback( + model_save_path=f"models/{run.id}", + ) + checkpoint_callback = CheckpointCallback( + save_freq=100_000, save_path="./logs/", name_prefix=experiment_name + ) + callback = CallbackList([checkpoint_callback]) # , wandb_callback]) + + if selfplay: + selfplay_iterations = 10 + for i in range(selfplay_iterations): + model.learn( + total_timesteps=int(total_timesteps / selfplay_iterations), + callback=callback, + ) + model.save(model_path) + else: + model.learn(total_timesteps=total_timesteps, callback=callback) + model.save(model_path) + + elapsed_time = time.time() - start_time + timesteps_per_second = total_timesteps / elapsed_time + print(f"Training completed in {elapsed_time:.2f} seconds.") + print(f"Timesteps per second = {timesteps_per_second:.2f}") + + model.save(model_path) + run.finish() + + +if __name__ == "__main__": + multiprocessing.set_start_method("spawn") + main() diff --git a/research-requirements.txt b/research-requirements.txt new file mode 100644 index 000000000..50c1766ae --- /dev/null +++ b/research-requirements.txt @@ -0,0 +1,17 @@ +# Only required to train/run the Players using PyTorch (i.e. PPOPlayer) +keras==2.13.1 +sb3_contrib==2.3.0 +stable_baselines3==2.3.2 +sympy==1.13.1 +tensorboard==2.13.0 +tensorboard-data-server==0.7.2 +tensorflow==2.13.0 +tensorflow-estimator==2.13.0 +tensorflow-intel==2.13.0 +tensorflow-io-gcs-filesystem==0.31.0 +termcolor==2.4.0 +tomli==2.0.1 +torch==2.4.1 +torchaudio==2.4.1 +torchvision==0.19.1 +wandb==0.18.1