diff --git a/.gitignore b/.gitignore
index efaa3648f..a5dc1270b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,4 +137,7 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-/db-data
\ No newline at end of file
+/db-data
+
+# wandb
+/wandb/
\ No newline at end of file
diff --git a/catanatron_experimental/catanatron_experimental/cli/cli_players.py b/catanatron_experimental/catanatron_experimental/cli/cli_players.py
index 850841978..695d13b08 100644
--- a/catanatron_experimental/catanatron_experimental/cli/cli_players.py
+++ b/catanatron_experimental/catanatron_experimental/cli/cli_players.py
@@ -22,6 +22,7 @@
 )
 from catanatron.players.search import VictoryPointPlayer
 from catanatron_experimental.machine_learning.players.mcts import MCTSPlayer
+from catanatron_experimental.machine_learning.players.ppo import PPOPlayer
 from catanatron_experimental.machine_learning.players.playouts import (
     GreedyPlayoutsPlayer,
 )
@@ -95,6 +96,12 @@
         "AlphaBeta but searches only within turn",
         SameTurnAlphaBetaPlayer,
     ),
+    CliPlayer(
+        "PPO",
+        "PPOPlayer",
+        "Proximal Policy Optimization reinforcement learning agent.",
+        PPOPlayer,
+    ),
 ]
 
 
diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/custom_cnn.py b/catanatron_experimental/catanatron_experimental/machine_learning/custom_cnn.py
new file mode 100644
index 000000000..6e3219059
--- /dev/null
+++ b/catanatron_experimental/catanatron_experimental/machine_learning/custom_cnn.py
@@ -0,0 +1,51 @@
+import torch as th
+from torch import nn
+import gymnasium as gym
+from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
+
+
+class CustomCNN(BaseFeaturesExtractor):
+    """
+    Custom CNN to process the board observations.
+    :param observation_space: (gym.Space)
+    :param cnn_arch: List of integers specifying the number of filters in each Conv layer.
+    :param features_dim: (int) Number of features extracted.
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Dict,
+        cnn_arch,
+        features_dim: int = 256,
+    ):
+        super(CustomCNN, self).__init__(observation_space, features_dim)
+        n_input_channels = observation_space["board"].shape[0]
+
+        layers = []
+        in_channels = n_input_channels
+        for out_channels in cnn_arch:
+            layers.append(
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            )
+            layers.append(nn.BatchNorm2d(out_channels))
+            layers.append(nn.ReLU())
+            in_channels = out_channels
+        layers.append(nn.Flatten())
+        self.cnn = nn.Sequential(*layers)
+
+        # Compute the number of features after CNN
+        with th.no_grad():
+            sample_board = th.as_tensor(
+                observation_space.sample()["board"][None]
+            ).float()
+            n_flatten = self.cnn(sample_board).shape[1]
+
+        n_numeric_features = observation_space["numeric"].shape[0]
+        self.linear = nn.Sequential(
+            nn.Linear(n_flatten + n_numeric_features, features_dim), nn.ReLU()
+        )
+
+    def forward(self, observations: dict) -> th.Tensor:
+        board_features = self.cnn(observations["board"])
+        concatenated_tensor = th.cat([board_features, observations["numeric"]], dim=1)
+        return self.linear(concatenated_tensor)
diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/players/ppo.py b/catanatron_experimental/catanatron_experimental/machine_learning/players/ppo.py
new file mode 100644
index 000000000..791edd5db
--- /dev/null
+++ b/catanatron_experimental/catanatron_experimental/machine_learning/players/ppo.py
@@ -0,0 +1,119 @@
+from typing import Iterable
+import numpy as np
+import os
+from sb3_contrib import MaskablePPO
+
+from catanatron.game import Game
+from catanatron.models.actions import Action
+from catanatron.models.player import Player
+from catanatron_gym.envs.catanatron_env import from_action_space, to_action_space
+from catanatron_gym.features import create_sample, get_feature_ordering
+from catanatron_gym.board_tensor_features import (
+    create_board_tensor,
+    is_graph_feature,
+)
+from catanatron_experimental.machine_learning.custom_cnn import CustomCNN
+
+
+class PPOPlayer(Player):
+    """
+    Proximal Policy Optimization (PPO) reinforcement learning agent.
+    """
+
+    def __init__(self, color, model_path=None):
+        super().__init__(color)
+        self.model = None
+        self.numeric_features = None
+        if model_path is None:
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            model_path = os.path.join(script_dir, "..", "model.zip")
+        if model_path:
+            self.load(model_path)
+
+    def decide(self, game: Game, playable_actions: Iterable[Action]):
+        if self.model is None:
+            raise ValueError("Model not loaded. Call load() first.")
+
+        # Initialize numeric_features based on the current game
+        if self.numeric_features is None:
+            num_players = len(game.state.players)
+            self.features = get_feature_ordering(num_players)
+            self.numeric_features = [
+                f for f in self.features if not is_graph_feature(f)
+            ]
+
+        # Generate observation from the game state
+        obs = self.generate_observation(game)
+
+        # Generate action mask from playable actions
+        action_mask = self.generate_action_mask(playable_actions)
+
+        # Predict the action index
+        action_index, _ = self.model.predict(
+            obs, action_masks=action_mask, deterministic=True
+        )
+
+        # Map the action index to the actual Action
+        try:
+            selected_action = self.action_index_to_action(
+                action_index, playable_actions
+            )
+            return selected_action
+        except Exception as e:
+            print(f"Error mapping action index to Action: {e}")
+            # Default to the first playable action
+            return list(playable_actions)[0]
+
+    def generate_observation(self, game: Game):
+        # Create the sample
+        sample = create_sample(game, self.color)
+
+        # Generate board tensor
+        board_tensor = create_board_tensor(
+            game, self.color, channels_first=True
+        ).astype(np.float32)
+
+        # Extract numeric features
+        numeric = np.array(
+            [float(sample[i]) for i in self.numeric_features], dtype=np.float32
+        )
+
+        # Create the observation
+        obs = {"board": board_tensor, "numeric": numeric}
+        return obs
+
+    def generate_action_mask(self, playable_actions: Iterable[Action]):
+        action_mask = np.zeros(self.model.action_space.n, dtype=bool)
+        for action in playable_actions:
+            try:
+                action_index = self.action_to_action_index(action)
+                if (
+                    action_index is not None
+                    and 0 <= action_index < self.model.action_space.n
+                ):
+                    action_mask[action_index] = True
+            except Exception as e:
+                print(f"Error in action_to_action_index: {e}")
+                continue
+        return action_mask
+
+    def action_to_action_index(self, action: Action):
+        action_index = to_action_space(action)
+        return action_index
+
+    def action_index_to_action(
+        self, action_index: int, playable_actions: Iterable[Action]
+    ):
+        action = from_action_space(action_index, playable_actions)
+        if action in playable_actions:
+            return action
+        else:
+            raise ValueError(f"Action {action} not in playable actions.")
+
+    def load(self, path):
+        self.model = MaskablePPO.load(
+            path,
+            custom_objects={
+                "features_extractor_class": CustomCNN,
+            },
+        )
diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/reward_functions.py b/catanatron_experimental/catanatron_experimental/machine_learning/reward_functions.py
new file mode 100644
index 000000000..8458ead75
--- /dev/null
+++ b/catanatron_experimental/catanatron_experimental/machine_learning/reward_functions.py
@@ -0,0 +1,55 @@
+# reward_functions.py
+
+import numpy as np
+from catanatron.state_functions import get_actual_victory_points
+
+
+def partial_rewards(game, p0_color, vps_to_win):
+    """
+    Calculate the partial rewards for the game.
+
+    Args:
+        game: The game instance.
+        p0_color: The color representing the player's position.
+        vps_to_win: The victory points required to win the game.
+
+    Returns:
+        A float representing the partial reward.
+    """
+    winning_color = game.winning_color()
+    if winning_color is None:
+        return 0
+
+    total = 0
+    if p0_color == winning_color:
+        total += 0.20
+    else:
+        total -= 0.20
+    enemy_vps = [
+        get_actual_victory_points(game.state, color)
+        for color in game.state.colors
+        if color != p0_color
+    ]
+    enemy_avg_vp = sum(enemy_vps) / len(enemy_vps)
+    my_vps = get_actual_victory_points(game.state, p0_color)
+    vp_diff = (my_vps - enemy_avg_vp) / (vps_to_win - 1)
+
+    total += 0.80 * vp_diff
+    print(f"my_vps = {my_vps} enemy_avg_vp = {enemy_avg_vp} partial_rewards = {total}")
+    return total
+
+
+def mask_fn(env) -> np.ndarray:
+    """
+    Generates a boolean mask of valid actions for the environment.
+
+    Args:
+        env: The environment instance.
+
+    Returns:
+        A numpy array of booleans indicating valid actions.
+    """
+    valid_actions = env.unwrapped.get_valid_actions()
+    mask = np.zeros(env.action_space.n, dtype=bool)
+    mask[valid_actions] = True
+    return mask
diff --git a/catanatron_experimental/catanatron_experimental/machine_learning/train_ppo_agent.py b/catanatron_experimental/catanatron_experimental/machine_learning/train_ppo_agent.py
new file mode 100644
index 000000000..ee6e29e13
--- /dev/null
+++ b/catanatron_experimental/catanatron_experimental/machine_learning/train_ppo_agent.py
@@ -0,0 +1,243 @@
+import math
+import os
+import random
+
+os.environ["WANDB_DISABLE_SYMLINKS"] = "True"
+from typing import Any
+import atexit
+import time
+import multiprocessing
+
+from functools import partial
+import gymnasium as gym
+import torch as th
+import numpy as np
+from stable_baselines3.common.callbacks import (
+    CheckpointCallback,
+    CallbackList,
+)
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from stable_baselines3.common.vec_env import VecMonitor
+from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
+from sb3_contrib.common.wrappers import ActionMasker
+from sb3_contrib.ppo_mask import MaskablePPO
+import wandb
+from wandb.integration.sb3 import WandbCallback
+
+from catanatron import Color
+from catanatron_experimental.machine_learning.custom_cnn import CustomCNN
+from catanatron_experimental.machine_learning.players.value import (
+    ValueFunctionPlayer,
+)
+from catanatron.models.player import RandomPlayer
+from reward_functions import partial_rewards, mask_fn
+
+LOAD = False
+
+
+def learning_rate_schedule(initial_lr, final_lr):
+    def lr_schedule(progress_remaining):
+        return final_lr + (initial_lr - final_lr) * progress_remaining
+
+    return lr_schedule
+
+
+def main():
+    # ===== Params:
+    # With 100,000,000 timesteps, training took 4.97 days
+    total_timesteps = 100_000_000
+    cnn_arch = [64, 128, 256, 512]
+    net_arch = [
+        dict(
+            vf=[4096, 4096, 2048, 2048, 1024, 1024, 512, 512, 256],
+            pi=[4096, 4096, 2048, 2048, 1024, 1024, 512, 512, 256],
+        )
+    ]
+    activation_fn = th.nn.LeakyReLU
+    initial_lr = 1e-4
+    final_lr = 1e-6
+    ent_coef = 0.01
+    vps_to_win = 10
+    env_name = "catanatron_gym:catanatron-v1"
+    map_type = "BASE"
+    enemies = [ValueFunctionPlayer(Color.RED)]
+    reward_function = partial(partial_rewards, vps_to_win=vps_to_win)
+    reward_function.__name__ = partial_rewards.__name__
+    representation = "mixed"
+    # batch_size = 64
+    gamma = 0.99
+    normalized = False
+    selfplay = False
+    seed = 42
+
+    n_envs = 8
+    n_steps = 256
+    batch_size = n_envs * n_steps
+    n_epochs = 10
+
+    assert (
+        n_envs * n_steps
+    ) % batch_size == 0, "batch_size must divide n_envs * n_steps"
+
+    start_time = time.time()
+
+    # Set random seeds for reproducibility
+    random.seed(seed)
+    np.random.seed(seed)
+    th.manual_seed(seed)
+    th.cuda.manual_seed_all(seed)
+    th.backends.cudnn.deterministic = True
+    th.backends.cudnn.benchmark = False
+
+    # Create learning rate schedule
+    lr_schedule = learning_rate_schedule(initial_lr, final_lr)
+
+    # Build Experiment Name
+    iters = round(math.log(total_timesteps, 10))
+    arch_str = (
+        activation_fn.__name__
+        + "x".join([str(i) for i in net_arch[:-1]])
+        + "+"
+        + "vf="
+        + "x".join([str(i) for i in net_arch[-1]["vf"]])
+        + "+"
+        + "pi="
+        + "x".join([str(i) for i in net_arch[-1]["pi"]])
+    )
+    if representation == "mixed":
+        arch_str = "Cnn" + "x".join([str(i) for i in cnn_arch]) + "+" + arch_str
+    enemy_desc = "".join(e.__class__.__name__ for e in enemies)
+    experiment_name = f"ppo-{selfplay}-{normalized}-{iters}-{batch_size}-{gamma}-{enemy_desc}-{reward_function.__name__}-{representation}-{arch_str}-{initial_lr}lr-{vps_to_win}vp-{map_type}map"
+    print(experiment_name)
+
+    # WandB config
+    config = {
+        "initial_learning_rate": initial_lr,
+        "final_learning_rate": final_lr,
+        "total_timesteps": total_timesteps,
+        "net_arch": net_arch,
+        "activation_fn": activation_fn.__name__,
+        "vps_to_win": vps_to_win,
+        "map_type": map_type,
+        "enemies": [str(enemy) for enemy in enemies],
+        "reward_function": reward_function.__name__,
+        "representation": representation,
+        "batch_size": batch_size,
+        "gamma": gamma,
+        "normalized": normalized,
+        "cnn_arch": cnn_arch,
+        "selfplay": selfplay,
+        "experiment_name": experiment_name,
+        "n_envs": n_envs,
+        "n_steps": n_steps,
+        "n_epochs": n_epochs,
+        "seed": seed,
+    }
+    run = wandb.init(
+        project="catanatron",
+        config=config,
+        sync_tensorboard=True,
+    )
+
+    def print_name():
+        print(experiment_name)
+
+    atexit.register(print_name)
+
+    # Define the environment creation function
+    def make_env(rank, seed=0):
+        def _init():
+            env = gym.make(
+                env_name,
+                config={
+                    "map_type": map_type,
+                    "vps_to_win": vps_to_win,
+                    "enemies": enemies,
+                    "reward_function": reward_function,
+                    "representation": representation,
+                    "normalized": True,
+                },
+            )
+            env = ActionMasker(env, mask_fn)
+            return env
+
+        return _init
+
+    # Create the vectorized environment
+    env = SubprocVecEnv([make_env(i, seed) for i in range(n_envs)])
+    env = VecMonitor(env)
+
+    # Print the observation space to verify its type
+    print("Observation Space:", env.observation_space)
+    device = th.device("cuda" if th.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    model_path = os.path.join(script_dir, "model")
+    try:
+        model = MaskablePPO.load(model_path, env, device=device)
+        # Override the training configuration from the previously trained model
+        model.gamma = gamma
+        model.ent_coef = ent_coef
+        model.learning_rate = lr_schedule
+        model.batch_size = batch_size
+        model._setup_lr_schedule()
+        print("Loaded", "model")
+    except Exception as e:
+        print(f"Failed to load the model from {model_path}: {e}")
+        print("Creating a new model.")
+        policy_kwargs: Any = dict(activation_fn=activation_fn, net_arch=net_arch[0])
+        if representation == "mixed":
+            policy_kwargs["features_extractor_class"] = CustomCNN
+            policy_kwargs["features_extractor_kwargs"] = dict(
+                cnn_arch=cnn_arch, features_dim=512  # Adjust as needed
+            )
+        model = MaskablePPO(
+            MaskableActorCriticPolicy,
+            env,
+            gamma=gamma,
+            n_steps=n_steps,
+            batch_size=batch_size,
+            n_epochs=n_epochs,
+            policy_kwargs=policy_kwargs,
+            learning_rate=lr_schedule,
+            ent_coef=ent_coef,
+            verbose=1,
+            tensorboard_log="./logs/mppo_tensorboard/" + experiment_name,
+            device=device,
+            seed=seed,
+        )
+
+    # Save a checkpoint every 100,000 steps
+    wandb_callback = WandbCallback(
+        model_save_path=f"models/{run.id}",
+    )
+    checkpoint_callback = CheckpointCallback(
+        save_freq=100_000, save_path="./logs/", name_prefix=experiment_name
+    )
+    callback = CallbackList([checkpoint_callback])  # , wandb_callback])
+
+    if selfplay:
+        selfplay_iterations = 10
+        for i in range(selfplay_iterations):
+            model.learn(
+                total_timesteps=int(total_timesteps / selfplay_iterations),
+                callback=callback,
+            )
+            model.save(model_path)
+    else:
+        model.learn(total_timesteps=total_timesteps, callback=callback)
+        model.save(model_path)
+
+        elapsed_time = time.time() - start_time
+        timesteps_per_second = total_timesteps / elapsed_time
+        print(f"Training completed in {elapsed_time:.2f} seconds.")
+        print(f"Timesteps per second = {timesteps_per_second:.2f}")
+
+    model.save(model_path)
+    run.finish()
+
+
+if __name__ == "__main__":
+    multiprocessing.set_start_method("spawn")
+    main()
diff --git a/research-requirements.txt b/research-requirements.txt
new file mode 100644
index 000000000..50c1766ae
--- /dev/null
+++ b/research-requirements.txt
@@ -0,0 +1,17 @@
+# Only required to train/run the Players using PyTorch (i.e. PPOPlayer)
+keras==2.13.1
+sb3_contrib==2.3.0
+stable_baselines3==2.3.2
+sympy==1.13.1
+tensorboard==2.13.0
+tensorboard-data-server==0.7.2
+tensorflow==2.13.0
+tensorflow-estimator==2.13.0
+tensorflow-intel==2.13.0
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==2.4.0
+tomli==2.0.1
+torch==2.4.1
+torchaudio==2.4.1
+torchvision==0.19.1
+wandb==0.18.1