Tesla2000
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Config.py‎
Lines changed: 32 additions & 5 deletions b/‎Config.py‎
Lines changed: 32 additions & 5 deletions
diff --git a/‎agent/Agent.py‎
Lines changed: 10 additions & 12 deletions b/‎agent/Agent.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎agent/RLDataset.py‎
Lines changed: 16 additions & 0 deletions b/‎agent/RLDataset.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎agent/policy.py‎
Lines changed: 25 additions & 0 deletions b/‎agent/policy.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎agent/search.py‎
Lines changed: 43 additions & 0 deletions b/‎agent/search.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎agent/self_play.py‎
Lines changed: 30 additions & 0 deletions b/‎agent/self_play.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎agent/train_agent.py‎
Lines changed: 23 additions & 82 deletions b/‎agent/train_agent.py‎
Lines changed: 23 additions & 82 deletions
diff --git a/‎main.py‎
Lines changed: 43 additions & 1 deletion b/‎main.py‎
Lines changed: 43 additions & 1 deletion
@@ -0,0 +1,3 @@
+/sandbox.py
+/data/
+/models/
@@ -1,15 +1,42 @@
 import random
+from pathlib import Path
 
 import numpy as np
 import torch
 
-random.seed(42)
-np.random.seed(42)
-torch.random.manual_seed(42)
 
+class _ConfigPaths:
+    root = Path(__file__).parent
+    data_path = root / 'data'
+    data_path.mkdir(exist_ok=True)
+    model_path = root / 'models'
+    model_path.mkdir(exist_ok=True)
 
-class Config:
+
+class _ConfigAgent:
+    # hidden_sizes = (256, 128, 64, 32)
+    hidden_sizes = (256,)
+    # hidden_sizes = tuple()
+    c = .1
+    learning_rate = 1e-3
+    debug = False
+    pretrain = True
+
+
+class Config(_ConfigPaths, _ConfigAgent):
+    max_results_held = 100
+    minimal_relative_agent_improvement = 1.1
+    min_games_to_replace_agents = 20
+    train_batch_size = 64
+    training_buffer_len = 1000
     min_n_points_to_finish = 15
     n_simulations = 100
-    n_games = 1
+    n_games = None
     n_players = 2
+    n_actions = 46
+
+
+if Config.debug:
+    random.seed(42)
+    np.random.seed(42)
+    torch.random.manual_seed(42)
@@ -3,42 +3,40 @@
 import numpy as np
 from torch import nn, Tensor
 
+from Config import Config
+
 
 class Agent(nn.Module):
     _input_size_dictionary = {
-        2: 215,
+        2: 205,
     }
 
     def __init__(
         self,
         n_players: int,
-        hidden_sizes: tuple = (256, 128, 64, 32),
+        hidden_sizes: tuple = Config.hidden_sizes,
         n_moves: int = 46,
     ):
         super().__init__()
-        self.relu = nn.ReLU()
         self.tanh = nn.Tanh()
         self.softmax = nn.Softmax(dim=1)
         first_size = self._get_size(n_players)
         sizes = first_size, *hidden_sizes
-        self.layers = tuple(starmap(nn.Linear, pairwise(sizes)))
-        for index, layer in enumerate(self.layers):
-            setattr(self, f"layer_{index}", layer)
-        self.fc_v = nn.Linear(hidden_sizes[-1], 1)
-        self.fc_p = nn.Linear(hidden_sizes[-1], n_moves)
+        self.layers = nn.ModuleList(starmap(nn.Linear, pairwise(sizes)))
+        self.trained = True
+        self.fc_v = nn.Linear(sizes[-1], 1)
+        self.fc_p = nn.Linear(sizes[-1], n_moves)
         self._n_moves = n_moves
-        self._trained = False
 
     def _get_size(self, n_players: int) -> int:
         return self._input_size_dictionary[n_players]
 
     def forward(self, state: Tensor):
-        if not self.training and not self._trained:
+        if not self.training and not self.trained:
             return self.softmax(Tensor(np.random.random((1, self._n_moves)))), Tensor(
                 np.random.uniform(-1, 1, (1, 1))
             )
-        self._trained = True
+        self.trained = True
         for layer in self.layers:
             state = layer(state)
-            state = self.relu(state)
         return self.softmax(self.fc_p(state)), self.tanh(self.fc_v(state))
@@ -0,0 +1,16 @@
+from collections import deque
+
+import numpy as np
+from torch.utils.data import Dataset
+
+
+class RLDataset(Dataset):
+    def __init__(self, examples: deque[tuple[tuple, np.array, int]]):
+        self.examples = examples
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, index) -> tuple[np.array, ...]:
+        return np.array(self.examples[index][0]), np.array(self.examples[index][1]), np.array(
+            [self.examples[index][2] * 2 - 1])
@@ -0,0 +1,25 @@
+from collections import defaultdict
+
+import numpy as np
+from torch import nn
+
+from agent.search import search
+from src.Game import Game
+
+
+def policy(
+    game: Game,
+    agent: nn.Module,
+    c: float,
+    n_simulations: int,
+):
+    N = defaultdict(lambda: defaultdict(int))
+    visited = set()
+    P = defaultdict(dict)
+    Q = defaultdict(dict)
+    initial_state = game.get_state()
+    all_moves = game.get_possible_actions()
+    for _ in range(n_simulations):
+        search(game.copy(), agent, c, N, visited, P, Q)
+    pi = [N[initial_state][a] for a in all_moves]
+    return pi, all_moves[np.argmax(pi)]
@@ -0,0 +1,43 @@
+from collections import defaultdict
+from math import sqrt
+
+from torch import nn, Tensor
+
+from src.Game import Game
+
+
+def search(
+    game: Game,
+    agent: nn.Module,
+    c: float,
+    N: defaultdict,
+    visited: set,
+    P: defaultdict,
+    Q: defaultdict,
+):
+    if game.is_terminal():
+        return -game.get_results()[game.current_player.id]
+    state = game.get_state()
+    if state not in visited:
+        visited.add(state)
+        move_scores, v = agent(Tensor([state]))
+        tuple(
+            P[state].__setitem__(move, move_scores[0, index])
+            for index, move in enumerate(game.all_moves)
+        )
+        return -v
+
+    action = max(
+        game.get_possible_actions(),
+        key=lambda action: Q[state].get(action, 1)
+        + c * P[state][action] * sqrt(sum(N[state].values())) / (1 + N[state][action]),
+    )
+
+    next_game_state = game.perform(action)
+    v = search(next_game_state, agent, c, N, visited, P, Q)
+
+    Q[state][action] = (N[state][action] * Q[state].get(action, 1) + v) / (
+        N[state][action] + 1
+    )
+    N[state][action] += 1
+    return -v
@@ -0,0 +1,30 @@
+from collections import deque
+from itertools import count
+
+import numpy as np
+from tqdm import tqdm
+
+from Config import Config
+from .Agent import Agent
+from .policy import policy
+from src.Game import Game
+
+
+def self_play(agents: deque[Agent]) -> tuple[list[tuple[np.array, np.array, int]], Agent]:
+    states = []
+    game = Game(n_players=Config.n_players)
+    id_to_agent = dict((player.id, agent) for agent, player in zip(agents, game.players))
+    for agent in agents:
+        agent.eval()
+    for _ in tqdm(count()):
+        agent = id_to_agent[game.current_player.id]
+        pi, action = policy(game, agent, Config.c, Config.n_simulations)
+        action_index = game.all_moves.index(action)
+        onehot_encoded_action = np.zeros(Config.n_actions)
+        onehot_encoded_action[action_index] = 1
+        states.append((game, onehot_encoded_action, 0))
+        game = game.perform(action)
+        if game.is_terminal():
+            result = game.get_results()
+            return (list((state[0].get_state(), state[1], int(result[state[0].current_player.id] == 1)) for state in states),
+                    id_to_agent[next(player.id for player in game.players if result[player.id])])
@@ -1,87 +1,28 @@
-from collections import defaultdict
-from dataclasses import astuple
-from math import sqrt
+from collections import deque
 
 import numpy as np
-from torch import nn, Tensor
-from tqdm import tqdm
+from torch import nn, optim
+from torch.utils.data import DataLoader
 
 from Config import Config
-from src.Game import Game
 from .Agent import Agent
-
-
-def train_agent():
-    agent = Agent(Config.n_players)
-    agent.eval()
-    examples = []
-    examples_per_game = []
-    for i in range(Config.n_games):
-        game = Game(n_players=Config.n_players)
-        while True:
-            pi, action = policy(game, agent, 1, Config.n_simulations)
-            examples_per_game.append((game, pi, 0))
-            game = game.perform(action)
-            print(len(game.players[1].cards), game.players[1].points)
-            if game.is_terminal():
-                for example in examples_per_game:
-                    example[2] = game.get_state()
-                break
-        examples += examples_per_game
-        break
-    return examples
-
-
-def search(
-    game: Game,
-    agent: nn.Module,
-    c: float,
-    N: defaultdict,
-    visited: set,
-    P: defaultdict,
-    Q: defaultdict,
-):
-    state = game.get_state()
-    if game.is_terminal():
-        return game.get_results()[game.current_player]
-    if state not in visited:
-        visited.add(state)
-        move_scores, v = agent(Tensor([state]))
-        tuple(
-            P[state].__setitem__(move, move_scores[0, index])
-            for index, move in enumerate(game.all_moves)
-        )
-        return -v
-
-    action = max(
-        game.get_possible_actions(),
-        key=lambda action: Q[state].get(action, 1)
-        + c * P[state][action] * sqrt(sum(N[state].values())) / (1 + N[state][action]),
-    )
-
-    next_game_state = game.perform(action)
-    v = search(next_game_state, agent, c, N, visited, P, Q)
-
-    Q[state][action] = (N[state][action] * Q[state].get(action, 1) + v) / (
-        N[state][action] + 1
-    )
-    N[state][action] += 1
-    return -v
-
-
-def policy(
-    game: Game,
-    agent: nn.Module,
-    c: float,
-    n_simulations: int,
-):
-    N = defaultdict(lambda: defaultdict(int))
-    visited = set()
-    P = defaultdict(dict)
-    Q = defaultdict(dict)
-    initial_state = game.get_state()
-    all_moves = game.get_possible_actions()
-    for _ in tqdm(range(n_simulations)):
-        search(game, agent, c, N, visited, P, Q)
-    pi = [N[initial_state][a] for a in all_moves]
-    return pi, all_moves[np.argmax(pi)]
+from .RLDataset import RLDataset
+
+
+def train_agent(agent: Agent, train_data: deque[tuple[tuple, np.array, int]]):
+    agent.train()
+    categorical_cross_entropy = nn.CrossEntropyLoss()
+    mse = nn.MSELoss()
+    optimizer = optim.Adam(agent.parameters(), lr=Config.learning_rate)
+    dataset = RLDataset(train_data)
+    loader = DataLoader(dataset, batch_size=Config.train_batch_size)
+    for batch in loader:
+        state, policy, win_probability = batch
+        state, policy, win_probability = state.float(), policy.float(), win_probability.float()
+        optimizer.zero_grad()
+        output_policy, output_v = agent(state)
+        bce = mse(output_v, win_probability)
+        cce = categorical_cross_entropy(output_policy, policy)
+        bce.backward(retain_graph=True)
+        cce.backward()
+        optimizer.step()
@@ -1,4 +1,46 @@
+import re
+from collections import deque
+from copy import deepcopy
+from itertools import count, islice
+from pathlib import Path
+
+import torch
+
+from Config import Config
+from agent.Agent import Agent
+from agent.self_play import self_play
 from agent.train_agent import train_agent
 
+
+def main():
+    training_buffer = deque(maxlen=Config.training_buffer_len)
+    agents = deque((Agent(Config.n_players) for _ in range(Config.n_players)), maxlen=Config.n_players)
+    if Config.pretrain:
+        for agent, checkpoint_index in zip(islice(reversed(agents), 1, None), sorted((int(path.name.split('.')[0]) for path in Config.model_path.iterdir()), reverse=True)):
+            agent.load_state_dict(torch.load(Config.model_path.joinpath(f'{checkpoint_index}.pth')))
+        agents[-1].load_state_dict(torch.load(Config.model_path.joinpath(f"{max(int(path.name.split('.')[0]) for path in Config.model_path.iterdir())}.pth")))
+        training_buffer += list(map(eval, map(Path.read_text, sorted(Config.data_path.iterdir(), key=lambda path: int(path.name), reverse=True)[:Config.training_buffer_len])))
+        train_agent(agents[-1], training_buffer)
+    scores = deque(maxlen=Config.max_results_held)
+    for _ in (count() if Config.n_games is None else range(Config.n_games)):
+        buffer, winner = self_play(agents)
+        start_index = max((*tuple(int(path.name) for path in Config.data_path.iterdir()), -1)) + 1
+        for start_index, sample in enumerate(buffer, start_index + 1):
+            Config.data_path.joinpath(str(start_index)).write_text(str((list(sample[0]), list(sample[1]), sample[2])))
+        scores.append(agents[-1] is winner)
+        if (len(scores) < Config.min_games_to_replace_agents and sum(scores) >= Config.minimal_relative_agent_improvement * Config.min_games_to_replace_agents / len(agents)) or (len(scores) >= Config.min_games_to_replace_agents and sum(scores) >= Config.minimal_relative_agent_improvement * len(scores) / len(agents)):
+            torch.save(agents[-1].state_dict(), Config.model_path.joinpath(str(max(map(int, (*re.findall(r'\d+', ''.join(map(str, Config.model_path.iterdir()))), -1))) + 1) + ".pth"))
+            agents.append(Agent(Config.n_players))
+            agents[-1].load_state_dict(deepcopy(agents[-1].state_dict()))
+            agents[-1].training = True
+            scores = deque(maxlen=Config.max_results_held)
+        elif len(scores) >= Config.min_games_to_replace_agents:
+            print(f'{len(scores)} {sum(scores) / len(scores):.2f}')
+        else:
+            print(f'{len(scores)} {sum(scores)}/{len(scores)}')
+        training_buffer += buffer
+        train_agent(agents[-1], training_buffer)
+
+
 if __name__ == "__main__":
-    train_agent()
+    main()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+/sandbox.py`
	`2`	`+/data/`
	`3`	`+/models/`