Added pretraining with saved data

Tesla2000 · Tesla2000 · commit bcd46e1cf9ca · 2024-03-08T15:44:35.000+01:00
diff --git a/Config.py b/Config.py
@@ -4,10 +4,6 @@
 import numpy as np
 import torch
 
-random.seed(42)
-np.random.seed(42)
-torch.random.manual_seed(42)
-
 
 class _ConfigPaths:
     root = Path(__file__).parent
@@ -17,12 +13,17 @@ class _ConfigPaths:
     model_path.mkdir(exist_ok=True)
 
 
-class Config(_ConfigPaths):
+class _ConfigAgent:
     # hidden_sizes = (256, 128, 64, 32)
-    c = .1
     hidden_sizes = (256,)
     # hidden_sizes = tuple()
+    c = .1
     learning_rate = 1e-3
+    debug = False
+    pretrain = True
+
+
+class Config(_ConfigPaths, _ConfigAgent):
     max_results_held = 100
     minimal_relative_agent_improvement = 1.1
     min_games_to_replace_agents = 20
@@ -33,3 +34,9 @@ class Config(_ConfigPaths):
     n_games = None
     n_players = 2
     n_actions = 46
+
+
+if Config.debug:
+    random.seed(42)
+    np.random.seed(42)
+    torch.random.manual_seed(42)
diff --git a/agent/RLDataset.py b/agent/RLDataset.py
@@ -12,4 +12,4 @@ def __len__(self):
         return len(self.examples)
 
     def __getitem__(self, index) -> tuple[np.array, ...]:
-        return np.array(self.examples[index][0]), self.examples[index][1], np.array([self.examples[index][2]])
+        return np.array(self.examples[index][0]), np.array(self.examples[index][1]), np.array([self.examples[index][2]])
diff --git a/agent/policy.py b/agent/policy.py
@@ -9,7 +9,7 @@
 
 def policy(
     game: Game,
-    agent: nn.Module,
+    agents: dict[int, nn.Module],
     c: float,
     n_simulations: int,
 ):
@@ -20,6 +20,6 @@ def policy(
     initial_state = game.get_state()
     all_moves = game.get_possible_actions()
     for _ in range(n_simulations):
-        search(game.copy(), agent, c, N, visited, P, Q)
+        search(game.copy(), agents, c, N, visited, P, Q)
     pi = [N[initial_state][a] for a in all_moves]
     return pi, all_moves[np.argmax(pi)]
diff --git a/agent/search.py b/agent/search.py
@@ -8,7 +8,7 @@
 
 def search(
     game: Game,
-    agent: nn.Module,
+    agents: dict[int, nn.Module],
     c: float,
     N: defaultdict,
     visited: set,
@@ -18,6 +18,7 @@ def search(
     if game.is_terminal():
         return game.get_results()[game.current_player.id]
     state = game.get_state()
+    agent = agents[game.current_player.id]
     if state not in visited:
         visited.add(state)
         move_scores, v = agent(Tensor([state]))
@@ -34,7 +35,7 @@ def search(
     )
 
     next_game_state = game.perform(action)
-    v = search(next_game_state, agent, c, N, visited, P, Q)
+    v = search(next_game_state, agents, c, N, visited, P, Q)
 
     Q[state][action] = (N[state][action] * Q[state].get(action, 1) + v) / (
         N[state][action] + 1
diff --git a/agent/self_play.py b/agent/self_play.py
@@ -1,5 +1,5 @@
 from collections import deque
-from itertools import cycle, count
+from itertools import count
 
 import numpy as np
 from tqdm import tqdm
@@ -17,8 +17,7 @@ def self_play(agents: deque[Agent]) -> tuple[list[tuple[np.array, np.array, int]
     for agent in agents:
         agent.eval()
     for _ in tqdm(count()):
-        agent = id_to_agent[game.current_player.id]
-        pi, action = policy(game, agent, Config.c, Config.n_simulations)
+        pi, action = policy(game, id_to_agent, Config.c, Config.n_simulations)
         action_index = game.all_moves.index(action)
         onehot_encoded_action = np.zeros(Config.n_actions)
         onehot_encoded_action[action_index] = 1
diff --git a/main.py b/main.py
@@ -2,6 +2,7 @@
 from collections import deque
 from copy import deepcopy
 from itertools import count
+from pathlib import Path
 
 import torch
 
@@ -14,10 +15,13 @@
 def main():
     training_buffer = deque(maxlen=Config.training_buffer_len)
     agents = deque((Agent(Config.n_players) for _ in range(Config.n_players)), maxlen=Config.n_players)
+    if Config.pretrain:
+        training_buffer += list(map(eval, map(Path.read_text, sorted(Config.data_path.iterdir(), key=lambda path: int(path.name), reverse=True)[:Config.training_buffer_len])))
+        train_agent(agents[-1], training_buffer)
     scores = deque(maxlen=Config.max_results_held)
     for _ in (count() if Config.n_games is None else range(Config.n_games)):
         buffer, winner = self_play(agents)
-        Config.data_path.joinpath(str(max((*tuple(map(int, map(str, Config.data_path.iterdir()))), -1)) + 1)).write_text(str((list(buffer[0][0]), list(buffer[0][1]), buffer[0][2])))
+        Config.data_path.joinpath(str(max((*tuple(int(path.name) for path in Config.data_path.iterdir()), -1)) + 1)).write_text(str((list(buffer[0][0]), list(buffer[0][1]), buffer[0][2])))
         scores.append(agents[-1] is winner)
         if len(scores) >= Config.min_games_to_replace_agents and sum(scores) > Config.minimal_relative_agent_improvement * len(scores) / len(agents):
             torch.save(agents[-1].state_dict(), Config.model_path.joinpath(str(max(map(int, (*re.findall(r'\d+', ''.join(map(str, Config.model_path.iterdir()))), -1))) + 1) + ".pth"))