Added data saving

Tesla2000 · Tesla2000 · commit 04c4c39b0812 · 2024-03-08T13:51:18.000+01:00
diff --git a/Config.py b/Config.py
@@ -19,6 +19,7 @@ class _ConfigPaths:
 
 class Config(_ConfigPaths):
     # hidden_sizes = (256, 128, 64, 32)
+    c = .1
     hidden_sizes = (256,)
     # hidden_sizes = tuple()
     learning_rate = 1e-3
diff --git a/agent/Agent.py b/agent/Agent.py
@@ -23,7 +23,7 @@ def __init__(
         first_size = self._get_size(n_players)
         sizes = first_size, *hidden_sizes
         self.layers = nn.ModuleList(starmap(nn.Linear, pairwise(sizes)))
-        self.trained = False
+        self.trained = True
         self.fc_v = nn.Linear(sizes[-1], 1)
         self.fc_p = nn.Linear(sizes[-1], n_moves)
         self._n_moves = n_moves
diff --git a/agent/self_play.py b/agent/self_play.py
@@ -1,5 +1,5 @@
 from collections import deque
-from itertools import cycle
+from itertools import cycle, count
 
 import numpy as np
 from tqdm import tqdm
@@ -16,8 +16,9 @@ def self_play(agents: deque[Agent]) -> tuple[list[tuple[np.array, np.array, int]
     id_to_agent = dict((player.id, agent) for agent, player in zip(agents, game.players))
     for agent in agents:
         agent.eval()
-    for agent in tqdm(cycle(agents)):
-        pi, action = policy(game, agent, 1, Config.n_simulations)
+    for _ in tqdm(count()):
+        agent = id_to_agent[game.current_player.id]
+        pi, action = policy(game, agent, Config.c, Config.n_simulations)
         action_index = game.all_moves.index(action)
         onehot_encoded_action = np.zeros(Config.n_actions)
         onehot_encoded_action[action_index] = 1
diff --git a/main.py b/main.py
@@ -1,5 +1,5 @@
 import re
-from collections import deque, defaultdict
+from collections import deque
 from copy import deepcopy
 from itertools import count
 
@@ -17,9 +17,10 @@ def main():
     scores = deque(maxlen=Config.max_results_held)
     for _ in (count() if Config.n_games is None else range(Config.n_games)):
         buffer, winner = self_play(agents)
+        Config.data_path.joinpath(str(max((*tuple(map(int, map(str, Config.data_path.iterdir()))), -1)) + 1)).write_text(str((list(buffer[0][0]), list(buffer[0][1]), buffer[0][2])))
         scores.append(agents[-1] is winner)
         if len(scores) >= Config.min_games_to_replace_agents and sum(scores) > Config.minimal_relative_agent_improvement * len(scores) / len(agents):
-            torch.save(agents[-1].state_dict(), Config.model_path.joinpath(str(max(map(int, (*re.findall(r'\d+', ''.join(Config.model_path.iterdir())), -1))) + 1) + ".pth"))
+            torch.save(agents[-1].state_dict(), Config.model_path.joinpath(str(max(map(int, (*re.findall(r'\d+', ''.join(map(str, Config.model_path.iterdir()))), -1))) + 1) + ".pth"))
             agents.append(Agent(Config.n_players).load_state_dict(deepcopy(agents[-1].state_dict())))
             agents[-1].training = True
             scores = deque(maxlen=Config.max_results_held)
diff --git a/src/Game.py b/src/Game.py
@@ -130,6 +130,7 @@ def copy(self) -> Self:
                 ),
             ),
             n_players=self.n_players,
+            _last_turn=self._last_turn,
         )
         game.current_player = game.players[0]
         for player in game.players:
diff --git a/src/moves/Move.py b/src/moves/Move.py
@@ -10,7 +10,7 @@
 class Move(ABC):
     @abstractmethod
     def perform(self, game: "Game") -> "Game":
-        # game = game.copy()
+        game = game.copy()
         game.is_blocked[game.current_player] = False
         return game
 

Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,7 @@ def copy(self) -> Self:`
`130`	`130`	`),`
`131`	`131`	`),`
`132`	`132`	`n_players=self.n_players,`
	`133`	`+ _last_turn=self._last_turn,`
`133`	`134`	`)`
`134`	`135`	`game.current_player = game.players[0]`
`135`	`136`	`for player in game.players:`