Made games more fair by replaying with switched positions

Tesla2000 · Tesla2000 · commit bf9b923ac994 · 2024-03-08T21:56:03.000+01:00
diff --git a/Config.py b/Config.py
@@ -7,8 +7,10 @@
 
 class _ConfigPaths:
     root = Path(__file__).parent
-    data_path = root / 'data'
-    data_path.mkdir(exist_ok=True)
+    training_data_path = root / 'training_data'
+    training_data_path.mkdir(exist_ok=True)
+    evaluation_data_path = root / 'evaluation_data'
+    evaluation_data_path.mkdir(exist_ok=True)
     model_path = root / 'models'
     model_path.mkdir(exist_ok=True)
 
@@ -26,9 +28,9 @@ class _ConfigAgent:
 class Config(_ConfigPaths, _ConfigAgent):
     max_results_held = 100
     minimal_relative_agent_improvement = 1.1
-    min_games_to_replace_agents = 20
+    min_games_to_replace_agents = 40
     train_batch_size = 64
-    training_buffer_len = 1000
+    training_buffer_len = 100_000
     min_n_points_to_finish = 15
     n_simulations = 100
     n_games = None
diff --git a/agent/self_play.py b/agent/self_play.py
@@ -1,5 +1,6 @@
 from collections import deque
-from itertools import count
+from itertools import count, cycle, islice
+from more_itertools import windowed
 
 import numpy as np
 from tqdm import tqdm
@@ -10,12 +11,23 @@
 from src.Game import Game
 
 
-def self_play(agents: deque[Agent]) -> tuple[list[tuple[np.array, np.array, int]], Agent]:
+def self_play(agents: deque[Agent]) -> tuple[list[tuple[np.array, np.array, int]], list[Agent]]:
     states = []
-    game = Game(n_players=Config.n_players)
-    id_to_agent = dict((player.id, agent) for agent, player in zip(agents, game.players))
+    winners = []
+    initial_state = Game(n_players=Config.n_players)
     for agent in agents:
         agent.eval()
+    for agents_in_order in islice(windowed(cycle(agents), Config.n_players), Config.n_players):
+        game = initial_state.copy()
+        id_to_agent = dict((player.id, agent) for agent, player in zip(agents_in_order, game.players))
+        results, winner = _perform_game(game, [], id_to_agent)
+        states += results
+        winners.append(winner)
+    return states, winners
+
+
+def _perform_game(game: Game, states: list, id_to_agent: dict[int, Agent]) -> tuple[
+    list[tuple[np.array, np.array, int]], Agent]:
     for _ in tqdm(count()):
         agent = id_to_agent[game.current_player.id]
         pi, action = policy(game, agent, Config.c, Config.n_simulations)
@@ -26,5 +38,7 @@ def self_play(agents: deque[Agent]) -> tuple[list[tuple[np.array, np.array, int]
         game = game.perform(action)
         if game.is_terminal():
             result = game.get_results()
-            return (list((state[0].get_state(), state[1], int(result[state[0].current_player.id] == 1)) for state in states),
-                    id_to_agent[next(player.id for player in game.players if result[player.id])])
+            return (
+                list(
+                    (state[0].get_state(), state[1], int(result[state[0].current_player.id] == 1)) for state in states),
+                id_to_agent[next(player.id for player in game.players if result[player.id])])
diff --git a/main.py b/main.py
@@ -18,16 +18,20 @@ def main():
     if Config.pretrain:
         for agent, checkpoint_index in zip(islice(reversed(agents), 1, None), sorted((int(path.name.split('.')[0]) for path in Config.model_path.iterdir()), reverse=True)):
             agent.load_state_dict(torch.load(Config.model_path.joinpath(f'{checkpoint_index}.pth')))
-        agents[-1].load_state_dict(torch.load(Config.model_path.joinpath(f"{max(int(path.name.split('.')[0]) for path in Config.model_path.iterdir())}.pth")))
-        training_buffer += list(map(eval, map(Path.read_text, sorted(Config.data_path.iterdir(), key=lambda path: int(path.name), reverse=True)[:Config.training_buffer_len])))
+        newest = Config.model_path.joinpath(
+            f"{max((*tuple(int(path.name.split('.')[0]) for path in Config.model_path.iterdir()), 0))}.pth")
+        if newest.exists():
+            agents[-1].load_state_dict(torch.load(newest))
+        training_buffer += list(map(eval, map(Path.read_text, sorted(Config.training_data_path.iterdir(), key=lambda path: int(path.name), reverse=True)[:Config.training_buffer_len])))
         train_agent(agents[-1], training_buffer)
     scores = deque(maxlen=Config.max_results_held)
     for _ in (count() if Config.n_games is None else range(Config.n_games)):
-        buffer, winner = self_play(agents)
-        start_index = max((*tuple(int(path.name) for path in Config.data_path.iterdir()), -1)) + 1
+        buffer, winners = self_play(agents)
+        start_index = max((*tuple(int(path.name) for path in Config.training_data_path.iterdir()), -1)) + 1
         for start_index, sample in enumerate(buffer, start_index + 1):
-            Config.data_path.joinpath(str(start_index)).write_text(str((list(sample[0]), list(sample[1]), sample[2])))
-        scores.append(agents[-1] is winner)
+            Config.training_data_path.joinpath(str(start_index)).write_text(str((list(sample[0]), list(sample[1]), sample[2])))
+        for winner in winners:
+            scores.append(agents[-1] is winner)
         if (len(scores) < Config.min_games_to_replace_agents and sum(scores) >= Config.minimal_relative_agent_improvement * Config.min_games_to_replace_agents / len(agents)) or (len(scores) >= Config.min_games_to_replace_agents and sum(scores) >= Config.minimal_relative_agent_improvement * len(scores) / len(agents)):
             torch.save(agents[-1].state_dict(), Config.model_path.joinpath(str(max(map(int, (*re.findall(r'\d+', ''.join(map(str, Config.model_path.iterdir()))), -1))) + 1) + ".pth"))
             agents.append(Agent(Config.n_players))