Speed up complete

Tesla2000 · Tesla2000 · commit f2408eb7ab2e · 2024-04-14T15:30:15.000+02:00
diff --git a/Config.py b/Config.py
@@ -39,7 +39,7 @@ class Config(_ConfigPaths, _ConfigAgent):
     train_batch_size = 128
     training_buffer_len = 100_000
     min_n_points_to_finish = 15
-    n_simulations = 100
+    n_simulations = 1000
     n_games = None
     n_players = 2
     n_actions = 45
diff --git a/agent/policy.py b/agent/policy.py
@@ -13,12 +13,12 @@ def policy(
     c: float,
     n_simulations: int,
 ):
-    N = defaultdict(list)
+    N = defaultdict(lambda: defaultdict(int))
     visited = set()
-    P = defaultdict(list)
-    Q = defaultdict(list)
+    P = defaultdict(dict)
+    Q = defaultdict(dict)
     initial_state = game.get_state()
-    all_moves = game.get_possible_actions()
+    all_moves = game.all_moves
     for _ in range(n_simulations):
         search(game.copy(), agent, c, N, visited, P, Q)
     pi = np.array([N[initial_state][a] for a in all_moves])
diff --git a/agent/search.py b/agent/search.py
@@ -1,6 +1,7 @@
-from collections import defaultdict
 from math import sqrt
+from collections import defaultdict
 
+import torch
 from torch import nn, Tensor
 
 from src.Game import Game
@@ -10,42 +11,45 @@ def search(
     game: Game,
     agent: nn.Module,
     c: float,
-    N: defaultdict[list[int]],
+    N: defaultdict,
     visited: set,
-    P: defaultdict[list],
-    Q: defaultdict[list],
+    P: defaultdict,
+    Q: defaultdict,
 ):
     if game.is_terminal():
         return -game.get_results()[game.current_player.id]
     state = game.get_state()
     if state not in visited:
         visited.add(state)
-        move_scores, v = agent(Tensor([state]))
+        with torch.no_grad():
+            move_scores, v = agent(Tensor([state]))
         tuple(
-            P[state].__setitem__(move, move_scores[0, index])
+            P[state].__setitem__(move, move_scores[0, index].item())
             for index, move in enumerate(game.all_moves)
         )
-        return -v
+        return -v.item()
     q_state = Q[state]
     p_state = P[state]
     n_state = N[state]
     sqrt_value = sqrt(sum(n_state.values()))
-    def _get_action(game: Game):
-        return max(
-            game.get_possible_actions(),
-            key=lambda action: q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action]),
-        )
+
     # def _get_action(game: Game):
-    #     actions = sorted(
-    #         game.all_moves,
-    #         key=lambda action: q_state.get(action, 1)
-    #         + c * p_state[action] * sqrt_value / (1 + n_state[action]),
-    #         reverse=True,
+    #     return max(
+    #         game.get_possible_actions(),
+    #         key=lambda action: q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action]),
     #     )
-    #     for action in actions:
-    #         if action.is_valid(game):
-    #             return action
-    action = _get_action(game)
+    # def _get_action(game: Game):
+    #     best_action = None
+    #     best_value = -float('inf')
+    #     for action in game.all_moves:
+    #         value = q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action])
+    #         if value > best_value and action.is_valid(game):
+    #             best_value, best_action = value, action
+    #     return best_action
+    action = max(
+        game.get_possible_actions(),
+        key=lambda action: q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action]),
+    )
     next_game_state = game.perform(action)
     v = search(next_game_state, agent, c, N, visited, P, Q)
 
diff --git a/agent/self_play.py b/agent/self_play.py
@@ -32,7 +32,7 @@ def self_play(
 def _perform_game(
     game: Game, states: list, id_to_agent: dict[int, Agent]
 ) -> tuple[list[tuple[np.array, np.array, int]], Agent]:
-    for turn in tqdm(count()):
+    for _ in tqdm(count()):
         agent = id_to_agent[game.current_player.id]
         pi, action = policy(game, agent, Config.c, Config.n_simulations)
         states.append((game, pi / pi.sum(), 0))
@@ -47,7 +47,6 @@ def _perform_game(
                         int(result[state[0].current_player.id] == 1),
                     )
                     for state in states
-                    if state[1] != game.null_move
                 ),
                 id_to_agent[
                     next(player.id for player in game.players if result[player.id])