1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3""" 4Defines an environment for Tic Tac Toe. 5""" 6 7from __future__ import division 8from __future__ import print_function 9from __future__ import unicode_literals 10 11import os 12import random 13import sys 14 15# Insert the package's parent directory into the system search path, so that this package can be 16# imported when the aixi.py script is run directly from a release archive. 17PROJECT_ROOT = os.path.realpath(os.path.join(os.pardir, os.pardir)) 18sys.path.insert(0, PROJECT_ROOT) 19 20# Ensure xrange is defined on Python 3. 21from six.moves import xrange 22 23from pyaixi import environment, util 24 25# Define a enumeration to represent environment observations: either a square 26# is empty, filled with the agent's piece, or the environment's piece. 27tictactoe_observation_enum = util.enum('oEmpty', 'oAgent', 'oEnv') 28 29# Define a enumeration to represent rewards as a result of actions: invalid actions, 30# losses, null, draws, and wins. 31# NOTE: since the enumeration values need to be positive, these values are defined relative to 3. 32# So -3 points is 0, -2 points is 1, and 2 points is 5. 33tictactoe_reward_enum = util.enum(rInvalid = 0, rLoss = 1, rNull = 3, rDraw = 4, rWin = 5) 34 35# Define some shorthand notation for ease of reference. 36oEmpty = tictactoe_observation_enum.oEmpty 37oAgent = tictactoe_observation_enum.oAgent 38oEnv = tictactoe_observation_enum.oEnv 39 40rInvalid = tictactoe_reward_enum.rInvalid 41rLoss = tictactoe_reward_enum.rLoss 42rNull = tictactoe_reward_enum.rNull 43rDraw = tictactoe_reward_enum.rDraw 44rWin = tictactoe_reward_enum.rWin 45 46class Tic_Tac_Toe(environment.Environment): 47 """ In this domain, the agent plays repeated games of TicTacToe against an 48 opponent who moves randomly. If the agent wins the game, it receives a 49 reward of 2. If there is a draw, the agent receives a reward of 1. A loss 50 penalizes the agent by -2. If the agent makes an illegal move, by moving on 51 top of an already filled square, then it receives a reward of -3. A legal 52 move that does not end the game earns no reward. An illegal reward causes 53 the game to restart. 54 55 Domain characteristics: 56 - environment: "tictactoe" 57 - maximum action: 8 (4 bits) 58 - maximum observation: 174672 (18 bits) 59 - 174672 (decimal) = 101010101010101010 (binary) 60 - maximum reward: 5 (3 bits) 61 """ 62 63 # Instance methods. 64 65 def __init__(self, options = {}): 66 """ Construct the Tic Tac Toe environment from the given options. 67 68 - `options` is a dictionary of named options and their values. 69 """ 70 71 # Set up the base environment. 72 environment.Environment.__init__(self, options = options) 73 74 # Define the acceptable action values. 75 self.valid_actions = xrange(0, 9) 76 77 # Define the acceptable observation values. 78 self.valid_observations = xrange(0, 174672 + 1) 79 80 # Define the acceptable reward values. 81 self.valid_rewards = list(tictactoe_reward_enum.keys()) 82 83 # Set the initial reward. 84 self.reward = 0 85 86 # Set up the game. 87 self.reset() 88 # end def 89 90 def check_win(self): 91 """ Check if either player has won the game. 92 Returns True if so, False otherwise. 93 94 (Called `checkWin` in the C++ version.) 95 """ 96 97 # Check if we've got a row of three matching symbols. 98 for r in xrange(0, 3): 99 # Is this row all matching, non-empty symbols? 100 if (self.board[r][0] != oEmpty and \ 101 self.board[r][0] == self.board[r][1] and \ 102 self.board[r][1] == self.board[r][2]): 103 # Yes. Someone has won. 104 return True 105 # end if 106 # end for 107 108 # Check if we've got any columns of three matching symbols. 109 for c in xrange(0, 3): 110 # Is this column all matching, non-empty symbols? 111 if (self.board[0][c] != oEmpty and \ 112 self.board[0][c] == self.board[1][c] and \ 113 self.board[1][c] == self.board[2][c]): 114 # Yes. Someone has won. 115 return True 116 # end if 117 # end for 118 119 # Check the diagonals. 120 if (self.board[1][1] != oEmpty and \ 121 self.board[0][0] == self.board[1][1] and \ 122 self.board[1][1] == self.board[2][2]): 123 return True 124 # end if 125 126 if (self.board[1][1] != oEmpty and \ 127 self.board[0][2] == self.board[1][1] and \ 128 self.board[1][1] == self.board[2][0]): 129 return True 130 # end if 131 132 # If we're here, there's no winner yet. 133 return False 134 # end def 135 136 def compute_observation(self): 137 """ Encodes the state of each square into an overall observation and saves 138 the result in self.observation. 139 Each cell corresponds to two bits. 140 141 (Called `computeObservation` in the C++ version.) 142 """ 143 144 self.observation = 0 145 for r in xrange(0, 3): 146 for c in xrange(0, 3): 147 # Shift the existing observation up by 2 bits, and add the current observation to 148 # that. 149 self.observation = self.board[r][c] + (4 * self.observation) 150 # end for 151 # end for 152 # end def 153 154 def perform_action(self, action): 155 """ Receives the agent's action and calculates the new environment percept. 156 (Called `performAction` in the C++ version.) 157 """ 158 159 assert self.is_valid_action(action) 160 161 # Save the action. 162 self.action = action 163 164 # Increment the actions-since-reset counter. 165 self.actions_since_reset += 1 166 167 # Decode the action into the desired row and column to update. 168 r = int(action / 3) 169 c = action % 3 170 171 # If agent makes an invalid move, give the appropriate (lack of) reward and clear the board. 172 if (self.board[r][c] != oEmpty): 173 self.reward = rInvalid 174 self.reset() 175 return 176 # end def 177 178 # The agent makes their move. 179 self.board[r][c] = oAgent 180 181 # If the agent wins or draws, give an appropriate reward and clear the board. 182 if (self.check_win()): 183 self.reward = rWin 184 self.reset() 185 return 186 elif self.actions_since_reset == 5: 187 # If it's been 5 actions since the reset, then this must be a draw, as there's 188 # no way to win from this point. 189 self.reward = rDraw 190 self.reset() 191 return 192 # end def 193 194 # The environment makes a random play. 195 while (self.board[r][c] != oEmpty): 196 # Keep picking board positions at random until we find an unoccupied spot. 197 r = random.randrange(0, 3) 198 c = random.randrange(0, 3) 199 # end while 200 201 # If we're here, we've got an unoccupied spot. 202 self.board[r][c] = oEnv 203 204 # If the environment has won, give an appropriate reward and clear the board. 205 if (self.check_win()): 206 self.reward = rLoss 207 self.reset() 208 return 209 # end def 210 211 # The game continues. 212 self.reward = rNull 213 self.compute_observation() 214 215 return (self.observation, self.reward) 216 # end def 217 218 def print(self): 219 """ Returns a string indicating the status of the environment. 220 """ 221 222 # Show what just happened, correcting the reward for being defined relative to 3. 223 message = "action = %s, observation = %s, reward = %s (%d), board:" % \ 224 (self.action, self.observation, self.reward, (self.reward - 3)) + os.linesep 225 226 # Display the current state of the board. 227 for r in xrange(0, 3): 228 for c in xrange(0, 3): 229 b = self.board[r][c] 230 message += "." if b == oEmpty else ("A" if b == oAgent else "O") 231 # end for 232 message += os.linesep 233 # end for 234 message += os.linesep 235 236 return message 237 # end def 238 239 def reset(self): 240 """ Begin a new game. 241 """ 242 243 # Set up the board. 244 self.board = {} 245 246 for r in xrange(0, 3): 247 for c in xrange(0, 3): 248 # Ensure the row exists. 249 if r not in self.board: 250 self.board[r] = {} 251 # end if 252 253 # Set this element to be empty. 254 self.board[r][c] = oEmpty 255 # end for 256 # end for 257 258 # Set an initial observation. 259 self.compute_observation() 260 261 # Set the actions-since-reset marker. 262 self.actions_since_reset = 0 263 # end def 264# end class