1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4Defines an environment for Tic Tac Toe.
5"""
6
7from __future__ import division
8from __future__ import print_function
9from __future__ import unicode_literals
10
11import os
12import random
13import sys
14
15# Insert the package's parent directory into the system search path, so that this package can be
16# imported when the aixi.py script is run directly from a release archive.
17PROJECT_ROOT = os.path.realpath(os.path.join(os.pardir, os.pardir))
18sys.path.insert(0, PROJECT_ROOT)
19
20# Ensure xrange is defined on Python 3.
21from six.moves import xrange
22
23from pyaixi import environment, util
24
25# Define a enumeration to represent environment observations: either a square
26# is empty, filled with the agent's piece, or the environment's piece.
27tictactoe_observation_enum = util.enum('oEmpty', 'oAgent', 'oEnv')
28
29# Define a enumeration to represent rewards as a result of actions: invalid actions,
30# losses, null, draws, and wins.
31# NOTE: since the enumeration values need to be positive, these values are defined relative to 3.
32#       So -3 points is 0, -2 points is 1, and 2 points is 5.
33tictactoe_reward_enum = util.enum(rInvalid = 0, rLoss = 1, rNull = 3, rDraw = 4, rWin = 5)
34
35# Define some shorthand notation for ease of reference.
36oEmpty  = tictactoe_observation_enum.oEmpty
37oAgent  = tictactoe_observation_enum.oAgent
38oEnv    = tictactoe_observation_enum.oEnv
39
40rInvalid = tictactoe_reward_enum.rInvalid
41rLoss    = tictactoe_reward_enum.rLoss
42rNull    = tictactoe_reward_enum.rNull
43rDraw    = tictactoe_reward_enum.rDraw
44rWin     = tictactoe_reward_enum.rWin
45
46class Tic_Tac_Toe(environment.Environment):
47    """ In this domain, the agent plays repeated games of TicTacToe against an
48        opponent who moves randomly. If the agent wins the game, it receives a
49        reward of 2. If there is a draw, the agent receives a reward of 1. A loss
50        penalizes the agent by -2. If the agent makes an illegal move, by moving on
51        top of an already filled square, then it receives a reward of -3. A legal
52        move that does not end the game earns no reward. An illegal reward causes
53        the game to restart.
54
55        Domain characteristics:
56        - environment: "tictactoe"
57        - maximum action: 8 (4 bits)
58        - maximum observation: 174672 (18 bits)
59          - 174672 (decimal) = 101010101010101010 (binary)
60        - maximum reward: 5 (3 bits)
61    """
62
63    # Instance methods.
64
65    def __init__(self, options = {}):
66        """ Construct the Tic Tac Toe environment from the given options.
67
68             - `options` is a dictionary of named options and their values.
69        """
70
71        # Set up the base environment.
72        environment.Environment.__init__(self, options = options)
73
74        # Define the acceptable action values.
75        self.valid_actions = xrange(0, 9)
76
77        # Define the acceptable observation values.
78        self.valid_observations = xrange(0, 174672 + 1)
79
80        # Define the acceptable reward values.
81        self.valid_rewards = list(tictactoe_reward_enum.keys())
82
83        # Set the initial reward.
84        self.reward = 0
85
86        # Set up the game.
87        self.reset()
88    # end def
89
90    def check_win(self):
91        """ Check if either player has won the game.
92            Returns True if so, False otherwise.
93
94            (Called `checkWin` in the C++ version.)
95        """
96
97        # Check if we've got a row of three matching symbols.
98        for r in xrange(0, 3):
99            # Is this row all matching, non-empty symbols?
100            if (self.board[r][0] != oEmpty and \
101                self.board[r][0] == self.board[r][1] and \
102                self.board[r][1] == self.board[r][2]):
103                # Yes. Someone has won.
104                return True
105            # end if
106        # end for
107
108        # Check if we've got any columns of three matching symbols.
109        for c in xrange(0, 3):
110            # Is this column all matching, non-empty symbols?
111            if (self.board[0][c] != oEmpty and \
112                self.board[0][c] == self.board[1][c] and \
113                self.board[1][c] == self.board[2][c]):
114                # Yes. Someone has won.
115                return True
116            # end if
117        # end for
118
119        # Check the diagonals.
120        if (self.board[1][1] != oEmpty and \
121            self.board[0][0] == self.board[1][1] and \
122            self.board[1][1] == self.board[2][2]):
123            return True
124        # end if
125
126        if (self.board[1][1] != oEmpty and \
127            self.board[0][2] == self.board[1][1] and \
128            self.board[1][1] == self.board[2][0]):
129            return True
130        # end if
131
132        # If we're here, there's no winner yet.
133        return False
134    # end def
135
136    def compute_observation(self):
137        """ Encodes the state of each square into an overall observation and saves
138            the result in self.observation.
139            Each cell corresponds to two bits.
140
141            (Called `computeObservation` in the C++ version.)
142        """
143
144        self.observation = 0
145        for r in xrange(0, 3):
146            for c in xrange(0, 3):
147                # Shift the existing observation up by 2 bits, and add the current observation to
148                # that.
149                self.observation = self.board[r][c] + (4 * self.observation)
150            # end for
151        # end for
152    # end def
153
154    def perform_action(self, action):
155        """ Receives the agent's action and calculates the new environment percept.
156            (Called `performAction` in the C++ version.)
157        """
158
159        assert self.is_valid_action(action)
160
161        # Save the action.
162        self.action = action
163
164        # Increment the actions-since-reset counter.
165        self.actions_since_reset += 1
166
167        # Decode the action into the desired row and column to update.
168        r = int(action / 3)
169        c = action % 3
170
171        # If agent makes an invalid move, give the appropriate (lack of) reward and clear the board.
172        if (self.board[r][c] != oEmpty):
173            self.reward = rInvalid
174            self.reset()
175            return
176        # end def
177
178        # The agent makes their move.
179        self.board[r][c] = oAgent
180
181        # If the agent wins or draws, give an appropriate reward and clear the board.
182        if (self.check_win()):
183            self.reward = rWin
184            self.reset()
185            return
186        elif self.actions_since_reset == 5:
187            # If it's been 5 actions since the reset, then this must be a draw, as there's
188            # no way to win from this point.
189            self.reward = rDraw
190            self.reset()
191            return
192        # end def
193
194        # The environment makes a random play.
195        while (self.board[r][c] != oEmpty):
196            # Keep picking board positions at random until we find an unoccupied spot.
197            r = random.randrange(0, 3)
198            c = random.randrange(0, 3)
199        # end while
200
201        # If we're here, we've got an unoccupied spot.
202        self.board[r][c] = oEnv
203
204        # If the environment has won, give an appropriate reward and clear the board.
205        if (self.check_win()):
206            self.reward = rLoss
207            self.reset()
208            return
209        # end def
210
211        # The game continues.
212        self.reward = rNull
213        self.compute_observation()
214
215        return (self.observation, self.reward)
216    # end def
217
218    def print(self):
219        """ Returns a string indicating the status of the environment.
220        """
221
222        # Show what just happened, correcting the reward for being defined relative to 3.
223        message = "action = %s, observation = %s, reward = %s (%d), board:" % \
224                  (self.action, self.observation, self.reward, (self.reward - 3)) + os.linesep
225
226        # Display the current state of the board.
227        for r in xrange(0, 3):
228            for c in xrange(0, 3):
229                b = self.board[r][c]
230                message += "." if b == oEmpty else ("A" if b == oAgent else "O")
231            # end for
232            message += os.linesep
233        # end for
234        message += os.linesep
235
236        return message
237    # end def
238
239    def reset(self):
240        """ Begin a new game.
241        """
242
243        # Set up the board.
244        self.board = {}
245
246        for r in xrange(0, 3):
247            for c in xrange(0, 3):
248                # Ensure the row exists.
249                if r not in self.board:
250                    self.board[r] = {}
251                # end if
252
253                # Set this element to be empty.
254                self.board[r][c] = oEmpty
255            # end for
256        # end for
257
258        # Set an initial observation.
259        self.compute_observation()
260
261        # Set the actions-since-reset marker.
262        self.actions_since_reset = 0
263    # end def
264# end class