1__author__ = 'Thomas Rueckstiess, ruecksti@in.tum.de' 2 3from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner 4 5 6class Q(ValueBasedLearner): 7 8 offPolicy = True 9 batchMode = True 10 11 def __init__(self, alpha=0.5, gamma=0.99): 12 ValueBasedLearner.__init__(self) 13 14 self.alpha = alpha 15 self.gamma = gamma 16 17 self.laststate = None 18 self.lastaction = None 19 20 def learn(self): 21 """ Learn on the current dataset, either for many timesteps and 22 even episodes (batchMode = True) or for a single timestep 23 (batchMode = False). Batch mode is possible, because Q-Learning 24 is an off-policy method. 25 26 In batchMode, the algorithm goes through all the samples in the 27 history and performs an update on each of them. if batchMode is 28 False, only the last data sample is considered. The user himself 29 has to make sure to keep the dataset consistent with the agent's 30 history. 31 """ 32 if self.batchMode: 33 samples = self.dataset 34 else: 35 samples = [[self.dataset.getSample()]] 36 37 for seq in samples: 38 # information from the previous episode (sequence) 39 # should not influence the training on this episode 40 self.laststate = None 41 self.lastaction = None 42 self.lastreward = None 43 44 for state, action, reward in seq: 45 46 state = int(state) 47 action = int(action) 48 49 # first learning call has no last state: skip 50 if self.laststate == None: 51 self.lastaction = action 52 self.laststate = state 53 self.lastreward = reward 54 continue 55 56 qvalue = self.module.getValue(self.laststate, self.lastaction) 57 maxnext = self.module.getValue(state, self.module.getMaxAction(state)) 58 self.module.updateValue(self.laststate, self.lastaction, qvalue + self.alpha * (self.lastreward + self.gamma * maxnext - qvalue)) 59 60 # move state to oldstate 61 self.laststate = state 62 self.lastaction = action 63 self.lastreward = reward 64 65