1__author__ = 'Thomas Rueckstiess, ruecksti@in.tum.de' 2 3from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner 4 5 6class SARSA(ValueBasedLearner): 7 """ State-Action-Reward-State-Action (SARSA) algorithm. 8 9 In batchMode, the algorithm goes through all the samples in the 10 history and performs an update on each of them. if batchMode is 11 False, only the last data sample is considered. The user himself 12 has to make sure to keep the dataset consistent with the agent's 13 history.""" 14 15 offPolicy = False 16 batchMode = True 17 18 def __init__(self, alpha=0.5, gamma=0.99): 19 ValueBasedLearner.__init__(self) 20 21 self.alpha = alpha 22 self.gamma = gamma 23 24 self.laststate = None 25 self.lastaction = None 26 27 def learn(self): 28 if self.batchMode: 29 samples = self.dataset 30 else: 31 samples = [[self.dataset.getSample()]] 32 33 for seq in samples: 34 # information from the previous episode (sequence) 35 # should not influence the training on this episode 36 self.laststate = None 37 self.lastaction = None 38 self.lastreward = None 39 for state, action, reward in seq: 40 41 state = int(state) 42 action = int(action) 43 44 # first learning call has no last state: skip 45 if self.laststate == None: 46 self.lastaction = action 47 self.laststate = state 48 self.lastreward = reward 49 continue 50 51 qvalue = self.module.getValue(self.laststate, self.lastaction) 52 qnext = self.module.getValue(state, action) 53 self.module.updateValue(self.laststate, self.lastaction, qvalue + self.alpha * (self.lastreward + self.gamma * qnext - qvalue)) 54 55 # move state to oldstate 56 self.laststate = state 57 self.lastaction = action 58 self.lastreward = reward 59 60