1__author__ = 'Thomas Rueckstiess, ruecksti@in.tum.de'
2
3from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner
4
5
6class SARSA(ValueBasedLearner):
7    """ State-Action-Reward-State-Action (SARSA) algorithm.
8
9    In batchMode, the algorithm goes through all the samples in the
10    history and performs an update on each of them. if batchMode is
11    False, only the last data sample is considered. The user himself
12    has to make sure to keep the dataset consistent with the agent's
13    history."""
14
15    offPolicy = False
16    batchMode = True
17
18    def __init__(self, alpha=0.5, gamma=0.99):
19        ValueBasedLearner.__init__(self)
20
21        self.alpha = alpha
22        self.gamma = gamma
23
24        self.laststate = None
25        self.lastaction = None
26
27    def learn(self):
28        if self.batchMode:
29            samples = self.dataset
30        else:
31            samples = [[self.dataset.getSample()]]
32
33        for seq in samples:
34            # information from the previous episode (sequence)
35            # should not influence the training on this episode
36            self.laststate = None
37            self.lastaction = None
38            self.lastreward = None
39            for state, action, reward in seq:
40
41                state = int(state)
42                action = int(action)
43
44                # first learning call has no last state: skip
45                if self.laststate == None:
46                    self.lastaction = action
47                    self.laststate = state
48                    self.lastreward = reward
49                    continue
50
51                qvalue = self.module.getValue(self.laststate, self.lastaction)
52                qnext = self.module.getValue(state, action)
53                self.module.updateValue(self.laststate, self.lastaction, qvalue + self.alpha * (self.lastreward + self.gamma * qnext - qvalue))
54
55                # move state to oldstate
56                self.laststate = state
57                self.lastaction = action
58                self.lastreward = reward
59
60