1__author__ = 'Thomas Rueckstiess, ruecksti@in.tum.de'
2
3from pybrain.rl.learners.valuebased.valuebased import ValueBasedLearner
4
5
6class Q(ValueBasedLearner):
7
8    offPolicy = True
9    batchMode = True
10
11    def __init__(self, alpha=0.5, gamma=0.99):
12        ValueBasedLearner.__init__(self)
13
14        self.alpha = alpha
15        self.gamma = gamma
16
17        self.laststate = None
18        self.lastaction = None
19
20    def learn(self):
21        """ Learn on the current dataset, either for many timesteps and
22            even episodes (batchMode = True) or for a single timestep
23            (batchMode = False). Batch mode is possible, because Q-Learning
24            is an off-policy method.
25
26            In batchMode, the algorithm goes through all the samples in the
27            history and performs an update on each of them. if batchMode is
28            False, only the last data sample is considered. The user himself
29            has to make sure to keep the dataset consistent with the agent's
30            history.
31        """
32        if self.batchMode:
33            samples = self.dataset
34        else:
35            samples = [[self.dataset.getSample()]]
36
37        for seq in samples:
38            # information from the previous episode (sequence)
39            # should not influence the training on this episode
40            self.laststate = None
41            self.lastaction = None
42            self.lastreward = None
43
44            for state, action, reward in seq:
45
46                state = int(state)
47                action = int(action)
48
49                # first learning call has no last state: skip
50                if self.laststate == None:
51                    self.lastaction = action
52                    self.laststate = state
53                    self.lastreward = reward
54                    continue
55
56                qvalue = self.module.getValue(self.laststate, self.lastaction)
57                maxnext = self.module.getValue(state, self.module.getMaxAction(state))
58                self.module.updateValue(self.laststate, self.lastaction, qvalue + self.alpha * (self.lastreward + self.gamma * maxnext - qvalue))
59
60                # move state to oldstate
61                self.laststate = state
62                self.lastaction = action
63                self.lastreward = reward
64
65