1 // -*- Mode: c++ -*-
2 // $Id: policy.cpp,v 1.7.2.1 2008/12/31 03:53:55 berniw Exp $
3
4 // copyright (c) 2004 by Christos Dimitrakakis <dimitrak@idiap.ch>
5 /***************************************************************************
6 * *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
11 * *
12 ***************************************************************************/
13
14 #include <cstring>
15 #include <learning/learn_debug.h>
16 #include <learning/policy.h>
17 #include <learning/MathFunctions.h>
18 #ifdef WIN32
19 #include <float.h>
20 #define std::isnan _isnan
21 #endif // WIN32
22
23 #undef POLICY_LOG
24
25 #ifndef POLICY_LOG
26 #undef logmsg
27 #define logmsg empty_log
28 #endif
29
empty_log(const char * s,...)30 void empty_log(const char* s, ...)
31 {
32 }
33 /// \brief Create a new discrete policy.
34 /// \arg n_states Number of states for the agent
35 /// \arg n_actions Number of actions.
36 /// \arg alpha Learning rate.
37 /// \arg gamma Discount parameter.
38 /// \arg lambda Eligibility trace decay.
39 /// \arg softmax Use softmax if true (can be overridden later)
40 /// \arg randomness Amount of randomness.
41 /// \arg init_eval Initial evaluation of actions.
DiscretePolicy(int n_states,int n_actions,real alpha,real gamma,real lambda,bool softmax,real randomness,real init_eval)42 DiscretePolicy::DiscretePolicy (int n_states, int n_actions, real alpha,
43 real gamma, real lambda, bool softmax,
44 real randomness, real init_eval)
45 {
46 if (lambda<0.0f) lambda = 0.0f;
47 if (lambda>0.99f) lambda = 0.99f;
48
49 if (gamma<0.0f) gamma = 0.0f;
50 if (gamma>0.99f) gamma = 0.99f;
51
52 if (alpha<0.0f) alpha = 0.0f;
53 if (alpha>1.0f) alpha = 1.0f;
54
55 this->n_states = n_states;
56 this->n_actions = n_actions;
57 this->gamma = gamma;
58 this->lambda = lambda;
59 this->alpha = alpha;
60 smax = softmax;
61 temp = randomness;
62 //logmsg ("RR:%f", temp);
63 if (smax) {
64 if (temp<0.1f)
65 temp = 0.1f;
66 } else {
67 if (temp<0.0f) {
68 temp = 0.0f;
69 }
70 if (temp>1.0f) {
71 temp = 1.0f;
72 }
73 }
74 learning_method = Sarsa;
75
76 logmsg ("#Making Sarsa(lambda) ");
77 if (smax) {
78 logmsg ("#softmax");
79 } else {
80 logmsg ("#e-greedy");
81 }
82 logmsg (" policy with Q:[%d x %d] -> R, a:%f g:%f, l:%f, t:%f\n",
83 this->n_states, this->n_actions, this->alpha, this->gamma, this->lambda, this->temp);
84
85 P = new real* [n_states];
86 Q = new real* [n_states];
87 e = new real* [n_states];
88 vQ = new real* [n_states];
89 for (int s=0; s<n_states; s++) {
90 P[s] = new real [n_actions];
91 Q[s] = new real [n_actions];
92 e[s] = new real [n_actions];
93 vQ[s] = new real [n_actions];
94 for (int a=0; a<n_actions; a++) {
95 P[s][a] = 1.0/((float) n_actions);
96 Q[s][a] = init_eval;
97 e[s][a] = 0.0;
98 vQ[s][a] = 1.0;
99 }
100 }
101 pQ = 0.0;
102 ps = -1;
103 pa = -1;
104 min_el_state = 0;
105 max_el_state = n_states -1;
106 eval = new real[n_actions];
107 sample = new real[n_actions];
108 for (int a=0; a<n_actions; a++) {
109 eval[a] = 0.0;
110 sample[a] = 0.0;
111 }
112 forced_learning = false;
113 confidence = false;
114 confidence_uses_gibbs = true;
115 confidence_distribution = SINGULAR;
116 zeta = 0.01f;
117 tdError = 0.0f;
118 expected_r = 0.0f;
119 expected_V = 0.0f;
120 n_samples = 0;
121 replacing_traces = false;
122 }
123
124 /// \brief Save the current evaluations in text format to a file.
125 /// The values are saved as triplets (\c Q, \c P, \c
126 /// vQ). The columns are ordered by actions and the rows by state
127 /// number.
saveState(FILE * f)128 void DiscretePolicy::saveState(FILE* f)
129 {
130 if (!f)
131 return;
132 for (int s=0; s<n_states; s++) {
133
134 //softMax(Q[s]);
135 real sum2=0.0;
136 int a;
137 for (a=0; a<n_actions; a++) {
138 sum2 += eval[a];
139 }
140 for (a=0; a<n_actions; a++) {
141 fprintf (f, "%f ", Q[s][a]);
142 }
143 for (a=0; a<n_actions; a++) {
144 fprintf (f, "%f ", P[s][a]);
145 }
146 for (a=0; a<n_actions; a++) {
147 fprintf (f, "%f ", vQ[s][a]);
148 }
149 }
150
151 fprintf (f, "\n");
152 }
153
154 /// Delete policy.
~DiscretePolicy()155 DiscretePolicy::~DiscretePolicy()
156 {
157 real sum = 0.0;
158 FILE* f = fopen ("/tmp/discrete","wb");
159
160 int s;
161 for (s=0; s<n_states; s++) {
162 sum += Q[s][argMax(Q[s])];
163 if (f) {
164 //softMax(Q[s]);
165 real sum2=0.0;
166 int a;
167 for (a=0; a<n_actions; a++) {
168 sum2 += eval[a];
169 }
170 for (a=0; a<n_actions; a++) {
171 fprintf (f, "%f ", Q[s][a]);
172 }
173 for (a=0; a<n_actions; a++) {
174 fprintf (f, "%f ", P[s][a]);
175 }
176 for (a=0; a<n_actions; a++) {
177 fprintf (f, "%f ", vQ[s][a]);
178 }
179 fprintf (f, "\n");
180 }
181 }
182
183 if (f) {
184 fclose (f);
185 }
186
187 logmsg ("#Expected return of greedy policy over random distribution of states: %f\n", sum/((real) n_states));
188
189 for (s=0; s<n_states; s++) {
190 delete [] P[s];
191 delete [] Q[s];
192 delete [] e[s];
193 delete [] vQ[s];
194 }
195 delete [] P;
196 delete [] Q;
197 delete [] vQ;
198 delete [] e;
199 delete [] eval;
200 delete [] sample;
201 }
202
203 /** Select an action a, given state s and reward from previous action.
204
205 Optional argument a forces an action if setForcedLearning() has
206 been called with true.
207
208 Two algorithms are implemented, both of which converge. One of them
209 calculates the value of the current policy, while the other that of
210 the optimal policy.
211
212 Sarsa (\f$\lambda\f$) algorithmic description:
213
214 1. Take action \f$a\f$, observe \f$r, s'\f$
215
216 2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$
217
218 3. \f$\delta = r + \gamma Q(s',a') - Q(s,a)\f$
219
220 4. \f$e(s,a) = e(s,a)+ 1\f$, depending on trace settings
221
222 5. for all \f$s,a\f$ :
223 \f[
224 Q_{t}(s,a) = Q_{t-1}(s,a) + \alpha \delta e_{t}(s,a),
225 \f]
226 where \f$e_{t}(s,a) = \gamma \lambda e_{t-1}(s,a)\f$
227
228 end
229
230 6. \f$a = a'\f$ (we will take this action at the next step)
231
232 7. \f$s = s'\f$
233
234 Watkins Q (l) algorithmic description:
235
236 1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$
237
238 2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$
239
240 3. \f$a* = \arg \max_b Q(s',b)\f$
241
242 3. \f$\delta = r + \gamma Q(s',a^*) - Q(s,a)\f$
243
244 4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces
245
246 5. for all \f$s,a\f$ :
247 \f[
248 Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
249 \f]
250 if \f$(a'=a*)\f$ then \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a)\f$
251 else \f$e(s,a) = 0\f$
252 end
253
254 6. \f$a = a'\f$ (we will take this action at the next step)
255
256 7. \f$s = s'\f$
257
258 The most general algorithm is E-learning, currently under
259 development, which is defined as follows:
260
261 1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$
262
263 2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$
264
265 3. \f$\delta = r + \gamma E{Q(s',a^*)|\pi} - Q(s,a)\f$
266
267 4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces
268
269 5. for all \f$s,a\f$ :
270 \f[
271 Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
272 \f]
273 \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a) P(a|s,\pi) \f$
274
275 6. \f$a = a'\f$ (we will take this action at the next step)
276
277 7. \f$s = s'\f$
278
279 Note that we also cut off the eligibility traces that have fallen below 0.1
280
281
282 */
SelectAction(int s,real r,int forced_a)283 int DiscretePolicy::SelectAction (int s, real r, int forced_a)
284 {
285 if ((s<0)||(s>=n_states)) {
286 return 0;
287 }
288
289 if ((ps>=0)&&(pa>=0)) {
290 expected_r += r;
291 expected_V += Q[ps][pa];
292 n_samples++;
293
294 if (s==0) {
295 real max_estimate = 0.0;
296 real max_estimate_k = 0.0;
297 for (int i=0; i<n_states; i++) {
298 max_estimate += Q[i][argMax (Q[i])];
299 max_estimate_k += 1.0;
300 }
301
302 #if 0
303 logmsg ("%f %f %f %f#rTVV\n",
304 expected_r/((real) n_samples),
305 temp,
306 expected_V/((real) n_samples),
307 max_estimate/max_estimate_k);
308 #endif
309 expected_r = 0.0;
310 expected_V= 0.0;
311 n_samples = 0;
312 }
313 }
314 int a, amax;
315 int argmax = argMax (Q[s]);
316
317 P[s][argmax] += zeta*(1.0-P[s][argmax]);
318 for (int j=0; j<n_actions; j++) {
319 if (j!=argmax) {
320 P[s][j] += zeta*(0.0-P[s][j]);
321 }
322 }
323
324
325
326 if (forced_learning) {
327 a = forced_a;
328 } else if (pursuit) {
329 real sum = 0.0;
330 a = -1;
331 int j;
332 for (j=0; j<n_actions; j++) {
333 sum += P[s][j];
334 }
335 real X = urandom()*sum;
336 real dsum=0.0;
337 for (j=0; j<n_actions; j++) {
338 dsum += P[s][j];
339 if (X<=dsum) {
340 a = j;
341 break;
342 }
343 }
344 if (a==-1) {
345 fprintf (stderr, "No action selected with pursuit!\n");
346 }
347 } else if (confidence) {
348 if (confidence_uses_gibbs && (confidence_distribution == SINGULAR)) {
349 a = confMax (Q[s],vQ[s]);
350 } else {
351 a = confSample (Q[s], vQ[s]);
352 if (confidence_uses_gibbs) { // and not SINGULAR distribution
353 a = softMax(sample); //use softmax on the sample values
354 }
355 }
356 } else if (reliability_estimate) {
357 temp = sqrt(Sum(vQ[s], n_actions)/((real) n_actions));
358 //temp = 0.1;
359 a = softMax(Q[s]);
360 //printf ("%f\n", temp);
361 } else if (smax) {
362 a = softMax (Q[s]);
363 //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
364 } else {
365 a = eGreedy (Q[s]);
366 }
367
368 if (a<0 || a>=n_actions) {
369 fprintf (stderr, "Action %d out of bounds.. ", a);
370 a = (int) floor (urandom()*((real) n_actions));
371 fprintf (stderr, "mapping to %d\n", a);
372 }
373
374 real EQ_s = 0.0;
375 int i;
376
377 switch (learning_method) {
378
379 case Sarsa:
380 amax = a;
381 EQ_s = Q[s][amax];
382 break;
383 case QLearning:
384 amax = argmax;
385 EQ_s = Q[s][amax];
386 break;
387 case ELearning:
388 amax = a; //? correct ?
389 Normalise(eval, eval, n_actions);
390 EQ_s = 0.0;
391 for (i=0; i<n_actions; i++) {
392 EQ_s += eval[i] * Q[s][i];
393 }
394 break;
395 default:
396 amax = a;
397 EQ_s = Q[s][amax];
398 fprintf (stderr, "Unknown learning method\n");
399 }
400 if ((ps>=0)&&(pa>=0)) { // do not update at start of episode
401 real delta = r + gamma*EQ_s - Q[ps][pa];
402 tdError = delta;
403 if (replacing_traces) {
404 e[ps][pa] = 1.0;
405 } else {
406 e[ps][pa] += 1.0;
407 }
408 real ad = alpha*delta;
409 real gl = gamma * lambda;
410 real variance_threshold = 0.0001f;
411 if (confidence_eligibility == false) {
412 vQ[ps][pa] = (1.0 - zeta)*vQ[ps][pa] + zeta*(ad*ad);
413 if (vQ[ps][pa]<variance_threshold) {
414 vQ[ps][pa]=variance_threshold;
415 }
416 }
417 if (ps<min_el_state) min_el_state = ps;
418 if (ps>max_el_state) max_el_state = ps;
419
420
421 for (i=0; i<n_states; i++) {
422 //for (int i=min_el_state; i<=max_el_state; i++) {
423 bool el=true;
424 for (int j=0; j<n_actions; j++) {
425 if (e[i][j]>0.01) {
426 Q[i][j] += ad * e[i][j];
427 if (confidence_eligibility == true) {
428 real zeta_el = zeta * e[i][j];
429 vQ[i][j] = (1.0 - zeta_el)*vQ[i][j] + zeta_el*(ad*ad);
430 if (vQ[i][j]<variance_threshold) {
431 vQ[i][j]=variance_threshold;
432 }
433 }
434 //this is the same as setting e[ps][pa] += (1-P[ps][pa])
435 // if P[][] remains unchanged between updates.
436 // -- removed because it doesn't work! --
437 //P[i][j] += 0.01*delta * e[i][j] * (1.-P[i][j]);
438 if ((fabs (Q[i][j])>1000.0)||(std::isnan(Q[i][j]))) {
439 printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]);
440 }
441
442 //This is only needed for Qlearning, but sarsa is not
443 //affected since always amax==a;
444 if (amax==a) {
445 e[i][j] *= gl;
446 } else {
447 e[i][j] = 0.0;
448 }
449 } else {
450 e[i][j] = 0.0;
451 el = false;
452 }
453 }
454 if (el==false) {
455 if (min_el_state==i)
456 min_el_state++;
457 } else {
458 max_el_state = i;
459 }
460 }
461 }
462
463 //printf ("%d %d #STATE\n", min_el_state, max_el_state);
464 // printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
465 // ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);
466 ps = s;
467 pa = a;
468
469 return a;
470 }
471
472 /// Use at the end of every episode, after agent has entered the
473 /// absorbing state.
Reset()474 void DiscretePolicy::Reset ()
475 {
476 for (int s=0; s<n_states; s++) {
477 for (int a=0; a<n_actions; a++) {
478 e[s][a] = 0.0;
479 }
480 }
481 }
482
483 /// Load policy from a file.
loadFile(char * f)484 void DiscretePolicy::loadFile (char* f)
485 {
486 FILE* fh = NULL;
487 fh = fopen (f, "rb");
488 if (fh==NULL) {
489 fprintf (stderr, "Failed to read file %s\n", f);
490 return;
491 }
492 char rtag[256];
493 const char* start_tag="QSA";
494 const char* close_tag="END";
495 int n_read_states, n_read_actions;
496
497 fread((void *) rtag, sizeof (char), strlen (start_tag)+1, fh);
498 if (strcmp (rtag, start_tag)) {
499 fprintf (stderr, "Could not find starting tag\n");
500 return;
501 }
502 fread((void *) &n_read_states, sizeof(int), 1, fh);
503 fread((void *) &n_read_actions, sizeof(int), 1, fh);
504
505 if ((n_read_states!=n_states)||(n_read_actions!=n_actions)) {
506 fprintf (stderr, "File has %dx%d space! Aborting read.\n", n_read_states, n_read_actions);
507 fclose(fh);
508 return;
509 }
510
511 int i, j;
512 for (i=0; i<n_states; i++) {
513 fread((void *) Q[i], sizeof(real), n_actions, fh);
514 for (j=0; j<n_actions; j++) {
515 if ((fabs (Q[i][j])>100.0)||(std::isnan(Q[i][j]))) {
516 printf ("l: %d %d %f\n", i,j,Q[i][j]);
517 Q[i][j] = 0.0;
518 }
519 }
520 }
521 for (i=0; i<n_states; i++) {
522 for (j=0; j<n_actions; j++) {
523 {
524 P[i][j] = 1.0/((real) n_actions);
525 }
526 }
527 int argmax = argMax (Q[i]);
528 P[i][argmax] += 0.001*(1.0-P[i][argmax]);
529 for (int j=0; j<n_actions; j++) {
530 if (j!=argmax) {
531 P[i][j] += 0.001*(0.0-P[i][j]);
532 }
533 }
534 }
535
536
537
538 fread((void *) rtag, sizeof (char), strlen (close_tag)+1, fh);
539 if (strcmp (rtag, close_tag)) {
540 fprintf (stderr, "Could not find ending tag\n");
541 fclose (fh);
542 return;
543 }
544
545
546 fclose (fh);
547 }
548
549 /// Save policy to a file.
saveFile(char * f)550 void DiscretePolicy::saveFile (char* f) {
551 FILE* fh = NULL;
552 fh = fopen (f, "wb");
553 if (fh==NULL) {
554 fprintf (stderr, "Failed to write to file %s\n", f);
555 return;
556 }
557
558 const char* start_tag="QSA";
559 const char* close_tag="END";
560
561 fwrite((void *) start_tag, sizeof (char), strlen (start_tag)+1, fh);
562 fwrite((void *) &n_states, sizeof(int), 1, fh);
563 fwrite((void *) &n_actions, sizeof(int), 1, fh);
564 for (int i=0; i<n_states; i++) {
565 fwrite((void *) Q[i], sizeof(real), n_actions, fh);
566 for (int j=0; j<n_actions; j++) {
567 if ((fabs (Q[i][j])>100.0)||(std::isnan(Q[i][j]))) {
568 printf ("s: %d %d %f\n", i,j,Q[i][j]);
569 }
570 }
571 }
572 fwrite((void *) close_tag, sizeof (char), strlen (start_tag)+1, fh);
573 fclose (fh);
574 }
575
576 /// \brief Set to use confidence estimates for action selection, with
577 /// variance smoothing zeta.
578 /// Variance smoothing currently uses a very simple method to estimate
579 /// the variance.
useConfidenceEstimates(bool confidence,real zeta,bool confidence_eligibility)580 bool DiscretePolicy::useConfidenceEstimates (bool confidence, real zeta, bool confidence_eligibility) {
581 this->confidence = confidence;
582 this->zeta = zeta;
583 this->confidence_eligibility = confidence_eligibility;
584
585 if (confidence_eligibility) {
586 logmsg ("#+[ELIG_VAR]");
587 }
588 if (confidence) {
589 logmsg ("#+[CONDIFENCE]");
590 } else {
591 logmsg ("#-[CONDIFENCE]\n");
592 }
593
594 return confidence;
595 }
596
597 /// Set the algorithm to QLearning mode
setQLearning()598 void DiscretePolicy::setQLearning() {
599 learning_method = QLearning;
600 logmsg ("#[Q-learning]\n");
601 }
602
603 /// Set the algorithm to ELearning mode
setELearning()604 void DiscretePolicy::setELearning() {
605 learning_method = ELearning;
606 logmsg ("#[E-learning]\n");
607 }
608
609 /// \brief Set the algorithm to SARSA mode.
610 /// A unified framework for action selection.
setSarsa()611 void DiscretePolicy::setSarsa()
612 {
613 learning_method = Sarsa;
614 logmsg ("#[Sarsa]\n");
615 }
616
617 /// Use Pursuit for action selection.
setPursuit(bool pursuit)618 void DiscretePolicy::setPursuit(bool pursuit)
619 {
620 this->pursuit = pursuit;
621 if (pursuit) {
622 logmsg ("#+[PURSUIT]\n");
623 } else {
624 logmsg ("#-[PURSUIT]\n");
625 }
626 }
627
628 /// Use Pursuit for action selection.
setReplacingTraces(bool replacing)629 void DiscretePolicy::setReplacingTraces (bool replacing)
630 {
631 this->replacing_traces = replacing;
632 if (replacing) {
633 logmsg ("#[REPLACING TRACES]\n");
634 } else {
635 logmsg ("#[ACCUMULATING TRACES]\n");
636 }
637 }
638 /// Set forced learning (force-feed actions)
setForcedLearning(bool forced)639 void DiscretePolicy::setForcedLearning(bool forced)
640 {
641 forced_learning = forced;
642 }
643
644 /// Set randomness for action selection. Does not affect confidence mode.
setRandomness(real epsilon)645 void DiscretePolicy::setRandomness (real epsilon)
646 {
647 temp = epsilon;
648 if (smax) {
649 if (temp<0.01) {
650 smax = false;
651 }
652 }
653 }
654
655 /// Set the gamma of the sum to be maximised.
setGamma(real gamma)656 void DiscretePolicy::setGamma (real gamma)
657 {
658 this->gamma = gamma;
659 }
660
661 /// Set action selection to softmax.
useSoftmax(bool softmax)662 void DiscretePolicy::useSoftmax (bool softmax)
663 {
664 smax = softmax;
665 if (smax) {
666 logmsg ("#+[SMAX]\n");
667 } else {
668 logmsg ("#-[SMAX]\n");
669 }
670 }
671
672 /// Use the reliability estimate method for action selection.
useReliabilityEstimate(bool ri)673 void DiscretePolicy::useReliabilityEstimate (bool ri)
674 {
675 reliability_estimate = ri;
676 if (ri) {
677 logmsg("#+[RI]\n");
678 } else {
679 logmsg("#-[RI]\n");
680 }
681 }
682
683 /// Set the distribution for direct action sampling.
setConfidenceDistribution(enum ConfidenceDistribution cd)684 void DiscretePolicy::setConfidenceDistribution (enum ConfidenceDistribution cd)
685 {
686 switch (cd) {
687 case SINGULAR:
688 logmsg("#[SINGULAR CONFIDENCE]\n"); break;
689 case BOUNDED:
690 logmsg("#[BOUNDED CONFIDENCE]\n"); break;
691 case GAUSSIAN:
692 logmsg("#[GAUSSIAN CONFIDENCE]\n"); break;
693 case LAPLACIAN:
694 logmsg("#[LAPLACIAN CONFIDENCE]\n"); break;
695 default:
696 Serror ("Unknown type %d\n", cd);
697 }
698 confidence_distribution = cd;
699 }
700
701 /// \brief Add Gibbs sampling for confidences.
702 /// This can be used in conjuction with any confidence distribution,
703 /// however it mostly makes sense for SINGULAR.
useGibbsConfidence(bool gibbs)704 void DiscretePolicy::useGibbsConfidence (bool gibbs)
705 {
706 if (gibbs) {
707 logmsg ("#+[GIBBS CONFIDENCE]\n");
708 } else {
709 logmsg ("#-[GIBBS CONFIDENCE]\n");
710 }
711 this->confidence_uses_gibbs = gibbs;
712 }
713
714 // ---------- action selection helpers -------------
confMax(real * Qs,real * vQs,real p)715 int DiscretePolicy::confMax(real* Qs, real* vQs, real p) {
716 real sum=0.0;
717 int a;
718 #if 0
719 for (a=0; a<n_actions; a++) {
720 eval[a] = exp(pow(Qs[a]/sqrt(vQs[a]), p));
721 sum += eval[a];
722 }
723 #else
724 for (a=0; a<n_actions; a++) {
725 real Q = Qs[a];
726 real cum = 1.0;
727 //real v = sqrt(vQs[a]);
728 for (int j=0; j<n_actions; j++) {
729 if (j!=a) {
730 cum += exp ((Qs[j]-Q)/sqrt(vQs[j]));
731 }
732 }
733 eval[a] = 1.0/(cum);//#exp(Qs[a]/sqrt(vQs[a]));
734 sum += eval[a];
735 }
736 #endif
737 real X = urandom()*sum;
738 real dsum = 0.0;
739 for (a=0; a<n_actions; a++) {
740 dsum += eval[a];
741 if (X<=dsum)
742 return a;
743
744 }
745 fprintf (stderr, "ConfMax: No action selected! %f %f %f\n",X,dsum,sum);
746 return -1;
747 }
748
confSample(real * Qs,real * vQs)749 int DiscretePolicy::confSample(real* Qs, real* vQs) {
750 static NormalDistribution gaussian;
751 static LaplacianDistribution laplacian;
752 static UniformDistribution uniform;
753
754 for (int a=0; a<n_actions; a++) {
755 //eval[a] = Qs[a] + urandom(-1.0,1.0)*vQs[a];
756 switch(confidence_distribution) {
757 case SINGULAR:
758 sample[a] = Qs[a];
759 break;
760 case BOUNDED:
761 uniform.setMean(Qs[a]);
762 uniform.setVariance(vQs[a]);
763 sample[a] = uniform.generate();
764 break;
765 case GAUSSIAN:
766 gaussian.setMean(Qs[a]);
767 gaussian.setVariance(vQs[a]);
768 sample[a] = gaussian.generate();
769 break;
770 case LAPLACIAN:
771 laplacian.setMean(Qs[a]);
772 laplacian.setVariance(vQs[a]);
773 sample[a] = Qs[a] + laplacian.generate();
774 break;
775 default:
776 Serror ("Unknown distribution ID:%d\n", confidence_distribution);
777 break;
778 }
779 }
780 return argMax(sample);
781 }
782
softMax(real * Qs)783 int DiscretePolicy::softMax(real* Qs) {
784 real sum=0.0f;
785 real beta = 1.0f/temp;
786 int a;
787 for (a=0; a<n_actions; a++) {
788 eval[a] = exp(beta * Qs[a]);
789 sum += eval[a];
790 }
791 real X = urandom()*sum;
792 real dsum = 0.0;
793 for (a=0; a<n_actions; a++) {
794 dsum += eval[a];
795 if (X<=dsum)
796 return a;
797
798 }
799 fprintf (stderr, "softMax: No action selected! %f %f %f\nT:%f\n",X,dsum,sum,temp);
800 return -1;
801 }
eGreedy(real * Qs)802 int DiscretePolicy::eGreedy(real* Qs) {
803 real X = urandom();
804 int amax = argMax(Qs);
805 real base_prob = temp/((real) n_actions);
806 for (int a=0; a<n_actions; a++) {
807 eval[a] = base_prob;
808 }
809 eval[amax] += 1.0-temp;
810 if (X<temp) {
811 return rand()%n_actions;
812 }
813 return argMax(Qs);
814 }
815
argMax(real * Qs)816 int DiscretePolicy::argMax(real* Qs) {
817 real max = Qs[0];
818 int arg_max = 0;
819 for (int a=1; a<n_actions; a++) {
820 if (max<Qs[a]) {
821 max = Qs[a];
822 arg_max = a;
823 }
824 }
825 return arg_max;
826 }
827
828
829