1 // -*- Mode: c++ -*-
2 // $Id: policy.cpp,v 1.7.2.1 2008/12/31 03:53:55 berniw Exp $
3 
4 // copyright (c) 2004 by Christos Dimitrakakis <dimitrak@idiap.ch>
5 /***************************************************************************
6  *                                                                         *
7  *   This program is free software; you can redistribute it and/or modify  *
8  *   it under the terms of the GNU General Public License as published by  *
9  *   the Free Software Foundation; either version 2 of the License, or     *
10  *   (at your option) any later version.                                   *
11  *                                                                         *
12  ***************************************************************************/
13 
14 #include <cstring>
15 #include <learning/learn_debug.h>
16 #include <learning/policy.h>
17 #include <learning/MathFunctions.h>
18 #ifdef WIN32
19 #include <float.h>
20 #define std::isnan _isnan
21 #endif // WIN32
22 
23 #undef POLICY_LOG
24 
25 #ifndef POLICY_LOG
26 #undef logmsg
27 #define logmsg empty_log
28 #endif
29 
empty_log(const char * s,...)30 void empty_log(const char* s, ...)
31 {
32 }
33 /// \brief Create a new discrete policy.
34 /// \arg n_states Number of states for the agent
35 /// \arg n_actions Number of actions.
36 /// \arg alpha Learning rate.
37 /// \arg gamma Discount parameter.
38 /// \arg lambda Eligibility trace decay.
39 /// \arg softmax Use softmax if true (can be overridden later)
40 /// \arg randomness Amount of randomness.
41 /// \arg init_eval Initial evaluation of actions.
DiscretePolicy(int n_states,int n_actions,real alpha,real gamma,real lambda,bool softmax,real randomness,real init_eval)42 DiscretePolicy::DiscretePolicy (int n_states, int n_actions, real alpha,
43 				real gamma, real lambda, bool softmax,
44 				real randomness, real init_eval)
45 {
46 	if (lambda<0.0f) lambda = 0.0f;
47 	if (lambda>0.99f) lambda = 0.99f;
48 
49 	if (gamma<0.0f) gamma = 0.0f;
50 	if (gamma>0.99f) gamma = 0.99f;
51 
52 	if (alpha<0.0f) alpha = 0.0f;
53 	if (alpha>1.0f) alpha = 1.0f;
54 
55 	this->n_states = n_states;
56 	this->n_actions = n_actions;
57 	this->gamma = gamma;
58 	this->lambda = lambda;
59 	this->alpha = alpha;
60 	smax = softmax;
61 	temp = randomness;
62 	//logmsg ("RR:%f", temp);
63 	if (smax) {
64 		if (temp<0.1f)
65 			temp = 0.1f;
66 	} else {
67 		if (temp<0.0f) {
68 			temp = 0.0f;
69 		}
70 		if (temp>1.0f) {
71 			temp = 1.0f;
72 		}
73 	}
74 	learning_method = Sarsa;
75 
76 	logmsg ("#Making Sarsa(lambda) ");
77 	if (smax) {
78 		logmsg ("#softmax");
79 	} else {
80 		logmsg ("#e-greedy");
81 	}
82 	logmsg (" policy with Q:[%d x %d] -> R, a:%f g:%f, l:%f, t:%f\n",
83 			this->n_states, this->n_actions, this->alpha, this->gamma, this->lambda, this->temp);
84 
85 	P = new real* [n_states];
86 	Q = new real* [n_states];
87 	e = new real* [n_states];
88 	vQ = new real* [n_states];
89 	for (int s=0; s<n_states; s++) {
90 		P[s] = new real [n_actions];
91 		Q[s] = new real [n_actions];
92 		e[s] = new real [n_actions];
93 		vQ[s] = new real [n_actions];
94 		for (int a=0; a<n_actions; a++) {
95 			P[s][a] = 1.0/((float)  n_actions);
96 			Q[s][a] = init_eval;
97 			e[s][a] = 0.0;
98 			vQ[s][a] = 1.0;
99 		}
100 	}
101 	pQ = 0.0;
102 	ps = -1;
103 	pa = -1;
104 	min_el_state = 0;
105 	max_el_state = n_states -1;
106 	eval = new real[n_actions];
107 	sample = new real[n_actions];
108 	for (int a=0; a<n_actions; a++) {
109 		eval[a] = 0.0;
110 		sample[a] = 0.0;
111 	}
112 	forced_learning = false;
113 	confidence = false;
114 	confidence_uses_gibbs = true;
115 	confidence_distribution = SINGULAR;
116 	zeta = 0.01f;
117 	tdError = 0.0f;
118 	expected_r = 0.0f;
119 	expected_V = 0.0f;
120 	n_samples = 0;
121 	replacing_traces = false;
122 }
123 
124 /// \brief Save the current evaluations in text format to a file.
125 /// The values are saved as triplets (\c Q, \c P, \c
126 /// vQ). The columns are ordered by actions and the rows by state
127 /// number.
saveState(FILE * f)128 void DiscretePolicy::saveState(FILE* f)
129 {
130 	if (!f)
131 		return;
132 	for (int s=0; s<n_states; s++) {
133 
134 		//softMax(Q[s]);
135 		real sum2=0.0;
136 		int a;
137 		for (a=0; a<n_actions; a++) {
138 			sum2 += eval[a];
139 		}
140 		for (a=0; a<n_actions; a++) {
141 			fprintf (f, "%f ", Q[s][a]);
142 		}
143 		for (a=0; a<n_actions; a++) {
144 			fprintf (f, "%f ", P[s][a]);
145 		}
146 		for (a=0; a<n_actions; a++) {
147 			fprintf (f, "%f ", vQ[s][a]);
148 		}
149 	}
150 
151 	fprintf (f, "\n");
152 }
153 
154 /// Delete policy.
~DiscretePolicy()155 DiscretePolicy::~DiscretePolicy()
156 {
157 	real sum = 0.0;
158 	FILE* f = fopen ("/tmp/discrete","wb");
159 
160 	int s;
161 	for (s=0; s<n_states; s++) {
162 		sum += Q[s][argMax(Q[s])];
163 		if (f) {
164 			//softMax(Q[s]);
165 			real sum2=0.0;
166 			int a;
167 			for (a=0; a<n_actions; a++) {
168 				sum2 += eval[a];
169 			}
170 			for (a=0; a<n_actions; a++) {
171 				fprintf (f, "%f ", Q[s][a]);
172 			}
173 			for (a=0; a<n_actions; a++) {
174 				fprintf (f, "%f ", P[s][a]);
175 			}
176 			for (a=0; a<n_actions; a++) {
177 				fprintf (f, "%f ", vQ[s][a]);
178 			}
179 			fprintf (f, "\n");
180 		}
181 	}
182 
183 	if (f) {
184 		fclose (f);
185 	}
186 
187 	logmsg ("#Expected return of greedy policy over random distribution of states: %f\n", sum/((real) n_states));
188 
189 	for (s=0; s<n_states; s++) {
190 		delete [] P[s];
191 		delete [] Q[s];
192 		delete [] e[s];
193 		delete [] vQ[s];
194 	}
195 	delete [] P;
196 	delete [] Q;
197 	delete [] vQ;
198 	delete [] e;
199 	delete [] eval;
200 	delete [] sample;
201 }
202 
203 /** Select an action a, given state s and reward from previous action.
204 
205    Optional argument a forces an action if setForcedLearning() has
206    been called with true.
207 
208    Two algorithms are implemented, both of which converge. One of them
209    calculates the value of the current policy, while the other that of
210    the optimal policy.
211 
212    Sarsa (\f$\lambda\f$) algorithmic description:
213 
214    1. Take action \f$a\f$, observe \f$r, s'\f$
215 
216    2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$
217 
218    3. \f$\delta = r + \gamma Q(s',a') - Q(s,a)\f$
219 
220    4. \f$e(s,a) = e(s,a)+ 1\f$, depending on trace settings
221 
222    5. for all \f$s,a\f$ :
223    \f[
224    Q_{t}(s,a) = Q_{t-1}(s,a) + \alpha \delta e_{t}(s,a),
225    \f]
226 where \f$e_{t}(s,a) = \gamma \lambda e_{t-1}(s,a)\f$
227 
228 	  end
229 
230    6. \f$a = a'\f$ (we will take this action at the next step)
231 
232    7. \f$s = s'\f$
233 
234    Watkins Q (l) algorithmic description:
235 
236    1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$
237 
238    2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$
239 
240    3. \f$a* = \arg \max_b Q(s',b)\f$
241 
242    3. \f$\delta = r + \gamma Q(s',a^*) - Q(s,a)\f$
243 
244    4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces
245 
246    5. for all \f$s,a\f$ :
247 \f[
248         Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
249 \f]
250 		if \f$(a'=a*)\f$ then \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a)\f$
251 		           else \f$e(s,a) = 0\f$
252 	  end
253 
254    6. \f$a = a'\f$ (we will take this action at the next step)
255 
256    7. \f$s = s'\f$
257 
258    The most general algorithm is E-learning, currently under
259    development, which is defined as follows:
260 
261    1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$
262 
263    2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$
264 
265    3. \f$\delta = r + \gamma E{Q(s',a^*)|\pi} - Q(s,a)\f$
266 
267    4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces
268 
269    5. for all \f$s,a\f$ :
270 \f[
271         Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
272 \f]
273 		\f$e(s,a)\f$ = \f$\gamma \lambda e(s,a) P(a|s,\pi) \f$
274 
275    6. \f$a = a'\f$ (we will take this action at the next step)
276 
277    7. \f$s = s'\f$
278 
279    Note that we also cut off the eligibility traces that have fallen below 0.1
280 
281 
282 */
SelectAction(int s,real r,int forced_a)283 int DiscretePolicy::SelectAction (int s, real r, int forced_a)
284 {
285 	if ((s<0)||(s>=n_states)) {
286 		return 0;
287 	}
288 
289 	if ((ps>=0)&&(pa>=0)) {
290 		expected_r += r;
291 		expected_V += Q[ps][pa];
292 		n_samples++;
293 
294 		if (s==0) {
295 			real max_estimate = 0.0;
296 			real max_estimate_k = 0.0;
297 			for (int i=0; i<n_states; i++) {
298 				max_estimate += Q[i][argMax (Q[i])];
299 				max_estimate_k += 1.0;
300 			}
301 
302 #if 0
303 			logmsg ("%f %f %f %f#rTVV\n",
304 					expected_r/((real) n_samples),
305 					temp,
306 					expected_V/((real) n_samples),
307 					max_estimate/max_estimate_k);
308 #endif
309 			expected_r = 0.0;
310 			expected_V= 0.0;
311 			n_samples = 0;
312 		}
313 	}
314 	int a, amax;
315 	int argmax = argMax (Q[s]);
316 
317 	P[s][argmax] += zeta*(1.0-P[s][argmax]);
318 	for (int j=0; j<n_actions; j++) {
319 		if (j!=argmax) {
320 			P[s][j] += zeta*(0.0-P[s][j]);
321 		}
322 	}
323 
324 
325 
326 	if (forced_learning) {
327 		a = forced_a;
328 	} else if (pursuit) {
329 		real sum = 0.0;
330 		a = -1;
331 		int j;
332 		for (j=0; j<n_actions; j++) {
333 			sum += P[s][j];
334 		}
335 		real X = urandom()*sum;
336 		real dsum=0.0;
337 		for (j=0; j<n_actions; j++) {
338 			dsum += P[s][j];
339 			if (X<=dsum) {
340 				a = j;
341 				break;
342 			}
343 		}
344 		if (a==-1) {
345 			fprintf (stderr, "No action selected with pursuit!\n");
346 		}
347 	} else if (confidence) {
348 		if (confidence_uses_gibbs && (confidence_distribution == SINGULAR)) {
349 			a = confMax (Q[s],vQ[s]);
350 		} else {
351 			a = confSample (Q[s], vQ[s]);
352 			if (confidence_uses_gibbs) { // and not SINGULAR distribution
353 				a = softMax(sample); //use softmax on the sample values
354 			}
355 		}
356 	} else if (reliability_estimate) {
357 		temp = sqrt(Sum(vQ[s], n_actions)/((real) n_actions));
358 		//temp = 0.1;
359 		a = softMax(Q[s]);
360 		//printf ("%f\n", temp);
361 	} else if (smax) {
362 		a = softMax (Q[s]);
363 		//printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
364 	} else {
365 		a = eGreedy (Q[s]);
366 	}
367 
368 	if (a<0 || a>=n_actions) {
369 		fprintf (stderr, "Action %d out of bounds.. ", a);
370 		a = (int) floor (urandom()*((real) n_actions));
371 		fprintf (stderr, "mapping to %d\n", a);
372 	}
373 
374 	real EQ_s = 0.0;
375 	int i;
376 
377 	switch (learning_method) {
378 
379 	case Sarsa:
380 		amax = a;
381 		EQ_s = Q[s][amax];
382 		break;
383 	case QLearning:
384 		amax = argmax;
385 		EQ_s = Q[s][amax];
386 		break;
387 	case ELearning:
388 		amax = a; //? correct ?
389 		Normalise(eval, eval, n_actions);
390 		EQ_s = 0.0;
391 		for (i=0; i<n_actions; i++) {
392 			EQ_s += eval[i] * Q[s][i];
393 		}
394 		break;
395 	default:
396 		amax = a;
397 		EQ_s = Q[s][amax];
398 		fprintf (stderr, "Unknown learning method\n");
399 	}
400 	if ((ps>=0)&&(pa>=0)) { // do not update at start of episode
401 		real delta = r + gamma*EQ_s - Q[ps][pa];
402 		tdError = delta;
403 		if (replacing_traces) {
404 			e[ps][pa] = 1.0;
405 		} else {
406 			e[ps][pa] += 1.0;
407 		}
408 		real ad = alpha*delta;
409 		real gl = gamma * lambda;
410 		real variance_threshold = 0.0001f;
411 		if  (confidence_eligibility == false) {
412 			vQ[ps][pa] = (1.0 - zeta)*vQ[ps][pa] + zeta*(ad*ad);
413 			if (vQ[ps][pa]<variance_threshold) {
414 				vQ[ps][pa]=variance_threshold;
415 			}
416 		}
417 		if (ps<min_el_state) min_el_state = ps;
418 		if (ps>max_el_state) max_el_state = ps;
419 
420 
421 		for (i=0; i<n_states; i++) {
422 			//for (int i=min_el_state; i<=max_el_state; i++) {
423 			bool el=true;
424 			for (int j=0; j<n_actions; j++) {
425 				if (e[i][j]>0.01) {
426 					Q[i][j] += ad * e[i][j];
427 					if (confidence_eligibility == true) {
428 						real zeta_el = zeta * e[i][j];
429 						vQ[i][j] = (1.0 - zeta_el)*vQ[i][j] + zeta_el*(ad*ad);
430 						if (vQ[i][j]<variance_threshold) {
431 							vQ[i][j]=variance_threshold;
432 						}
433 					}
434 					//this is the same as setting e[ps][pa] += (1-P[ps][pa])
435 					// if P[][] remains unchanged between updates.
436 					// -- removed because it doesn't work! --
437 					//P[i][j] += 0.01*delta * e[i][j] * (1.-P[i][j]);
438 					if ((fabs (Q[i][j])>1000.0)||(std::isnan(Q[i][j]))) {
439 						printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]);
440 					}
441 
442 					//This is only needed for Qlearning, but sarsa is not
443 					//affected since always amax==a;
444 					if (amax==a) {
445 						e[i][j] *= gl;
446 					} else {
447 						e[i][j] = 0.0;
448 					}
449 				} else {
450 					e[i][j] = 0.0;
451 					el = false;
452 				}
453 			}
454 			if (el==false) {
455 				if (min_el_state==i)
456 					min_el_state++;
457 			} else {
458 				max_el_state = i;
459 			}
460 		}
461 	}
462 
463 	//printf ("%d %d #STATE\n", min_el_state, max_el_state);
464 	//	printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
465 	//			ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);
466 	ps = s;
467 	pa = a;
468 
469 	return a;
470 }
471 
472 /// Use at the end of every episode, after agent has entered the
473 /// absorbing state.
Reset()474 void DiscretePolicy::Reset ()
475 {
476 	for (int s=0; s<n_states; s++) {
477 		for (int a=0; a<n_actions; a++) {
478 			e[s][a] = 0.0;
479 		}
480 	}
481 }
482 
483 /// Load policy from a file.
loadFile(char * f)484 void DiscretePolicy::loadFile (char* f)
485 {
486 	FILE* fh = NULL;
487 	fh = fopen (f, "rb");
488 	if (fh==NULL) {
489 		fprintf (stderr, "Failed to read file %s\n", f);
490 		return;
491 	}
492 	char rtag[256];
493 	const char* start_tag="QSA";
494 	const char* close_tag="END";
495 	int n_read_states, n_read_actions;
496 
497 	fread((void *) rtag, sizeof (char), strlen (start_tag)+1, fh);
498 	if (strcmp (rtag, start_tag)) {
499 		fprintf (stderr, "Could not find starting tag\n");
500 		return;
501 	}
502 	fread((void *) &n_read_states, sizeof(int), 1, fh);
503 	fread((void *) &n_read_actions, sizeof(int), 1, fh);
504 
505 	if ((n_read_states!=n_states)||(n_read_actions!=n_actions)) {
506 		fprintf (stderr, "File has %dx%d space! Aborting read.\n", n_read_states, n_read_actions);
507 		fclose(fh);
508 		return;
509 	}
510 
511 	int i, j;
512 	for (i=0; i<n_states; i++) {
513 		fread((void *) Q[i], sizeof(real), n_actions, fh);
514 		for (j=0; j<n_actions; j++) {
515 			if ((fabs (Q[i][j])>100.0)||(std::isnan(Q[i][j]))) {
516 				printf ("l: %d %d %f\n", i,j,Q[i][j]);
517 				Q[i][j] = 0.0;
518 			}
519 		}
520 	}
521 	for (i=0; i<n_states; i++) {
522 		for (j=0; j<n_actions; j++) {
523 			{
524 				P[i][j] = 1.0/((real) n_actions);
525 			}
526 		}
527 		int argmax = argMax (Q[i]);
528 		P[i][argmax] += 0.001*(1.0-P[i][argmax]);
529 		for (int j=0; j<n_actions; j++) {
530 			if (j!=argmax) {
531 				P[i][j] += 0.001*(0.0-P[i][j]);
532 			}
533 		}
534 	}
535 
536 
537 
538 	fread((void *) rtag, sizeof (char), strlen (close_tag)+1, fh);
539 	if (strcmp (rtag, close_tag)) {
540 		fprintf (stderr, "Could not find ending tag\n");
541 		fclose (fh);
542 		return;
543 	}
544 
545 
546 	fclose (fh);
547 }
548 
549 /// Save policy to a file.
saveFile(char * f)550 void DiscretePolicy::saveFile (char* f) {
551 	FILE* fh = NULL;
552 	fh = fopen (f, "wb");
553 	if (fh==NULL) {
554 		fprintf (stderr, "Failed to write to file %s\n", f);
555 		return;
556 	}
557 
558 	const char* start_tag="QSA";
559 	const char* close_tag="END";
560 
561 	fwrite((void *) start_tag, sizeof (char), strlen (start_tag)+1, fh);
562 	fwrite((void *) &n_states, sizeof(int), 1, fh);
563 	fwrite((void *) &n_actions, sizeof(int), 1, fh);
564 	for (int i=0; i<n_states; i++) {
565 		fwrite((void *) Q[i], sizeof(real), n_actions, fh);
566 		for (int j=0; j<n_actions; j++) {
567 			if ((fabs (Q[i][j])>100.0)||(std::isnan(Q[i][j]))) {
568 				printf ("s: %d %d %f\n", i,j,Q[i][j]);
569 			}
570 		}
571 	}
572 	fwrite((void *) close_tag, sizeof (char), strlen (start_tag)+1, fh);
573 	fclose (fh);
574 }
575 
576 /// \brief Set to use confidence estimates for action selection, with
577 /// variance smoothing zeta.
578 /// Variance smoothing currently uses a very simple method to estimate
579 /// the variance.
useConfidenceEstimates(bool confidence,real zeta,bool confidence_eligibility)580 bool DiscretePolicy::useConfidenceEstimates (bool confidence, real zeta, bool confidence_eligibility) {
581 	this->confidence = confidence;
582 	this->zeta = zeta;
583 	this->confidence_eligibility = confidence_eligibility;
584 
585 	if (confidence_eligibility) {
586 		logmsg ("#+[ELIG_VAR]");
587 	}
588 	if (confidence) {
589 		logmsg ("#+[CONDIFENCE]");
590 	} else {
591 		logmsg ("#-[CONDIFENCE]\n");
592 	}
593 
594 	return confidence;
595 }
596 
597 /// Set the algorithm to QLearning mode
setQLearning()598 void DiscretePolicy::setQLearning() {
599 	learning_method = QLearning;
600 	logmsg ("#[Q-learning]\n");
601 }
602 
603 /// Set the algorithm to ELearning mode
setELearning()604 void DiscretePolicy::setELearning() {
605 	learning_method = ELearning;
606 	logmsg ("#[E-learning]\n");
607 }
608 
609 /// \brief Set the algorithm to SARSA mode.
610 /// A unified framework for action selection.
setSarsa()611 void DiscretePolicy::setSarsa()
612 {
613 	learning_method = Sarsa;
614 	logmsg ("#[Sarsa]\n");
615 }
616 
617 /// Use Pursuit for action selection.
setPursuit(bool pursuit)618 void DiscretePolicy::setPursuit(bool pursuit)
619 {
620 	this->pursuit = pursuit;
621 	if (pursuit) {
622 		logmsg ("#+[PURSUIT]\n");
623 	} else {
624 		logmsg ("#-[PURSUIT]\n");
625 	}
626 }
627 
628 /// Use Pursuit for action selection.
setReplacingTraces(bool replacing)629 void DiscretePolicy::setReplacingTraces (bool replacing)
630 {
631 	this->replacing_traces = replacing;
632 	if (replacing) {
633 		logmsg ("#[REPLACING TRACES]\n");
634 	} else {
635 		logmsg ("#[ACCUMULATING TRACES]\n");
636 	}
637 }
638 /// Set forced learning (force-feed actions)
setForcedLearning(bool forced)639 void DiscretePolicy::setForcedLearning(bool forced)
640 {
641 	forced_learning = forced;
642 }
643 
644 /// Set randomness for action selection. Does not affect confidence mode.
setRandomness(real epsilon)645 void DiscretePolicy::setRandomness (real epsilon)
646 {
647 	temp = epsilon;
648 	if (smax) {
649 		if (temp<0.01) {
650 			smax = false;
651 		}
652 	}
653 }
654 
655 /// Set the gamma of the sum to be maximised.
setGamma(real gamma)656 void DiscretePolicy::setGamma (real gamma)
657 {
658 	this->gamma = gamma;
659 }
660 
661 /// Set action selection to softmax.
useSoftmax(bool softmax)662 void DiscretePolicy::useSoftmax (bool softmax)
663 {
664 	smax = softmax;
665 	if (smax) {
666 		logmsg ("#+[SMAX]\n");
667 	} else {
668 		logmsg ("#-[SMAX]\n");
669 	}
670 }
671 
672 /// Use the reliability estimate method for action selection.
useReliabilityEstimate(bool ri)673 void DiscretePolicy::useReliabilityEstimate (bool ri)
674 {
675 	reliability_estimate = ri;
676 	if (ri) {
677 		logmsg("#+[RI]\n");
678 	} else {
679 		logmsg("#-[RI]\n");
680 	}
681 }
682 
683 /// Set the distribution for direct action sampling.
setConfidenceDistribution(enum ConfidenceDistribution cd)684 void DiscretePolicy::setConfidenceDistribution (enum ConfidenceDistribution cd)
685 {
686 	switch (cd) {
687 	case SINGULAR:
688 		logmsg("#[SINGULAR CONFIDENCE]\n"); break;
689 	case BOUNDED:
690 		logmsg("#[BOUNDED CONFIDENCE]\n"); break;
691 	case GAUSSIAN:
692 		logmsg("#[GAUSSIAN CONFIDENCE]\n"); break;
693 	case LAPLACIAN:
694 		logmsg("#[LAPLACIAN CONFIDENCE]\n"); break;
695 	default:
696 		Serror ("Unknown type %d\n", cd);
697 	}
698 	confidence_distribution = cd;
699 }
700 
701 /// \brief Add Gibbs sampling for confidences.
702 /// This can be used in conjuction with any confidence distribution,
703 /// however it mostly makes sense for SINGULAR.
useGibbsConfidence(bool gibbs)704 void DiscretePolicy::useGibbsConfidence (bool gibbs)
705 {
706 	if (gibbs) {
707 		logmsg ("#+[GIBBS CONFIDENCE]\n");
708 	} else {
709 		logmsg ("#-[GIBBS CONFIDENCE]\n");
710 	}
711 	this->confidence_uses_gibbs = gibbs;
712 }
713 
714 // ---------- action selection helpers -------------
confMax(real * Qs,real * vQs,real p)715 int DiscretePolicy::confMax(real* Qs, real* vQs, real p) {
716 	real sum=0.0;
717 	int a;
718 #if 0
719 	for (a=0; a<n_actions; a++) {
720 		eval[a] = exp(pow(Qs[a]/sqrt(vQs[a]), p));
721 		sum += eval[a];
722 	}
723 #else
724 	for (a=0; a<n_actions; a++) {
725 		real Q = Qs[a];
726 		real cum = 1.0;
727 		//real v = sqrt(vQs[a]);
728 		for (int j=0; j<n_actions; j++) {
729 			if (j!=a) {
730 				cum += exp ((Qs[j]-Q)/sqrt(vQs[j]));
731 			}
732 		}
733 		eval[a] = 1.0/(cum);//#exp(Qs[a]/sqrt(vQs[a]));
734 		sum += eval[a];
735 	}
736 #endif
737 	real X = urandom()*sum;
738 	real dsum = 0.0;
739 	for (a=0; a<n_actions; a++) {
740 		dsum += eval[a];
741 		if (X<=dsum)
742 			return a;
743 
744 	}
745 	fprintf (stderr, "ConfMax: No action selected! %f %f %f\n",X,dsum,sum);
746 	return -1;
747 }
748 
confSample(real * Qs,real * vQs)749 int DiscretePolicy::confSample(real* Qs, real* vQs) {
750 	static NormalDistribution gaussian;
751 	static LaplacianDistribution laplacian;
752 	static UniformDistribution uniform;
753 
754 	for (int a=0; a<n_actions; a++) {
755 		//eval[a] = Qs[a] + urandom(-1.0,1.0)*vQs[a];
756 		switch(confidence_distribution) {
757 		case SINGULAR:
758 			sample[a] = Qs[a];
759 			break;
760 		case BOUNDED:
761 			uniform.setMean(Qs[a]);
762 			uniform.setVariance(vQs[a]);
763 			sample[a] = uniform.generate();
764 			break;
765 		case GAUSSIAN:
766 			gaussian.setMean(Qs[a]);
767 			gaussian.setVariance(vQs[a]);
768 			sample[a] = gaussian.generate();
769 			break;
770 		case LAPLACIAN:
771 			laplacian.setMean(Qs[a]);
772 			laplacian.setVariance(vQs[a]);
773 			sample[a] = Qs[a] + laplacian.generate();
774 			break;
775 		default:
776 			Serror ("Unknown distribution ID:%d\n", confidence_distribution);
777 			break;
778 		}
779 	}
780 	return argMax(sample);
781 }
782 
softMax(real * Qs)783 int DiscretePolicy::softMax(real* Qs) {
784 	real sum=0.0f;
785 	real beta = 1.0f/temp;
786 	int a;
787 	for (a=0; a<n_actions; a++) {
788 		eval[a] = exp(beta * Qs[a]);
789 		sum += eval[a];
790 	}
791 	real X = urandom()*sum;
792 	real dsum = 0.0;
793 	for (a=0; a<n_actions; a++) {
794 		dsum += eval[a];
795 		if (X<=dsum)
796 			return a;
797 
798 	}
799 	fprintf (stderr, "softMax: No action selected! %f %f %f\nT:%f\n",X,dsum,sum,temp);
800 	return -1;
801 }
eGreedy(real * Qs)802 int DiscretePolicy::eGreedy(real* Qs) {
803 	real X = urandom();
804 	int amax = argMax(Qs);
805 	real base_prob = temp/((real) n_actions);
806 	for (int a=0; a<n_actions; a++) {
807 		eval[a] = base_prob;
808 	}
809 	eval[amax] += 1.0-temp;
810 	if (X<temp) {
811 		return rand()%n_actions;
812 	}
813 	return argMax(Qs);
814 }
815 
argMax(real * Qs)816 int DiscretePolicy::argMax(real* Qs) {
817 	real max = Qs[0];
818 	int arg_max = 0;
819 	for (int a=1; a<n_actions; a++) {
820 		if (max<Qs[a]) {
821 			max = Qs[a];
822 			arg_max = a;
823 		}
824 	}
825 	return arg_max;
826 }
827 
828 
829