1 #include <math.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <stdarg.h>
6 #include <locale.h>
7 #include "linear.h"
8 #include "newton.h"
9 int liblinear_version = LIBLINEAR_VERSION;
10 typedef signed char schar;
swap(T & x,T & y)11 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
12 #ifndef min
min(T x,T y)13 template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
14 #endif
15 #ifndef max
max(T x,T y)16 template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
17 #endif
clone(T * & dst,S * src,int n)18 template <class S, class T> static inline void clone(T*& dst, S* src, int n)
19 {
20 	dst = new T[n];
21 	memcpy((void *)dst,(void *)src,sizeof(T)*n);
22 }
23 #define INF HUGE_VAL
24 #define Malloc(type,n) (type *)malloc((n)*sizeof(type))
25 
print_string_stdout(const char * s)26 static void print_string_stdout(const char *s)
27 {
28 	fputs(s,stdout);
29 	fflush(stdout);
30 }
print_null(const char * s)31 static void print_null(const char *s) {}
32 
33 static void (*liblinear_print_string) (const char *) = &print_string_stdout;
34 
35 #if 1
info(const char * fmt,...)36 static void info(const char *fmt,...)
37 {
38 	char buf[BUFSIZ];
39 	va_list ap;
40 	va_start(ap,fmt);
41 	vsprintf(buf,fmt,ap);
42 	va_end(ap);
43 	(*liblinear_print_string)(buf);
44 }
45 #else
info(const char * fmt,...)46 static void info(const char *fmt,...) {}
47 #endif
48 class sparse_operator
49 {
50 public:
nrm2_sq(const feature_node * x)51 	static double nrm2_sq(const feature_node *x)
52 	{
53 		double ret = 0;
54 		while(x->index != -1)
55 		{
56 			ret += x->value*x->value;
57 			x++;
58 		}
59 		return ret;
60 	}
61 
dot(const double * s,const feature_node * x)62 	static double dot(const double *s, const feature_node *x)
63 	{
64 		double ret = 0;
65 		while(x->index != -1)
66 		{
67 			ret += s[x->index-1]*x->value;
68 			x++;
69 		}
70 		return ret;
71 	}
72 
sparse_dot(const feature_node * x1,const feature_node * x2)73 	static double sparse_dot(const feature_node *x1, const feature_node *x2)
74 	{
75 		double ret = 0;
76 		while(x1->index != -1 && x2->index != -1)
77 		{
78 			if(x1->index == x2->index)
79 			{
80 				ret += x1->value * x2->value;
81 				++x1;
82 				++x2;
83 			}
84 			else
85 			{
86 				if(x1->index > x2->index)
87 					++x2;
88 				else
89 					++x1;
90 			}
91 		}
92 		return ret;
93 	}
94 
axpy(const double a,const feature_node * x,double * y)95 	static void axpy(const double a, const feature_node *x, double *y)
96 	{
97 		while(x->index != -1)
98 		{
99 			y[x->index-1] += a*x->value;
100 			x++;
101 		}
102 	}
103 };
104 
105 // L2-regularized empirical risk minimization
106 // min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss
107 
108 class l2r_erm_fun: public function
109 {
110 public:
111 	l2r_erm_fun(const problem *prob, const parameter *param, double *C);
112 	~l2r_erm_fun();
113 
114 	double fun(double *w);
115 	double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha);
116 	int get_nr_variable(void);
117 
118 protected:
119 	virtual double C_times_loss(int i, double wx_i) = 0;
120 	void Xv(double *v, double *Xv);
121 	void XTv(double *v, double *XTv);
122 
123 	double *C;
124 	const problem *prob;
125 	double *wx;
126 	double *tmp; // a working array
127 	double wTw;
128 	int regularize_bias;
129 };
130 
l2r_erm_fun(const problem * prob,const parameter * param,double * C)131 l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C)
132 {
133 	int l=prob->l;
134 
135 	this->prob = prob;
136 
137 	wx = new double[l];
138 	tmp = new double[l];
139 	this->C = C;
140 	this->regularize_bias = param->regularize_bias;
141 }
142 
~l2r_erm_fun()143 l2r_erm_fun::~l2r_erm_fun()
144 {
145 	delete[] wx;
146 	delete[] tmp;
147 }
148 
fun(double * w)149 double l2r_erm_fun::fun(double *w)
150 {
151 	int i;
152 	double f=0;
153 	int l=prob->l;
154 	int w_size=get_nr_variable();
155 
156 	wTw = 0;
157 	Xv(w, wx);
158 
159 	for(i=0;i<w_size;i++)
160 		wTw += w[i]*w[i];
161 	if(regularize_bias == 0)
162 		wTw -= w[w_size-1]*w[w_size-1];
163 	for(i=0;i<l;i++)
164 		f += C_times_loss(i, wx[i]);
165 	f = f + 0.5 * wTw;
166 
167 	return f;
168 }
169 
get_nr_variable(void)170 int l2r_erm_fun::get_nr_variable(void)
171 {
172 	return prob->n;
173 }
174 
175 // On entry *f must be the function value of w
176 // On exit w is updated and *f is the new function value
linesearch_and_update(double * w,double * s,double * f,double * g,double alpha)177 double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
178 {
179 	int i;
180 	int l = prob->l;
181 	double sTs = 0;
182 	double wTs = 0;
183 	double gTs = 0;
184 	double eta = 0.01;
185 	int w_size = get_nr_variable();
186 	int max_num_linesearch = 20;
187 	double fold = *f;
188 	Xv(s, tmp);
189 
190 	for (i=0;i<w_size;i++)
191 	{
192 		sTs += s[i] * s[i];
193 		wTs += s[i] * w[i];
194 		gTs += s[i] * g[i];
195 	}
196 	if(regularize_bias == 0)
197 	{
198 		// bias not used in calculating (w + \alpha s)^T (w + \alpha s)
199 		sTs -= s[w_size-1] * s[w_size-1];
200 		wTs -= s[w_size-1] * w[w_size-1];
201 	}
202 
203 	int num_linesearch = 0;
204 	for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
205 	{
206 		double loss = 0;
207 		for(i=0;i<l;i++)
208 		{
209 			double inner_product = tmp[i] * alpha + wx[i];
210 			loss += C_times_loss(i, inner_product);
211 		}
212 		*f = loss + (alpha * alpha * sTs + wTw) / 2.0 + alpha * wTs;
213 		if (*f - fold <= eta * alpha * gTs)
214 		{
215 			for (i=0;i<l;i++)
216 				wx[i] += alpha * tmp[i];
217 			break;
218 		}
219 		else
220 			alpha *= 0.5;
221 	}
222 
223 	if (num_linesearch >= max_num_linesearch)
224 	{
225 		*f = fold;
226 		return 0;
227 	}
228 	else
229 		for (i=0;i<w_size;i++)
230 			w[i] += alpha * s[i];
231 
232 	wTw += alpha * alpha * sTs + 2* alpha * wTs;
233 	return alpha;
234 }
235 
Xv(double * v,double * Xv)236 void l2r_erm_fun::Xv(double *v, double *Xv)
237 {
238 	int i;
239 	int l=prob->l;
240 	feature_node **x=prob->x;
241 
242 	for(i=0;i<l;i++)
243 		Xv[i]=sparse_operator::dot(v, x[i]);
244 }
245 
XTv(double * v,double * XTv)246 void l2r_erm_fun::XTv(double *v, double *XTv)
247 {
248 	int i;
249 	int l=prob->l;
250 	int w_size=get_nr_variable();
251 	feature_node **x=prob->x;
252 
253 	for(i=0;i<w_size;i++)
254 		XTv[i]=0;
255 	for(i=0;i<l;i++)
256 		sparse_operator::axpy(v[i], x[i], XTv);
257 }
258 
259 class l2r_lr_fun: public l2r_erm_fun
260 {
261 public:
262 	l2r_lr_fun(const problem *prob, const parameter *param, double *C);
263 	~l2r_lr_fun();
264 
265 	void grad(double *w, double *g);
266 	void Hv(double *s, double *Hs);
267 
268 	void get_diag_preconditioner(double *M);
269 
270 private:
271 	double *D;
272 	double C_times_loss(int i, double wx_i);
273 };
274 
l2r_lr_fun(const problem * prob,const parameter * param,double * C)275 l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C):
276 	l2r_erm_fun(prob, param, C)
277 {
278 	int l=prob->l;
279 	D = new double[l];
280 }
281 
~l2r_lr_fun()282 l2r_lr_fun::~l2r_lr_fun()
283 {
284 	delete[] D;
285 }
286 
C_times_loss(int i,double wx_i)287 double l2r_lr_fun::C_times_loss(int i, double wx_i)
288 {
289 	double ywx_i = wx_i * prob->y[i];
290 	if (ywx_i >= 0)
291 		return C[i]*log(1 + exp(-ywx_i));
292 	else
293 		return C[i]*(-ywx_i + log(1 + exp(ywx_i)));
294 }
295 
grad(double * w,double * g)296 void l2r_lr_fun::grad(double *w, double *g)
297 {
298 	int i;
299 	double *y=prob->y;
300 	int l=prob->l;
301 	int w_size=get_nr_variable();
302 
303 	for(i=0;i<l;i++)
304 	{
305 		tmp[i] = 1/(1 + exp(-y[i]*wx[i]));
306 		D[i] = tmp[i]*(1-tmp[i]);
307 		tmp[i] = C[i]*(tmp[i]-1)*y[i];
308 	}
309 	XTv(tmp, g);
310 
311 	for(i=0;i<w_size;i++)
312 		g[i] = w[i] + g[i];
313 	if(regularize_bias == 0)
314 		g[w_size-1] -= w[w_size-1];
315 }
316 
get_diag_preconditioner(double * M)317 void l2r_lr_fun::get_diag_preconditioner(double *M)
318 {
319 	int i;
320 	int l = prob->l;
321 	int w_size=get_nr_variable();
322 	feature_node **x = prob->x;
323 
324 	for (i=0; i<w_size; i++)
325 		M[i] = 1;
326 	if(regularize_bias == 0)
327 		M[w_size-1] = 0;
328 
329 	for (i=0; i<l; i++)
330 	{
331 		feature_node *xi = x[i];
332 		while (xi->index!=-1)
333 		{
334 			M[xi->index-1] += xi->value*xi->value*C[i]*D[i];
335 			xi++;
336 		}
337 	}
338 }
339 
Hv(double * s,double * Hs)340 void l2r_lr_fun::Hv(double *s, double *Hs)
341 {
342 	int i;
343 	int l=prob->l;
344 	int w_size=get_nr_variable();
345 	feature_node **x=prob->x;
346 
347 	for(i=0;i<w_size;i++)
348 		Hs[i] = 0;
349 	for(i=0;i<l;i++)
350 	{
351 		feature_node * const xi=x[i];
352 		double xTs = sparse_operator::dot(s, xi);
353 
354 		xTs = C[i]*D[i]*xTs;
355 
356 		sparse_operator::axpy(xTs, xi, Hs);
357 	}
358 	for(i=0;i<w_size;i++)
359 		Hs[i] = s[i] + Hs[i];
360 	if(regularize_bias == 0)
361 		Hs[w_size-1] -= s[w_size-1];
362 }
363 
364 class l2r_l2_svc_fun: public l2r_erm_fun
365 {
366 public:
367 	l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
368 	~l2r_l2_svc_fun();
369 
370 	void grad(double *w, double *g);
371 	void Hv(double *s, double *Hs);
372 
373 	void get_diag_preconditioner(double *M);
374 
375 protected:
376 	void subXTv(double *v, double *XTv);
377 
378 	int *I;
379 	int sizeI;
380 
381 private:
382 	double C_times_loss(int i, double wx_i);
383 };
384 
l2r_l2_svc_fun(const problem * prob,const parameter * param,double * C)385 l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C):
386 	l2r_erm_fun(prob, param, C)
387 {
388 	I = new int[prob->l];
389 }
390 
~l2r_l2_svc_fun()391 l2r_l2_svc_fun::~l2r_l2_svc_fun()
392 {
393 	delete[] I;
394 }
395 
C_times_loss(int i,double wx_i)396 double l2r_l2_svc_fun::C_times_loss(int i, double wx_i)
397 {
398 	double d = 1 - prob->y[i] * wx_i;
399 	if (d > 0)
400 		return C[i] * d * d;
401 	else
402 		return 0;
403 }
404 
grad(double * w,double * g)405 void l2r_l2_svc_fun::grad(double *w, double *g)
406 {
407 	int i;
408 	double *y=prob->y;
409 	int l=prob->l;
410 	int w_size=get_nr_variable();
411 
412 	sizeI = 0;
413 	for (i=0;i<l;i++)
414 	{
415 		tmp[i] = wx[i] * y[i];
416 		if (tmp[i] < 1)
417 		{
418 			tmp[sizeI] = C[i]*y[i]*(tmp[i]-1);
419 			I[sizeI] = i;
420 			sizeI++;
421 		}
422 	}
423 	subXTv(tmp, g);
424 
425 	for(i=0;i<w_size;i++)
426 		g[i] = w[i] + 2*g[i];
427 	if(regularize_bias == 0)
428 		g[w_size-1] -= w[w_size-1];
429 }
430 
get_diag_preconditioner(double * M)431 void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
432 {
433 	int i;
434 	int w_size=get_nr_variable();
435 	feature_node **x = prob->x;
436 
437 	for (i=0; i<w_size; i++)
438 		M[i] = 1;
439 	if(regularize_bias == 0)
440 		M[w_size-1] = 0;
441 
442 	for (i=0; i<sizeI; i++)
443 	{
444 		int idx = I[i];
445 		feature_node *xi = x[idx];
446 		while (xi->index!=-1)
447 		{
448 			M[xi->index-1] += xi->value*xi->value*C[idx]*2;
449 			xi++;
450 		}
451 	}
452 }
453 
Hv(double * s,double * Hs)454 void l2r_l2_svc_fun::Hv(double *s, double *Hs)
455 {
456 	int i;
457 	int w_size=get_nr_variable();
458 	feature_node **x=prob->x;
459 
460 	for(i=0;i<w_size;i++)
461 		Hs[i]=0;
462 	for(i=0;i<sizeI;i++)
463 	{
464 		feature_node * const xi=x[I[i]];
465 		double xTs = sparse_operator::dot(s, xi);
466 
467 		xTs = C[I[i]]*xTs;
468 
469 		sparse_operator::axpy(xTs, xi, Hs);
470 	}
471 	for(i=0;i<w_size;i++)
472 		Hs[i] = s[i] + 2*Hs[i];
473 	if(regularize_bias == 0)
474 		Hs[w_size-1] -= s[w_size-1];
475 }
476 
subXTv(double * v,double * XTv)477 void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
478 {
479 	int i;
480 	int w_size=get_nr_variable();
481 	feature_node **x=prob->x;
482 
483 	for(i=0;i<w_size;i++)
484 		XTv[i]=0;
485 	for(i=0;i<sizeI;i++)
486 		sparse_operator::axpy(v[i], x[I[i]], XTv);
487 }
488 
489 class l2r_l2_svr_fun: public l2r_l2_svc_fun
490 {
491 public:
492 	l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
493 
494 	void grad(double *w, double *g);
495 
496 private:
497 	double C_times_loss(int i, double wx_i);
498 	double p;
499 };
500 
l2r_l2_svr_fun(const problem * prob,const parameter * param,double * C)501 l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
502 	l2r_l2_svc_fun(prob, param, C)
503 {
504 	this->p = param->p;
505 	this->regularize_bias = param->regularize_bias;
506 }
507 
C_times_loss(int i,double wx_i)508 double l2r_l2_svr_fun::C_times_loss(int i, double wx_i)
509 {
510 	double d = wx_i - prob->y[i];
511 	if(d < -p)
512 		return C[i]*(d+p)*(d+p);
513 	else if(d > p)
514 		return C[i]*(d-p)*(d-p);
515 	return 0;
516 }
517 
grad(double * w,double * g)518 void l2r_l2_svr_fun::grad(double *w, double *g)
519 {
520 	int i;
521 	double *y=prob->y;
522 	int l=prob->l;
523 	int w_size=get_nr_variable();
524 	double d;
525 
526 	sizeI = 0;
527 	for(i=0;i<l;i++)
528 	{
529 		d = wx[i] - y[i];
530 
531 		// generate index set I
532 		if(d < -p)
533 		{
534 			tmp[sizeI] = C[i]*(d+p);
535 			I[sizeI] = i;
536 			sizeI++;
537 		}
538 		else if(d > p)
539 		{
540 			tmp[sizeI] = C[i]*(d-p);
541 			I[sizeI] = i;
542 			sizeI++;
543 		}
544 
545 	}
546 	subXTv(tmp, g);
547 
548 	for(i=0;i<w_size;i++)
549 		g[i] = w[i] + 2*g[i];
550 	if(regularize_bias == 0)
551 		g[w_size-1] -= w[w_size-1];
552 }
553 
554 // A coordinate descent algorithm for
555 // multi-class support vector machines by Crammer and Singer
556 //
557 //  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
558 //    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
559 //
560 //  where e^m_i = 0 if y_i  = m,
561 //        e^m_i = 1 if y_i != m,
562 //  C^m_i = C if m  = y_i,
563 //  C^m_i = 0 if m != y_i,
564 //  and w_m(\alpha) = \sum_i \alpha^m_i x_i
565 //
566 // Given:
567 // x, y, C
568 // eps is the stopping tolerance
569 //
570 // solution will be put in w
571 //
572 // See Appendix of LIBLINEAR paper, Fan et al. (2008)
573 
574 #define GETI(i) ((int) prob->y[i])
575 // To support weights for instances, use GETI(i) (i)
576 
577 class Solver_MCSVM_CS
578 {
579 	public:
580 		Solver_MCSVM_CS(const problem *prob, int nr_class, double *C, double eps=0.1, int max_iter=100000);
581 		~Solver_MCSVM_CS();
582 		void Solve(double *w);
583 	private:
584 		void solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new);
585 		bool be_shrunk(int i, int m, int yi, double alpha_i, double minG);
586 		double *B, *C, *G;
587 		int w_size, l;
588 		int nr_class;
589 		int max_iter;
590 		double eps;
591 		const problem *prob;
592 };
593 
Solver_MCSVM_CS(const problem * prob,int nr_class,double * weighted_C,double eps,int max_iter)594 Solver_MCSVM_CS::Solver_MCSVM_CS(const problem *prob, int nr_class, double *weighted_C, double eps, int max_iter)
595 {
596 	this->w_size = prob->n;
597 	this->l = prob->l;
598 	this->nr_class = nr_class;
599 	this->eps = eps;
600 	this->max_iter = max_iter;
601 	this->prob = prob;
602 	this->B = new double[nr_class];
603 	this->G = new double[nr_class];
604 	this->C = weighted_C;
605 }
606 
~Solver_MCSVM_CS()607 Solver_MCSVM_CS::~Solver_MCSVM_CS()
608 {
609 	delete[] B;
610 	delete[] G;
611 }
612 
compare_double(const void * a,const void * b)613 int compare_double(const void *a, const void *b)
614 {
615 	if(*(double *)a > *(double *)b)
616 		return -1;
617 	if(*(double *)a < *(double *)b)
618 		return 1;
619 	return 0;
620 }
621 
solve_sub_problem(double A_i,int yi,double C_yi,int active_i,double * alpha_new)622 void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new)
623 {
624 	int r;
625 	double *D;
626 
627 	clone(D, B, active_i);
628 	if(yi < active_i)
629 		D[yi] += A_i*C_yi;
630 	qsort(D, active_i, sizeof(double), compare_double);
631 
632 	double beta = D[0] - A_i*C_yi;
633 	for(r=1;r<active_i && beta<r*D[r];r++)
634 		beta += D[r];
635 	beta /= r;
636 
637 	for(r=0;r<active_i;r++)
638 	{
639 		if(r == yi)
640 			alpha_new[r] = min(C_yi, (beta-B[r])/A_i);
641 		else
642 			alpha_new[r] = min((double)0, (beta - B[r])/A_i);
643 	}
644 	delete[] D;
645 }
646 
be_shrunk(int i,int m,int yi,double alpha_i,double minG)647 bool Solver_MCSVM_CS::be_shrunk(int i, int m, int yi, double alpha_i, double minG)
648 {
649 	double bound = 0;
650 	if(m == yi)
651 		bound = C[GETI(i)];
652 	if(alpha_i == bound && G[m] < minG)
653 		return true;
654 	return false;
655 }
656 
Solve(double * w)657 void Solver_MCSVM_CS::Solve(double *w)
658 {
659 	int i, m, s;
660 	int iter = 0;
661 	double *alpha =  new double[l*nr_class];
662 	double *alpha_new = new double[nr_class];
663 	int *index = new int[l];
664 	double *QD = new double[l];
665 	int *d_ind = new int[nr_class];
666 	double *d_val = new double[nr_class];
667 	int *alpha_index = new int[nr_class*l];
668 	int *y_index = new int[l];
669 	int active_size = l;
670 	int *active_size_i = new int[l];
671 	double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
672 	bool start_from_all = true;
673 
674 	// Initial alpha can be set here. Note that
675 	// sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
676 	// alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
677 	// alpha[i*nr_class+m] <= 0 if prob->y[i] != m
678 	// If initial alpha isn't zero, uncomment the for loop below to initialize w
679 	for(i=0;i<l*nr_class;i++)
680 		alpha[i] = 0;
681 
682 	for(i=0;i<w_size*nr_class;i++)
683 		w[i] = 0;
684 	for(i=0;i<l;i++)
685 	{
686 		for(m=0;m<nr_class;m++)
687 			alpha_index[i*nr_class+m] = m;
688 		feature_node *xi = prob->x[i];
689 		QD[i] = 0;
690 		while(xi->index != -1)
691 		{
692 			double val = xi->value;
693 			QD[i] += val*val;
694 
695 			// Uncomment the for loop if initial alpha isn't zero
696 			// for(m=0; m<nr_class; m++)
697 			//	w[(xi->index-1)*nr_class+m] += alpha[i*nr_class+m]*val;
698 			xi++;
699 		}
700 		active_size_i[i] = nr_class;
701 		y_index[i] = (int)prob->y[i];
702 		index[i] = i;
703 	}
704 
705 	while(iter < max_iter)
706 	{
707 		double stopping = -INF;
708 		for(i=0;i<active_size;i++)
709 		{
710 			int j = i+rand()%(active_size-i);
711 			swap(index[i], index[j]);
712 		}
713 		for(s=0;s<active_size;s++)
714 		{
715 			i = index[s];
716 			double Ai = QD[i];
717 			double *alpha_i = &alpha[i*nr_class];
718 			int *alpha_index_i = &alpha_index[i*nr_class];
719 
720 			if(Ai > 0)
721 			{
722 				for(m=0;m<active_size_i[i];m++)
723 					G[m] = 1;
724 				if(y_index[i] < active_size_i[i])
725 					G[y_index[i]] = 0;
726 
727 				feature_node *xi = prob->x[i];
728 				while(xi->index!= -1)
729 				{
730 					double *w_i = &w[(xi->index-1)*nr_class];
731 					for(m=0;m<active_size_i[i];m++)
732 						G[m] += w_i[alpha_index_i[m]]*(xi->value);
733 					xi++;
734 				}
735 
736 				double minG = INF;
737 				double maxG = -INF;
738 				for(m=0;m<active_size_i[i];m++)
739 				{
740 					if(alpha_i[alpha_index_i[m]] < 0 && G[m] < minG)
741 						minG = G[m];
742 					if(G[m] > maxG)
743 						maxG = G[m];
744 				}
745 				if(y_index[i] < active_size_i[i])
746 					if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG)
747 						minG = G[y_index[i]];
748 
749 				for(m=0;m<active_size_i[i];m++)
750 				{
751 					if(be_shrunk(i, m, y_index[i], alpha_i[alpha_index_i[m]], minG))
752 					{
753 						active_size_i[i]--;
754 						while(active_size_i[i]>m)
755 						{
756 							if(!be_shrunk(i, active_size_i[i], y_index[i],
757 											alpha_i[alpha_index_i[active_size_i[i]]], minG))
758 							{
759 								swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]);
760 								swap(G[m], G[active_size_i[i]]);
761 								if(y_index[i] == active_size_i[i])
762 									y_index[i] = m;
763 								else if(y_index[i] == m)
764 									y_index[i] = active_size_i[i];
765 								break;
766 							}
767 							active_size_i[i]--;
768 						}
769 					}
770 				}
771 
772 				if(active_size_i[i] <= 1)
773 				{
774 					active_size--;
775 					swap(index[s], index[active_size]);
776 					s--;
777 					continue;
778 				}
779 
780 				if(maxG-minG <= 1e-12)
781 					continue;
782 				else
783 					stopping = max(maxG - minG, stopping);
784 
785 				for(m=0;m<active_size_i[i];m++)
786 					B[m] = G[m] - Ai*alpha_i[alpha_index_i[m]] ;
787 
788 				solve_sub_problem(Ai, y_index[i], C[GETI(i)], active_size_i[i], alpha_new);
789 				int nz_d = 0;
790 				for(m=0;m<active_size_i[i];m++)
791 				{
792 					double d = alpha_new[m] - alpha_i[alpha_index_i[m]];
793 					alpha_i[alpha_index_i[m]] = alpha_new[m];
794 					if(fabs(d) >= 1e-12)
795 					{
796 						d_ind[nz_d] = alpha_index_i[m];
797 						d_val[nz_d] = d;
798 						nz_d++;
799 					}
800 				}
801 
802 				xi = prob->x[i];
803 				while(xi->index != -1)
804 				{
805 					double *w_i = &w[(xi->index-1)*nr_class];
806 					for(m=0;m<nz_d;m++)
807 						w_i[d_ind[m]] += d_val[m]*xi->value;
808 					xi++;
809 				}
810 			}
811 		}
812 
813 		iter++;
814 		if(iter % 10 == 0)
815 		{
816 			info(".");
817 		}
818 
819 		if(stopping < eps_shrink)
820 		{
821 			if(stopping < eps && start_from_all == true)
822 				break;
823 			else
824 			{
825 				active_size = l;
826 				for(i=0;i<l;i++)
827 					active_size_i[i] = nr_class;
828 				info("*");
829 				eps_shrink = max(eps_shrink/2, eps);
830 				start_from_all = true;
831 			}
832 		}
833 		else
834 			start_from_all = false;
835 	}
836 
837 	info("\noptimization finished, #iter = %d\n",iter);
838 	if (iter >= max_iter)
839 		info("\nWARNING: reaching max number of iterations\n");
840 
841 	// calculate objective value
842 	double v = 0;
843 	int nSV = 0;
844 	for(i=0;i<w_size*nr_class;i++)
845 		v += w[i]*w[i];
846 	v = 0.5*v;
847 	for(i=0;i<l*nr_class;i++)
848 	{
849 		v += alpha[i];
850 		if(fabs(alpha[i]) > 0)
851 			nSV++;
852 	}
853 	for(i=0;i<l;i++)
854 		v -= alpha[i*nr_class+(int)prob->y[i]];
855 	info("Objective value = %lf\n",v);
856 	info("nSV = %d\n",nSV);
857 
858 	delete [] alpha;
859 	delete [] alpha_new;
860 	delete [] index;
861 	delete [] QD;
862 	delete [] d_ind;
863 	delete [] d_val;
864 	delete [] alpha_index;
865 	delete [] y_index;
866 	delete [] active_size_i;
867 }
868 
869 // A coordinate descent algorithm for
870 // L1-loss and L2-loss SVM dual problems
871 //
872 //  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
873 //    s.t.      0 <= \alpha_i <= upper_bound_i,
874 //
875 //  where Qij = yi yj xi^T xj and
876 //  D is a diagonal matrix
877 //
878 // In L1-SVM case:
879 //              upper_bound_i = Cp if y_i = 1
880 //              upper_bound_i = Cn if y_i = -1
881 //              D_ii = 0
882 // In L2-SVM case:
883 //              upper_bound_i = INF
884 //              D_ii = 1/(2*Cp) if y_i = 1
885 //              D_ii = 1/(2*Cn) if y_i = -1
886 //
887 // Given:
888 // x, y, Cp, Cn
889 // eps is the stopping tolerance
890 //
891 // solution will be put in w
892 //
893 // this function returns the number of iterations
894 //
895 // See Algorithm 3 of Hsieh et al., ICML 2008
896 
897 #undef GETI
898 #define GETI(i) (y[i]+1)
899 // To support weights for instances, use GETI(i) (i)
900 
solve_l2r_l1l2_svc(const problem * prob,const parameter * param,double * w,double Cp,double Cn,int max_iter=300)901 static int solve_l2r_l1l2_svc(const problem *prob, const parameter *param, double *w, double Cp, double Cn, int max_iter=300)
902 {
903 	int l = prob->l;
904 	int w_size = prob->n;
905 	double eps = param->eps;
906 	int solver_type = param->solver_type;
907 	int i, s, iter = 0;
908 	double C, d, G;
909 	double *QD = new double[l];
910 	int *index = new int[l];
911 	double *alpha = new double[l];
912 	schar *y = new schar[l];
913 	int active_size = l;
914 
915 	// PG: projected gradient, for shrinking and stopping
916 	double PG;
917 	double PGmax_old = INF;
918 	double PGmin_old = -INF;
919 	double PGmax_new, PGmin_new;
920 
921 	// default solver_type: L2R_L2LOSS_SVC_DUAL
922 	double diag[3] = {0.5/Cn, 0, 0.5/Cp};
923 	double upper_bound[3] = {INF, 0, INF};
924 	if(solver_type == L2R_L1LOSS_SVC_DUAL)
925 	{
926 		diag[0] = 0;
927 		diag[2] = 0;
928 		upper_bound[0] = Cn;
929 		upper_bound[2] = Cp;
930 	}
931 
932 	for(i=0; i<l; i++)
933 	{
934 		if(prob->y[i] > 0)
935 		{
936 			y[i] = +1;
937 		}
938 		else
939 		{
940 			y[i] = -1;
941 		}
942 	}
943 
944 	// Initial alpha can be set here. Note that
945 	// 0 <= alpha[i] <= upper_bound[GETI(i)]
946 	for(i=0; i<l; i++)
947 		alpha[i] = 0;
948 
949 	for(i=0; i<w_size; i++)
950 		w[i] = 0;
951 	for(i=0; i<l; i++)
952 	{
953 		QD[i] = diag[GETI(i)];
954 
955 		feature_node * const xi = prob->x[i];
956 		QD[i] += sparse_operator::nrm2_sq(xi);
957 		sparse_operator::axpy(y[i]*alpha[i], xi, w);
958 
959 		index[i] = i;
960 	}
961 
962 	while (iter < max_iter)
963 	{
964 		PGmax_new = -INF;
965 		PGmin_new = INF;
966 
967 		for (i=0; i<active_size; i++)
968 		{
969 			int j = i+rand()%(active_size-i);
970 			swap(index[i], index[j]);
971 		}
972 
973 		for (s=0; s<active_size; s++)
974 		{
975 			i = index[s];
976 			const schar yi = y[i];
977 			feature_node * const xi = prob->x[i];
978 
979 			G = yi*sparse_operator::dot(w, xi)-1;
980 
981 			C = upper_bound[GETI(i)];
982 			G += alpha[i]*diag[GETI(i)];
983 
984 			PG = 0;
985 			if (alpha[i] == 0)
986 			{
987 				if (G > PGmax_old)
988 				{
989 					active_size--;
990 					swap(index[s], index[active_size]);
991 					s--;
992 					continue;
993 				}
994 				else if (G < 0)
995 					PG = G;
996 			}
997 			else if (alpha[i] == C)
998 			{
999 				if (G < PGmin_old)
1000 				{
1001 					active_size--;
1002 					swap(index[s], index[active_size]);
1003 					s--;
1004 					continue;
1005 				}
1006 				else if (G > 0)
1007 					PG = G;
1008 			}
1009 			else
1010 				PG = G;
1011 
1012 			PGmax_new = max(PGmax_new, PG);
1013 			PGmin_new = min(PGmin_new, PG);
1014 
1015 			if(fabs(PG) > 1.0e-12)
1016 			{
1017 				double alpha_old = alpha[i];
1018 				alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C);
1019 				d = (alpha[i] - alpha_old)*yi;
1020 				sparse_operator::axpy(d, xi, w);
1021 			}
1022 		}
1023 
1024 		iter++;
1025 		if(iter % 10 == 0)
1026 			info(".");
1027 
1028 		if(PGmax_new - PGmin_new <= eps &&
1029 			fabs(PGmax_new) <= eps && fabs(PGmin_new) <= eps)
1030 		{
1031 			if(active_size == l)
1032 				break;
1033 			else
1034 			{
1035 				active_size = l;
1036 				info("*");
1037 				PGmax_old = INF;
1038 				PGmin_old = -INF;
1039 				continue;
1040 			}
1041 		}
1042 		PGmax_old = PGmax_new;
1043 		PGmin_old = PGmin_new;
1044 		if (PGmax_old <= 0)
1045 			PGmax_old = INF;
1046 		if (PGmin_old >= 0)
1047 			PGmin_old = -INF;
1048 	}
1049 
1050 	info("\noptimization finished, #iter = %d\n",iter);
1051 
1052 	// calculate objective value
1053 
1054 	double v = 0;
1055 	int nSV = 0;
1056 	for(i=0; i<w_size; i++)
1057 		v += w[i]*w[i];
1058 	for(i=0; i<l; i++)
1059 	{
1060 		v += alpha[i]*(alpha[i]*diag[GETI(i)] - 2);
1061 		if(alpha[i] > 0)
1062 			++nSV;
1063 	}
1064 	info("Objective value = %lf\n",v/2);
1065 	info("nSV = %d\n",nSV);
1066 
1067 	delete [] QD;
1068 	delete [] alpha;
1069 	delete [] y;
1070 	delete [] index;
1071 
1072 	return iter;
1073 }
1074 
1075 
1076 // A coordinate descent algorithm for
1077 // L1-loss and L2-loss epsilon-SVR dual problem
1078 //
1079 //  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
1080 //    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
1081 //
1082 //  where Qij = xi^T xj and
1083 //  D is a diagonal matrix
1084 //
1085 // In L1-SVM case:
1086 //              upper_bound_i = C
1087 //              lambda_i = 0
1088 // In L2-SVM case:
1089 //              upper_bound_i = INF
1090 //              lambda_i = 1/(2*C)
1091 //
1092 // Given:
1093 // x, y, p, C
1094 // eps is the stopping tolerance
1095 //
1096 // solution will be put in w
1097 //
1098 // this function returns the number of iterations
1099 //
1100 // See Algorithm 4 of Ho and Lin, 2012
1101 
1102 #undef GETI
1103 #define GETI(i) (0)
1104 // To support weights for instances, use GETI(i) (i)
1105 
solve_l2r_l1l2_svr(const problem * prob,const parameter * param,double * w,int max_iter=300)1106 static int solve_l2r_l1l2_svr(const problem *prob, const parameter *param, double *w, int max_iter=300)
1107 {
1108 	const int solver_type = param->solver_type;
1109 	int l = prob->l;
1110 	double C = param->C;
1111 	double p = param->p;
1112 	int w_size = prob->n;
1113 	double eps = param->eps;
1114 	int i, s, iter = 0;
1115 	int active_size = l;
1116 	int *index = new int[l];
1117 
1118 	double d, G, H;
1119 	double Gmax_old = INF;
1120 	double Gmax_new, Gnorm1_new;
1121 	double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1122 	double *beta = new double[l];
1123 	double *QD = new double[l];
1124 	double *y = prob->y;
1125 
1126 	// L2R_L2LOSS_SVR_DUAL
1127 	double lambda[1], upper_bound[1];
1128 	lambda[0] = 0.5/C;
1129 	upper_bound[0] = INF;
1130 
1131 	if(solver_type == L2R_L1LOSS_SVR_DUAL)
1132 	{
1133 		lambda[0] = 0;
1134 		upper_bound[0] = C;
1135 	}
1136 
1137 	// Initial beta can be set here. Note that
1138 	// -upper_bound <= beta[i] <= upper_bound
1139 	for(i=0; i<l; i++)
1140 		beta[i] = 0;
1141 
1142 	for(i=0; i<w_size; i++)
1143 		w[i] = 0;
1144 	for(i=0; i<l; i++)
1145 	{
1146 		feature_node * const xi = prob->x[i];
1147 		QD[i] = sparse_operator::nrm2_sq(xi);
1148 		sparse_operator::axpy(beta[i], xi, w);
1149 
1150 		index[i] = i;
1151 	}
1152 
1153 
1154 	while(iter < max_iter)
1155 	{
1156 		Gmax_new = 0;
1157 		Gnorm1_new = 0;
1158 
1159 		for(i=0; i<active_size; i++)
1160 		{
1161 			int j = i+rand()%(active_size-i);
1162 			swap(index[i], index[j]);
1163 		}
1164 
1165 		for(s=0; s<active_size; s++)
1166 		{
1167 			i = index[s];
1168 			G = -y[i] + lambda[GETI(i)]*beta[i];
1169 			H = QD[i] + lambda[GETI(i)];
1170 
1171 			feature_node * const xi = prob->x[i];
1172 			G += sparse_operator::dot(w, xi);
1173 
1174 			double Gp = G+p;
1175 			double Gn = G-p;
1176 			double violation = 0;
1177 			if(beta[i] == 0)
1178 			{
1179 				if(Gp < 0)
1180 					violation = -Gp;
1181 				else if(Gn > 0)
1182 					violation = Gn;
1183 				else if(Gp>Gmax_old && Gn<-Gmax_old)
1184 				{
1185 					active_size--;
1186 					swap(index[s], index[active_size]);
1187 					s--;
1188 					continue;
1189 				}
1190 			}
1191 			else if(beta[i] >= upper_bound[GETI(i)])
1192 			{
1193 				if(Gp > 0)
1194 					violation = Gp;
1195 				else if(Gp < -Gmax_old)
1196 				{
1197 					active_size--;
1198 					swap(index[s], index[active_size]);
1199 					s--;
1200 					continue;
1201 				}
1202 			}
1203 			else if(beta[i] <= -upper_bound[GETI(i)])
1204 			{
1205 				if(Gn < 0)
1206 					violation = -Gn;
1207 				else if(Gn > Gmax_old)
1208 				{
1209 					active_size--;
1210 					swap(index[s], index[active_size]);
1211 					s--;
1212 					continue;
1213 				}
1214 			}
1215 			else if(beta[i] > 0)
1216 				violation = fabs(Gp);
1217 			else
1218 				violation = fabs(Gn);
1219 
1220 			Gmax_new = max(Gmax_new, violation);
1221 			Gnorm1_new += violation;
1222 
1223 			// obtain Newton direction d
1224 			if(Gp < H*beta[i])
1225 				d = -Gp/H;
1226 			else if(Gn > H*beta[i])
1227 				d = -Gn/H;
1228 			else
1229 				d = -beta[i];
1230 
1231 			if(fabs(d) < 1.0e-12)
1232 				continue;
1233 
1234 			double beta_old = beta[i];
1235 			beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]);
1236 			d = beta[i]-beta_old;
1237 
1238 			if(d != 0)
1239 				sparse_operator::axpy(d, xi, w);
1240 		}
1241 
1242 		if(iter == 0)
1243 			Gnorm1_init = Gnorm1_new;
1244 		iter++;
1245 		if(iter % 10 == 0)
1246 			info(".");
1247 
1248 		if(Gnorm1_new <= eps*Gnorm1_init)
1249 		{
1250 			if(active_size == l)
1251 				break;
1252 			else
1253 			{
1254 				active_size = l;
1255 				info("*");
1256 				Gmax_old = INF;
1257 				continue;
1258 			}
1259 		}
1260 
1261 		Gmax_old = Gmax_new;
1262 	}
1263 
1264 	info("\noptimization finished, #iter = %d\n", iter);
1265 
1266 	// calculate objective value
1267 	double v = 0;
1268 	int nSV = 0;
1269 	for(i=0; i<w_size; i++)
1270 		v += w[i]*w[i];
1271 	v = 0.5*v;
1272 	for(i=0; i<l; i++)
1273 	{
1274 		v += p*fabs(beta[i]) - y[i]*beta[i] + 0.5*lambda[GETI(i)]*beta[i]*beta[i];
1275 		if(beta[i] != 0)
1276 			nSV++;
1277 	}
1278 
1279 	info("Objective value = %lf\n", v);
1280 	info("nSV = %d\n",nSV);
1281 
1282 	delete [] beta;
1283 	delete [] QD;
1284 	delete [] index;
1285 
1286 	return iter;
1287 }
1288 
1289 
1290 // A coordinate descent algorithm for
1291 // the dual of L2-regularized logistic regression problems
1292 //
1293 //  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
1294 //    s.t.      0 <= \alpha_i <= upper_bound_i,
1295 //
1296 //  where Qij = yi yj xi^T xj and
1297 //  upper_bound_i = Cp if y_i = 1
1298 //  upper_bound_i = Cn if y_i = -1
1299 //
1300 // Given:
1301 // x, y, Cp, Cn
1302 // eps is the stopping tolerance
1303 //
1304 // solution will be put in w
1305 //
1306 // this function returns the number of iterations
1307 //
1308 // See Algorithm 5 of Yu et al., MLJ 2010
1309 
1310 #undef GETI
1311 #define GETI(i) (y[i]+1)
1312 // To support weights for instances, use GETI(i) (i)
1313 
solve_l2r_lr_dual(const problem * prob,const parameter * param,double * w,double Cp,double Cn,int max_iter=300)1314 static int solve_l2r_lr_dual(const problem *prob, const parameter *param, double *w, double Cp, double Cn, int max_iter=300)
1315 {
1316 	int l = prob->l;
1317 	int w_size = prob->n;
1318 	double eps = param->eps;
1319 	int i, s, iter = 0;
1320 	double *xTx = new double[l];
1321 	int *index = new int[l];
1322 	double *alpha = new double[2*l]; // store alpha and C - alpha
1323 	schar *y = new schar[l];
1324 	int max_inner_iter = 100; // for inner Newton
1325 	double innereps = 1e-2;
1326 	double innereps_min = min(1e-8, eps);
1327 	double upper_bound[3] = {Cn, 0, Cp};
1328 
1329 	for(i=0; i<l; i++)
1330 	{
1331 		if(prob->y[i] > 0)
1332 		{
1333 			y[i] = +1;
1334 		}
1335 		else
1336 		{
1337 			y[i] = -1;
1338 		}
1339 	}
1340 
1341 	// Initial alpha can be set here. Note that
1342 	// 0 < alpha[i] < upper_bound[GETI(i)]
1343 	// alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)]
1344 	for(i=0; i<l; i++)
1345 	{
1346 		alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
1347 		alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
1348 	}
1349 
1350 	for(i=0; i<w_size; i++)
1351 		w[i] = 0;
1352 	for(i=0; i<l; i++)
1353 	{
1354 		feature_node * const xi = prob->x[i];
1355 		xTx[i] = sparse_operator::nrm2_sq(xi);
1356 		sparse_operator::axpy(y[i]*alpha[2*i], xi, w);
1357 		index[i] = i;
1358 	}
1359 
1360 	while (iter < max_iter)
1361 	{
1362 		for (i=0; i<l; i++)
1363 		{
1364 			int j = i+rand()%(l-i);
1365 			swap(index[i], index[j]);
1366 		}
1367 		int newton_iter = 0;
1368 		double Gmax = 0;
1369 		for (s=0; s<l; s++)
1370 		{
1371 			i = index[s];
1372 			const schar yi = y[i];
1373 			double C = upper_bound[GETI(i)];
1374 			double ywTx = 0, xisq = xTx[i];
1375 			feature_node * const xi = prob->x[i];
1376 			ywTx = yi*sparse_operator::dot(w, xi);
1377 			double a = xisq, b = ywTx;
1378 
1379 			// Decide to minimize g_1(z) or g_2(z)
1380 			int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
1381 			if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0)
1382 			{
1383 				ind1 = 2*i+1;
1384 				ind2 = 2*i;
1385 				sign = -1;
1386 			}
1387 
1388 			//  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
1389 			double alpha_old = alpha[ind1];
1390 			double z = alpha_old;
1391 			if(C - z < 0.5 * C)
1392 				z = 0.1*z;
1393 			double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1394 			Gmax = max(Gmax, fabs(gp));
1395 
1396 			// Newton method on the sub-problem
1397 			const double eta = 0.1; // xi in the paper
1398 			int inner_iter = 0;
1399 			while (inner_iter <= max_inner_iter)
1400 			{
1401 				if(fabs(gp) < innereps)
1402 					break;
1403 				double gpp = a + C/(C-z)/z;
1404 				double tmpz = z - gp/gpp;
1405 				if(tmpz <= 0)
1406 					z *= eta;
1407 				else // tmpz in (0, C)
1408 					z = tmpz;
1409 				gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1410 				newton_iter++;
1411 				inner_iter++;
1412 			}
1413 
1414 			if(inner_iter > 0) // update w
1415 			{
1416 				alpha[ind1] = z;
1417 				alpha[ind2] = C-z;
1418 				sparse_operator::axpy(sign*(z-alpha_old)*yi, xi, w);
1419 			}
1420 		}
1421 
1422 		iter++;
1423 		if(iter % 10 == 0)
1424 			info(".");
1425 
1426 		if(Gmax < eps)
1427 			break;
1428 
1429 		if(newton_iter <= l/10)
1430 			innereps = max(innereps_min, 0.1*innereps);
1431 
1432 	}
1433 
1434 	info("\noptimization finished, #iter = %d\n",iter);
1435 
1436 	// calculate objective value
1437 
1438 	double v = 0;
1439 	for(i=0; i<w_size; i++)
1440 		v += w[i] * w[i];
1441 	v *= 0.5;
1442 	for(i=0; i<l; i++)
1443 		v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1])
1444 			- upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
1445 	info("Objective value = %lf\n", v);
1446 
1447 	delete [] xTx;
1448 	delete [] alpha;
1449 	delete [] y;
1450 	delete [] index;
1451 
1452 	return iter;
1453 }
1454 
1455 // A coordinate descent algorithm for
1456 // L1-regularized L2-loss support vector classification
1457 //
1458 //  min_w \sum |wj| + C \sum max(0, 1-yi w^T xi)^2,
1459 //
1460 // Given:
1461 // x, y, Cp, Cn
1462 // eps is the stopping tolerance
1463 //
1464 // solution will be put in w
1465 //
1466 // this function returns the number of iterations
1467 //
1468 // See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
1469 //
1470 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1471 // must have been added to the original data. (see -B and -R option)
1472 
1473 #undef GETI
1474 #define GETI(i) (y[i]+1)
1475 // To support weights for instances, use GETI(i) (i)
1476 
solve_l1r_l2_svc(const problem * prob_col,const parameter * param,double * w,double Cp,double Cn,double eps)1477 static int solve_l1r_l2_svc(const problem *prob_col, const parameter* param, double *w, double Cp, double Cn, double eps)
1478 {
1479 	int l = prob_col->l;
1480 	int w_size = prob_col->n;
1481 	int regularize_bias = param->regularize_bias;
1482 	int j, s, iter = 0;
1483 	int max_iter = 1000;
1484 	int active_size = w_size;
1485 	int max_num_linesearch = 20;
1486 
1487 	double sigma = 0.01;
1488 	double d, G_loss, G, H;
1489 	double Gmax_old = INF;
1490 	double Gmax_new, Gnorm1_new;
1491 	double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1492 	double d_old, d_diff;
1493 	double loss_old = 0, loss_new;
1494 	double appxcond, cond;
1495 
1496 	int *index = new int[w_size];
1497 	schar *y = new schar[l];
1498 	double *b = new double[l]; // b = 1-ywTx
1499 	double *xj_sq = new double[w_size];
1500 	feature_node *x;
1501 
1502 	double C[3] = {Cn,0,Cp};
1503 
1504 	// Initial w can be set here.
1505 	for(j=0; j<w_size; j++)
1506 		w[j] = 0;
1507 
1508 	for(j=0; j<l; j++)
1509 	{
1510 		b[j] = 1;
1511 		if(prob_col->y[j] > 0)
1512 			y[j] = 1;
1513 		else
1514 			y[j] = -1;
1515 	}
1516 	for(j=0; j<w_size; j++)
1517 	{
1518 		index[j] = j;
1519 		xj_sq[j] = 0;
1520 		x = prob_col->x[j];
1521 		while(x->index != -1)
1522 		{
1523 			int ind = x->index-1;
1524 			x->value *= y[ind]; // x->value stores yi*xij
1525 			double val = x->value;
1526 			b[ind] -= w[j]*val;
1527 			xj_sq[j] += C[GETI(ind)]*val*val;
1528 			x++;
1529 		}
1530 	}
1531 
1532 	while(iter < max_iter)
1533 	{
1534 		Gmax_new = 0;
1535 		Gnorm1_new = 0;
1536 
1537 		for(j=0; j<active_size; j++)
1538 		{
1539 			int i = j+rand()%(active_size-j);
1540 			swap(index[i], index[j]);
1541 		}
1542 
1543 		for(s=0; s<active_size; s++)
1544 		{
1545 			j = index[s];
1546 			G_loss = 0;
1547 			H = 0;
1548 
1549 			x = prob_col->x[j];
1550 			while(x->index != -1)
1551 			{
1552 				int ind = x->index-1;
1553 				if(b[ind] > 0)
1554 				{
1555 					double val = x->value;
1556 					double tmp = C[GETI(ind)]*val;
1557 					G_loss -= tmp*b[ind];
1558 					H += tmp*val;
1559 				}
1560 				x++;
1561 			}
1562 			G_loss *= 2;
1563 
1564 			G = G_loss;
1565 			H *= 2;
1566 			H = max(H, 1e-12);
1567 
1568 			double violation = 0;
1569 			double Gp = 0, Gn = 0;
1570 			if(j == w_size-1 && regularize_bias == 0)
1571 				violation = fabs(G);
1572 			else
1573 			{
1574 				Gp = G+1;
1575 				Gn = G-1;
1576 				if(w[j] == 0)
1577 				{
1578 					if(Gp < 0)
1579 						violation = -Gp;
1580 					else if(Gn > 0)
1581 						violation = Gn;
1582 					else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1583 					{
1584 						active_size--;
1585 						swap(index[s], index[active_size]);
1586 						s--;
1587 						continue;
1588 					}
1589 				}
1590 				else if(w[j] > 0)
1591 					violation = fabs(Gp);
1592 				else
1593 					violation = fabs(Gn);
1594 			}
1595 			Gmax_new = max(Gmax_new, violation);
1596 			Gnorm1_new += violation;
1597 
1598 			// obtain Newton direction d
1599 			if(j == w_size-1 && regularize_bias == 0)
1600 				d = -G/H;
1601 			else
1602 			{
1603 				if(Gp < H*w[j])
1604 					d = -Gp/H;
1605 				else if(Gn > H*w[j])
1606 					d = -Gn/H;
1607 				else
1608 					d = -w[j];
1609 			}
1610 
1611 			if(fabs(d) < 1.0e-12)
1612 				continue;
1613 
1614 			double delta;
1615 			if(j == w_size-1 && regularize_bias == 0)
1616 				delta = G*d;
1617 			else
1618 				delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
1619 			d_old = 0;
1620 			int num_linesearch;
1621 			for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
1622 			{
1623 				d_diff = d_old - d;
1624 				if(j == w_size-1 && regularize_bias == 0)
1625 					cond = -sigma*delta;
1626 				else
1627 					cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
1628 
1629 				appxcond = xj_sq[j]*d*d + G_loss*d + cond;
1630 				if(appxcond <= 0)
1631 				{
1632 					x = prob_col->x[j];
1633 					sparse_operator::axpy(d_diff, x, b);
1634 					break;
1635 				}
1636 
1637 				if(num_linesearch == 0)
1638 				{
1639 					loss_old = 0;
1640 					loss_new = 0;
1641 					x = prob_col->x[j];
1642 					while(x->index != -1)
1643 					{
1644 						int ind = x->index-1;
1645 						if(b[ind] > 0)
1646 							loss_old += C[GETI(ind)]*b[ind]*b[ind];
1647 						double b_new = b[ind] + d_diff*x->value;
1648 						b[ind] = b_new;
1649 						if(b_new > 0)
1650 							loss_new += C[GETI(ind)]*b_new*b_new;
1651 						x++;
1652 					}
1653 				}
1654 				else
1655 				{
1656 					loss_new = 0;
1657 					x = prob_col->x[j];
1658 					while(x->index != -1)
1659 					{
1660 						int ind = x->index-1;
1661 						double b_new = b[ind] + d_diff*x->value;
1662 						b[ind] = b_new;
1663 						if(b_new > 0)
1664 							loss_new += C[GETI(ind)]*b_new*b_new;
1665 						x++;
1666 					}
1667 				}
1668 
1669 				cond = cond + loss_new - loss_old;
1670 				if(cond <= 0)
1671 					break;
1672 				else
1673 				{
1674 					d_old = d;
1675 					d *= 0.5;
1676 					delta *= 0.5;
1677 				}
1678 			}
1679 
1680 			w[j] += d;
1681 
1682 			// recompute b[] if line search takes too many steps
1683 			if(num_linesearch >= max_num_linesearch)
1684 			{
1685 				info("#");
1686 				for(int i=0; i<l; i++)
1687 					b[i] = 1;
1688 
1689 				for(int i=0; i<w_size; i++)
1690 				{
1691 					if(w[i]==0) continue;
1692 					x = prob_col->x[i];
1693 					sparse_operator::axpy(-w[i], x, b);
1694 				}
1695 			}
1696 		}
1697 
1698 		if(iter == 0)
1699 			Gnorm1_init = Gnorm1_new;
1700 		iter++;
1701 		if(iter % 10 == 0)
1702 			info(".");
1703 
1704 		if(Gnorm1_new <= eps*Gnorm1_init)
1705 		{
1706 			if(active_size == w_size)
1707 				break;
1708 			else
1709 			{
1710 				active_size = w_size;
1711 				info("*");
1712 				Gmax_old = INF;
1713 				continue;
1714 			}
1715 		}
1716 
1717 		Gmax_old = Gmax_new;
1718 	}
1719 
1720 	info("\noptimization finished, #iter = %d\n", iter);
1721 	if(iter >= max_iter)
1722 		info("\nWARNING: reaching max number of iterations\n");
1723 
1724 	// calculate objective value
1725 
1726 	double v = 0;
1727 	int nnz = 0;
1728 	for(j=0; j<w_size; j++)
1729 	{
1730 		x = prob_col->x[j];
1731 		while(x->index != -1)
1732 		{
1733 			x->value *= prob_col->y[x->index-1]; // restore x->value
1734 			x++;
1735 		}
1736 		if(w[j] != 0)
1737 		{
1738 			v += fabs(w[j]);
1739 			nnz++;
1740 		}
1741 	}
1742 	if (regularize_bias == 0)
1743 		v -= fabs(w[w_size-1]);
1744 	for(j=0; j<l; j++)
1745 		if(b[j] > 0)
1746 			v += C[GETI(j)]*b[j]*b[j];
1747 
1748 	info("Objective value = %lf\n", v);
1749 	info("#nonzeros/#features = %d/%d\n", nnz, w_size);
1750 
1751 	delete [] index;
1752 	delete [] y;
1753 	delete [] b;
1754 	delete [] xj_sq;
1755 
1756 	return iter;
1757 }
1758 
1759 // A coordinate descent algorithm for
1760 // L1-regularized logistic regression problems
1761 //
1762 //  min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)),
1763 //
1764 // Given:
1765 // x, y, Cp, Cn
1766 // eps is the stopping tolerance
1767 //
1768 // solution will be put in w
1769 //
1770 // this function returns the number of iterations
1771 //
1772 // See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
1773 //
1774 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1775 // must have been added to the original data. (see -B and -R option)
1776 
1777 #undef GETI
1778 #define GETI(i) (y[i]+1)
1779 // To support weights for instances, use GETI(i) (i)
1780 
solve_l1r_lr(const problem * prob_col,const parameter * param,double * w,double Cp,double Cn,double eps)1781 static int solve_l1r_lr(const problem *prob_col, const parameter *param, double *w, double Cp, double Cn, double eps)
1782 {
1783 	int l = prob_col->l;
1784 	int w_size = prob_col->n;
1785 	int regularize_bias = param->regularize_bias;
1786 	int j, s, newton_iter=0, iter=0;
1787 	int max_newton_iter = 100;
1788 	int max_iter = 1000;
1789 	int max_num_linesearch = 20;
1790 	int active_size;
1791 	int QP_active_size;
1792 
1793 	double nu = 1e-12;
1794 	double inner_eps = 1;
1795 	double sigma = 0.01;
1796 	double w_norm, w_norm_new;
1797 	double z, G, H;
1798 	double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1799 	double Gmax_old = INF;
1800 	double Gmax_new, Gnorm1_new;
1801 	double QP_Gmax_old = INF;
1802 	double QP_Gmax_new, QP_Gnorm1_new;
1803 	double delta, negsum_xTd, cond;
1804 
1805 	int *index = new int[w_size];
1806 	schar *y = new schar[l];
1807 	double *Hdiag = new double[w_size];
1808 	double *Grad = new double[w_size];
1809 	double *wpd = new double[w_size];
1810 	double *xjneg_sum = new double[w_size];
1811 	double *xTd = new double[l];
1812 	double *exp_wTx = new double[l];
1813 	double *exp_wTx_new = new double[l];
1814 	double *tau = new double[l];
1815 	double *D = new double[l];
1816 	feature_node *x;
1817 
1818 	double C[3] = {Cn,0,Cp};
1819 
1820 	// Initial w can be set here.
1821 	for(j=0; j<w_size; j++)
1822 		w[j] = 0;
1823 
1824 	for(j=0; j<l; j++)
1825 	{
1826 		if(prob_col->y[j] > 0)
1827 			y[j] = 1;
1828 		else
1829 			y[j] = -1;
1830 
1831 		exp_wTx[j] = 0;
1832 	}
1833 
1834 	w_norm = 0;
1835 	for(j=0; j<w_size; j++)
1836 	{
1837 		w_norm += fabs(w[j]);
1838 		wpd[j] = w[j];
1839 		index[j] = j;
1840 		xjneg_sum[j] = 0;
1841 		x = prob_col->x[j];
1842 		while(x->index != -1)
1843 		{
1844 			int ind = x->index-1;
1845 			double val = x->value;
1846 			exp_wTx[ind] += w[j]*val;
1847 			if(y[ind] == -1)
1848 				xjneg_sum[j] += C[GETI(ind)]*val;
1849 			x++;
1850 		}
1851 	}
1852 	if (regularize_bias == 0)
1853 		w_norm -= fabs(w[w_size-1]);
1854 
1855 	for(j=0; j<l; j++)
1856 	{
1857 		exp_wTx[j] = exp(exp_wTx[j]);
1858 		double tau_tmp = 1/(1+exp_wTx[j]);
1859 		tau[j] = C[GETI(j)]*tau_tmp;
1860 		D[j] = C[GETI(j)]*exp_wTx[j]*tau_tmp*tau_tmp;
1861 	}
1862 
1863 	while(newton_iter < max_newton_iter)
1864 	{
1865 		Gmax_new = 0;
1866 		Gnorm1_new = 0;
1867 		active_size = w_size;
1868 
1869 		for(s=0; s<active_size; s++)
1870 		{
1871 			j = index[s];
1872 			Hdiag[j] = nu;
1873 			Grad[j] = 0;
1874 
1875 			double tmp = 0;
1876 			x = prob_col->x[j];
1877 			while(x->index != -1)
1878 			{
1879 				int ind = x->index-1;
1880 				Hdiag[j] += x->value*x->value*D[ind];
1881 				tmp += x->value*tau[ind];
1882 				x++;
1883 			}
1884 			Grad[j] = -tmp + xjneg_sum[j];
1885 
1886 			double violation = 0;
1887 			if (j == w_size-1 && regularize_bias == 0)
1888 				violation = fabs(Grad[j]);
1889 			else
1890 			{
1891 				double Gp = Grad[j]+1;
1892 				double Gn = Grad[j]-1;
1893 				if(w[j] == 0)
1894 				{
1895 					if(Gp < 0)
1896 						violation = -Gp;
1897 					else if(Gn > 0)
1898 						violation = Gn;
1899 					//outer-level shrinking
1900 					else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1901 					{
1902 						active_size--;
1903 						swap(index[s], index[active_size]);
1904 						s--;
1905 						continue;
1906 					}
1907 				}
1908 				else if(w[j] > 0)
1909 					violation = fabs(Gp);
1910 				else
1911 					violation = fabs(Gn);
1912 			}
1913 			Gmax_new = max(Gmax_new, violation);
1914 			Gnorm1_new += violation;
1915 		}
1916 
1917 		if(newton_iter == 0)
1918 			Gnorm1_init = Gnorm1_new;
1919 
1920 		if(Gnorm1_new <= eps*Gnorm1_init)
1921 			break;
1922 
1923 		iter = 0;
1924 		QP_Gmax_old = INF;
1925 		QP_active_size = active_size;
1926 
1927 		for(int i=0; i<l; i++)
1928 			xTd[i] = 0;
1929 
1930 		// optimize QP over wpd
1931 		while(iter < max_iter)
1932 		{
1933 			QP_Gmax_new = 0;
1934 			QP_Gnorm1_new = 0;
1935 
1936 			for(j=0; j<QP_active_size; j++)
1937 			{
1938 				int i = j+rand()%(QP_active_size-j);
1939 				swap(index[i], index[j]);
1940 			}
1941 
1942 			for(s=0; s<QP_active_size; s++)
1943 			{
1944 				j = index[s];
1945 				H = Hdiag[j];
1946 
1947 				x = prob_col->x[j];
1948 				G = Grad[j] + (wpd[j]-w[j])*nu;
1949 				while(x->index != -1)
1950 				{
1951 					int ind = x->index-1;
1952 					G += x->value*D[ind]*xTd[ind];
1953 					x++;
1954 				}
1955 
1956 				double violation = 0;
1957 				if (j == w_size-1 && regularize_bias == 0)
1958 				{
1959 					// bias term not shrunken
1960 					violation = fabs(G);
1961 					z = -G/H;
1962 				}
1963 				else
1964 				{
1965 					double Gp = G+1;
1966 					double Gn = G-1;
1967 					if(wpd[j] == 0)
1968 					{
1969 						if(Gp < 0)
1970 							violation = -Gp;
1971 						else if(Gn > 0)
1972 							violation = Gn;
1973 						//inner-level shrinking
1974 						else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
1975 						{
1976 							QP_active_size--;
1977 							swap(index[s], index[QP_active_size]);
1978 							s--;
1979 							continue;
1980 						}
1981 					}
1982 					else if(wpd[j] > 0)
1983 						violation = fabs(Gp);
1984 					else
1985 						violation = fabs(Gn);
1986 
1987 					// obtain solution of one-variable problem
1988 					if(Gp < H*wpd[j])
1989 						z = -Gp/H;
1990 					else if(Gn > H*wpd[j])
1991 						z = -Gn/H;
1992 					else
1993 						z = -wpd[j];
1994 				}
1995 				QP_Gmax_new = max(QP_Gmax_new, violation);
1996 				QP_Gnorm1_new += violation;
1997 
1998 				if(fabs(z) < 1.0e-12)
1999 					continue;
2000 				z = min(max(z,-10.0),10.0);
2001 
2002 				wpd[j] += z;
2003 
2004 				x = prob_col->x[j];
2005 				sparse_operator::axpy(z, x, xTd);
2006 			}
2007 
2008 			iter++;
2009 
2010 			if(QP_Gnorm1_new <= inner_eps*Gnorm1_init)
2011 			{
2012 				//inner stopping
2013 				if(QP_active_size == active_size)
2014 					break;
2015 				//active set reactivation
2016 				else
2017 				{
2018 					QP_active_size = active_size;
2019 					QP_Gmax_old = INF;
2020 					continue;
2021 				}
2022 			}
2023 
2024 			QP_Gmax_old = QP_Gmax_new;
2025 		}
2026 
2027 		if(iter >= max_iter)
2028 			info("WARNING: reaching max number of inner iterations\n");
2029 
2030 		delta = 0;
2031 		w_norm_new = 0;
2032 		for(j=0; j<w_size; j++)
2033 		{
2034 			delta += Grad[j]*(wpd[j]-w[j]);
2035 			if(wpd[j] != 0)
2036 				w_norm_new += fabs(wpd[j]);
2037 		}
2038 		if (regularize_bias == 0)
2039 			w_norm_new -= fabs(wpd[w_size-1]);
2040 		delta += (w_norm_new-w_norm);
2041 
2042 		negsum_xTd = 0;
2043 		for(int i=0; i<l; i++)
2044 			if(y[i] == -1)
2045 				negsum_xTd += C[GETI(i)]*xTd[i];
2046 
2047 		int num_linesearch;
2048 		for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
2049 		{
2050 			cond = w_norm_new - w_norm + negsum_xTd - sigma*delta;
2051 
2052 			for(int i=0; i<l; i++)
2053 			{
2054 				double exp_xTd = exp(xTd[i]);
2055 				exp_wTx_new[i] = exp_wTx[i]*exp_xTd;
2056 				cond += C[GETI(i)]*log((1+exp_wTx_new[i])/(exp_xTd+exp_wTx_new[i]));
2057 			}
2058 
2059 			if(cond <= 0)
2060 			{
2061 				w_norm = w_norm_new;
2062 				for(j=0; j<w_size; j++)
2063 					w[j] = wpd[j];
2064 				for(int i=0; i<l; i++)
2065 				{
2066 					exp_wTx[i] = exp_wTx_new[i];
2067 					double tau_tmp = 1/(1+exp_wTx[i]);
2068 					tau[i] = C[GETI(i)]*tau_tmp;
2069 					D[i] = C[GETI(i)]*exp_wTx[i]*tau_tmp*tau_tmp;
2070 				}
2071 				break;
2072 			}
2073 			else
2074 			{
2075 				w_norm_new = 0;
2076 				for(j=0; j<w_size; j++)
2077 				{
2078 					wpd[j] = (w[j]+wpd[j])*0.5;
2079 					if(wpd[j] != 0)
2080 						w_norm_new += fabs(wpd[j]);
2081 				}
2082 				if (regularize_bias == 0)
2083 					w_norm_new -= fabs(wpd[w_size-1]);
2084 				delta *= 0.5;
2085 				negsum_xTd *= 0.5;
2086 				for(int i=0; i<l; i++)
2087 					xTd[i] *= 0.5;
2088 			}
2089 		}
2090 
2091 		// Recompute some info due to too many line search steps
2092 		if(num_linesearch >= max_num_linesearch)
2093 		{
2094 			for(int i=0; i<l; i++)
2095 				exp_wTx[i] = 0;
2096 
2097 			for(int i=0; i<w_size; i++)
2098 			{
2099 				if(w[i]==0) continue;
2100 				x = prob_col->x[i];
2101 				sparse_operator::axpy(w[i], x, exp_wTx);
2102 			}
2103 
2104 			for(int i=0; i<l; i++)
2105 				exp_wTx[i] = exp(exp_wTx[i]);
2106 		}
2107 
2108 		if(iter == 1)
2109 			inner_eps *= 0.25;
2110 
2111 		newton_iter++;
2112 		Gmax_old = Gmax_new;
2113 
2114 		info("iter %3d  #CD cycles %d\n", newton_iter, iter);
2115 	}
2116 
2117 	info("=========================\n");
2118 	info("optimization finished, #iter = %d\n", newton_iter);
2119 	if(newton_iter >= max_newton_iter)
2120 		info("WARNING: reaching max number of iterations\n");
2121 
2122 	// calculate objective value
2123 
2124 	double v = 0;
2125 	int nnz = 0;
2126 	for(j=0; j<w_size; j++)
2127 		if(w[j] != 0)
2128 		{
2129 			v += fabs(w[j]);
2130 			nnz++;
2131 		}
2132 	if (regularize_bias == 0)
2133 		v -= fabs(w[w_size-1]);
2134 	for(j=0; j<l; j++)
2135 		if(y[j] == 1)
2136 			v += C[GETI(j)]*log(1+1/exp_wTx[j]);
2137 		else
2138 			v += C[GETI(j)]*log(1+exp_wTx[j]);
2139 
2140 	info("Objective value = %lf\n", v);
2141 	info("#nonzeros/#features = %d/%d\n", nnz, w_size);
2142 
2143 	delete [] index;
2144 	delete [] y;
2145 	delete [] Hdiag;
2146 	delete [] Grad;
2147 	delete [] wpd;
2148 	delete [] xjneg_sum;
2149 	delete [] xTd;
2150 	delete [] exp_wTx;
2151 	delete [] exp_wTx_new;
2152 	delete [] tau;
2153 	delete [] D;
2154 
2155 	return newton_iter;
2156 }
2157 
2158 struct heap {
2159 	enum HEAP_TYPE { MIN, MAX };
2160 	int _size;
2161 	HEAP_TYPE _type;
2162 	feature_node* a;
2163 
heapheap2164 	heap(int max_size, HEAP_TYPE type)
2165 	{
2166 		_size = 0;
2167 		a = new feature_node[max_size];
2168 		_type = type;
2169 	}
~heapheap2170 	~heap()
2171 	{
2172 		delete [] a;
2173 	}
cmpheap2174 	bool cmp(const feature_node& left, const feature_node& right)
2175 	{
2176 		if(_type == MIN)
2177 			return left.value > right.value;
2178 		else
2179 			return left.value < right.value;
2180 	}
sizeheap2181 	int size()
2182 	{
2183 		return _size;
2184 	}
pushheap2185 	void push(feature_node node)
2186 	{
2187 		a[_size] = node;
2188 		_size++;
2189 		int i = _size-1;
2190 		while(i)
2191 		{
2192 			int p = (i-1)/2;
2193 			if(cmp(a[p], a[i]))
2194 			{
2195 				swap(a[i], a[p]);
2196 				i = p;
2197 			}
2198 			else
2199 				break;
2200 		}
2201 	}
popheap2202 	void pop()
2203 	{
2204 		_size--;
2205 		a[0] = a[_size];
2206 		int i = 0;
2207 		while(i*2+1 < _size)
2208 		{
2209 			int l = i*2+1;
2210 			int r = i*2+2;
2211 			if(r < _size && cmp(a[l], a[r]))
2212 				l = r;
2213 			if(cmp(a[i], a[l]))
2214 			{
2215 				swap(a[i], a[l]);
2216 				i = l;
2217 			}
2218 			else
2219 				break;
2220 		}
2221 	}
topheap2222 	feature_node top()
2223 	{
2224 		return a[0];
2225 	}
2226 };
2227 
2228 // A two-level coordinate descent algorithm for
2229 // a scaled one-class SVM dual problem
2230 //
2231 //  min_\alpha  0.5(\alpha^T Q \alpha),
2232 //    s.t.      0 <= \alpha_i <= 1 and
2233 //              e^T \alpha = \nu l
2234 //
2235 //  where Qij = xi^T xj
2236 //
2237 // Given:
2238 // x, nu
2239 // eps is the stopping tolerance
2240 //
2241 // solution will be put in w and rho
2242 //
2243 // this function returns the number of iterations
2244 //
2245 // See Algorithm 7 in supplementary materials of Chou et al., SDM 2020.
2246 
solve_oneclass_svm(const problem * prob,const parameter * param,double * w,double * rho)2247 static int solve_oneclass_svm(const problem *prob, const parameter *param, double *w, double *rho)
2248 {
2249 	int l = prob->l;
2250 	int w_size = prob->n;
2251 	double eps = param->eps;
2252 	double nu = param->nu;
2253 	int i, j, s, iter = 0;
2254 	double Gi, Gj;
2255 	double Qij, quad_coef, delta, sum;
2256 	double old_alpha_i;
2257 	double *QD = new double[l];
2258 	double *G = new double[l];
2259 	int *index = new int[l];
2260 	double *alpha = new double[l];
2261 	int max_inner_iter;
2262 	int max_iter = 1000;
2263 	int active_size = l;
2264 
2265 	double negGmax;                 // max { -grad(f)_i | alpha_i < 1 }
2266 	double negGmin;                 // min { -grad(f)_i | alpha_i > 0 }
2267 
2268 	int *most_violating_i = new int[l];
2269 	int *most_violating_j = new int[l];
2270 
2271 	int n = (int)(nu*l);            // # of alpha's at upper bound
2272 	for(i=0; i<n; i++)
2273 		alpha[i] = 1;
2274 	if (n<l)
2275 		alpha[i] = nu*l-n;
2276 	for(i=n+1; i<l; i++)
2277 		alpha[i] = 0;
2278 
2279 	for(i=0; i<w_size; i++)
2280 		w[i] = 0;
2281 	for(i=0; i<l; i++)
2282 	{
2283 		feature_node * const xi = prob->x[i];
2284 		QD[i] = sparse_operator::nrm2_sq(xi);
2285 		sparse_operator::axpy(alpha[i], xi, w);
2286 
2287 		index[i] = i;
2288 	}
2289 
2290 	while (iter < max_iter)
2291 	{
2292 		negGmax = -INF;
2293 		negGmin = INF;
2294 
2295 		for (s=0; s<active_size; s++)
2296 		{
2297 			i = index[s];
2298 			feature_node * const xi = prob->x[i];
2299 			G[i] = sparse_operator::dot(w, xi);
2300 			if (alpha[i] < 1)
2301 				negGmax = max(negGmax, -G[i]);
2302 			if (alpha[i] > 0)
2303 				negGmin = min(negGmin, -G[i]);
2304 		}
2305 
2306 		if (negGmax - negGmin < eps)
2307 		{
2308 			if (active_size == l)
2309 				break;
2310 			else
2311 			{
2312 				active_size = l;
2313 				info("*");
2314 				continue;
2315 			}
2316 		}
2317 
2318 		for(s=0; s<active_size; s++)
2319 		{
2320 			i = index[s];
2321 			if ((alpha[i] == 1 && -G[i] > negGmax) ||
2322 			    (alpha[i] == 0 && -G[i] < negGmin))
2323 			{
2324 				active_size--;
2325 				swap(index[s], index[active_size]);
2326 				s--;
2327 			}
2328 		}
2329 
2330 		max_inner_iter = max(active_size/10, 1);
2331 		struct heap min_heap = heap(max_inner_iter, heap::MIN);
2332 		struct heap max_heap = heap(max_inner_iter, heap::MAX);
2333 		struct feature_node node;
2334 		for(s=0; s<active_size; s++)
2335 		{
2336 			i = index[s];
2337 			node.index = i;
2338 			node.value = -G[i];
2339 
2340 			if (alpha[i] < 1)
2341 			{
2342 				if (min_heap.size() < max_inner_iter)
2343 					min_heap.push(node);
2344 				else if (min_heap.top().value < node.value)
2345 				{
2346 					min_heap.pop();
2347 					min_heap.push(node);
2348 				}
2349 			}
2350 
2351 			if (alpha[i] > 0)
2352 			{
2353 				if (max_heap.size() < max_inner_iter)
2354 					max_heap.push(node);
2355 				else if (max_heap.top().value > node.value)
2356 				{
2357 					max_heap.pop();
2358 					max_heap.push(node);
2359 				}
2360 			}
2361 		}
2362 		max_inner_iter = min(min_heap.size(), max_heap.size());
2363 		while (max_heap.size() > max_inner_iter)
2364 			max_heap.pop();
2365 		while (min_heap.size() > max_inner_iter)
2366 			min_heap.pop();
2367 
2368 		for (s=max_inner_iter-1; s>=0; s--)
2369 		{
2370 			most_violating_i[s] = min_heap.top().index;
2371 			most_violating_j[s] = max_heap.top().index;
2372 			min_heap.pop();
2373 			max_heap.pop();
2374 		}
2375 
2376 		for (s=0; s<max_inner_iter; s++)
2377 		{
2378 			i = most_violating_i[s];
2379 			j = most_violating_j[s];
2380 
2381 			if ((alpha[i] == 0 && alpha[j] == 0) ||
2382 			    (alpha[i] == 1 && alpha[j] == 1))
2383 				continue;
2384 
2385 			feature_node const * xi = prob->x[i];
2386 			feature_node const * xj = prob->x[j];
2387 
2388 			Gi = sparse_operator::dot(w, xi);
2389 			Gj = sparse_operator::dot(w, xj);
2390 
2391 			int violating_pair = 0;
2392 			if (alpha[i] < 1 && alpha[j] > 0 && -Gj + 1e-12 < -Gi)
2393 				violating_pair = 1;
2394 			else
2395 				if (alpha[i] > 0 && alpha[j] < 1 && -Gi + 1e-12 < -Gj)
2396 					violating_pair = 1;
2397 			if (violating_pair == 0)
2398 				continue;
2399 
2400 			Qij = sparse_operator::sparse_dot(xi, xj);
2401 			quad_coef = QD[i] + QD[j] - 2*Qij;
2402 			if(quad_coef <= 0)
2403 				quad_coef = 1e-12;
2404 			delta = (Gi - Gj) / quad_coef;
2405 			old_alpha_i = alpha[i];
2406 			sum = alpha[i] + alpha[j];
2407 			alpha[i] = alpha[i] - delta;
2408 			alpha[j] = alpha[j] + delta;
2409 			if (sum > 1)
2410 			{
2411 				if (alpha[i] > 1)
2412 				{
2413 					alpha[i] = 1;
2414 					alpha[j] = sum - 1;
2415 				}
2416 			}
2417 			else
2418 			{
2419 				if (alpha[j] < 0)
2420 				{
2421 					alpha[j] = 0;
2422 					alpha[i] = sum;
2423 				}
2424 			}
2425 			if (sum > 1)
2426 			{
2427 				if (alpha[j] > 1)
2428 				{
2429 					alpha[j] = 1;
2430 					alpha[i] = sum - 1;
2431 				}
2432 			}
2433 			else
2434 			{
2435 				if (alpha[i] < 0)
2436 				{
2437 					alpha[i] = 0;
2438 					alpha[j] = sum;
2439 				}
2440 			}
2441 			delta = alpha[i] - old_alpha_i;
2442 			sparse_operator::axpy(delta, xi, w);
2443 			sparse_operator::axpy(-delta, xj, w);
2444 		}
2445 		iter++;
2446 		if (iter % 10 == 0)
2447 			info(".");
2448 	}
2449 	info("\noptimization finished, #iter = %d\n",iter);
2450 	if (iter >= max_iter)
2451 		info("\nWARNING: reaching max number of iterations\n\n");
2452 
2453 	// calculate object value
2454 	double v = 0;
2455 	for(i=0; i<w_size; i++)
2456 		v += w[i]*w[i];
2457 	int nSV = 0;
2458 	for(i=0; i<l; i++)
2459 	{
2460 		if (alpha[i] > 0)
2461 			++nSV;
2462 	}
2463 	info("Objective value = %lf\n", v/2);
2464 	info("nSV = %d\n", nSV);
2465 
2466 	// calculate rho
2467 	double nr_free = 0;
2468 	double ub = INF, lb = -INF, sum_free = 0;
2469 	for(i=0; i<l; i++)
2470 	{
2471 		double G = sparse_operator::dot(w, prob->x[i]);
2472 		if (alpha[i] == 1)
2473 			lb = max(lb, G);
2474 		else if (alpha[i] == 0)
2475 			ub = min(ub, G);
2476 		else
2477 		{
2478 			++nr_free;
2479 			sum_free += G;
2480 		}
2481 	}
2482 
2483 	if (nr_free > 0)
2484 		*rho = sum_free/nr_free;
2485 	else
2486 		*rho = (ub + lb)/2;
2487 
2488 	info("rho = %lf\n", *rho);
2489 
2490 	delete [] QD;
2491 	delete [] G;
2492 	delete [] index;
2493 	delete [] alpha;
2494 	delete [] most_violating_i;
2495 	delete [] most_violating_j;
2496 
2497 	return iter;
2498 }
2499 
2500 // transpose matrix X from row format to column format
transpose(const problem * prob,feature_node ** x_space_ret,problem * prob_col)2501 static void transpose(const problem *prob, feature_node **x_space_ret, problem *prob_col)
2502 {
2503 	int i;
2504 	int l = prob->l;
2505 	int n = prob->n;
2506 	size_t nnz = 0;
2507 	size_t *col_ptr = new size_t [n+1];
2508 	feature_node *x_space;
2509 	prob_col->l = l;
2510 	prob_col->n = n;
2511 	prob_col->y = new double[l];
2512 	prob_col->x = new feature_node*[n];
2513 
2514 	for(i=0; i<l; i++)
2515 		prob_col->y[i] = prob->y[i];
2516 
2517 	for(i=0; i<n+1; i++)
2518 		col_ptr[i] = 0;
2519 	for(i=0; i<l; i++)
2520 	{
2521 		feature_node *x = prob->x[i];
2522 		while(x->index != -1)
2523 		{
2524 			nnz++;
2525 			col_ptr[x->index]++;
2526 			x++;
2527 		}
2528 	}
2529 	for(i=1; i<n+1; i++)
2530 		col_ptr[i] += col_ptr[i-1] + 1;
2531 
2532 	x_space = new feature_node[nnz+n];
2533 	for(i=0; i<n; i++)
2534 		prob_col->x[i] = &x_space[col_ptr[i]];
2535 
2536 	for(i=0; i<l; i++)
2537 	{
2538 		feature_node *x = prob->x[i];
2539 		while(x->index != -1)
2540 		{
2541 			int ind = x->index-1;
2542 			x_space[col_ptr[ind]].index = i+1; // starts from 1
2543 			x_space[col_ptr[ind]].value = x->value;
2544 			col_ptr[ind]++;
2545 			x++;
2546 		}
2547 	}
2548 	for(i=0; i<n; i++)
2549 		x_space[col_ptr[i]].index = -1;
2550 
2551 	*x_space_ret = x_space;
2552 
2553 	delete [] col_ptr;
2554 }
2555 
2556 // label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
2557 // perm, length l, must be allocated before calling this subroutine
group_classes(const problem * prob,int * nr_class_ret,int ** label_ret,int ** start_ret,int ** count_ret,int * perm)2558 static void group_classes(const problem *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
2559 {
2560 	int l = prob->l;
2561 	int max_nr_class = 16;
2562 	int nr_class = 0;
2563 	int *label = Malloc(int,max_nr_class);
2564 	int *count = Malloc(int,max_nr_class);
2565 	int *data_label = Malloc(int,l);
2566 	int i;
2567 
2568 	for(i=0;i<l;i++)
2569 	{
2570 		int this_label = (int)prob->y[i];
2571 		int j;
2572 		for(j=0;j<nr_class;j++)
2573 		{
2574 			if(this_label == label[j])
2575 			{
2576 				++count[j];
2577 				break;
2578 			}
2579 		}
2580 		data_label[i] = j;
2581 		if(j == nr_class)
2582 		{
2583 			if(nr_class == max_nr_class)
2584 			{
2585 				max_nr_class *= 2;
2586 				label = (int *)realloc(label,max_nr_class*sizeof(int));
2587 				count = (int *)realloc(count,max_nr_class*sizeof(int));
2588 			}
2589 			label[nr_class] = this_label;
2590 			count[nr_class] = 1;
2591 			++nr_class;
2592 		}
2593 	}
2594 
2595 	//
2596 	// Labels are ordered by their first occurrence in the training set.
2597 	// However, for two-class sets with -1/+1 labels and -1 appears first,
2598 	// we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances.
2599 	//
2600 	if (nr_class == 2 && label[0] == -1 && label[1] == 1)
2601 	{
2602 		swap(label[0],label[1]);
2603 		swap(count[0],count[1]);
2604 		for(i=0;i<l;i++)
2605 		{
2606 			if(data_label[i] == 0)
2607 				data_label[i] = 1;
2608 			else
2609 				data_label[i] = 0;
2610 		}
2611 	}
2612 
2613 	int *start = Malloc(int,nr_class);
2614 	start[0] = 0;
2615 	for(i=1;i<nr_class;i++)
2616 		start[i] = start[i-1]+count[i-1];
2617 	for(i=0;i<l;i++)
2618 	{
2619 		perm[start[data_label[i]]] = i;
2620 		++start[data_label[i]];
2621 	}
2622 	start[0] = 0;
2623 	for(i=1;i<nr_class;i++)
2624 		start[i] = start[i-1]+count[i-1];
2625 
2626 	*nr_class_ret = nr_class;
2627 	*label_ret = label;
2628 	*start_ret = start;
2629 	*count_ret = count;
2630 	free(data_label);
2631 }
2632 
train_one(const problem * prob,const parameter * param,double * w,double Cp,double Cn)2633 static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
2634 {
2635 	int solver_type = param->solver_type;
2636 	int dual_solver_max_iter = 300;
2637 	int iter;
2638 
2639 	bool is_regression = (solver_type==L2R_L2LOSS_SVR ||
2640 				solver_type==L2R_L1LOSS_SVR_DUAL ||
2641 				solver_type==L2R_L2LOSS_SVR_DUAL);
2642 
2643 	// Some solvers use Cp,Cn but not C array; extensions possible but no plan for now
2644 	double *C = new double[prob->l];
2645 	double primal_solver_tol = param->eps;
2646 	if(is_regression)
2647 	{
2648 		for(int i=0;i<prob->l;i++)
2649 			C[i] = param->C;
2650 	}
2651 	else
2652 	{
2653 		int pos = 0;
2654 		for(int i=0;i<prob->l;i++)
2655 		{
2656 			if(prob->y[i] > 0)
2657 			{
2658 				pos++;
2659 				C[i] = Cp;
2660 			}
2661 			else
2662 				C[i] = Cn;
2663 		}
2664 		int neg = prob->l - pos;
2665 		primal_solver_tol = param->eps*max(min(pos,neg), 1)/prob->l;
2666 	}
2667 
2668 	switch(solver_type)
2669 	{
2670 		case L2R_LR:
2671 		{
2672 			l2r_lr_fun fun_obj(prob, param, C);
2673 			NEWTON newton_obj(&fun_obj, primal_solver_tol);
2674 			newton_obj.set_print_string(liblinear_print_string);
2675 			newton_obj.newton(w);
2676 			break;
2677 		}
2678 		case L2R_L2LOSS_SVC:
2679 		{
2680 			l2r_l2_svc_fun fun_obj(prob, param, C);
2681 			NEWTON newton_obj(&fun_obj, primal_solver_tol);
2682 			newton_obj.set_print_string(liblinear_print_string);
2683 			newton_obj.newton(w);
2684 			break;
2685 		}
2686 		case L2R_L2LOSS_SVC_DUAL:
2687 		{
2688 			iter = solve_l2r_l1l2_svc(prob, param, w, Cp, Cn, dual_solver_max_iter);
2689 			if(iter >= dual_solver_max_iter)
2690 			{
2691 				info("\nWARNING: reaching max number of iterations\nSwitching to use -s 2\n\n");
2692 				// primal_solver_tol obtained from eps for dual may be too loose
2693 				primal_solver_tol *= 0.1;
2694 				l2r_l2_svc_fun fun_obj(prob, param, C);
2695 				NEWTON newton_obj(&fun_obj, primal_solver_tol);
2696 				newton_obj.set_print_string(liblinear_print_string);
2697 				newton_obj.newton(w);
2698 			}
2699 			break;
2700 		}
2701 		case L2R_L1LOSS_SVC_DUAL:
2702 		{
2703 			iter = solve_l2r_l1l2_svc(prob, param, w, Cp, Cn, dual_solver_max_iter);
2704 			if(iter >= dual_solver_max_iter)
2705 				info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n");
2706 			break;
2707 		}
2708 		case L1R_L2LOSS_SVC:
2709 		{
2710 			problem prob_col;
2711 			feature_node *x_space = NULL;
2712 			transpose(prob, &x_space ,&prob_col);
2713 			solve_l1r_l2_svc(&prob_col, param, w, Cp, Cn, primal_solver_tol);
2714 			delete [] prob_col.y;
2715 			delete [] prob_col.x;
2716 			delete [] x_space;
2717 			break;
2718 		}
2719 		case L1R_LR:
2720 		{
2721 			problem prob_col;
2722 			feature_node *x_space = NULL;
2723 			transpose(prob, &x_space ,&prob_col);
2724 			solve_l1r_lr(&prob_col, param, w, Cp, Cn, primal_solver_tol);
2725 			delete [] prob_col.y;
2726 			delete [] prob_col.x;
2727 			delete [] x_space;
2728 			break;
2729 		}
2730 		case L2R_LR_DUAL:
2731 		{
2732 			iter = solve_l2r_lr_dual(prob, param, w, Cp, Cn, dual_solver_max_iter);
2733 			if(iter >= dual_solver_max_iter)
2734 			{
2735 				info("\nWARNING: reaching max number of iterations\nSwitching to use -s 0\n\n");
2736 				// primal_solver_tol obtained from eps for dual may be too loose
2737 				primal_solver_tol *= 0.1;
2738 				l2r_lr_fun fun_obj(prob, param, C);
2739 				NEWTON newton_obj(&fun_obj, primal_solver_tol);
2740 				newton_obj.set_print_string(liblinear_print_string);
2741 				newton_obj.newton(w);
2742 			}
2743 			break;
2744 		}
2745 		case L2R_L2LOSS_SVR:
2746 		{
2747 			l2r_l2_svr_fun fun_obj(prob, param, C);
2748 			NEWTON newton_obj(&fun_obj, primal_solver_tol);
2749 			newton_obj.set_print_string(liblinear_print_string);
2750 			newton_obj.newton(w);
2751 			break;
2752 		}
2753 		case L2R_L1LOSS_SVR_DUAL:
2754 		{
2755 			iter = solve_l2r_l1l2_svr(prob, param, w, dual_solver_max_iter);
2756 			if(iter >= dual_solver_max_iter)
2757 				info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster (also see FAQ)\n\n");
2758 
2759 			break;
2760 		}
2761 		case L2R_L2LOSS_SVR_DUAL:
2762 		{
2763 			iter = solve_l2r_l1l2_svr(prob, param, w, dual_solver_max_iter);
2764 			if(iter >= dual_solver_max_iter)
2765 			{
2766 				info("\nWARNING: reaching max number of iterations\nSwitching to use -s 11\n\n");
2767 				// primal_solver_tol obtained from eps for dual may be too loose
2768 				primal_solver_tol *= 0.001;
2769 				l2r_l2_svr_fun fun_obj(prob, param, C);
2770 				NEWTON newton_obj(&fun_obj, primal_solver_tol);
2771 				newton_obj.set_print_string(liblinear_print_string);
2772 				newton_obj.newton(w);
2773 			}
2774 			break;
2775 		}
2776 		default:
2777 			fprintf(stderr, "ERROR: unknown solver_type\n");
2778 			break;
2779 	}
2780 
2781 	delete[] C;
2782 }
2783 
2784 // Calculate the initial C for parameter selection
calc_start_C(const problem * prob,const parameter * param)2785 static double calc_start_C(const problem *prob, const parameter *param)
2786 {
2787 	int i;
2788 	double xTx, max_xTx;
2789 	max_xTx = 0;
2790 	for(i=0; i<prob->l; i++)
2791 	{
2792 		xTx = 0;
2793 		feature_node *xi=prob->x[i];
2794 		while(xi->index != -1)
2795 		{
2796 			double val = xi->value;
2797 			xTx += val*val;
2798 			xi++;
2799 		}
2800 		if(xTx > max_xTx)
2801 			max_xTx = xTx;
2802 	}
2803 
2804 	double min_C = 1.0;
2805 	if(param->solver_type == L2R_LR)
2806 		min_C = 1.0 / (prob->l * max_xTx);
2807 	else if(param->solver_type == L2R_L2LOSS_SVC)
2808 		min_C = 1.0 / (2 * prob->l * max_xTx);
2809 	else if(param->solver_type == L2R_L2LOSS_SVR)
2810 	{
2811 		double sum_y, loss, y_abs;
2812 		double delta2 = 0.1;
2813 		sum_y = 0, loss = 0;
2814 		for(i=0; i<prob->l; i++)
2815 		{
2816 			y_abs = fabs(prob->y[i]);
2817 			sum_y += y_abs;
2818 			loss += max(y_abs - param->p, 0.0) * max(y_abs - param->p, 0.0);
2819 		}
2820 		if(loss > 0)
2821 			min_C = delta2 * delta2 * loss / (8 * sum_y * sum_y * max_xTx);
2822 		else
2823 			min_C = INF;
2824 	}
2825 
2826 	return pow( 2, floor(log(min_C) / log(2.0)) );
2827 }
2828 
calc_max_p(const problem * prob)2829 static double calc_max_p(const problem *prob)
2830 {
2831 	int i;
2832 	double max_p = 0.0;
2833 	for(i = 0; i < prob->l; i++)
2834 		max_p = max(max_p, fabs(prob->y[i]));
2835 
2836 	return max_p;
2837 }
2838 
find_parameter_C(const problem * prob,parameter * param_tmp,double start_C,double max_C,double * best_C,double * best_score,const int * fold_start,const int * perm,const problem * subprob,int nr_fold)2839 static void find_parameter_C(const problem *prob, parameter *param_tmp, double start_C, double max_C, double *best_C, double *best_score, const int *fold_start, const int *perm, const problem *subprob, int nr_fold)
2840 {
2841 	// variables for CV
2842 	int i;
2843 	double *target = Malloc(double, prob->l);
2844 
2845 	// variables for warm start
2846 	double ratio = 2;
2847 	double **prev_w = Malloc(double*, nr_fold);
2848 	for(i = 0; i < nr_fold; i++)
2849 		prev_w[i] = NULL;
2850 	int num_unchanged_w = 0;
2851 	void (*default_print_string) (const char *) = liblinear_print_string;
2852 
2853 	if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2854 		*best_score = 0.0;
2855 	else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2856 		*best_score = INF;
2857 	*best_C = start_C;
2858 
2859 	param_tmp->C = start_C;
2860 	while(param_tmp->C <= max_C)
2861 	{
2862 		//Output disabled for running CV at a particular C
2863 		set_print_string_function(&print_null);
2864 
2865 		for(i=0; i<nr_fold; i++)
2866 		{
2867 			int j;
2868 			int begin = fold_start[i];
2869 			int end = fold_start[i+1];
2870 
2871 			param_tmp->init_sol = prev_w[i];
2872 			struct model *submodel = train(&subprob[i],param_tmp);
2873 
2874 			int total_w_size;
2875 			if(submodel->nr_class == 2)
2876 				total_w_size = subprob[i].n;
2877 			else
2878 				total_w_size = subprob[i].n * submodel->nr_class;
2879 
2880 			if(prev_w[i] == NULL)
2881 			{
2882 				prev_w[i] = Malloc(double, total_w_size);
2883 				for(j=0; j<total_w_size; j++)
2884 					prev_w[i][j] = submodel->w[j];
2885 			}
2886 			else if(num_unchanged_w >= 0)
2887 			{
2888 				double norm_w_diff = 0;
2889 				for(j=0; j<total_w_size; j++)
2890 				{
2891 					norm_w_diff += (submodel->w[j] - prev_w[i][j])*(submodel->w[j] - prev_w[i][j]);
2892 					prev_w[i][j] = submodel->w[j];
2893 				}
2894 				norm_w_diff = sqrt(norm_w_diff);
2895 
2896 				if(norm_w_diff > 1e-15)
2897 					num_unchanged_w = -1;
2898 			}
2899 			else
2900 			{
2901 				for(j=0; j<total_w_size; j++)
2902 					prev_w[i][j] = submodel->w[j];
2903 			}
2904 
2905 			for(j=begin; j<end; j++)
2906 				target[perm[j]] = predict(submodel,prob->x[perm[j]]);
2907 
2908 			free_and_destroy_model(&submodel);
2909 		}
2910 		set_print_string_function(default_print_string);
2911 
2912 		if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2913 		{
2914 			int total_correct = 0;
2915 			for(i=0; i<prob->l; i++)
2916 				if(target[i] == prob->y[i])
2917 					++total_correct;
2918 			double current_rate = (double)total_correct/prob->l;
2919 			if(current_rate > *best_score)
2920 			{
2921 				*best_C = param_tmp->C;
2922 				*best_score = current_rate;
2923 			}
2924 
2925 			info("log2c=%7.2f\trate=%g\n",log(param_tmp->C)/log(2.0),100.0*current_rate);
2926 		}
2927 		else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2928 		{
2929 			double total_error = 0.0;
2930 			for(i=0; i<prob->l; i++)
2931 			{
2932 				double y = prob->y[i];
2933 				double v = target[i];
2934 				total_error += (v-y)*(v-y);
2935 			}
2936 			double current_error = total_error/prob->l;
2937 			if(current_error < *best_score)
2938 			{
2939 				*best_C = param_tmp->C;
2940 				*best_score = current_error;
2941 			}
2942 
2943 			info("log2c=%7.2f\tp=%7.2f\tMean squared error=%g\n",log(param_tmp->C)/log(2.0),param_tmp->p,current_error);
2944 		}
2945 
2946 		num_unchanged_w++;
2947 		if(num_unchanged_w == 5)
2948 			break;
2949 		param_tmp->C = param_tmp->C*ratio;
2950 	}
2951 
2952 	if(param_tmp->C > max_C)
2953 		info("WARNING: maximum C reached.\n");
2954 	free(target);
2955 	for(i=0; i<nr_fold; i++)
2956 		free(prev_w[i]);
2957 	free(prev_w);
2958 }
2959 
2960 
2961 //
2962 // Interface functions
2963 //
train(const problem * prob,const parameter * param)2964 model* train(const problem *prob, const parameter *param)
2965 {
2966 	int i,j;
2967 	int l = prob->l;
2968 	int n = prob->n;
2969 	int w_size = prob->n;
2970 	model *model_ = Malloc(model,1);
2971 
2972 	if(prob->bias>=0)
2973 		model_->nr_feature=n-1;
2974 	else
2975 		model_->nr_feature=n;
2976 	model_->param = *param;
2977 	model_->bias = prob->bias;
2978 
2979 	if(check_regression_model(model_))
2980 	{
2981 		model_->w = Malloc(double, w_size);
2982 
2983 		if(param->init_sol != NULL)
2984 			for(i=0;i<w_size;i++)
2985 				model_->w[i] = param->init_sol[i];
2986 		else
2987 			for(i=0;i<w_size;i++)
2988 				model_->w[i] = 0;
2989 
2990 		model_->nr_class = 2;
2991 		model_->label = NULL;
2992 		train_one(prob, param, model_->w, 0, 0);
2993 	}
2994 	else if(check_oneclass_model(model_))
2995 	{
2996 		model_->w = Malloc(double, w_size);
2997 		model_->nr_class = 2;
2998 		model_->label = NULL;
2999 		solve_oneclass_svm(prob, param, model_->w, &(model_->rho));
3000 	}
3001 	else
3002 	{
3003 		int nr_class;
3004 		int *label = NULL;
3005 		int *start = NULL;
3006 		int *count = NULL;
3007 		int *perm = Malloc(int,l);
3008 
3009 		// group training data of the same class
3010 		group_classes(prob,&nr_class,&label,&start,&count,perm);
3011 
3012 		model_->nr_class=nr_class;
3013 		model_->label = Malloc(int,nr_class);
3014 		for(i=0;i<nr_class;i++)
3015 			model_->label[i] = label[i];
3016 
3017 		// calculate weighted C
3018 		double *weighted_C = Malloc(double, nr_class);
3019 		for(i=0;i<nr_class;i++)
3020 			weighted_C[i] = param->C;
3021 		for(i=0;i<param->nr_weight;i++)
3022 		{
3023 			for(j=0;j<nr_class;j++)
3024 				if(param->weight_label[i] == label[j])
3025 					break;
3026 			if(j == nr_class)
3027 				fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
3028 			else
3029 				weighted_C[j] *= param->weight[i];
3030 		}
3031 
3032 		// constructing the subproblem
3033 		feature_node **x = Malloc(feature_node *,l);
3034 		for(i=0;i<l;i++)
3035 			x[i] = prob->x[perm[i]];
3036 
3037 		int k;
3038 		problem sub_prob;
3039 		sub_prob.l = l;
3040 		sub_prob.n = n;
3041 		sub_prob.x = Malloc(feature_node *,sub_prob.l);
3042 		sub_prob.y = Malloc(double,sub_prob.l);
3043 
3044 		for(k=0; k<sub_prob.l; k++)
3045 			sub_prob.x[k] = x[k];
3046 
3047 		// multi-class svm by Crammer and Singer
3048 		if(param->solver_type == MCSVM_CS)
3049 		{
3050 			model_->w=Malloc(double, n*nr_class);
3051 			for(i=0;i<nr_class;i++)
3052 				for(j=start[i];j<start[i]+count[i];j++)
3053 					sub_prob.y[j] = i;
3054 			Solver_MCSVM_CS Solver(&sub_prob, nr_class, weighted_C, param->eps);
3055 			Solver.Solve(model_->w);
3056 		}
3057 		else
3058 		{
3059 			if(nr_class == 2)
3060 			{
3061 				model_->w=Malloc(double, w_size);
3062 
3063 				int e0 = start[0]+count[0];
3064 				k=0;
3065 				for(; k<e0; k++)
3066 					sub_prob.y[k] = +1;
3067 				for(; k<sub_prob.l; k++)
3068 					sub_prob.y[k] = -1;
3069 
3070 				if(param->init_sol != NULL)
3071 					for(i=0;i<w_size;i++)
3072 						model_->w[i] = param->init_sol[i];
3073 				else
3074 					for(i=0;i<w_size;i++)
3075 						model_->w[i] = 0;
3076 
3077 				train_one(&sub_prob, param, model_->w, weighted_C[0], weighted_C[1]);
3078 			}
3079 			else
3080 			{
3081 				model_->w=Malloc(double, w_size*nr_class);
3082 				double *w=Malloc(double, w_size);
3083 				for(i=0;i<nr_class;i++)
3084 				{
3085 					int si = start[i];
3086 					int ei = si+count[i];
3087 
3088 					k=0;
3089 					for(; k<si; k++)
3090 						sub_prob.y[k] = -1;
3091 					for(; k<ei; k++)
3092 						sub_prob.y[k] = +1;
3093 					for(; k<sub_prob.l; k++)
3094 						sub_prob.y[k] = -1;
3095 
3096 					if(param->init_sol != NULL)
3097 						for(j=0;j<w_size;j++)
3098 							w[j] = param->init_sol[j*nr_class+i];
3099 					else
3100 						for(j=0;j<w_size;j++)
3101 							w[j] = 0;
3102 
3103 					train_one(&sub_prob, param, w, weighted_C[i], param->C);
3104 
3105 					for(j=0;j<w_size;j++)
3106 						model_->w[j*nr_class+i] = w[j];
3107 				}
3108 				free(w);
3109 			}
3110 
3111 		}
3112 
3113 		free(x);
3114 		free(label);
3115 		free(start);
3116 		free(count);
3117 		free(perm);
3118 		free(sub_prob.x);
3119 		free(sub_prob.y);
3120 		free(weighted_C);
3121 	}
3122 	return model_;
3123 }
3124 
cross_validation(const problem * prob,const parameter * param,int nr_fold,double * target)3125 void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target)
3126 {
3127 	int i;
3128 	int *fold_start;
3129 	int l = prob->l;
3130 	int *perm = Malloc(int,l);
3131 	if (nr_fold > l)
3132 	{
3133 		nr_fold = l;
3134 		fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3135 	}
3136 	fold_start = Malloc(int,nr_fold+1);
3137 	for(i=0;i<l;i++) perm[i]=i;
3138 	for(i=0;i<l;i++)
3139 	{
3140 		int j = i+rand()%(l-i);
3141 		swap(perm[i],perm[j]);
3142 	}
3143 	for(i=0;i<=nr_fold;i++)
3144 		fold_start[i]=i*l/nr_fold;
3145 
3146 	for(i=0;i<nr_fold;i++)
3147 	{
3148 		int begin = fold_start[i];
3149 		int end = fold_start[i+1];
3150 		int j,k;
3151 		struct problem subprob;
3152 
3153 		subprob.bias = prob->bias;
3154 		subprob.n = prob->n;
3155 		subprob.l = l-(end-begin);
3156 		subprob.x = Malloc(struct feature_node*,subprob.l);
3157 		subprob.y = Malloc(double,subprob.l);
3158 
3159 		k=0;
3160 		for(j=0;j<begin;j++)
3161 		{
3162 			subprob.x[k] = prob->x[perm[j]];
3163 			subprob.y[k] = prob->y[perm[j]];
3164 			++k;
3165 		}
3166 		for(j=end;j<l;j++)
3167 		{
3168 			subprob.x[k] = prob->x[perm[j]];
3169 			subprob.y[k] = prob->y[perm[j]];
3170 			++k;
3171 		}
3172 		struct model *submodel = train(&subprob,param);
3173 		for(j=begin;j<end;j++)
3174 			target[perm[j]] = predict(submodel,prob->x[perm[j]]);
3175 		free_and_destroy_model(&submodel);
3176 		free(subprob.x);
3177 		free(subprob.y);
3178 	}
3179 	free(fold_start);
3180 	free(perm);
3181 }
3182 
3183 
find_parameters(const problem * prob,const parameter * param,int nr_fold,double start_C,double start_p,double * best_C,double * best_p,double * best_score)3184 void find_parameters(const problem *prob, const parameter *param, int nr_fold, double start_C, double start_p, double *best_C, double *best_p, double *best_score)
3185 {
3186 	// prepare CV folds
3187 
3188 	int i;
3189 	int *fold_start;
3190 	int l = prob->l;
3191 	int *perm = Malloc(int, l);
3192 	struct problem *subprob = Malloc(problem,nr_fold);
3193 
3194 	if (nr_fold > l)
3195 	{
3196 		nr_fold = l;
3197 		fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3198 	}
3199 	fold_start = Malloc(int,nr_fold+1);
3200 	for(i=0;i<l;i++) perm[i]=i;
3201 	for(i=0;i<l;i++)
3202 	{
3203 		int j = i+rand()%(l-i);
3204 		swap(perm[i],perm[j]);
3205 	}
3206 	for(i=0;i<=nr_fold;i++)
3207 		fold_start[i]=i*l/nr_fold;
3208 
3209 	for(i=0;i<nr_fold;i++)
3210 	{
3211 		int begin = fold_start[i];
3212 		int end = fold_start[i+1];
3213 		int j,k;
3214 
3215 		subprob[i].bias = prob->bias;
3216 		subprob[i].n = prob->n;
3217 		subprob[i].l = l-(end-begin);
3218 		subprob[i].x = Malloc(struct feature_node*,subprob[i].l);
3219 		subprob[i].y = Malloc(double,subprob[i].l);
3220 
3221 		k=0;
3222 		for(j=0;j<begin;j++)
3223 		{
3224 			subprob[i].x[k] = prob->x[perm[j]];
3225 			subprob[i].y[k] = prob->y[perm[j]];
3226 			++k;
3227 		}
3228 		for(j=end;j<l;j++)
3229 		{
3230 			subprob[i].x[k] = prob->x[perm[j]];
3231 			subprob[i].y[k] = prob->y[perm[j]];
3232 			++k;
3233 		}
3234 	}
3235 
3236 	struct parameter param_tmp = *param;
3237 	*best_p = -1;
3238 	if(param->solver_type == L2R_LR || param->solver_type == L2R_L2LOSS_SVC)
3239 	{
3240 		if(start_C <= 0)
3241 			start_C = calc_start_C(prob, &param_tmp);
3242 		double max_C = 1024;
3243 		start_C = min(start_C, max_C);
3244 		double best_C_tmp, best_score_tmp;
3245 
3246 		find_parameter_C(prob, &param_tmp, start_C, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3247 
3248 		*best_C = best_C_tmp;
3249 		*best_score = best_score_tmp;
3250 	}
3251 	else if(param->solver_type == L2R_L2LOSS_SVR)
3252 	{
3253 		double max_p = calc_max_p(prob);
3254 		int num_p_steps = 20;
3255 		double max_C = 1048576;
3256 		*best_score = INF;
3257 
3258 		i = num_p_steps-1;
3259 		if(start_p > 0)
3260 			i = min((int)(start_p/(max_p/num_p_steps)), i);
3261 		for(; i >= 0; i--)
3262 		{
3263 			param_tmp.p = i*max_p/num_p_steps;
3264 			double start_C_tmp;
3265 			if(start_C <= 0)
3266 				start_C_tmp = calc_start_C(prob, &param_tmp);
3267 			else
3268 				start_C_tmp = start_C;
3269 			start_C_tmp = min(start_C_tmp, max_C);
3270 			double best_C_tmp, best_score_tmp;
3271 
3272 			find_parameter_C(prob, &param_tmp, start_C_tmp, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3273 
3274 			if(best_score_tmp < *best_score)
3275 			{
3276 				*best_p = param_tmp.p;
3277 				*best_C = best_C_tmp;
3278 				*best_score = best_score_tmp;
3279 			}
3280 		}
3281 	}
3282 
3283 	free(fold_start);
3284 	free(perm);
3285 	for(i=0; i<nr_fold; i++)
3286 	{
3287 		free(subprob[i].x);
3288 		free(subprob[i].y);
3289 	}
3290 	free(subprob);
3291 }
3292 
predict_values(const struct model * model_,const struct feature_node * x,double * dec_values)3293 double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values)
3294 {
3295 	int idx;
3296 	int n;
3297 	if(model_->bias>=0)
3298 		n=model_->nr_feature+1;
3299 	else
3300 		n=model_->nr_feature;
3301 	double *w=model_->w;
3302 	int nr_class=model_->nr_class;
3303 	int i;
3304 	int nr_w;
3305 	if(nr_class==2 && model_->param.solver_type != MCSVM_CS)
3306 		nr_w = 1;
3307 	else
3308 		nr_w = nr_class;
3309 
3310 	const feature_node *lx=x;
3311 	for(i=0;i<nr_w;i++)
3312 		dec_values[i] = 0;
3313 	for(; (idx=lx->index)!=-1; lx++)
3314 	{
3315 		// the dimension of testing data may exceed that of training
3316 		if(idx<=n)
3317 			for(i=0;i<nr_w;i++)
3318 				dec_values[i] += w[(idx-1)*nr_w+i]*lx->value;
3319 	}
3320 	if(check_oneclass_model(model_))
3321 		dec_values[0] -= model_->rho;
3322 
3323 	if(nr_class==2)
3324 	{
3325 		if(check_regression_model(model_))
3326 			return dec_values[0];
3327 		else if(check_oneclass_model(model_))
3328 			return (dec_values[0]>0)?1:-1;
3329 		else
3330 			return (dec_values[0]>0)?model_->label[0]:model_->label[1];
3331 	}
3332 	else
3333 	{
3334 		int dec_max_idx = 0;
3335 		for(i=1;i<nr_class;i++)
3336 		{
3337 			if(dec_values[i] > dec_values[dec_max_idx])
3338 				dec_max_idx = i;
3339 		}
3340 		return model_->label[dec_max_idx];
3341 	}
3342 }
3343 
predict(const model * model_,const feature_node * x)3344 double predict(const model *model_, const feature_node *x)
3345 {
3346 	double *dec_values = Malloc(double, model_->nr_class);
3347 	double label=predict_values(model_, x, dec_values);
3348 	free(dec_values);
3349 	return label;
3350 }
3351 
predict_probability(const struct model * model_,const struct feature_node * x,double * prob_estimates)3352 double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates)
3353 {
3354 	if(check_probability_model(model_))
3355 	{
3356 		int i;
3357 		int nr_class=model_->nr_class;
3358 		int nr_w;
3359 		if(nr_class==2)
3360 			nr_w = 1;
3361 		else
3362 			nr_w = nr_class;
3363 
3364 		double label=predict_values(model_, x, prob_estimates);
3365 		for(i=0;i<nr_w;i++)
3366 			prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
3367 
3368 		if(nr_class==2) // for binary classification
3369 			prob_estimates[1]=1.-prob_estimates[0];
3370 		else
3371 		{
3372 			double sum=0;
3373 			for(i=0; i<nr_class; i++)
3374 				sum+=prob_estimates[i];
3375 
3376 			for(i=0; i<nr_class; i++)
3377 				prob_estimates[i]=prob_estimates[i]/sum;
3378 		}
3379 
3380 		return label;
3381 	}
3382 	else
3383 		return 0;
3384 }
3385 
3386 static const char *solver_type_table[]=
3387 {
3388 	"L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
3389 	"L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL",
3390 	"", "", "",
3391 	"L2R_L2LOSS_SVR", "L2R_L2LOSS_SVR_DUAL", "L2R_L1LOSS_SVR_DUAL",
3392 	"", "", "", "", "", "", "",
3393 	"ONECLASS_SVM", NULL
3394 };
3395 
save_model(const char * model_file_name,const struct model * model_)3396 int save_model(const char *model_file_name, const struct model *model_)
3397 {
3398 	int i;
3399 	int nr_feature=model_->nr_feature;
3400 	int n;
3401 	const parameter& param = model_->param;
3402 
3403 	if(model_->bias>=0)
3404 		n=nr_feature+1;
3405 	else
3406 		n=nr_feature;
3407 	int w_size = n;
3408 	FILE *fp = fopen(model_file_name,"w");
3409 	if(fp==NULL) return -1;
3410 
3411 	char *old_locale = setlocale(LC_ALL, NULL);
3412 	if (old_locale)
3413 	{
3414 		old_locale = strdup(old_locale);
3415 	}
3416 	setlocale(LC_ALL, "C");
3417 
3418 	int nr_w;
3419 	if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS)
3420 		nr_w=1;
3421 	else
3422 		nr_w=model_->nr_class;
3423 
3424 	fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]);
3425 	fprintf(fp, "nr_class %d\n", model_->nr_class);
3426 
3427 	if(model_->label)
3428 	{
3429 		fprintf(fp, "label");
3430 		for(i=0; i<model_->nr_class; i++)
3431 			fprintf(fp, " %d", model_->label[i]);
3432 		fprintf(fp, "\n");
3433 	}
3434 
3435 	fprintf(fp, "nr_feature %d\n", nr_feature);
3436 
3437 	fprintf(fp, "bias %.17g\n", model_->bias);
3438 
3439 	if(check_oneclass_model(model_))
3440 		fprintf(fp, "rho %.17g\n", model_->rho);
3441 
3442 	fprintf(fp, "w\n");
3443 	for(i=0; i<w_size; i++)
3444 	{
3445 		int j;
3446 		for(j=0; j<nr_w; j++)
3447 			fprintf(fp, "%.17g ", model_->w[i*nr_w+j]);
3448 		fprintf(fp, "\n");
3449 	}
3450 
3451 	setlocale(LC_ALL, old_locale);
3452 	free(old_locale);
3453 
3454 	if (ferror(fp) != 0 || fclose(fp) != 0) return -1;
3455 	else return 0;
3456 }
3457 
3458 //
3459 // FSCANF helps to handle fscanf failures.
3460 // Its do-while block avoids the ambiguity when
3461 // if (...)
3462 //    FSCANF();
3463 // is used
3464 //
3465 #define FSCANF(_stream, _format, _var)do\
3466 {\
3467 	if (fscanf(_stream, _format, _var) != 1)\
3468 	{\
3469 		fprintf(stderr, "ERROR: fscanf failed to read the model\n");\
3470 		EXIT_LOAD_MODEL()\
3471 	}\
3472 }while(0)
3473 // EXIT_LOAD_MODEL should NOT end with a semicolon.
3474 #define EXIT_LOAD_MODEL()\
3475 {\
3476 	setlocale(LC_ALL, old_locale);\
3477 	free(model_->label);\
3478 	free(model_);\
3479 	free(old_locale);\
3480 	return NULL;\
3481 }
load_model(const char * model_file_name)3482 struct model *load_model(const char *model_file_name)
3483 {
3484 	FILE *fp = fopen(model_file_name,"r");
3485 	if(fp==NULL) return NULL;
3486 
3487 	int i;
3488 	int nr_feature;
3489 	int n;
3490 	int nr_class;
3491 	double bias;
3492 	double rho;
3493 	model *model_ = Malloc(model,1);
3494 	parameter& param = model_->param;
3495 	// parameters for training only won't be assigned, but arrays are assigned as NULL for safety
3496 	param.nr_weight = 0;
3497 	param.weight_label = NULL;
3498 	param.weight = NULL;
3499 	param.init_sol = NULL;
3500 
3501 	model_->label = NULL;
3502 
3503 	char *old_locale = setlocale(LC_ALL, NULL);
3504 	if (old_locale)
3505 	{
3506 		old_locale = strdup(old_locale);
3507 	}
3508 	setlocale(LC_ALL, "C");
3509 
3510 	char cmd[81];
3511 	while(1)
3512 	{
3513 		FSCANF(fp,"%80s",cmd);
3514 		if(strcmp(cmd,"solver_type")==0)
3515 		{
3516 			FSCANF(fp,"%80s",cmd);
3517 			int i;
3518 			for(i=0;solver_type_table[i];i++)
3519 			{
3520 				if(strcmp(solver_type_table[i],cmd)==0)
3521 				{
3522 					param.solver_type=i;
3523 					break;
3524 				}
3525 			}
3526 			if(solver_type_table[i] == NULL)
3527 			{
3528 				fprintf(stderr,"unknown solver type.\n");
3529 				EXIT_LOAD_MODEL()
3530 			}
3531 		}
3532 		else if(strcmp(cmd,"nr_class")==0)
3533 		{
3534 			FSCANF(fp,"%d",&nr_class);
3535 			model_->nr_class=nr_class;
3536 		}
3537 		else if(strcmp(cmd,"nr_feature")==0)
3538 		{
3539 			FSCANF(fp,"%d",&nr_feature);
3540 			model_->nr_feature=nr_feature;
3541 		}
3542 		else if(strcmp(cmd,"bias")==0)
3543 		{
3544 			FSCANF(fp,"%lf",&bias);
3545 			model_->bias=bias;
3546 		}
3547 		else if(strcmp(cmd,"rho")==0)
3548 		{
3549 			FSCANF(fp,"%lf",&rho);
3550 			model_->rho=rho;
3551 		}
3552 		else if(strcmp(cmd,"w")==0)
3553 		{
3554 			break;
3555 		}
3556 		else if(strcmp(cmd,"label")==0)
3557 		{
3558 			int nr_class = model_->nr_class;
3559 			model_->label = Malloc(int,nr_class);
3560 			for(int i=0;i<nr_class;i++)
3561 				FSCANF(fp,"%d",&model_->label[i]);
3562 		}
3563 		else
3564 		{
3565 			fprintf(stderr,"unknown text in model file: [%s]\n",cmd);
3566 			EXIT_LOAD_MODEL()
3567 		}
3568 	}
3569 
3570 	nr_feature=model_->nr_feature;
3571 	if(model_->bias>=0)
3572 		n=nr_feature+1;
3573 	else
3574 		n=nr_feature;
3575 	int w_size = n;
3576 	int nr_w;
3577 	if(nr_class==2 && param.solver_type != MCSVM_CS)
3578 		nr_w = 1;
3579 	else
3580 		nr_w = nr_class;
3581 
3582 	model_->w=Malloc(double, w_size*nr_w);
3583 	for(i=0; i<w_size; i++)
3584 	{
3585 		int j;
3586 		for(j=0; j<nr_w; j++)
3587 			FSCANF(fp, "%lf ", &model_->w[i*nr_w+j]);
3588 	}
3589 
3590 	setlocale(LC_ALL, old_locale);
3591 	free(old_locale);
3592 
3593 	if (ferror(fp) != 0 || fclose(fp) != 0) return NULL;
3594 
3595 	return model_;
3596 }
3597 
get_nr_feature(const model * model_)3598 int get_nr_feature(const model *model_)
3599 {
3600 	return model_->nr_feature;
3601 }
3602 
get_nr_class(const model * model_)3603 int get_nr_class(const model *model_)
3604 {
3605 	return model_->nr_class;
3606 }
3607 
get_labels(const model * model_,int * label)3608 void get_labels(const model *model_, int* label)
3609 {
3610 	if (model_->label != NULL)
3611 		for(int i=0;i<model_->nr_class;i++)
3612 			label[i] = model_->label[i];
3613 }
3614 
3615 // use inline here for better performance (around 20% faster than the non-inline one)
get_w_value(const struct model * model_,int idx,int label_idx)3616 static inline double get_w_value(const struct model *model_, int idx, int label_idx)
3617 {
3618 	int nr_class = model_->nr_class;
3619 	int solver_type = model_->param.solver_type;
3620 	const double *w = model_->w;
3621 
3622 	if(idx < 0 || idx > model_->nr_feature)
3623 		return 0;
3624 	if(check_regression_model(model_) || check_oneclass_model(model_))
3625 		return w[idx];
3626 	else
3627 	{
3628 		if(label_idx < 0 || label_idx >= nr_class)
3629 			return 0;
3630 		if(nr_class == 2 && solver_type != MCSVM_CS)
3631 		{
3632 			if(label_idx == 0)
3633 				return w[idx];
3634 			else
3635 				return -w[idx];
3636 		}
3637 		else
3638 			return w[idx*nr_class+label_idx];
3639 	}
3640 }
3641 
3642 // feat_idx: starting from 1 to nr_feature
3643 // label_idx: starting from 0 to nr_class-1 for classification models;
3644 //            for regression and one-class SVM models, label_idx is
3645 //            ignored.
get_decfun_coef(const struct model * model_,int feat_idx,int label_idx)3646 double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx)
3647 {
3648 	if(feat_idx > model_->nr_feature)
3649 		return 0;
3650 	return get_w_value(model_, feat_idx-1, label_idx);
3651 }
3652 
get_decfun_bias(const struct model * model_,int label_idx)3653 double get_decfun_bias(const struct model *model_, int label_idx)
3654 {
3655 	if(check_oneclass_model(model_))
3656 	{
3657 		fprintf(stderr, "ERROR: get_decfun_bias can not be called for a one-class SVM model\n");
3658 		return 0;
3659 	}
3660 	int bias_idx = model_->nr_feature;
3661 	double bias = model_->bias;
3662 	if(bias <= 0)
3663 		return 0;
3664 	else
3665 		return bias*get_w_value(model_, bias_idx, label_idx);
3666 }
3667 
get_decfun_rho(const struct model * model_)3668 double get_decfun_rho(const struct model *model_)
3669 {
3670 	if(check_oneclass_model(model_))
3671 		return model_->rho;
3672 	else
3673 	{
3674 		fprintf(stderr, "ERROR: get_decfun_rho can be called only for a one-class SVM model\n");
3675 		return 0;
3676 	}
3677 }
3678 
free_model_content(struct model * model_ptr)3679 void free_model_content(struct model *model_ptr)
3680 {
3681 	if(model_ptr->w != NULL)
3682 		free(model_ptr->w);
3683 	if(model_ptr->label != NULL)
3684 		free(model_ptr->label);
3685 }
3686 
free_and_destroy_model(struct model ** model_ptr_ptr)3687 void free_and_destroy_model(struct model **model_ptr_ptr)
3688 {
3689 	struct model *model_ptr = *model_ptr_ptr;
3690 	if(model_ptr != NULL)
3691 	{
3692 		free_model_content(model_ptr);
3693 		free(model_ptr);
3694 	}
3695 }
3696 
destroy_param(parameter * param)3697 void destroy_param(parameter* param)
3698 {
3699 	if(param->weight_label != NULL)
3700 		free(param->weight_label);
3701 	if(param->weight != NULL)
3702 		free(param->weight);
3703 	if(param->init_sol != NULL)
3704 		free(param->init_sol);
3705 }
3706 
check_parameter(const problem * prob,const parameter * param)3707 const char *check_parameter(const problem *prob, const parameter *param)
3708 {
3709 	if(param->eps <= 0)
3710 		return "eps <= 0";
3711 
3712 	if(param->C <= 0)
3713 		return "C <= 0";
3714 
3715 	if(param->p < 0)
3716 		return "p < 0";
3717 
3718 	if(prob->bias >= 0 && param->solver_type == ONECLASS_SVM)
3719 		return "prob->bias >=0, but this is ignored in ONECLASS_SVM";
3720 
3721 	if(param->regularize_bias == 0)
3722 	{
3723 		if(prob->bias != 1.0)
3724 			return "To not regularize bias, must specify -B 1 along with -R";
3725 		if(param->solver_type != L2R_LR
3726 			&& param->solver_type != L2R_L2LOSS_SVC
3727 			&& param->solver_type != L1R_L2LOSS_SVC
3728 			&& param->solver_type != L1R_LR
3729 			&& param->solver_type != L2R_L2LOSS_SVR)
3730 			return "-R option supported only for solver L2R_LR, L2R_L2LOSS_SVC, L1R_L2LOSS_SVC, L1R_LR, and L2R_L2LOSS_SVR";
3731 	}
3732 
3733 	if(param->solver_type != L2R_LR
3734 		&& param->solver_type != L2R_L2LOSS_SVC_DUAL
3735 		&& param->solver_type != L2R_L2LOSS_SVC
3736 		&& param->solver_type != L2R_L1LOSS_SVC_DUAL
3737 		&& param->solver_type != MCSVM_CS
3738 		&& param->solver_type != L1R_L2LOSS_SVC
3739 		&& param->solver_type != L1R_LR
3740 		&& param->solver_type != L2R_LR_DUAL
3741 		&& param->solver_type != L2R_L2LOSS_SVR
3742 		&& param->solver_type != L2R_L2LOSS_SVR_DUAL
3743 		&& param->solver_type != L2R_L1LOSS_SVR_DUAL
3744 		&& param->solver_type != ONECLASS_SVM)
3745 		return "unknown solver type";
3746 
3747 	if(param->init_sol != NULL
3748 		&& param->solver_type != L2R_LR
3749 		&& param->solver_type != L2R_L2LOSS_SVC
3750 		&& param->solver_type != L2R_L2LOSS_SVR)
3751 		return "Initial-solution specification supported only for solvers L2R_LR, L2R_L2LOSS_SVC, and L2R_L2LOSS_SVR";
3752 
3753 	return NULL;
3754 }
3755 
check_probability_model(const struct model * model_)3756 int check_probability_model(const struct model *model_)
3757 {
3758 	return (model_->param.solver_type==L2R_LR ||
3759 			model_->param.solver_type==L2R_LR_DUAL ||
3760 			model_->param.solver_type==L1R_LR);
3761 }
3762 
check_regression_model(const struct model * model_)3763 int check_regression_model(const struct model *model_)
3764 {
3765 	return (model_->param.solver_type==L2R_L2LOSS_SVR ||
3766 			model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
3767 			model_->param.solver_type==L2R_L2LOSS_SVR_DUAL);
3768 }
3769 
check_oneclass_model(const struct model * model_)3770 int check_oneclass_model(const struct model *model_)
3771 {
3772 	return model_->param.solver_type == ONECLASS_SVM;
3773 }
3774 
set_print_string_function(void (* print_func)(const char *))3775 void set_print_string_function(void (*print_func)(const char*))
3776 {
3777 	if (print_func == NULL)
3778 		liblinear_print_string = &print_string_stdout;
3779 	else
3780 		liblinear_print_string = print_func;
3781 }
3782 
3783