1 /** @file svm.c
2  ** @brief Support Vector Machines (SVM) - Implementation
3  ** @author Milan Sulc
4  ** @author Daniele Perrone
5  ** @author Andrea Vedaldi
6  **/
7 
8 /*
9 Copyright (C) 2013 Milan Sulc.
10 Copyright (C) 2012 Daniele Perrone.
11 Copyright (C) 2011-13 Andrea Vedaldi.
12 
13 All rights reserved.
14 
15 This file is part of the VLFeat library and is made available under
16 the terms of the BSD license (see the COPYING file).
17 */
18 
19 /** @file svm.h
20  ** @see @ref svm.
21  **/
22 
23 /**
24 <!-- ------------------------------------------------------------- -->
25 @page svm Support Vector Machines (SVM)
26 @author Milan Sulc
27 @author Daniele Perrone
28 @author Andrea Vedaldi
29 @tableofcontents
30 <!-- ------------------------------------------------------------- -->
31 
32 *Support Vector Machines* (SVMs) are one of the most popular types of
33 discriminate classifiers. VLFeat implements two solvers, SGD and SDCA,
34 capable of learning linear SVMs on a large scale. These linear solvers
35 can be combined with explicit feature maps to learn non-linear models
36 as well. The solver supports a few variants of the standard
37 SVM formulation, including using loss functions other than the hinge
38 loss.
39 
40 @ref svm-starting demonstrates how to use VLFeat to learn an SVM.
41 Information on SVMs and the corresponding optimization algorithms as
42 implemented by VLFeat are given in:
43 
44 - @subpage svm-fundamentals - Linear SVMs and their learning.
45 - @subpage svm-advanced - Loss functions, dual objective, and other details.
46 - @subpage svm-sgd - The SGD algorithm.
47 - @subpage svm-sdca - The SDCA algorithm.
48 
49 <!-- ------------------------------------------------------------- -->
50 @section svm-starting Getting started
51 <!-- ------------------------------------------------------------- -->
52 
53 This section demonstrates how to learn an SVM by using VLFeat. SVM
54 learning is implemented by the ::VlSvm object type. Let's
55 start by a complete example:
56 
57 @code
58 #include <stdio.h>
59 #include <vl/svm.h>
60 
61 int main()
62 {
63   vl_size const numData = 4 ;
64   vl_size const dimension = 2 ;
65   double x [dimension * numData] = {
66     0.0, -0.5,
67     0.6, -0.3,
68     0.0,  0.5
69     0.6,  0.0} ;
70   double y [numData] = {1, 1, -1, 1} ;
71   double lambda = 0.01;
72   double * const model ;
73   double bias ;
74 
75   VlSvm * svm = vl_svm_new(VlSvmSolverSgd,
76                            x, dimension, numData,
77                            y,
78                            lambda) ;
79   vl_svm_train(svm) ;
80 
81   model = vl_svm_get_model(svm) ;
82   bias = vl_svm_get_bias(svm) ;
83 
84   printf("model w = [ %f , %f ] , bias b = %f \n",
85          model[0],
86          model[1],
87          bias);
88 
89   vl_svm_delete(svm) ;
90   return 0;
91 }
92 @endcode
93 
94 This code learns a binary linear SVM using the SGD algorithm on
95 four two-dimensional points using 0.01 as regularization parameter.
96 
97 ::VlSvmSolverSdca can be specified in place of ::VlSvmSolverSdca
98 in orer to use the SDCA algorithm instead.
99 
100 Convergence and other diagnostic information can be obtained after
101 training by using the ::vl_svm_get_statistics function. Algorithms
102 regularly check for convergence (usally after each pass over the data).
103 The ::vl_svm_set_diagnostic_function can be used to specify a callback
104 to be invoked when diagnostic is run. This can be used, for example,
105 to dump information on the screen as the algorithm progresses.
106 
107 Convergence is reached after a maximum number of iterations
108 (::vl_svm_set_max_num_iterations) or after a given criterion falls
109 below a threshold (::vl_svm_set_epsilon). The meaning of these
110 may depend on the specific algorithm (see @ref svm for further details).
111 
112 ::VlSvm is a quite powerful object. Algorithms only need to perform
113 inner product and accumulation operation on the data (see @ref svm-advanced).
114 This is used to abstract from the data type and support almost anything
115 by speciying just two functions (::vl_svm_set_data_functions).
116 
117 A simple interface to this advanced functionality is provided by the
118 ::VlSvmDataset object. This supports natively @c float and @c double
119 data types, as well as applying on the fly the homogeneous kernel map
120 (@ref homkermap). This is exemplified in @ref svmdataset-starting.
121 
122 */
123 
124 /**
125 <!-- ------------------------------------------------------------- -->
126 @page svm-fundamentals SVM fundamentals
127 @tableofcontents
128 <!-- ------------------------------------------------------------- -->
129 
130 This page introduces the SVM formulation used in VLFeat. See @ref svm
131 for more information on VLFeat SVM support.
132 
133 Let $ \bx \in \real^d $ be a vector representing, for example, an
134 image, an audio track, or a fragment of text. Our goal is to design a
135 *classifier*, i.e. a function that associates to each vector $\bx$ a
136 positive or negative label based on a desired criterion, for example
137 the fact that the image contains or not a cat, that the audio track
138 contains or not English speech, or that the text is or not a
139 scientific paper.
140 
141 The vector $\bx$ is classified by looking at the sign of a *linear
142 scoring function* $\langle \bx, \bw \rangle$. The goal of learning is
143 to estimate the parameter $\bw \in \real^d$ in such a way that the
144 score is positive if the vector $\bx$ belongs to the positive class
145 and negative otherwise. In fact, in the standard SVM formulation the
146 the goal is to have a score of *at least 1* in the first case, and of
147 *at most -1* in the second one, imposing a *margin*.
148 
149 The parameter $\bw$ is estimated or *learned* by fitting the scoring
150 function to a training set of $n$ example pairs $(\bx_i,y_i),
151 i=1,\dots,n$. Here $y_i \in \{-1,1\}$ are the *ground truth labels* of
152 the corresponding example vectors. The fit quality is measured by a
153 *loss function* which, in standard SVMs, is the *hinge loss*:
154 
155 \[
156 \ell_i(\langle \bw,\bx\rangle) = \max\{0, 1 - y_i \langle \bw,\bx\rangle\}.
157 \]
158 
159 Note that the hinge loss is zero only if the score $\langle
160 \bw,\bx\rangle$ is at least 1 or at most -1, depending on the label
161 $y_i$.
162 
163 Fitting the training data is usually insufficient. In order for the
164 scoring function *generalize to future data* as well, it is usually
165 preferable to trade off the fitting accuracy with the *regularity* of
166 the learned scoring function $\langle \bx, \bw \rangle$. Regularity in
167 the standard formulation is measured by the norm of the parameter
168 vector $\|\bw\|^2$ (see @ref svm-advanced). Averaging the loss on all
169 training samples and adding to it the regularizer weighed by a
170 parameter $\lambda$ yields the *regularized loss objective*
171 
172 @f{equation}{
173 \boxed{\displaystyle
174 E(\bw) =  \frac{\lambda}{2} \left\| \bw \right\|^2
175 + \frac{1}{n} \sum_{i=1}^n \max\{0, 1 - y_i \langle \bw,\bx\rangle\}.
176 \label{e:svm-primal-hinge}
177 }
178 @f}
179 
180 Note that this objective function is *convex*, so that there exists a
181 single global optimum.
182 
183 The scoring function $\langle \bx, \bw \rangle$ considered so far has
184 been linear and unbiased. @ref svm-bias discusses how a bias term can
185 be added to the SVM and @ref svm-feature-maps shows how non-linear
186 SVMs can be reduced to the linear case by computing suitable feature
187 maps.
188 
189 @ref svm-learning shows how VLFeat can be used to learn an SVM by
190 minimizing $E(\bw)$.
191 
192 <!-- ------------------------------------------------------------- -->
193 @section svm-learning Learning
194 <!-- ------------------------------------------------------------- -->
195 
196 Learning an SVM amounts to finding the minimizer $\bw^*$ of the cost
197 function $E(\bw)$. While there are dozens of methods that can be used
198 to do so, VLFeat implements two large scale methods, designed to work
199 with linear SVMs (see @ref svm-feature-maps to go beyond linear):
200 
201 - @ref svm-sgd
202 - @ref svm-sdca
203 
204 Using these solvers is exemplified in @ref svm-starting.
205 
206 <!-- ------------------------------------------------------------- -->
207 @section svm-bias Adding a bias
208 <!-- ------------------------------------------------------------- -->
209 
210 It is common to add to the SVM scoring function a *bias term* $b$, and
211 to consider the score $\langle \bx,\bw \rangle + b$. In practice the
212 bias term can be crucial to fit the training data optimally, as there
213 is no reason why the inner products $\langle \bx,\bw \rangle$ should
214 be naturally centered at zero.
215 
216 Some SVM learning algorithms can estimate both $\bw$ and $b$
217 directly. However, other algorithms such as SGD and SDCA cannot. In
218 this case, a simple workaround is to add a constant component $B > 0$
219 (we call this constant the *bias multiplier*) to the data,
220 i.e. consider the extended data vectors:
221 \[
222 \bar \bx = \begin{bmatrix} \bx \\ B \end{bmatrix},
223 \quad
224 \bar \bw = \begin{bmatrix} \bw \\ w_b \end{bmatrix}.
225 \]
226 In this manner the scoring function incorporates implicitly a bias $b = B w_b$:
227 \[
228 \langle \bar\bx, \bar\bw \rangle =
229 \langle \bx, \bw \rangle + B w_b.
230 \]
231 
232 The disadvantage of this reduction is that the term $w_b^2$ becomes
233 part of the SVM regularizer, which shrinks the bias $b$ towards
234 zero. This effect can be alleviated by making $B$ sufficiently large,
235 because in this case $\|\bw\|^2 \gg w_b^2$ and the shrinking effect is
236 negligible.
237 
238 Unfortunately, making $B$ too large makes the problem numerically
239 unbalanced, so a reasonable trade-off between shrinkage and stability
240 is generally sought. Typically, a good trade-off is obtained by
241 normalizing the data to have unitary Euclidean norm and then choosing
242 $B \in [1, 10]$.
243 
244 Specific implementations of SGD and SDCA may provide explicit support
245 to learn the bias in this manner, but it is important to understand
246 the implications on speed and accuracy of the learning if this is
247 done.
248 
249 <!-- ------------------------------------------------------------- -->
250 @section svm-feature-maps Non-linear SVMs and feature maps
251 <!-- ------------------------------------------------------------- -->
252 
253 So far only linear scoring function $\langle \bx,\bw \rangle$ have
254 been considered. Implicitly, however, this assumes that the objects to
255 be classified (e.g. images) have been encoded as vectors $\bx$ in a
256 way that makes linear classification possible. This encoding step can
257 be made explicit by introducing the *feature map* $\Phi(\bx) \in
258 \real^d$. Including the feature map yields a scoring function
259 *non-linear* in $\bx$:
260 \[
261 \bx\in\mathcal{X} \quad\longrightarrow\quad \langle \Phi(\bx), \bw \rangle.
262 \]
263 The nature of the input space $\mathcal{X}$ can be arbitrary and might
264 not have a vector space structure at all.
265 
266 The representation or encoding captures a notion of *similarity*
267 between objects: if two vectors $\Phi(\bx_1)$ and $\Phi(\bx_2)$ are
268 similar, then their scores will also be similar. Note that choosing a
269 feature map amounts to incorporating this information in the model
270 *prior* to learning.
271 
272 The relation of feature maps to similarity functions is formalized by
273 the notion of a *kernel*, a positive definite function $K(\bx,\bx')$
274 measuring the similarity of a pair of objects. A feature map defines a
275 kernel by
276 
277 \[
278 K(\bx,\bx') = \langle \Phi(\bx),\Phi(\bx') \rangle.
279 \]
280 
281 Viceversa, any kernel function can be represented by a feature map in
282 this manner, establishing an equivalence.
283 
284 So far, all solvers in VLFeat assume that the feature map $\Psi(\bx)$
285 can be explicitly computed. Although classically kernels were
286 introduced to generalize solvers to non-linear SVMs for which a
287 feature map *cannot* be computed (e.g. for a Gaussian kernel the
288 feature map is infinite dimensional), in practice using explicit
289 feature representations allow to use much faster solvers, so it makes
290 sense to *reverse* this process.
291 */
292 
293 /**
294 <!-- ------------------------------------------------------------- -->
295 @page svm-advanced Advanced SVM topics
296 @tableofcontents
297 <!-- ------------------------------------------------------------- -->
298 
299 This page discusses advanced SVM topics. For an introduction to SVMs,
300 please refer to @ref svm and @ref svm-fundamentals.
301 
302 <!-- ------------------------------------------------------------- -->
303 @section svm-loss-functions Loss functions
304 <!-- ------------------------------------------------------------- -->
305 
306 The SVM formulation given in @ref svm-fundamentals uses the
307 hinge loss, which is only one of a variety of loss functions that
308 are often used for SVMs. More in general, one
309 can consider the objective
310 
311 @f{equation}{
312 E(\bw) =  \frac{\lambda}{2} \left\| \bw \right\|^2 + \frac{1}{n} \sum_{i=1}^n \ell_i(\langle \bw,\bx\rangle).
313 \label{e:svm-primal}
314 @f}
315 
316 where the loss $\ell_i(z)$ is a convex function of the scalar variable
317 $z$. Losses differ by: (i) their purpose (some are suitable for
318 classification, other for regression), (ii) their smoothness (which
319 usually affects how quickly the SVM objective function can be
320 minimized), and (iii) their statistical interpretation (for example
321 the logistic loss can be used to learn logistic models).
322 
323 Concrete examples are the:
324 
325 <table>
326 <tr>
327 <td>Name</td>
328 <td>Loss $\ell_i(z)$</td>
329 <td>Description</td>
330 </tr>
331 <tr>
332 <td>Hinge</td>
333 <td>$\max\{0, 1-y_i z\}$</td>
334 <td>The standard SVM loss function.</td>
335 </tr>
336 <tr>
337 <td>Square hinge</td>
338 <td>$\max\{0, 1-y_i z\}^2$</td>
339 <td>The standard SVM loss function, but squared. This version is
340 smoother and may yield numerically easier problems.</td>
341 </tr>
342 <tr>
343 <td>Square or l2</td>
344 <td>$(y_i - z)^2$</td>
345 <td>This loss yields the ridge regression model (l2 regularised least
346 square).</td>
347 </tr>
348 <tr>
349 <td>Linear or l1</td>
350 <td>$|y_i - z|$</td>
351 <td>Another loss suitable for regression, usually more robust but
352 harder to optimize than the squared one.</td>
353 </tr>
354 <tr>
355 <td>Insensitive l1</td>
356 <td>$\max\{0, |y_i - z| - \epsilon\}$.</td>
357 <td>This is a variant of the previous loss, proposed in the original
358 Support Vector Regression formulation. Differently from the previous
359 two losses, the insensitivity may yield to a sparse selection of
360 support vectors.</td>
361 </tr>
362 <tr>
363 <td>Logistic</td>
364 <td>$\log(1 + e^{-y_i z})$</td>
365 <td>This corresponds to regularized logisitc regression. The loss can
366 be seen as a negative log-likelihood: $\ell_i(z) = -\log P[y_i | z] =
367 - \log \sigma(y_iz/2)$, where $\sigma(z) = e^z/(1 + e^z)$ is the
368 sigmoid function, mapping a score $z$ to a probability. The $1/2$
369 factor in the sigmoid is due to the fact that labels are in $\{-1,1\}$
370 rather than $\{0,1\}$ as more common for the standard sigmoid
371 model.</td>
372 </tr>
373 </table>
374 
375 <!-- ------------------------------------------------------------- -->
376 @section svm-data-abstraction Data abstraction: working with compressed data
377 <!-- ------------------------------------------------------------- -->
378 
379 VLFeat learning algorithms (SGD and SDCA) access the data by means of
380 only two operations:
381 
382 - *inner product*: computing the inner product between the model and
383 a data vector, i.e. $\langle \bw, \bx \rangle$.
384 - *accumulation*: summing a data vector to the model, i.e. $\bw
385 \leftarrow \bw + \beta \bx$.
386 
387 VLFeat learning algorithms are *parameterized* in these two
388 operations. As a consequence, the data can be stored in any format
389 suitable to the user (e.g. dense matrices, sparse matrices,
390 block-sparse matrices, disk caches, and so on) provided that these two
391 operations can be implemented efficiently. Differently from the data,
392 however, the model vector $\bw$ is represented simply as a dense array
393 of doubles. This choice is adequate in almost any case.
394 
395 A particularly useful aspect of this design choice is that the
396 training data can be store in *compressed format* (for example by
397 using product quantization (PQ)). Furthermore, higher-dimensional
398 encodings such as the homogeneous kernel map (@ref homkermap) and the
399 intersection kernel map can be *computed on the fly*. Such techniques
400 are very important when dealing with GBs of data.
401 
402 <!-- ------------------------------------------------------------- -->
403 @section svm-dual-problem Dual problem
404 <!-- ------------------------------------------------------------- -->
405 
406 In optimization, the *dual objective* $D(\balpha)$ of the SVM
407 objective $E(\bw)$ is of great interest. To obtain the dual objective,
408 one starts by approximating each loss term from below by a family of planes:
409 \[
410 \ell_i(z) = \sup_{u} (u z - \ell_i^*(u) ),
411 \qquad
412 \ell_i^*(u) = \sup_{z} (z u - \ell_i(z) )
413 \]
414 where $\ell_i^*(u)$ is the *dual conjugate* of the loss and gives the
415 intercept of each approximating plane as a function of the slope. When
416 the loss function is convex, the approximation is in fact exact. Examples
417 include:
418 
419 <table>
420 <tr>
421 <td>Name</td>
422 <td>Loss $\ell_i(z)$</td>
423 <td>Conjugate loss $\ell_i^*(u)$</td>
424 </tr>
425 <tr>
426 <td>Hinge</td>
427 <td>$\max\{0, 1-y_i z\}$</td>
428 <td>\[
429 \ell_i^*(u) =
430 \begin{cases}
431 y_i u, & -1 \leq y_i u \leq 0, \\
432 +\infty, & \text{otherwise}
433 \end{cases}
434 \]</td>
435 </tr>
436 <tr>
437 <td>Square hinge</td>
438 <td>$\max\{0, 1-y_i z\}^2$</td>
439 <td>\[\ell_i^*(u) =
440 \begin{cases}
441 y_i u + \frac{u^2}{4}, & y_i u \leq 0, \\
442 +\infty, & \text{otherwise} \\
443 \end{cases}\]</td>
444 </tr>
445 <tr>
446 <td>Linear or l1</td>
447 <td>$|y_i - z|$</td>
448 <td>\[\ell_i^*(u) =
449 \begin{cases}
450 y_i u, & -1 \leq y_i u \leq 1, \\
451 +\infty, & \text{otherwise} \\
452 \end{cases}\]</td>
453 </tr>
454 <tr>
455 <td>Square or l2</td>
456 <td>$(y_i - z)^2$</td>
457 <td>\[\ell_i^*(u)=y_iu + \frac{u^2}{4}\]</td>
458 </tr>
459 <tr>
460 <td>Insensitive l1</td>
461 <td>$\max\{0, |y_i - z| - \epsilon\}$.</td>
462 <td></td>
463 </tr>
464 <tr>
465 <td>Logistic</td>
466 <td>$\log(1 + e^{-y_i z})$</td>
467 <td>\[\ell_i^*(u) =
468  \begin{cases}
469  (1+u) \log(1+u) - u \log(-u), & -1 \leq y_i u \leq 0, \\
470  +\infty, & \text{otherwise} \\
471  \end{cases}\]
472 </td>
473 </tr>
474 </table>
475 
476 Since each plane $- z \alpha_i - \ell^*_i(-\alpha_i) \leq \ell_i(z)$
477 bounds the loss from below, by substituting in $E(\bw)$ one can write
478 a lower bound for the SVM objective
479 \[
480 F(\bw,\balpha) = \frac{\lambda}{2} \|\bw\|^2 -
481 \frac{1}{n}\sum_{i=1}^n (\bw^\top \bx_i\alpha_i + \ell_i^*(-\alpha_i))
482 \leq E(\bw).
483 \]
484 for each setting of the *dual variables* $\alpha_i$. The dual
485 objective function $D(\balpha)$ is obtained by minimizing the lower
486 bound $F(\bw,\balpha)$ w.r.t. to $\bw$:
487 \[
488 D(\balpha) = \inf_{\bw} F(\bw,\balpha) \leq E(\bw).
489 \]
490 The minimizer and the dual objective are now easy to find:
491 \[
492 \boxed{\displaystyle
493 \bw(\balpha) =
494 \frac{1}{\lambda n}
495 \sum_{i=1}^n \bx_i \alpha_i = \frac{1}{\lambda n} X\balpha,
496 \quad
497 D(\balpha) = - \frac{1}{2\lambda n^2} \balpha^\top X^\top X \balpha +
498 \frac{1}{n} \sum_{i=1}^n - \ell_i^*(-\alpha_i)
499 }
500 \]
501 where $X = [\bx_1, \dots, \bx_n]$ is the data matrix. Since the dual
502 is uniformly smaller than the primal, one has the *duality gap* bound:
503 \[
504 D(\balpha) \leq P(\bw^*) \leq P(\bw(\balpha))
505 \]
506 This bound can be used to evaluate how far off $\bw(\balpha)$ is from
507 the primal minimizer $\bw^*$. In fact, due to convexity, this bound
508 can be shown to be zero when $\balpha^*$ is the dual maximizer (strong
509 duality):
510 \[
511 D(\balpha^*) = P(\bw^*) = P(\bw(\balpha^*)),
512 \quad \bw^* = \bw(\balpha^*).
513 \]
514 
515 <!-- ------------------------------------------------------------- -->
516 @section svm-C Parametrization in C
517 <!-- ------------------------------------------------------------- -->
518 
519 Often a slightly different form of the SVM objective is considered,
520 where a parameter $C$ is used to scale the loss instead of the regularizer:
521 
522 \[
523 E_C(\bw) = \frac{1}{2} \|\bw\|^2 + C \sum_{i=1}^n \ell_i(\langle \bx_i, \bw\rangle)
524 \]
525 
526 This and the objective function $E(\bw)$ in $\lambda$ are equivalent
527 (proportional) if
528 
529 \[
530 \lambda = \frac{1}{nC},
531 \qquad C = \frac{1}{n\lambda}.
532 \] up to an overall scaling factor to the problem.
533 
534 **/
535 
536 /**
537 
538 <!-- ------------------------------------------------------------- -->
539 @page svm-sdca Stochastic Dual Coordinate Ascent
540 @tableofcontents
541 <!-- ------------------------------------------------------------- -->
542 
543 This page describes the *Stochastic Dual Coordinate Ascent* (SDCA)
544 linear SVM solver. Please see @ref svm for an overview of VLFeat SVM
545 support.
546 
547 SDCA maximizes the dual SVM objective (see @ref svm-dual-problem
548 for a derivation of this expression):
549 
550 \[
551 D(\balpha) = - \frac{1}{2\lambda n^2} \balpha^\top X^\top X \balpha +
552 \frac{1}{n} \sum_{i=1}^n - \ell_i^*(-\alpha_i)
553 \]
554 
555 where $X$ is the data matrix. Recall that the primal parameter
556 corresponding to a given setting of the dual variables is:
557 
558 \[
559 \bw(\balpha) = \frac{1}{\lambda n} \sum_{i=1}^n \bx_i \alpha_i = \frac{1}{\lambda n} X\balpha
560 \]
561 
562 In its most basic form, the *SDCA algorithm* can be summarized as follows:
563 
564 - Let $\balpha_0 = 0$.
565 - Until the duality gap $P(\bw(\balpha_t)) -  D(\balpha_t) < \epsilon$
566   - Pick a dual variable $q$ uniformly at random in $1, \dots, n$.
567   - Maximize the dual with respect to this variable: $\Delta\alpha_q = \max_{\Delta\alpha_q} D(\balpha_t + \Delta\alpha_q \be_q )$
568   - Update $\balpha_{t+1} = \balpha_{t} + \be_q \Delta\alpha_q$.
569 
570 In VLFeat, we partially use the nomenclature from @cite{shwartz13a-dual} and @cite{hsieh08a-dual}.
571 
572 <!-- ------------------------------------------------------------- -->
573 @section svm-sdca-dual-max Dual coordinate maximization
574 <!-- ------------------------------------------------------------- -->
575 
576 The updated dual objective can be expanded as:
577 \[
578 D(\balpha_t + \be_q \Delta\alpha_q) =
579 \text{const.}
580 - \frac{1}{2\lambda n^2} \bx_q^\top \bx_q (\Delta\alpha_q)^2
581 - \frac{1}{n} \bx_q^\top \frac{X\alpha_t}{\lambda n} \Delta\alpha_q
582 - \frac{1}{n} \ell^*_q(- \alpha_q - \Delta\alpha_q)
583 \]
584 This can also be written as
585 @f{align*}
586 D(\balpha_t + \be_q \Delta\alpha_q) &\propto
587 - \frac{A}{2} (\Delta\alpha_q)^2
588 - B \Delta\alpha_q
589 - \ell^*_q(- \alpha_q - \Delta\alpha_q),
590 \\
591 A &= \frac{1}{\lambda n} \bx_q^\top \bx_q = \frac{1}{\lambda n} \| \bx_q \|^2,
592 \\
593 B &= \bx_q^\top \frac{X\balpha_t}{\lambda n} = \bx_q^\top \bw_t.
594 @f}
595 Maximizing this quantity in the scalar variable $\Delta\balpha$ is usually
596 not difficult. It is convenient to store and incrementally
597 update the model $\bw_t$ after the optimal step $\Delta\balpha$ has been
598 determined:
599 \[
600 \bw_t = \frac{X \balpha_t}{\lambda n},
601 \quad \bw_{t+1} = \bw_t + \frac{1}{\lambda n }\bx_q \be_q \Delta\alpha_q.
602 \]
603 
604 For example, consider the hinge loss as given in @ref svm-advanced :
605 \[
606 \ell_q^*(u) =
607 \begin{cases}
608 y_q u, & -1 \leq y_q u \leq 0, \\
609 +\infty, & \text{otherwise}.
610 \end{cases}
611 \]
612 The maximizer $\Delta\alpha_q$ of the update objective must be in the
613 range where the conjugate loss is not infinite. Ignoring such bounds,
614 the update can be obtained by setting the derivative of the objective
615 to zero, obtaining
616 \[
617 \tilde {\Delta \alpha_q}= \frac{y_q - B}{A}.
618 \]
619 Note that $B$ is simply current score associated by the SVM to
620 the sample $\bx_q$. Incorporating the constraint $-1 \leq - y_q
621 (\alpha_q + \Delta \alpha_q) \leq 0$,
622 i.e. $0 \leq y_q (\alpha_q + \Delta \alpha_q) \leq 1$, one obtains the update
623 \[
624 \Delta\alpha_q =  y_q \max\{0, \min\{1, y_q (\tilde {\Delta\alpha_q } + \alpha_q)\}\} - \alpha_q.
625 \]
626 
627 <!-- ------------------------------------------------------------ --->
628 @section svm-sdca-details Implementation details
629 <!-- ------------------------------------------------------------ --->
630 
631 Rather than visiting points completely at random, VLFeat SDCA follows
632 the best practice of visiting all the points at every epoch (pass
633 through the data), changing the order of the visit randomly by picking
634 every time a new random permutation.
635 **/
636 
637 /**
638 <!-- ------------------------------------------------------------- -->
639 @page svm-sgd Stochastic Gradient Descent
640 @tableofcontents
641 <!-- ------------------------------------------------------------- -->
642 
643 This page describes the *Stochastic Gradient Descent* (SGD) linear SVM
644 solver. SGD minimizes directly the primal SVM objective (see @ref svm):
645 
646 \[
647 E(\bw) = \frac{\lambda}{2} \left\| \bw \right\|^2 + \frac{1}{n} \sum_{i=1}^n
648 \ell_i(\langle \bw,\bx\rangle)
649 \]
650 
651 Firts, rewrite the objective as the average
652 
653 \[
654 E(\bw) = \frac{1}{n} \sum_{i=1}^n E_i(\bw),
655 \quad
656 E_i(\bw) = \frac{\lambda}{2}  \left\| \bw \right\|^2 + \ell_i(\langle \bw,\bx\rangle).
657 \]
658 
659 Then SGD performs gradient steps by considering at each iteration
660 one term $E_i(\bw)$ selected at random from this average.
661 In its most basic form, the algorithm is:
662 
663 - Start with $\bw_0 = 0$.
664 - For $t=1,2,\dots T$:
665   - Sample one index $i$ in $1,\dots,n$ uniformly at random.
666   - Compute a subgradient $\bg_t$ of $E_i(\bw)$ at $\bw_t$.
667   - Compute the learning rate $\eta_t$.
668   - Update $\bw_{t+1} = \bw_t - \eta_t \bg_t$.
669 
670 Provided that the learning rate $\eta_t$ is chosen correctly, this
671 simple algorithm is guaranteed to converge to the minimizer $\bw^*$ of
672 $E$.
673 
674 <!-- ------------------------------------------------------------- -->
675 @section svm-sgd-convergence Convergence and speed
676 <!-- ------------------------------------------------------------- -->
677 
678 The goal of the SGD algorithm is to bring the *primal suboptimality*
679 below a threshold $\epsilon_P$:
680 \[
681 E(\bw_t) - E(\bw^*) \leq \epsilon_P.
682 \]
683 
684 If the learning rate $\eta_t$ is selected appropriately, SGD can be
685 shown to converge properly. For example,
686 @cite{shalev-shwartz07pegasos} show that, since $E(\bw)$ is
687 $\lambda$-strongly convex, then using the learning rate
688 \[
689 \boxed{\eta_t = \frac{1}{\lambda t}}
690 \]
691 guarantees that the algorithm reaches primal-suboptimality $\epsilon_P$ in
692 \[
693 \tilde O\left( \frac{1}{\lambda \epsilon_P} \right).
694 \]
695 iterations. This particular SGD variant is sometimes known as PEGASOS
696 @cite{shalev-shwartz07pegasos} and is the version implemented in
697 VLFeat.
698 
699 The *convergence speed* is not sufficient to tell the *learning speed*,
700 i.e. how quickly an algorithm can learn an SVM that performs optimally
701 on the test set. The following two observations
702 can be used to link convergence speed to learning speed:
703 
704 - The regularizer strength is often heuristically selected to be
705   inversely proportional to the number of training samples: $\lambda =
706   \lambda_0 /n$. This reflects the fact that with more training data
707   the prior should count less.
708 - The primal suboptimality $\epsilon_P$ should be about the same as
709   the estimation error of the SVM primal. This estimation error is due
710   to the finite training set size and can be shown to be of the order
711   of $1/\lambda n = 1 / \lambda_0$.
712 
713 Under these two assumptions, PEGASOS can learn a linear SVM in time
714 $\tilde O(n)$, which is *linear in the number of training
715 examples*. This fares much better with $O(n^2)$ or worse of non-linear
716 SVM solvers.
717 
718 <!-- ------------------------------------------------------------- -->
719 @section svm-sgd-bias The bias term
720 <!-- ------------------------------------------------------------- -->
721 
722 Adding a bias $b$ to the SVM scoring function $\langle \bw, \bx
723 \rangle +b$ is done, as explained in @ref svm-bias, by appending a
724 constant feature $B$ (the *bias multiplier*) to the data vectors $\bx$
725 and a corresponding weight element $w_b$ to the weight vector $\bw$,
726 so that $b = B w_b$ As noted, the bias multiplier should be
727 relatively large in order to avoid shrinking the bias towards zero,
728 but small to make the optimization stable. In particular, setting $B$
729 to zero learns an unbiased SVM (::vl_svm_set_bias_multiplier).
730 
731 To counter instability caused by a large bias multiplier, the learning
732 rate of the bias is slowed down by multiplying the overall learning
733 rate $\eta_t$ by a bias-specific rate coefficient
734 (::vl_svm_set_bias_learning_rate).
735 
736 As a rule of thumb, if the data vectors $\bx$ are $l^2$ normalized (as
737 they typically should for optimal performance), then a reasonable bias
738 multiplier is in the range 1 to 10 and a reasonable bias learning rate
739 is somewhere in the range of the inverse of that (in this manner the
740 two parts of the extended feature vector $(\bx, B)$ are balanced).
741 
742 <!-- ------------------------------------------------------------- -->
743 @section svm-sgd-starting-iteration Adjusting the learning rate
744 <!-- ------------------------------------------------------------- -->
745 
746 Initially, the learning rate $\eta_t = 1/\lambda t$ is usually too
747 fast: as usually $\lambda \ll 1$, $\eta_1 \gg 1$. But this is clearly
748 excessive (for example, without a loss term, the best learning rate at
749 the first iteration is simply $\eta_1=1$, as this nails the optimum in
750 one step). Thus, the learning rate formula is modified to be $\eta_t =
751 1 / \lambda (t + t_0)$, where $t_0 \approx 2/\lambda$, which is
752 equivalent to start $t_0$ iterations later. In this manner $\eta_1
753 \approx 1/2$.
754 
755 <!-- ------------------------------------------------------------ --->
756 @subsection svm-sgd-warm-start Warm start
757 <!-- ------------------------------------------------------------ --->
758 
759 Starting from a given model $\bw$ is easy in SGD as the optimization
760 runs in the primal. However, the starting iteration index $t$ should
761 also be advanced for a warm start, as otherwise the initial setting of
762 $\bw$ is rapidly forgot (::vl_svm_set_model, ::vl_svm_set_bias,
763 ::vl_svm_set_iteration_number).
764 
765 <!-- ------------------------------------------------------------- -->
766 @section svm-sgd-details Implementation details
767 <!-- ------------------------------------------------------------- -->
768 
769 @par "Random sampling of points"
770 
771 Rather than visiting points completely at random, VLFeat SDCA follows
772 the best practice of visiting all the points at every epoch (pass
773 through the data), changing the order of the visit randomly by picking
774 every time a new random permutation.
775 
776 @par "Factored representation"
777 
778 At each iteration, the SGD algorithm updates the vector $\bw$
779 (including the additional bias component $w_b$) as $\bw_{t+1}
780 \leftarrow \bw_t - \lambda \eta_t \bw_t - \eta_t \bg_t$, where
781 $\eta_t$ is the learning rate. If the subgradient of the loss function
782 $\bg_t$ is zero at a given iteration, this amounts to simply shrink
783 $\bw$ towards the origin by multiplying it by the factor $1 - \lambda
784 \eta_t$. Thus such an iteration can be accelerated significantly by
785 representing internally $\bw_t = f_t \bu_t$, where $f_t$ is a scaling
786 factor. Then, the update becomes
787 \[
788    f_{t+1} \bu_{t+1}
789    = f_{t} \bu_{t} - \lambda \eta_t f_{t} \bu_{t} - \eta_t \bg_t
790    = (1-\lambda \eta_t) f_{t} \bu_{t} - \eta_t \bg_t.
791 \]
792 Setting $f_{t+1} = (1-\lambda \eta_t) f_{t}$, this gives the update
793 equation for $\bu_t$
794 \[
795 \bu_{t+1} = \bu_{t} - \frac{\eta_t}{f_{t+1}} \bg_t.
796 \]
797 but this step can be skipped whenever $\bg_t$ is equal to zero.
798 
799 When the bias component has a different learning rate, this scheme
800 must be adjusted slightly by adding a separated factor for the bias,
801 but it is otherwise identical.
802 
803 
804 **/
805 
806 /*
807 
808 <!-- ------------------------------------------------------------ --->
809 @section svm-pegasos PEGASOS
810 <!-- ------------------------------------------------------------ --->
811 
812 <!-- ------------------------------------------------------------ --->
813 @subsection svm-pegasos-algorithm Algorithm
814 <!-- ------------------------------------------------------------ --->
815 
816 PEGASOS @cite{shalev-shwartz07pegasos} is a stochastic subgradient
817 optimizer. At the <em>t</em>-th iteration the algorithm:
818 
819 - Samples uniformly at random as subset @f$ A_t @f$ of <em>k</em> of
820 training pairs @f$(x,y)@f$ from the <em>m</em> pairs provided for
821 training (this subset is called mini batch).
822 - Computes a subgradient @f$ \nabla_t @f$ of the function @f$ E_t(w) =
823 \frac{1}{2}\|w\|^2 + \frac{1}{k} \sum_{(x,y) \in A_t} \ell(w;(x,y))
824 @f$ (this is the SVM objective function restricted to the
825 minibatch).
826 - Compute an intermediate weight vector @f$ w_{t+1/2} @f$ by doing a
827 step @f$ w_{t+1/2} = w_t - \alpha_t \nabla_t @f$ with learning rate
828 @f$ \alpha_t = 1/(\eta t) @f$ along the subgradient. Note that the
829 learning rate is inversely proportional to the iteration number.
830 - Back projects the weight vector @f$ w_{t+1/2} @f$ on the
831 hypersphere of radius @f$ \sqrt{\lambda} @f$ to obtain the next
832 model estimate @f$ w_{t+1} @f$:
833 @f[
834 w_t = \min\{1, \sqrt{\lambda}/\|w\|\} w_{t+1/2}.
835 @f]
836 The hypersphere is guaranteed to contain the optimal weight vector
837 @f$ w^* @f$.
838 
839 VLFeat implementation fixes to one the size of the mini batches @f$ k
840 @f$.
841 
842 
843 <!-- ------------------------------------------------------------ --->
844 @subsection svm-pegasos-permutation Permutation
845 <!-- ------------------------------------------------------------ --->
846 
847 VLFeat PEGASOS can use a user-defined permutation to decide the order
848 in which data points are visited (instead of using random
849 sampling). By specifying a permutation the algorithm is guaranteed to
850 visit each data point exactly once in each loop. The permutation needs
851 not to be bijective. This can be used to visit certain data samples
852 more or less often than others, implicitly reweighting their relative
853 importance in the SVM objective function. This can be used to balance
854 the data.
855 
856 <!-- ------------------------------------------------------------ --->
857 @subsection svm-pegasos-kernels Non-linear kernels
858 <!-- ------------------------------------------------------------ --->
859 
860 PEGASOS can be extended to non-linear kernels, but the algorithm is
861 not particularly efficient in this setting [1]. When possible, it may
862 be preferable to work with explicit feature maps.
863 
864 Let @f$ k(x,y) @f$ be a positive definite kernel. A <em>feature
865 map</em> is a function @f$ \Psi(x) @f$ such that @f$ k(x,y) = \langle
866 \Psi(x), \Psi(y) \rangle @f$. Using this representation the non-linear
867 SVM learning objective function writes:
868 
869 @f[
870 \min_{w} \frac{\lambda}{2} \|w\|^2 + \frac{1}{m} \sum_{i=1}^n
871 \ell(w; (\Psi(x)_i,y_i)).
872 @f]
873 
874 Thus the only difference with the linear case is that the feature @f$
875 \Psi(x) @f$ is used in place of the data @f$ x @f$.
876 
877 @f$ \Psi(x) @f$ can be learned off-line, for instance by using the
878 incomplete Cholesky decomposition @f$ V^\top V @f$ of the Gram matrix
879 @f$ K = [k(x_i,x_j)] @f$ (in this case @f$ \Psi(x_i) @f$ is the
880 <em>i</em>-th columns of <em>V</em>). Alternatively, for additive
881 kernels (e.g. intersection, Chi2) the explicit feature map computed by
882 @ref homkermap.h can be used.
883 
884 For additive kernels it is also possible to perform the feature
885 expansion online inside the solver, setting the specific feature map
886 via ::vl_svmdataset_set_map. This is particular useful to keep the
887 size of the training data small, when the number of the samples is big
888 or the memory is limited.
889 */
890 
891 #include "svm.h"
892 #include "mathop.h"
893 #include <string.h>
894 
895 struct VlSvm_ {
896   VlSvmSolverType solver ;      /**< SVM solver type. */
897 
898   vl_size dimension ;           /**< Model dimension. */
899   double * model ;              /**< Model ($\bw$ vector). */
900   double bias ;                 /**< Bias. */
901   double biasMultiplier ;       /**< Bias feature multiplier. */
902 
903   /* valid during a run */
904   double lambda ;               /**< Regularizer multiplier. */
905   void const * data ;
906   vl_size numData ;
907   double const * labels ;       /**< Data labels. */
908   double const * weights ;      /**< Data weights. */
909 
910   VlSvmDataset * ownDataset ;   /**< Optional owned dataset. */
911 
912   VlSvmDiagnosticFunction diagnosticFn ;
913   void * diagnosticFnData ;
914   vl_size diagnosticFrequency ; /**< Frequency of diagnostic. */
915 
916   VlSvmLossFunction lossFn ;
917   VlSvmLossFunction conjugateLossFn ;
918   VlSvmLossFunction lossDerivativeFn ;
919   VlSvmDcaUpdateFunction dcaUpdateFn ;
920   VlSvmInnerProductFunction innerProductFn ;
921   VlSvmAccumulateFunction accumulateFn ;
922 
923   vl_size iteration ;           /**< Current iterations number. */
924   vl_size maxNumIterations ;    /**< Maximum number of iterations. */
925   double epsilon ;              /**< Stopping threshold. */
926 
927   /* Book keeping */
928   VlSvmStatistics statistics ;  /**< Statistcs. */
929   double * scores ;
930 
931   /* SGD specific */
932   double  biasLearningRate ;    /**< Bias learning rate. */
933 
934   /* SDCA specific */
935   double * alpha ;              /**< Dual variables. */
936 } ;
937 
938 /* ---------------------------------------------------------------- */
939 
940 /** @brief Create a new object with plain data.
941  ** @param type type of SMV solver.
942  ** @param data a pointer to a matrix of data.
943  ** @param dimension dimension of the SVM model.
944  ** @param numData number of training samples.
945  ** @param labels training labels.
946  ** @param lambda regularizer parameter.
947  ** @return the new object.
948  **
949  ** @a data has one column per sample, in @c double format.
950  ** More advanced inputs can be used with ::vl_svm_new_with_dataset
951  ** and ::vl_svm_new_with_abstract_data.
952  **
953  ** @sa ::vl_svm_delete
954  **/
955 
956 VlSvm *
vl_svm_new(VlSvmSolverType type,double const * data,vl_size dimension,vl_size numData,double const * labels,double lambda)957 vl_svm_new (VlSvmSolverType type,
958             double const * data,
959             vl_size dimension,
960             vl_size numData,
961             double const * labels,
962             double lambda)
963 {
964   VlSvmDataset * dataset = vl_svmdataset_new(VL_TYPE_DOUBLE, (void*)data, dimension, numData) ;
965   VlSvm * self = vl_svm_new_with_dataset (type, dataset, labels, lambda) ;
966   self->ownDataset = dataset ;
967   return self ;
968 }
969 
970 /** @brief Create a new object with a dataset.
971  ** @param solver type of SMV solver.
972  ** @param dataset SVM dataset object
973  ** @param labels training samples labels.
974  ** @param lambda regularizer parameter.
975  ** @return the new object.
976  ** @sa ::vl_svm_delete
977  **/
978 
979 VlSvm *
vl_svm_new_with_dataset(VlSvmSolverType solver,VlSvmDataset * dataset,double const * labels,double lambda)980 vl_svm_new_with_dataset (VlSvmSolverType solver,
981                          VlSvmDataset * dataset,
982                          double const * labels,
983                          double lambda)
984 {
985   VlSvm * self = vl_svm_new_with_abstract_data (solver,
986                                              dataset,
987                                              vl_svmdataset_get_dimension(dataset),
988                                              vl_svmdataset_get_num_data(dataset),
989                                              labels,
990                                              lambda) ;
991   vl_svm_set_data_functions (self,
992                              vl_svmdataset_get_inner_product_function(dataset),
993                              vl_svmdataset_get_accumulate_function(dataset)) ;
994   return self ;
995 }
996 
997 /** @brief Create a new object with abstract data.
998  ** @param solver type of SMV solver.
999  ** @param data pointer to the data.
1000  ** @param dimension dimension of the SVM model.
1001  ** @param numData num training samples.
1002  ** @param labels training samples labels.
1003  ** @param lambda regularizer parameter.
1004  ** @return the new object.
1005  **
1006  ** After calling this function, ::vl_svm_set_data_functions *must*
1007  ** be used to setup suitable callbacks for the inner product
1008  ** and accumulation operations (@see svm-data-abstraction).
1009  **
1010  ** @sa ::vl_svm_delete
1011  **/
1012 
1013 VlSvm *
vl_svm_new_with_abstract_data(VlSvmSolverType solver,void * data,vl_size dimension,vl_size numData,double const * labels,double lambda)1014 vl_svm_new_with_abstract_data (VlSvmSolverType solver,
1015                                void * data,
1016                                vl_size dimension,
1017                                vl_size numData,
1018                                double const * labels,
1019                                double lambda)
1020 {
1021   VlSvm * self = vl_calloc(1,sizeof(VlSvm)) ;
1022 
1023   assert(dimension >= 1) ;
1024   assert(numData >= 1) ;
1025   assert(labels) ;
1026 
1027   self->solver = solver ;
1028 
1029   self->dimension = dimension ;
1030   self->model = 0 ;
1031   self->bias = 0 ;
1032   self->biasMultiplier = 1.0 ;
1033 
1034   self->lambda = lambda ;
1035   self->data = data ;
1036   self->numData = numData ;
1037   self->labels = labels ;
1038 
1039   self->diagnosticFrequency = numData ;
1040   self->diagnosticFn = 0 ;
1041   self->diagnosticFnData = 0 ;
1042 
1043   self->lossFn = vl_svm_hinge_loss ;
1044   self->conjugateLossFn = vl_svm_hinge_conjugate_loss ;
1045   self->lossDerivativeFn = vl_svm_hinge_loss_derivative ;
1046   self->dcaUpdateFn = vl_svm_hinge_dca_update ;
1047 
1048   self->innerProductFn = 0 ;
1049   self->accumulateFn = 0 ;
1050 
1051   self->iteration = 0 ;
1052   self->maxNumIterations = VL_MAX((double)numData, vl_ceil_f(10.0 / lambda)) ;
1053   self->epsilon = 1e-2 ;
1054 
1055   /* SGD */
1056   self->biasLearningRate = 0.01 ;
1057 
1058   /* SDCA */
1059   self->alpha = 0 ;
1060 
1061   /* allocations */
1062   self->model = vl_calloc(dimension, sizeof(double)) ;
1063   if (self->model == NULL) goto err_alloc ;
1064 
1065   if (self->solver == VlSvmSolverSdca) {
1066     self->alpha = vl_calloc(self->numData, sizeof(double)) ;
1067     if (self->alpha == NULL) goto err_alloc ;
1068   }
1069 
1070   self->scores = vl_calloc(numData, sizeof(double)) ;
1071   if (self->scores == NULL) goto err_alloc ;
1072 
1073   return self ;
1074 
1075 err_alloc:
1076   if (self->scores) {
1077     vl_free (self->scores) ;
1078     self->scores = 0 ;
1079   }
1080   if (self->model) {
1081     vl_free (self->model) ;
1082     self->model = 0 ;
1083   }
1084   if (self->alpha) {
1085     vl_free (self->alpha) ;
1086     self->alpha = 0 ;
1087   }
1088   return 0 ;
1089 }
1090 
1091 /** @brief Delete object.
1092  ** @param self object.
1093  ** @sa ::vl_svm_new
1094  **/
1095 
1096 void
vl_svm_delete(VlSvm * self)1097 vl_svm_delete (VlSvm * self)
1098 {
1099   if (self->model) {
1100     vl_free (self->model) ;
1101     self->model = 0 ;
1102   }
1103   if (self->alpha) {
1104     vl_free (self->alpha) ;
1105     self->alpha = 0 ;
1106   }
1107   if (self->ownDataset) {
1108     vl_svmdataset_delete(self->ownDataset) ;
1109     self->ownDataset = 0 ;
1110   }
1111   vl_free (self) ;
1112 }
1113 
1114 /* ---------------------------------------------------------------- */
1115 /*                                              Setters and getters */
1116 /* ---------------------------------------------------------------- */
1117 
1118 /** @brief Set the convergence threshold
1119  ** @param self object
1120  ** @param epsilon threshold (non-negative).
1121  **/
1122 
vl_svm_set_epsilon(VlSvm * self,double epsilon)1123 void vl_svm_set_epsilon (VlSvm *self, double epsilon)
1124 {
1125   assert(self) ;
1126   assert(epsilon >= 0) ;
1127   self->epsilon = epsilon ;
1128 }
1129 
1130 /** @brief Get the convergence threshold
1131  ** @param self object
1132  ** @return epsilon threshold.
1133  **/
1134 
vl_svm_get_epsilon(VlSvm const * self)1135 double vl_svm_get_epsilon (VlSvm const *self)
1136 {
1137   assert(self) ;
1138   return self->epsilon ;
1139 }
1140 
1141 /** @brief Set the bias learning rate
1142  ** @param self object
1143  ** @param rate bias learning rate (positive).
1144  **
1145  ** This parameter applies only to the SGD solver.
1146  **/
1147 
vl_svm_set_bias_learning_rate(VlSvm * self,double rate)1148 void vl_svm_set_bias_learning_rate (VlSvm *self, double rate)
1149 {
1150   assert(self) ;
1151   assert(rate > 0) ;
1152   self->biasLearningRate = rate ;
1153 }
1154 
1155 /** @brief Get the bias leraning rate.
1156  ** @param self object
1157  ** @return bias learning rate.
1158  **/
1159 
vl_svm_get_bias_learning_rate(VlSvm const * self)1160 double vl_svm_get_bias_learning_rate (VlSvm const *self)
1161 {
1162   assert(self) ;
1163   return self->biasLearningRate ;
1164 }
1165 
1166 /** @brief Set the bias multiplier.
1167  ** @param self object
1168  ** @param b bias multiplier.
1169  **
1170  ** The *bias multiplier* is the value of the constant feature
1171  ** appended to the data vectors to implement the bias (@ref svm-bias).
1172  **/
1173 
vl_svm_set_bias_multiplier(VlSvm * self,double b)1174 void vl_svm_set_bias_multiplier (VlSvm * self, double b)
1175 {
1176   assert(self) ;
1177   assert(b >= 0) ;
1178   self->biasMultiplier = b ;
1179 }
1180 
1181 /** @brief Get the bias multiplier.
1182  ** @param self object.
1183  ** @return bias multiplier.
1184  **/
1185 
vl_svm_get_bias_multiplier(VlSvm const * self)1186 double vl_svm_get_bias_multiplier (VlSvm const * self)
1187 {
1188   assert(self) ;
1189   return self->biasMultiplier ;
1190 }
1191 
1192 /** @brief Set the current iteratio number.
1193  ** @param self object.
1194  ** @param n iteration number.
1195  **
1196  ** If called before training,
1197  ** this can be used with SGD for a warm start, as the net
1198  ** effect is to slow down the learning rate.
1199  **/
1200 
vl_svm_set_iteration_number(VlSvm * self,vl_uindex n)1201 void vl_svm_set_iteration_number (VlSvm *self, vl_uindex n)
1202 {
1203   assert(self) ;
1204   self->iteration = n ;
1205 }
1206 
1207 /** @brief Get the current iteration number.
1208  ** @param self object.
1209  ** @return current iteration number.
1210  **/
1211 
vl_svm_get_iteration_number(VlSvm const * self)1212 vl_size vl_svm_get_iteration_number (VlSvm const *self)
1213 {
1214   assert(self) ;
1215   return self->iteration ;
1216 }
1217 
1218 /** @brief Set the maximum number of iterations.
1219  ** @param self object.
1220  ** @param n maximum number of iterations.
1221  **/
1222 
vl_svm_set_max_num_iterations(VlSvm * self,vl_size n)1223 void vl_svm_set_max_num_iterations (VlSvm *self, vl_size n)
1224 {
1225   assert(self) ;
1226   self->maxNumIterations = n ;
1227 }
1228 
1229 /** @brief Get the maximum number of iterations.
1230  ** @param self object.
1231  ** @return maximum number of iterations.
1232  **/
1233 
vl_svm_get_max_num_iterations(VlSvm const * self)1234 vl_size vl_svm_get_max_num_iterations (VlSvm const *self)
1235 {
1236   assert(self) ;
1237   return self->maxNumIterations ;
1238 }
1239 
1240 /** @brief Set the diagnostic frequency.
1241  ** @param self object.
1242  ** @param f diagnostic frequency (@c >= 1).
1243  **
1244  ** A diagnostic round (to test for convergence and to printout
1245  ** information) is performed every @a f iterations.
1246  **/
1247 
vl_svm_set_diagnostic_frequency(VlSvm * self,vl_size f)1248 void vl_svm_set_diagnostic_frequency (VlSvm *self, vl_size f)
1249 {
1250   assert(self) ;
1251   assert(f > 0) ;
1252   self->diagnosticFrequency = f ;
1253 }
1254 
1255 /** @brief Get the diagnostic frequency.
1256  ** @param self object.
1257  ** @return diagnostic frequency.
1258  **/
1259 
vl_svm_get_diagnostic_frequency(VlSvm const * self)1260 vl_size vl_svm_get_diagnostic_frequency (VlSvm const *self)
1261 {
1262   assert(self) ;
1263   return self->diagnosticFrequency ;
1264 }
1265 
1266 /** @brief Get the SVM solver type.
1267  ** @param self object.
1268  ** @return SVM solver type.
1269  **/
1270 
vl_svm_get_solver(VlSvm const * self)1271 VlSvmSolverType vl_svm_get_solver (VlSvm const * self)
1272 {
1273   assert(self) ;
1274   return self->solver ;
1275 }
1276 
1277 /** @brief Set the regularizer parameter lambda.
1278  ** @param self object.
1279  ** @param lambda regularizer parameter.
1280  **
1281  ** Note that @a lambda is usually set when calling a
1282  ** constructor for ::VlSvm as certain parameters, such
1283  ** as the maximum number of iterations, are tuned accordingly.
1284  ** This tuning is not performed when @a lambda is changed
1285  ** using this function.
1286  **/
1287 
vl_svm_set_lambda(VlSvm * self,double lambda)1288 void vl_svm_set_lambda (VlSvm * self, double lambda)
1289 {
1290   assert(self) ;
1291   assert(lambda >= 0) ;
1292   self->lambda = lambda ;
1293 }
1294 
1295 /** @brief Get the regularizer parameter lambda.
1296  ** @param self object.
1297  ** @return diagnostic frequency.
1298  **/
1299 
vl_svm_get_lambda(VlSvm const * self)1300 double vl_svm_get_lambda (VlSvm const * self)
1301 {
1302   assert(self) ;
1303   return self->lambda ;
1304 }
1305 
1306 /** @brief Set the data weights.
1307  ** @param self object.
1308  ** @param weights data weights.
1309  **
1310  ** @a weights must be an array of non-negative weights.
1311  ** The loss of each data point is multiplied by the corresponding
1312  ** weight.
1313  **
1314  ** Set @a weights to @c NULL to weight the data uniformly by 1 (default).
1315  **
1316  ** Note that the @a weights array is *not* copied and must be valid
1317  ** througout the object lifetime (unless it is replaced).
1318  **/
1319 
vl_svm_set_weights(VlSvm * self,double const * weights)1320 void vl_svm_set_weights (VlSvm * self, double const *weights)
1321 {
1322   assert(self) ;
1323   self->weights = weights ;
1324 }
1325 
1326 /** @brief Get the data weights.
1327  ** @param self object.
1328  ** @return data weights.
1329  **/
1330 
vl_svm_get_weights(VlSvm const * self)1331 double const *vl_svm_get_weights (VlSvm const * self)
1332 {
1333   assert(self) ;
1334   return self->weights ;
1335 }
1336 
1337 /* ---------------------------------------------------------------- */
1338 /*                                                         Get data */
1339 /* ---------------------------------------------------------------- */
1340 
1341 /** @brief Get the model dimenison.
1342  ** @param self object.
1343  ** @return model dimension.
1344  **
1345  ** This is the dimensionality of the weight vector $\bw$.
1346  **/
1347 
vl_svm_get_dimension(VlSvm * self)1348 vl_size vl_svm_get_dimension (VlSvm *self)
1349 {
1350   assert(self) ;
1351   return self->dimension ;
1352 }
1353 
1354 /** @brief Get the number of data samples.
1355  ** @param self object.
1356  ** @return model number of data samples
1357  **
1358  ** This is the dimensionality of the weight vector $\bw$.
1359  **/
1360 
vl_svm_get_num_data(VlSvm * self)1361 vl_size vl_svm_get_num_data (VlSvm *self)
1362 {
1363   assert(self) ;
1364   return self->numData ;
1365 }
1366 
1367 /** @brief Get the SVM model.
1368  ** @param self object.
1369  ** @return model.
1370  **
1371  ** This is the weight vector $\bw$.
1372  **/
1373 
vl_svm_get_model(VlSvm const * self)1374 double const * vl_svm_get_model (VlSvm const *self)
1375 {
1376   assert(self) ;
1377   return self->model ;
1378 }
1379 
1380 /** @brief Set the SVM model.
1381  ** @param self object.
1382  ** @param model model.
1383  **
1384  ** The function *copies* the content of the vector @a model to the
1385  ** internal model buffer. This operation can be used for warm start
1386  ** with the SGD algorithm, but has undefined effect with the SDCA algorithm.
1387  **/
1388 
vl_svm_set_model(VlSvm * self,double const * model)1389 void vl_svm_set_model (VlSvm *self, double const *model)
1390 {
1391   assert(self) ;
1392   assert(model) ;
1393   memcpy(self->model, model, sizeof(double) * vl_svm_get_dimension(self)) ;
1394 }
1395 
1396 /** @brief Set the SVM bias.
1397  ** @param self object.
1398  ** @param b bias.
1399  **
1400  ** The function set the internal representation of the SVM bias to
1401  ** be equal to @a b (the bias multiplier
1402  ** is applied). The same remark
1403  ** that applies to ::vl_svm_set_model applies here too.
1404  **/
1405 
vl_svm_set_bias(VlSvm * self,double b)1406 void vl_svm_set_bias (VlSvm *self, double b)
1407 {
1408   assert(self);
1409   if (self->biasMultiplier) {
1410     self->bias = b / self->biasMultiplier ;
1411   }
1412 }
1413 
1414 /** @brief Get the value of the bias.
1415  ** @param self object.
1416  ** @return bias $b$.
1417  **
1418  ** The value of the bias returned already include the effect of
1419  ** bias mutliplier.
1420  **/
1421 
vl_svm_get_bias(VlSvm const * self)1422 double vl_svm_get_bias (VlSvm const *self)
1423 {
1424   assert(self) ;
1425   return self->bias * self->biasMultiplier ;
1426 }
1427 
1428 /** @brief Get the solver statistics.
1429  ** @param self object.
1430  ** @return statistics.
1431  **/
1432 
vl_svm_get_statistics(VlSvm const * self)1433 VlSvmStatistics const * vl_svm_get_statistics (VlSvm const *self)
1434 {
1435   assert(self) ;
1436   return &self->statistics ;
1437 }
1438 
1439 /** @brief Get the scores of the data points.
1440  ** @param self object.
1441  ** @return vector of scores.
1442  **
1443  ** After training or during the diagnostic callback,
1444  ** this function can be used to retrieve the scores
1445  ** of the points, i.e. $\langle \bx_i, \bw \rangle + b$.
1446  **/
1447 
vl_svm_get_scores(VlSvm const * self)1448 double const * vl_svm_get_scores (VlSvm const *self)
1449 {
1450   return self->scores ;
1451 }
1452 
1453 /* ---------------------------------------------------------------- */
1454 /*                                                        Callbacks */
1455 /* ---------------------------------------------------------------- */
1456 
1457 /** @typedef VlSvmDiagnosticFunction
1458  ** @brief SVM diagnostic function pointer.
1459  ** @param svm is an instance of ::VlSvm .
1460  **/
1461 
1462 /** @typedef VlSvmAccumulateFunction
1463  ** @brief Pointer to a function that adds to @a model the data point at
1464  ** position @a element multiplied by the constant @a multiplier.
1465  **/
1466 
1467 /** @typedef VlSvmInnerProductFunction
1468  ** @brief Pointer to a function that defines the inner product
1469  ** between the data point at position @a element and the SVM model
1470  **/
1471 
1472 /** @brief Set the diagnostic function callback
1473  ** @param self object.
1474  ** @param f diagnostic function pointer.
1475  ** @param data pointer to data used by the diagnostic function.
1476  **/
1477 
1478 void
vl_svm_set_diagnostic_function(VlSvm * self,VlSvmDiagnosticFunction f,void * data)1479 vl_svm_set_diagnostic_function (VlSvm *self, VlSvmDiagnosticFunction f, void *data) {
1480   self->diagnosticFn = f ;
1481   self->diagnosticFnData = data ;
1482 }
1483 
1484 /** @brief Set the data functions.
1485  ** @param self object.
1486  ** @param inner inner product function.
1487  ** @param acc accumulate function.
1488  **
1489  ** See @ref svm-data-abstraction.
1490  **/
1491 
vl_svm_set_data_functions(VlSvm * self,VlSvmInnerProductFunction inner,VlSvmAccumulateFunction acc)1492 void vl_svm_set_data_functions (VlSvm *self, VlSvmInnerProductFunction inner, VlSvmAccumulateFunction acc)
1493 {
1494   assert(self) ;
1495   assert(inner) ;
1496   assert(acc) ;
1497   self->innerProductFn = inner ;
1498   self->accumulateFn = acc ;
1499 }
1500 
1501 /** @brief Set the loss function callback.
1502  ** @param self object.
1503  ** @param f loss function callback.
1504  **
1505  ** Note that setting up a loss requires specifying more than just one
1506  ** callback. See @ref svm-loss-functions for details.
1507  **/
1508 
vl_svm_set_loss_function(VlSvm * self,VlSvmLossFunction f)1509 void vl_svm_set_loss_function (VlSvm *self, VlSvmLossFunction f)
1510 {
1511   assert(self) ;
1512   self->lossFn = f ;
1513 }
1514 
1515 /** @brief Set the loss derivative function callback.
1516  ** @copydetails vl_svm_set_loss_function.
1517  **/
1518 
vl_svm_set_loss_derivative_function(VlSvm * self,VlSvmLossFunction f)1519 void vl_svm_set_loss_derivative_function (VlSvm *self, VlSvmLossFunction f)
1520 {
1521   assert(self) ;
1522   self->lossDerivativeFn = f ;
1523 }
1524 
1525 /** @brief Set the conjugate loss function callback.
1526  ** @copydetails vl_svm_set_loss_function.
1527  **/
1528 
vl_svm_set_conjugate_loss_function(VlSvm * self,VlSvmLossFunction f)1529 void vl_svm_set_conjugate_loss_function (VlSvm *self, VlSvmLossFunction f)
1530 {
1531   assert(self) ;
1532   self->conjugateLossFn = f ;
1533 }
1534 
1535 /** @brief Set the DCA update function callback.
1536  ** @copydetails vl_svm_set_loss_function.
1537  **/
1538 
vl_svm_set_dca_update_function(VlSvm * self,VlSvmDcaUpdateFunction f)1539 void vl_svm_set_dca_update_function (VlSvm *self, VlSvmDcaUpdateFunction f)
1540 {
1541   assert(self) ;
1542   self->dcaUpdateFn = f ;
1543 }
1544 
1545 /** @brief Set the loss function to one of the default types.
1546  ** @param self object.
1547  ** @param loss type of loss function.
1548  ** @sa @ref svm-loss-functions.
1549  **/
1550 
1551 void
vl_svm_set_loss(VlSvm * self,VlSvmLossType loss)1552 vl_svm_set_loss (VlSvm *self, VlSvmLossType loss)
1553 {
1554 #define SETLOSS(x,y) \
1555 case VlSvmLoss ## x: \
1556   vl_svm_set_loss_function(self, vl_svm_ ## y ## _loss) ; \
1557   vl_svm_set_loss_derivative_function(self, vl_svm_ ## y ## _loss_derivative) ; \
1558   vl_svm_set_conjugate_loss_function(self, vl_svm_ ## y ## _conjugate_loss) ; \
1559   vl_svm_set_dca_update_function(self, vl_svm_ ## y ## _dca_update) ; \
1560   break;
1561 
1562   switch (loss) {
1563       SETLOSS(Hinge, hinge) ;
1564       SETLOSS(Hinge2, hinge2) ;
1565       SETLOSS(L1, l1) ;
1566       SETLOSS(L2, l2) ;
1567       SETLOSS(Logistic, logistic) ;
1568     default:
1569       assert(0) ;
1570   }
1571 }
1572 
1573 /* ---------------------------------------------------------------- */
1574 /*                                               Pre-defined losses */
1575 /* ---------------------------------------------------------------- */
1576 
1577 /** @typedef VlSvmLossFunction
1578  ** @brief SVM loss function pointer.
1579  ** @param inner inner product between sample and model $\bw^\top \bx$.
1580  ** @param label sample label $y$.
1581  ** @return value of the loss.
1582  **
1583  ** The interface is the same for a loss function, its derivative,
1584  ** or the conjugate loss.
1585  **
1586  ** @sa @ref svm-fundamentals
1587  **/
1588 
1589 /** @typedef VlSvmDcaUpdateFunction
1590  ** @brief SVM SDCA update function pointer.
1591  ** @param alpha current value of the dual variable.
1592  ** @param inner inner product $\bw^\top \bx$ of the sample with the SVM model.
1593  ** @param norm2 normalization factor $\|\bx\|^2/\lambda n$.
1594  ** @param label label $y$ of the sample.
1595  ** @return incremental update $\Delta\alpha$ of the dual variable.
1596  **
1597  ** @sa @ref svm-sdca
1598  **/
1599 
1600 /** @brief SVM hinge loss
1601  ** @copydetails VlSvmLossFunction */
1602 double
vl_svm_hinge_loss(double inner,double label)1603 vl_svm_hinge_loss (double inner, double label)
1604 {
1605   return VL_MAX(1 - label * inner, 0.0);
1606 }
1607 
1608 /** @brief SVM hinge loss derivative
1609  ** @copydetails VlSvmLossFunction */
1610 double
vl_svm_hinge_loss_derivative(double inner,double label)1611 vl_svm_hinge_loss_derivative (double inner, double label)
1612 {
1613   if (label * inner < 1.0) {
1614     return - label ;
1615   } else {
1616     return 0.0 ;
1617   }
1618 }
1619 
1620 /** @brief SVM hinge loss conjugate
1621  ** @param u dual variable.
1622  ** @param label label value.
1623  ** @return conjugate loss.
1624  **/
1625 double
vl_svm_hinge_conjugate_loss(double u,double label)1626 vl_svm_hinge_conjugate_loss (double u, double label) {
1627   double z = label * u ;
1628   if (-1 <= z && z <= 0) {
1629     return label * u ;
1630   } else {
1631     return VL_INFINITY_D ;
1632   }
1633 }
1634 
1635 /** @brief SVM hinge loss DCA update
1636  ** @copydetails VlSvmDcaUpdateFunction */
1637 double
vl_svm_hinge_dca_update(double alpha,double inner,double norm2,double label)1638 vl_svm_hinge_dca_update (double alpha, double inner, double norm2, double label) {
1639   double palpha = (label - inner) / norm2 + alpha ;
1640   return label * VL_MAX(0, VL_MIN(1, label * palpha)) - alpha ;
1641 }
1642 
1643 /** @brief SVM square hinge loss
1644  ** @copydetails VlSvmLossFunction */
1645 double
vl_svm_hinge2_loss(double inner,double label)1646 vl_svm_hinge2_loss (double inner,double label)
1647 {
1648   double z = VL_MAX(1 - label * inner, 0.0) ;
1649   return z*z ;
1650 }
1651 
1652 /** @brief SVM square hinge loss derivative
1653  ** @copydetails VlSvmLossFunction */
1654 double
vl_svm_hinge2_loss_derivative(double inner,double label)1655 vl_svm_hinge2_loss_derivative (double inner, double label)
1656 {
1657   if (label * inner < 1.0) {
1658     return 2 * (inner - label) ;
1659   } else {
1660     return 0 ;
1661   }
1662 }
1663 
1664 /** @brief SVM square hinge loss conjugate
1665  ** @copydetails vl_svm_hinge_conjugate_loss */
1666 double
vl_svm_hinge2_conjugate_loss(double u,double label)1667 vl_svm_hinge2_conjugate_loss (double u, double label) {
1668   if (label * u <= 0) {
1669     return (label + u/4) * u ;
1670   } else {
1671     return VL_INFINITY_D ;
1672   }
1673 }
1674 
1675 /** @brief SVM square hinge loss DCA update
1676  ** @copydetails VlSvmDcaUpdateFunction */
1677 double
vl_svm_hinge2_dca_update(double alpha,double inner,double norm2,double label)1678 vl_svm_hinge2_dca_update (double alpha, double inner, double norm2, double label) {
1679   double palpha = (label - inner - 0.5*alpha) / (norm2 + 0.5) + alpha ;
1680   return label * VL_MAX(0, label * palpha) - alpha ;
1681 }
1682 
1683 /** @brief SVM l1 loss
1684  ** @copydetails VlSvmLossFunction */
1685 double
vl_svm_l1_loss(double inner,double label)1686 vl_svm_l1_loss (double inner,double label)
1687 {
1688   return vl_abs_d(label - inner) ;
1689 }
1690 
1691 /** @brief SVM l1 loss derivative
1692  ** @copydetails VlSvmLossFunction */
1693 double
vl_svm_l1_loss_derivative(double inner,double label)1694 vl_svm_l1_loss_derivative (double inner, double label)
1695 {
1696   if (label > inner) {
1697     return - 1.0 ;
1698   } else {
1699     return + 1.0 ;
1700   }
1701 }
1702 
1703 /** @brief SVM l1 loss conjugate
1704  ** @copydetails vl_svm_hinge_conjugate_loss */
1705 double
vl_svm_l1_conjugate_loss(double u,double label)1706 vl_svm_l1_conjugate_loss (double u, double label) {
1707   if (vl_abs_d(u) <= 1) {
1708     return label*u ;
1709   } else {
1710     return VL_INFINITY_D ;
1711   }
1712 }
1713 
1714 /** @brief SVM l1 loss DCA update
1715  ** @copydetails VlSvmDcaUpdateFunction */
1716 double
vl_svm_l1_dca_update(double alpha,double inner,double norm2,double label)1717 vl_svm_l1_dca_update (double alpha, double inner, double norm2, double label) {
1718   if (vl_abs_d(alpha) <= 1) {
1719     double palpha = (label - inner) / norm2 + alpha ;
1720     return VL_MAX(-1.0, VL_MIN(1.0, palpha)) - alpha ;
1721   } else {
1722     return VL_INFINITY_D ;
1723   }
1724 }
1725 
1726 /** @brief SVM l2 loss
1727  ** @copydetails VlSvmLossFunction */
1728 double
vl_svm_l2_loss(double inner,double label)1729 vl_svm_l2_loss (double inner,double label)
1730 {
1731   double z = label - inner ;
1732   return z*z ;
1733 }
1734 
1735 /** @brief SVM l2 loss derivative
1736  ** @copydetails VlSvmLossFunction */
1737 double
vl_svm_l2_loss_derivative(double inner,double label)1738 vl_svm_l2_loss_derivative (double inner, double label)
1739 {
1740   return - 2 * (label - inner) ;
1741 }
1742 
1743 /** @brief SVM l2 loss conjugate
1744  ** @copydetails vl_svm_hinge_conjugate_loss */
1745 double
vl_svm_l2_conjugate_loss(double u,double label)1746 vl_svm_l2_conjugate_loss (double u, double label) {
1747   return (label + u/4) * u ;
1748 }
1749 
1750 /** @brief SVM l2 loss DCA update
1751  ** @copydetails VlSvmDcaUpdateFunction */
1752 double
vl_svm_l2_dca_update(double alpha,double inner,double norm2,double label)1753 vl_svm_l2_dca_update (double alpha, double inner, double norm2, double label) {
1754   return (label - inner - 0.5*alpha) / (norm2 + 0.5) ;
1755 }
1756 
1757 /** @brief SVM l2 loss
1758  ** @copydetails VlSvmLossFunction */
1759 double
vl_svm_logistic_loss(double inner,double label)1760 vl_svm_logistic_loss (double inner,double label)
1761 {
1762   double z = label * inner ;
1763   if (z >= 0) {
1764     return log(1.0 + exp(-z)) ;
1765   } else {
1766     return -z + log(exp(z) + 1.0) ;
1767   }
1768 }
1769 
1770 /** @brief SVM l2 loss derivative
1771  ** @copydetails VlSvmLossFunction */
1772 double
vl_svm_logistic_loss_derivative(double inner,double label)1773 vl_svm_logistic_loss_derivative (double inner, double label)
1774 {
1775   double z = label * inner ;
1776   double t = 1 / (1 + exp(-z)) ; /* this is stable for z << 0 too */
1777   return label * (t - 1) ; /*  = -label exp(-z) / (1 + exp(-z)) */
1778 }
1779 
xlogx(double x)1780 VL_INLINE double xlogx(double x)
1781 {
1782   if (x <= 1e-10) return 0 ;
1783   return x*log(x) ;
1784 }
1785 
1786 /** @brief SVM l2 loss conjugate
1787  ** @copydetails vl_svm_hinge_conjugate_loss */
1788 double
vl_svm_logistic_conjugate_loss(double u,double label)1789 vl_svm_logistic_conjugate_loss (double u, double label) {
1790   double z = label * u ;
1791   if (-1 <= z && z <= 0) {
1792     return xlogx(-z) + xlogx(1+z) ;
1793   } else {
1794     return VL_INFINITY_D ;
1795   }
1796 }
1797 
1798 /** @brief SVM l2 loss DCA update
1799  ** @copydetails VlSvmDcaUpdateFunction */
1800 double
vl_svm_logistic_dca_update(double alpha,double inner,double norm2,double label)1801 vl_svm_logistic_dca_update (double alpha, double inner, double norm2, double label) {
1802   /*
1803    The goal is to solve the problem
1804 
1805    min_delta A/2 delta^2 + B delta + l*(-alpha - delta|y),  -1 <= - y (alpha+delta) <= 0
1806 
1807    where A = norm2, B = inner, and y = label. To simplify the notation, we set
1808 
1809      f(beta) = beta * log(beta) + (1 - beta) * log(1 - beta)
1810 
1811    where beta = y(alpha + delta) such that
1812 
1813      l*(-alpha - delta |y) = f(beta).
1814 
1815    Hence 0 <= beta <= 1, delta = + y beta - alpha. Substituting
1816 
1817      min_beta A/2 beta^2 + y (B - A alpha) beta + f(beta) + const
1818 
1819    The Newton step is then given by
1820 
1821      beta = beta - (A beta + y(B - A alpha) + df) / (A + ddf).
1822 
1823    However, the function is singluar for beta=0 and beta=1 (infinite
1824    first and second order derivatives). Since the function is monotonic
1825    (second derivarive always strictly greater than zero) and smooth,
1826    we canuse bisection to find the zero crossing of the first derivative.
1827    Once one is sufficiently close to the optimum, a one or two Newton
1828    steps are sufficien to land on it with excellent accuracy.
1829    */
1830 
1831   double  df, ddf, der, dder ;
1832   vl_index t ;
1833 
1834   /* bisection */
1835   double beta1 = 0 ;
1836   double beta2 = 1 ;
1837   double beta = 0.5 ;
1838 
1839   for (t = 0 ; t < 5 ; ++t) {
1840     df = log(beta) - log(1-beta) ;
1841     der = norm2 * beta + label * (inner - norm2*alpha) + df ;
1842     if (der >= 0) {
1843       beta2 = beta ;
1844     } else {
1845       beta1 = beta ;
1846     }
1847     beta = 0.5 * (beta1 + beta2) ;
1848   }
1849 
1850 #if 1
1851   /* a final Newton step, but not too close to the singularities */
1852   for (t = 0 ; (t < 2) & (beta > VL_EPSILON_D) & (beta < 1-VL_EPSILON_D) ; ++t) {
1853     df = log(beta) - log(1-beta) ;
1854     ddf = 1 / (beta * (1-beta)) ;
1855     der = norm2 * beta + label * (inner - norm2*alpha) + df ;
1856     dder = norm2 + ddf ;
1857     beta -= der / dder ;
1858     beta = VL_MAX(0, VL_MIN(1, beta)) ;
1859   }
1860 #endif
1861 
1862   return label * beta - alpha ;
1863 }
1864 
1865 /* ---------------------------------------------------------------- */
1866 
1867 /** @internal @brief Update SVM statistics
1868  ** @param self object.
1869  **/
1870 
_vl_svm_update_statistics(VlSvm * self)1871 void _vl_svm_update_statistics (VlSvm *self)
1872 {
1873   vl_size i, k ;
1874   double inner, p ;
1875 
1876   memset(&self->statistics, 0, sizeof(VlSvmStatistics)) ;
1877 
1878   self->statistics.regularizer = self->bias * self->bias ;
1879   for (i = 0; i < self->dimension; i++) {
1880     self->statistics.regularizer += self->model[i] * self->model[i] ;
1881   }
1882   self->statistics.regularizer *= self->lambda * 0.5 ;
1883 
1884   for (k = 0; k < self->numData ; k++) {
1885     p = (self->weights) ? self->weights[k] : 1.0 ;
1886     if (p <= 0) continue ;
1887     inner = self->innerProductFn(self->data, k, self->model) ;
1888     inner += self->bias * self->biasMultiplier ;
1889     self->scores[k] = inner ;
1890     self->statistics.loss += p * self->lossFn(inner, self->labels[k]) ;
1891     if (self->solver == VlSvmSolverSdca) {
1892 
1893       self->statistics.dualLoss -= p * self->conjugateLossFn(- self->alpha[k] / p, self->labels[k]) ;
1894     }
1895   }
1896 
1897   self->statistics.loss /= self->numData ;
1898   self->statistics.objective = self->statistics.regularizer + self->statistics.loss ;
1899 
1900   if (self->solver == VlSvmSolverSdca) {
1901     self->statistics.dualLoss /= self->numData ;
1902     self->statistics.dualObjective = - self->statistics.regularizer + self->statistics.dualLoss ;
1903     self->statistics.dualityGap = self->statistics.objective - self->statistics.dualObjective ;
1904   }
1905 }
1906 
1907 /* ---------------------------------------------------------------- */
1908 /*                                       Evaluate rather than solve */
1909 /* ---------------------------------------------------------------- */
1910 
_vl_svm_evaluate(VlSvm * self)1911 void _vl_svm_evaluate (VlSvm *self)
1912 {
1913   double startTime = vl_get_cpu_time () ;
1914 
1915   _vl_svm_update_statistics (self) ;
1916 
1917   self->statistics.elapsedTime = vl_get_cpu_time() - startTime ;
1918   self->statistics.iteration = 0 ;
1919   self->statistics.epoch = 0 ;
1920   self->statistics.status = VlSvmStatusConverged ;
1921 
1922   if (self->diagnosticFn) {
1923     self->diagnosticFn(self, self->diagnosticFnData) ;
1924   }
1925 }
1926 
1927 /* ---------------------------------------------------------------- */
1928 /*                         Stochastic Dual Coordinate Ascent Solver */
1929 /* ---------------------------------------------------------------- */
1930 
_vl_svm_sdca_train(VlSvm * self)1931 void _vl_svm_sdca_train (VlSvm *self)
1932 {
1933   double * norm2 ;
1934   vl_index * permutation ;
1935   vl_uindex i, t  ;
1936   double inner, delta, multiplier, p ;
1937 
1938   double startTime = vl_get_cpu_time () ;
1939   VlRand * rand = vl_get_rand() ;
1940 
1941   norm2 = (double*) vl_calloc(self->numData, sizeof(double));
1942   permutation = vl_calloc(self->numData, sizeof(vl_index)) ;
1943 
1944   {
1945     double * buffer = vl_calloc(self->dimension, sizeof(double)) ;
1946     for (i = 0 ; i < (unsigned)self->numData; i++) {
1947       double n2 ;
1948       permutation [i] = i ;
1949       memset(buffer, 0, self->dimension * sizeof(double)) ;
1950       self->accumulateFn (self->data, i, buffer, 1) ;
1951       n2 = self->innerProductFn (self->data, i, buffer) ;
1952       n2 += self->biasMultiplier * self->biasMultiplier ;
1953       norm2[i] = n2 / (self->lambda * self->numData) ;
1954     }
1955     vl_free(buffer) ;
1956   }
1957 
1958   for (t = 0 ; 1 ; ++t) {
1959 
1960     if (t % self->numData == 0) {
1961       /* once a new epoch is reached (all data have been visited),
1962        change permutation */
1963       vl_rand_permute_indexes(rand, permutation, self->numData) ;
1964     }
1965 
1966     /* pick a sample and compute update */
1967     i = permutation[t % self->numData] ;
1968     p = (self->weights) ? self->weights[i] : 1.0 ;
1969     if (p > 0) {
1970       inner = self->innerProductFn(self->data, i, self->model) ;
1971       inner += self->bias * self->biasMultiplier ;
1972       delta = p * self->dcaUpdateFn(self->alpha[i] / p, inner, p * norm2[i], self->labels[i]) ;
1973     } else {
1974       delta = 0 ;
1975     }
1976 
1977     /* apply update */
1978     if (delta != 0) {
1979       self->alpha[i] += delta ;
1980       multiplier = delta / (self->numData * self->lambda) ;
1981       self->accumulateFn(self->data,i,self->model,multiplier) ;
1982       self->bias += self->biasMultiplier * multiplier;
1983     }
1984 
1985     /* call diagnostic occasionally */
1986     if ((t + 1) % self->diagnosticFrequency == 0 || t + 1 == self->maxNumIterations) {
1987       _vl_svm_update_statistics (self) ;
1988       self->statistics.elapsedTime = vl_get_cpu_time() - startTime ;
1989       self->statistics.iteration = t ;
1990       self->statistics.epoch = t / self->numData ;
1991 
1992       self->statistics.status = VlSvmStatusTraining ;
1993       if (self->statistics.dualityGap < self->epsilon) {
1994         self->statistics.status = VlSvmStatusConverged ;
1995       }
1996       else if (t + 1 == self->maxNumIterations) {
1997         self->statistics.status = VlSvmStatusMaxNumIterationsReached ;
1998       }
1999 
2000       if (self->diagnosticFn) {
2001         self->diagnosticFn(self, self->diagnosticFnData) ;
2002       }
2003 
2004       if (self->statistics.status != VlSvmStatusTraining) {
2005         break ;
2006       }
2007     }
2008   } /* next iteration */
2009 
2010   vl_free (norm2) ;
2011   vl_free (permutation) ;
2012 }
2013 
2014 /* ---------------------------------------------------------------- */
2015 /*                               Stochastic Gradient Descent Solver */
2016 /* ---------------------------------------------------------------- */
2017 
_vl_svm_sgd_train(VlSvm * self)2018 void _vl_svm_sgd_train (VlSvm *self)
2019 {
2020   vl_index * permutation ;
2021   double * scores ;
2022   double * previousScores ;
2023   vl_uindex i, t, k ;
2024   double inner, gradient, rate, biasRate, p ;
2025   double factor = 1.0 ;
2026   double biasFactor = 1.0 ; /* to allow slower bias learning rate */
2027   vl_index t0 = VL_MAX(2, vl_ceil_d(1.0 / self->lambda)) ;
2028   //t0=2 ;
2029 
2030   double startTime = vl_get_cpu_time () ;
2031   VlRand * rand = vl_get_rand() ;
2032 
2033   permutation = vl_calloc(self->numData, sizeof(vl_index)) ;
2034   scores = vl_calloc(self->numData * 2, sizeof(double)) ;
2035   previousScores = scores + self->numData ;
2036 
2037   for (i = 0 ; i < (unsigned)self->numData; i++) {
2038     permutation [i] = i ;
2039     previousScores [i] = - VL_INFINITY_D ;
2040   }
2041 
2042   /*
2043    We store the w vector as the product fw (factor * model).
2044    We also use a different factor for the bias: biasFactor * biasMultiplier
2045    to enable a slower learning rate for the bias.
2046 
2047    Given this representation, it is easy to carry the two key operations:
2048 
2049    * Inner product: <fw,x> = f <w,x>
2050 
2051    * Model update: fp wp = fw - rate * lambda * w - rate * g
2052                          = f(1 - rate * lambda) w - rate * g
2053 
2054      Thus the update equations are:
2055 
2056                    fp = f(1 - rate * lambda), and
2057                    wp = w + rate / fp * g ;
2058 
2059    * Realization of the scaling factor. Before the statistics function
2060      is called, or training finishes, the factor (and biasFactor)
2061      are explicitly applied to the model and the bias.
2062   */
2063 
2064   for (t = 0 ; 1 ; ++t) {
2065 
2066     if (t % self->numData == 0) {
2067       /* once a new epoch is reached (all data have been visited),
2068        change permutation */
2069       vl_rand_permute_indexes(rand, permutation, self->numData) ;
2070     }
2071 
2072     /* pick a sample and compute update */
2073     i = permutation[t % self->numData] ;
2074     p = (self->weights) ? self->weights[i] : 1.0 ;
2075     p = VL_MAX(0.0, p) ; /* we assume non-negative weights, so this is just for robustness */
2076     inner = factor * self->innerProductFn(self->data, i, self->model) ;
2077     inner += biasFactor * (self->biasMultiplier * self->bias) ;
2078     gradient = p * self->lossDerivativeFn(inner, self->labels[i]) ;
2079     previousScores[i] = scores[i] ;
2080     scores[i] = inner ;
2081 
2082     /* apply update */
2083     rate = 1.0 /  (self->lambda * (t + t0)) ;
2084     biasRate = rate * self->biasLearningRate ;
2085     factor *= (1.0 - self->lambda * rate) ;
2086     biasFactor *= (1.0 - self->lambda * biasRate) ;
2087 
2088     /* debug: realize the scaling factor all the times */
2089     /*
2090     for (k = 0 ; k < self->dimension ; ++k) self->model[k] *= factor ;
2091     self->bias *= biasFactor;
2092     factor = 1.0 ;
2093     biasFactor = 1.0 ;
2094     */
2095 
2096     if (gradient != 0) {
2097       self->accumulateFn(self->data, i, self->model, - gradient * rate / factor) ;
2098       self->bias += self->biasMultiplier * (- gradient * biasRate / biasFactor) ;
2099     }
2100 
2101     /* call diagnostic occasionally */
2102     if ((t + 1) % self->diagnosticFrequency == 0 || t + 1 == self->maxNumIterations) {
2103 
2104       /* realize factor before computing statistics or completing training */
2105       for (k = 0 ; k < self->dimension ; ++k) self->model[k] *= factor ;
2106       self->bias *= biasFactor;
2107       factor = 1.0 ;
2108       biasFactor = 1.0 ;
2109 
2110       _vl_svm_update_statistics (self) ;
2111 
2112       for (k = 0 ; k < self->numData ; ++k) {
2113         double delta = scores[k] - previousScores[k] ;
2114         self->statistics.scoresVariation += delta * delta ;
2115       }
2116       self->statistics.scoresVariation = sqrt(self->statistics.scoresVariation) / self->numData ;
2117 
2118       self->statistics.elapsedTime = vl_get_cpu_time() - startTime ;
2119       self->statistics.iteration = t ;
2120       self->statistics.epoch = t / self->numData ;
2121 
2122       self->statistics.status = VlSvmStatusTraining ;
2123       if (self->statistics.scoresVariation < self->epsilon) {
2124         self->statistics.status = VlSvmStatusConverged ;
2125       }
2126       else if (t + 1 == self->maxNumIterations) {
2127         self->statistics.status = VlSvmStatusMaxNumIterationsReached ;
2128       }
2129 
2130       if (self->diagnosticFn) {
2131         self->diagnosticFn(self, self->diagnosticFnData) ;
2132       }
2133 
2134       if (self->statistics.status != VlSvmStatusTraining) {
2135         break ;
2136       }
2137     }
2138   } /* next iteration */
2139 
2140   vl_free (scores) ;
2141   vl_free (permutation) ;
2142 }
2143 
2144 /* ---------------------------------------------------------------- */
2145 /*                                                       Dispatcher */
2146 /* ---------------------------------------------------------------- */
2147 
2148 /** @brief Run the SVM solver
2149  ** @param self object.
2150  **
2151  ** The data on which the SVM operates is passed upon the cration of
2152  ** the ::VlSvm object. This function runs a solver to learn a
2153  ** corresponding model. See @ref svm-starting.
2154  **/
2155 
vl_svm_train(VlSvm * self)2156 void vl_svm_train (VlSvm * self)
2157 {
2158   assert (self) ;
2159   switch (self->solver) {
2160     case VlSvmSolverSdca:
2161       _vl_svm_sdca_train(self) ;
2162       break ;
2163     case VlSvmSolverSgd:
2164       _vl_svm_sgd_train(self) ;
2165       break ;
2166     case VlSvmSolverNone:
2167       _vl_svm_evaluate(self) ;
2168       break ;
2169     default:
2170       assert(0) ;
2171   }
2172 }
2173