1 /** @file svm.c
2 ** @brief Support Vector Machines (SVM) - Implementation
3 ** @author Milan Sulc
4 ** @author Daniele Perrone
5 ** @author Andrea Vedaldi
6 **/
7
8 /*
9 Copyright (C) 2013 Milan Sulc.
10 Copyright (C) 2012 Daniele Perrone.
11 Copyright (C) 2011-13 Andrea Vedaldi.
12
13 All rights reserved.
14
15 This file is part of the VLFeat library and is made available under
16 the terms of the BSD license (see the COPYING file).
17 */
18
19 /** @file svm.h
20 ** @see @ref svm.
21 **/
22
23 /**
24 <!-- ------------------------------------------------------------- -->
25 @page svm Support Vector Machines (SVM)
26 @author Milan Sulc
27 @author Daniele Perrone
28 @author Andrea Vedaldi
29 @tableofcontents
30 <!-- ------------------------------------------------------------- -->
31
32 *Support Vector Machines* (SVMs) are one of the most popular types of
33 discriminate classifiers. VLFeat implements two solvers, SGD and SDCA,
34 capable of learning linear SVMs on a large scale. These linear solvers
35 can be combined with explicit feature maps to learn non-linear models
36 as well. The solver supports a few variants of the standard
37 SVM formulation, including using loss functions other than the hinge
38 loss.
39
40 @ref svm-starting demonstrates how to use VLFeat to learn an SVM.
41 Information on SVMs and the corresponding optimization algorithms as
42 implemented by VLFeat are given in:
43
44 - @subpage svm-fundamentals - Linear SVMs and their learning.
45 - @subpage svm-advanced - Loss functions, dual objective, and other details.
46 - @subpage svm-sgd - The SGD algorithm.
47 - @subpage svm-sdca - The SDCA algorithm.
48
49 <!-- ------------------------------------------------------------- -->
50 @section svm-starting Getting started
51 <!-- ------------------------------------------------------------- -->
52
53 This section demonstrates how to learn an SVM by using VLFeat. SVM
54 learning is implemented by the ::VlSvm object type. Let's
55 start by a complete example:
56
57 @code
58 #include <stdio.h>
59 #include <vl/svm.h>
60
61 int main()
62 {
63 vl_size const numData = 4 ;
64 vl_size const dimension = 2 ;
65 double x [dimension * numData] = {
66 0.0, -0.5,
67 0.6, -0.3,
68 0.0, 0.5
69 0.6, 0.0} ;
70 double y [numData] = {1, 1, -1, 1} ;
71 double lambda = 0.01;
72 double * const model ;
73 double bias ;
74
75 VlSvm * svm = vl_svm_new(VlSvmSolverSgd,
76 x, dimension, numData,
77 y,
78 lambda) ;
79 vl_svm_train(svm) ;
80
81 model = vl_svm_get_model(svm) ;
82 bias = vl_svm_get_bias(svm) ;
83
84 printf("model w = [ %f , %f ] , bias b = %f \n",
85 model[0],
86 model[1],
87 bias);
88
89 vl_svm_delete(svm) ;
90 return 0;
91 }
92 @endcode
93
94 This code learns a binary linear SVM using the SGD algorithm on
95 four two-dimensional points using 0.01 as regularization parameter.
96
97 ::VlSvmSolverSdca can be specified in place of ::VlSvmSolverSdca
98 in orer to use the SDCA algorithm instead.
99
100 Convergence and other diagnostic information can be obtained after
101 training by using the ::vl_svm_get_statistics function. Algorithms
102 regularly check for convergence (usally after each pass over the data).
103 The ::vl_svm_set_diagnostic_function can be used to specify a callback
104 to be invoked when diagnostic is run. This can be used, for example,
105 to dump information on the screen as the algorithm progresses.
106
107 Convergence is reached after a maximum number of iterations
108 (::vl_svm_set_max_num_iterations) or after a given criterion falls
109 below a threshold (::vl_svm_set_epsilon). The meaning of these
110 may depend on the specific algorithm (see @ref svm for further details).
111
112 ::VlSvm is a quite powerful object. Algorithms only need to perform
113 inner product and accumulation operation on the data (see @ref svm-advanced).
114 This is used to abstract from the data type and support almost anything
115 by speciying just two functions (::vl_svm_set_data_functions).
116
117 A simple interface to this advanced functionality is provided by the
118 ::VlSvmDataset object. This supports natively @c float and @c double
119 data types, as well as applying on the fly the homogeneous kernel map
120 (@ref homkermap). This is exemplified in @ref svmdataset-starting.
121
122 */
123
124 /**
125 <!-- ------------------------------------------------------------- -->
126 @page svm-fundamentals SVM fundamentals
127 @tableofcontents
128 <!-- ------------------------------------------------------------- -->
129
130 This page introduces the SVM formulation used in VLFeat. See @ref svm
131 for more information on VLFeat SVM support.
132
133 Let $ \bx \in \real^d $ be a vector representing, for example, an
134 image, an audio track, or a fragment of text. Our goal is to design a
135 *classifier*, i.e. a function that associates to each vector $\bx$ a
136 positive or negative label based on a desired criterion, for example
137 the fact that the image contains or not a cat, that the audio track
138 contains or not English speech, or that the text is or not a
139 scientific paper.
140
141 The vector $\bx$ is classified by looking at the sign of a *linear
142 scoring function* $\langle \bx, \bw \rangle$. The goal of learning is
143 to estimate the parameter $\bw \in \real^d$ in such a way that the
144 score is positive if the vector $\bx$ belongs to the positive class
145 and negative otherwise. In fact, in the standard SVM formulation the
146 the goal is to have a score of *at least 1* in the first case, and of
147 *at most -1* in the second one, imposing a *margin*.
148
149 The parameter $\bw$ is estimated or *learned* by fitting the scoring
150 function to a training set of $n$ example pairs $(\bx_i,y_i),
151 i=1,\dots,n$. Here $y_i \in \{-1,1\}$ are the *ground truth labels* of
152 the corresponding example vectors. The fit quality is measured by a
153 *loss function* which, in standard SVMs, is the *hinge loss*:
154
155 \[
156 \ell_i(\langle \bw,\bx\rangle) = \max\{0, 1 - y_i \langle \bw,\bx\rangle\}.
157 \]
158
159 Note that the hinge loss is zero only if the score $\langle
160 \bw,\bx\rangle$ is at least 1 or at most -1, depending on the label
161 $y_i$.
162
163 Fitting the training data is usually insufficient. In order for the
164 scoring function *generalize to future data* as well, it is usually
165 preferable to trade off the fitting accuracy with the *regularity* of
166 the learned scoring function $\langle \bx, \bw \rangle$. Regularity in
167 the standard formulation is measured by the norm of the parameter
168 vector $\|\bw\|^2$ (see @ref svm-advanced). Averaging the loss on all
169 training samples and adding to it the regularizer weighed by a
170 parameter $\lambda$ yields the *regularized loss objective*
171
172 @f{equation}{
173 \boxed{\displaystyle
174 E(\bw) = \frac{\lambda}{2} \left\| \bw \right\|^2
175 + \frac{1}{n} \sum_{i=1}^n \max\{0, 1 - y_i \langle \bw,\bx\rangle\}.
176 \label{e:svm-primal-hinge}
177 }
178 @f}
179
180 Note that this objective function is *convex*, so that there exists a
181 single global optimum.
182
183 The scoring function $\langle \bx, \bw \rangle$ considered so far has
184 been linear and unbiased. @ref svm-bias discusses how a bias term can
185 be added to the SVM and @ref svm-feature-maps shows how non-linear
186 SVMs can be reduced to the linear case by computing suitable feature
187 maps.
188
189 @ref svm-learning shows how VLFeat can be used to learn an SVM by
190 minimizing $E(\bw)$.
191
192 <!-- ------------------------------------------------------------- -->
193 @section svm-learning Learning
194 <!-- ------------------------------------------------------------- -->
195
196 Learning an SVM amounts to finding the minimizer $\bw^*$ of the cost
197 function $E(\bw)$. While there are dozens of methods that can be used
198 to do so, VLFeat implements two large scale methods, designed to work
199 with linear SVMs (see @ref svm-feature-maps to go beyond linear):
200
201 - @ref svm-sgd
202 - @ref svm-sdca
203
204 Using these solvers is exemplified in @ref svm-starting.
205
206 <!-- ------------------------------------------------------------- -->
207 @section svm-bias Adding a bias
208 <!-- ------------------------------------------------------------- -->
209
210 It is common to add to the SVM scoring function a *bias term* $b$, and
211 to consider the score $\langle \bx,\bw \rangle + b$. In practice the
212 bias term can be crucial to fit the training data optimally, as there
213 is no reason why the inner products $\langle \bx,\bw \rangle$ should
214 be naturally centered at zero.
215
216 Some SVM learning algorithms can estimate both $\bw$ and $b$
217 directly. However, other algorithms such as SGD and SDCA cannot. In
218 this case, a simple workaround is to add a constant component $B > 0$
219 (we call this constant the *bias multiplier*) to the data,
220 i.e. consider the extended data vectors:
221 \[
222 \bar \bx = \begin{bmatrix} \bx \\ B \end{bmatrix},
223 \quad
224 \bar \bw = \begin{bmatrix} \bw \\ w_b \end{bmatrix}.
225 \]
226 In this manner the scoring function incorporates implicitly a bias $b = B w_b$:
227 \[
228 \langle \bar\bx, \bar\bw \rangle =
229 \langle \bx, \bw \rangle + B w_b.
230 \]
231
232 The disadvantage of this reduction is that the term $w_b^2$ becomes
233 part of the SVM regularizer, which shrinks the bias $b$ towards
234 zero. This effect can be alleviated by making $B$ sufficiently large,
235 because in this case $\|\bw\|^2 \gg w_b^2$ and the shrinking effect is
236 negligible.
237
238 Unfortunately, making $B$ too large makes the problem numerically
239 unbalanced, so a reasonable trade-off between shrinkage and stability
240 is generally sought. Typically, a good trade-off is obtained by
241 normalizing the data to have unitary Euclidean norm and then choosing
242 $B \in [1, 10]$.
243
244 Specific implementations of SGD and SDCA may provide explicit support
245 to learn the bias in this manner, but it is important to understand
246 the implications on speed and accuracy of the learning if this is
247 done.
248
249 <!-- ------------------------------------------------------------- -->
250 @section svm-feature-maps Non-linear SVMs and feature maps
251 <!-- ------------------------------------------------------------- -->
252
253 So far only linear scoring function $\langle \bx,\bw \rangle$ have
254 been considered. Implicitly, however, this assumes that the objects to
255 be classified (e.g. images) have been encoded as vectors $\bx$ in a
256 way that makes linear classification possible. This encoding step can
257 be made explicit by introducing the *feature map* $\Phi(\bx) \in
258 \real^d$. Including the feature map yields a scoring function
259 *non-linear* in $\bx$:
260 \[
261 \bx\in\mathcal{X} \quad\longrightarrow\quad \langle \Phi(\bx), \bw \rangle.
262 \]
263 The nature of the input space $\mathcal{X}$ can be arbitrary and might
264 not have a vector space structure at all.
265
266 The representation or encoding captures a notion of *similarity*
267 between objects: if two vectors $\Phi(\bx_1)$ and $\Phi(\bx_2)$ are
268 similar, then their scores will also be similar. Note that choosing a
269 feature map amounts to incorporating this information in the model
270 *prior* to learning.
271
272 The relation of feature maps to similarity functions is formalized by
273 the notion of a *kernel*, a positive definite function $K(\bx,\bx')$
274 measuring the similarity of a pair of objects. A feature map defines a
275 kernel by
276
277 \[
278 K(\bx,\bx') = \langle \Phi(\bx),\Phi(\bx') \rangle.
279 \]
280
281 Viceversa, any kernel function can be represented by a feature map in
282 this manner, establishing an equivalence.
283
284 So far, all solvers in VLFeat assume that the feature map $\Psi(\bx)$
285 can be explicitly computed. Although classically kernels were
286 introduced to generalize solvers to non-linear SVMs for which a
287 feature map *cannot* be computed (e.g. for a Gaussian kernel the
288 feature map is infinite dimensional), in practice using explicit
289 feature representations allow to use much faster solvers, so it makes
290 sense to *reverse* this process.
291 */
292
293 /**
294 <!-- ------------------------------------------------------------- -->
295 @page svm-advanced Advanced SVM topics
296 @tableofcontents
297 <!-- ------------------------------------------------------------- -->
298
299 This page discusses advanced SVM topics. For an introduction to SVMs,
300 please refer to @ref svm and @ref svm-fundamentals.
301
302 <!-- ------------------------------------------------------------- -->
303 @section svm-loss-functions Loss functions
304 <!-- ------------------------------------------------------------- -->
305
306 The SVM formulation given in @ref svm-fundamentals uses the
307 hinge loss, which is only one of a variety of loss functions that
308 are often used for SVMs. More in general, one
309 can consider the objective
310
311 @f{equation}{
312 E(\bw) = \frac{\lambda}{2} \left\| \bw \right\|^2 + \frac{1}{n} \sum_{i=1}^n \ell_i(\langle \bw,\bx\rangle).
313 \label{e:svm-primal}
314 @f}
315
316 where the loss $\ell_i(z)$ is a convex function of the scalar variable
317 $z$. Losses differ by: (i) their purpose (some are suitable for
318 classification, other for regression), (ii) their smoothness (which
319 usually affects how quickly the SVM objective function can be
320 minimized), and (iii) their statistical interpretation (for example
321 the logistic loss can be used to learn logistic models).
322
323 Concrete examples are the:
324
325 <table>
326 <tr>
327 <td>Name</td>
328 <td>Loss $\ell_i(z)$</td>
329 <td>Description</td>
330 </tr>
331 <tr>
332 <td>Hinge</td>
333 <td>$\max\{0, 1-y_i z\}$</td>
334 <td>The standard SVM loss function.</td>
335 </tr>
336 <tr>
337 <td>Square hinge</td>
338 <td>$\max\{0, 1-y_i z\}^2$</td>
339 <td>The standard SVM loss function, but squared. This version is
340 smoother and may yield numerically easier problems.</td>
341 </tr>
342 <tr>
343 <td>Square or l2</td>
344 <td>$(y_i - z)^2$</td>
345 <td>This loss yields the ridge regression model (l2 regularised least
346 square).</td>
347 </tr>
348 <tr>
349 <td>Linear or l1</td>
350 <td>$|y_i - z|$</td>
351 <td>Another loss suitable for regression, usually more robust but
352 harder to optimize than the squared one.</td>
353 </tr>
354 <tr>
355 <td>Insensitive l1</td>
356 <td>$\max\{0, |y_i - z| - \epsilon\}$.</td>
357 <td>This is a variant of the previous loss, proposed in the original
358 Support Vector Regression formulation. Differently from the previous
359 two losses, the insensitivity may yield to a sparse selection of
360 support vectors.</td>
361 </tr>
362 <tr>
363 <td>Logistic</td>
364 <td>$\log(1 + e^{-y_i z})$</td>
365 <td>This corresponds to regularized logisitc regression. The loss can
366 be seen as a negative log-likelihood: $\ell_i(z) = -\log P[y_i | z] =
367 - \log \sigma(y_iz/2)$, where $\sigma(z) = e^z/(1 + e^z)$ is the
368 sigmoid function, mapping a score $z$ to a probability. The $1/2$
369 factor in the sigmoid is due to the fact that labels are in $\{-1,1\}$
370 rather than $\{0,1\}$ as more common for the standard sigmoid
371 model.</td>
372 </tr>
373 </table>
374
375 <!-- ------------------------------------------------------------- -->
376 @section svm-data-abstraction Data abstraction: working with compressed data
377 <!-- ------------------------------------------------------------- -->
378
379 VLFeat learning algorithms (SGD and SDCA) access the data by means of
380 only two operations:
381
382 - *inner product*: computing the inner product between the model and
383 a data vector, i.e. $\langle \bw, \bx \rangle$.
384 - *accumulation*: summing a data vector to the model, i.e. $\bw
385 \leftarrow \bw + \beta \bx$.
386
387 VLFeat learning algorithms are *parameterized* in these two
388 operations. As a consequence, the data can be stored in any format
389 suitable to the user (e.g. dense matrices, sparse matrices,
390 block-sparse matrices, disk caches, and so on) provided that these two
391 operations can be implemented efficiently. Differently from the data,
392 however, the model vector $\bw$ is represented simply as a dense array
393 of doubles. This choice is adequate in almost any case.
394
395 A particularly useful aspect of this design choice is that the
396 training data can be store in *compressed format* (for example by
397 using product quantization (PQ)). Furthermore, higher-dimensional
398 encodings such as the homogeneous kernel map (@ref homkermap) and the
399 intersection kernel map can be *computed on the fly*. Such techniques
400 are very important when dealing with GBs of data.
401
402 <!-- ------------------------------------------------------------- -->
403 @section svm-dual-problem Dual problem
404 <!-- ------------------------------------------------------------- -->
405
406 In optimization, the *dual objective* $D(\balpha)$ of the SVM
407 objective $E(\bw)$ is of great interest. To obtain the dual objective,
408 one starts by approximating each loss term from below by a family of planes:
409 \[
410 \ell_i(z) = \sup_{u} (u z - \ell_i^*(u) ),
411 \qquad
412 \ell_i^*(u) = \sup_{z} (z u - \ell_i(z) )
413 \]
414 where $\ell_i^*(u)$ is the *dual conjugate* of the loss and gives the
415 intercept of each approximating plane as a function of the slope. When
416 the loss function is convex, the approximation is in fact exact. Examples
417 include:
418
419 <table>
420 <tr>
421 <td>Name</td>
422 <td>Loss $\ell_i(z)$</td>
423 <td>Conjugate loss $\ell_i^*(u)$</td>
424 </tr>
425 <tr>
426 <td>Hinge</td>
427 <td>$\max\{0, 1-y_i z\}$</td>
428 <td>\[
429 \ell_i^*(u) =
430 \begin{cases}
431 y_i u, & -1 \leq y_i u \leq 0, \\
432 +\infty, & \text{otherwise}
433 \end{cases}
434 \]</td>
435 </tr>
436 <tr>
437 <td>Square hinge</td>
438 <td>$\max\{0, 1-y_i z\}^2$</td>
439 <td>\[\ell_i^*(u) =
440 \begin{cases}
441 y_i u + \frac{u^2}{4}, & y_i u \leq 0, \\
442 +\infty, & \text{otherwise} \\
443 \end{cases}\]</td>
444 </tr>
445 <tr>
446 <td>Linear or l1</td>
447 <td>$|y_i - z|$</td>
448 <td>\[\ell_i^*(u) =
449 \begin{cases}
450 y_i u, & -1 \leq y_i u \leq 1, \\
451 +\infty, & \text{otherwise} \\
452 \end{cases}\]</td>
453 </tr>
454 <tr>
455 <td>Square or l2</td>
456 <td>$(y_i - z)^2$</td>
457 <td>\[\ell_i^*(u)=y_iu + \frac{u^2}{4}\]</td>
458 </tr>
459 <tr>
460 <td>Insensitive l1</td>
461 <td>$\max\{0, |y_i - z| - \epsilon\}$.</td>
462 <td></td>
463 </tr>
464 <tr>
465 <td>Logistic</td>
466 <td>$\log(1 + e^{-y_i z})$</td>
467 <td>\[\ell_i^*(u) =
468 \begin{cases}
469 (1+u) \log(1+u) - u \log(-u), & -1 \leq y_i u \leq 0, \\
470 +\infty, & \text{otherwise} \\
471 \end{cases}\]
472 </td>
473 </tr>
474 </table>
475
476 Since each plane $- z \alpha_i - \ell^*_i(-\alpha_i) \leq \ell_i(z)$
477 bounds the loss from below, by substituting in $E(\bw)$ one can write
478 a lower bound for the SVM objective
479 \[
480 F(\bw,\balpha) = \frac{\lambda}{2} \|\bw\|^2 -
481 \frac{1}{n}\sum_{i=1}^n (\bw^\top \bx_i\alpha_i + \ell_i^*(-\alpha_i))
482 \leq E(\bw).
483 \]
484 for each setting of the *dual variables* $\alpha_i$. The dual
485 objective function $D(\balpha)$ is obtained by minimizing the lower
486 bound $F(\bw,\balpha)$ w.r.t. to $\bw$:
487 \[
488 D(\balpha) = \inf_{\bw} F(\bw,\balpha) \leq E(\bw).
489 \]
490 The minimizer and the dual objective are now easy to find:
491 \[
492 \boxed{\displaystyle
493 \bw(\balpha) =
494 \frac{1}{\lambda n}
495 \sum_{i=1}^n \bx_i \alpha_i = \frac{1}{\lambda n} X\balpha,
496 \quad
497 D(\balpha) = - \frac{1}{2\lambda n^2} \balpha^\top X^\top X \balpha +
498 \frac{1}{n} \sum_{i=1}^n - \ell_i^*(-\alpha_i)
499 }
500 \]
501 where $X = [\bx_1, \dots, \bx_n]$ is the data matrix. Since the dual
502 is uniformly smaller than the primal, one has the *duality gap* bound:
503 \[
504 D(\balpha) \leq P(\bw^*) \leq P(\bw(\balpha))
505 \]
506 This bound can be used to evaluate how far off $\bw(\balpha)$ is from
507 the primal minimizer $\bw^*$. In fact, due to convexity, this bound
508 can be shown to be zero when $\balpha^*$ is the dual maximizer (strong
509 duality):
510 \[
511 D(\balpha^*) = P(\bw^*) = P(\bw(\balpha^*)),
512 \quad \bw^* = \bw(\balpha^*).
513 \]
514
515 <!-- ------------------------------------------------------------- -->
516 @section svm-C Parametrization in C
517 <!-- ------------------------------------------------------------- -->
518
519 Often a slightly different form of the SVM objective is considered,
520 where a parameter $C$ is used to scale the loss instead of the regularizer:
521
522 \[
523 E_C(\bw) = \frac{1}{2} \|\bw\|^2 + C \sum_{i=1}^n \ell_i(\langle \bx_i, \bw\rangle)
524 \]
525
526 This and the objective function $E(\bw)$ in $\lambda$ are equivalent
527 (proportional) if
528
529 \[
530 \lambda = \frac{1}{nC},
531 \qquad C = \frac{1}{n\lambda}.
532 \] up to an overall scaling factor to the problem.
533
534 **/
535
536 /**
537
538 <!-- ------------------------------------------------------------- -->
539 @page svm-sdca Stochastic Dual Coordinate Ascent
540 @tableofcontents
541 <!-- ------------------------------------------------------------- -->
542
543 This page describes the *Stochastic Dual Coordinate Ascent* (SDCA)
544 linear SVM solver. Please see @ref svm for an overview of VLFeat SVM
545 support.
546
547 SDCA maximizes the dual SVM objective (see @ref svm-dual-problem
548 for a derivation of this expression):
549
550 \[
551 D(\balpha) = - \frac{1}{2\lambda n^2} \balpha^\top X^\top X \balpha +
552 \frac{1}{n} \sum_{i=1}^n - \ell_i^*(-\alpha_i)
553 \]
554
555 where $X$ is the data matrix. Recall that the primal parameter
556 corresponding to a given setting of the dual variables is:
557
558 \[
559 \bw(\balpha) = \frac{1}{\lambda n} \sum_{i=1}^n \bx_i \alpha_i = \frac{1}{\lambda n} X\balpha
560 \]
561
562 In its most basic form, the *SDCA algorithm* can be summarized as follows:
563
564 - Let $\balpha_0 = 0$.
565 - Until the duality gap $P(\bw(\balpha_t)) - D(\balpha_t) < \epsilon$
566 - Pick a dual variable $q$ uniformly at random in $1, \dots, n$.
567 - Maximize the dual with respect to this variable: $\Delta\alpha_q = \max_{\Delta\alpha_q} D(\balpha_t + \Delta\alpha_q \be_q )$
568 - Update $\balpha_{t+1} = \balpha_{t} + \be_q \Delta\alpha_q$.
569
570 In VLFeat, we partially use the nomenclature from @cite{shwartz13a-dual} and @cite{hsieh08a-dual}.
571
572 <!-- ------------------------------------------------------------- -->
573 @section svm-sdca-dual-max Dual coordinate maximization
574 <!-- ------------------------------------------------------------- -->
575
576 The updated dual objective can be expanded as:
577 \[
578 D(\balpha_t + \be_q \Delta\alpha_q) =
579 \text{const.}
580 - \frac{1}{2\lambda n^2} \bx_q^\top \bx_q (\Delta\alpha_q)^2
581 - \frac{1}{n} \bx_q^\top \frac{X\alpha_t}{\lambda n} \Delta\alpha_q
582 - \frac{1}{n} \ell^*_q(- \alpha_q - \Delta\alpha_q)
583 \]
584 This can also be written as
585 @f{align*}
586 D(\balpha_t + \be_q \Delta\alpha_q) &\propto
587 - \frac{A}{2} (\Delta\alpha_q)^2
588 - B \Delta\alpha_q
589 - \ell^*_q(- \alpha_q - \Delta\alpha_q),
590 \\
591 A &= \frac{1}{\lambda n} \bx_q^\top \bx_q = \frac{1}{\lambda n} \| \bx_q \|^2,
592 \\
593 B &= \bx_q^\top \frac{X\balpha_t}{\lambda n} = \bx_q^\top \bw_t.
594 @f}
595 Maximizing this quantity in the scalar variable $\Delta\balpha$ is usually
596 not difficult. It is convenient to store and incrementally
597 update the model $\bw_t$ after the optimal step $\Delta\balpha$ has been
598 determined:
599 \[
600 \bw_t = \frac{X \balpha_t}{\lambda n},
601 \quad \bw_{t+1} = \bw_t + \frac{1}{\lambda n }\bx_q \be_q \Delta\alpha_q.
602 \]
603
604 For example, consider the hinge loss as given in @ref svm-advanced :
605 \[
606 \ell_q^*(u) =
607 \begin{cases}
608 y_q u, & -1 \leq y_q u \leq 0, \\
609 +\infty, & \text{otherwise}.
610 \end{cases}
611 \]
612 The maximizer $\Delta\alpha_q$ of the update objective must be in the
613 range where the conjugate loss is not infinite. Ignoring such bounds,
614 the update can be obtained by setting the derivative of the objective
615 to zero, obtaining
616 \[
617 \tilde {\Delta \alpha_q}= \frac{y_q - B}{A}.
618 \]
619 Note that $B$ is simply current score associated by the SVM to
620 the sample $\bx_q$. Incorporating the constraint $-1 \leq - y_q
621 (\alpha_q + \Delta \alpha_q) \leq 0$,
622 i.e. $0 \leq y_q (\alpha_q + \Delta \alpha_q) \leq 1$, one obtains the update
623 \[
624 \Delta\alpha_q = y_q \max\{0, \min\{1, y_q (\tilde {\Delta\alpha_q } + \alpha_q)\}\} - \alpha_q.
625 \]
626
627 <!-- ------------------------------------------------------------ --->
628 @section svm-sdca-details Implementation details
629 <!-- ------------------------------------------------------------ --->
630
631 Rather than visiting points completely at random, VLFeat SDCA follows
632 the best practice of visiting all the points at every epoch (pass
633 through the data), changing the order of the visit randomly by picking
634 every time a new random permutation.
635 **/
636
637 /**
638 <!-- ------------------------------------------------------------- -->
639 @page svm-sgd Stochastic Gradient Descent
640 @tableofcontents
641 <!-- ------------------------------------------------------------- -->
642
643 This page describes the *Stochastic Gradient Descent* (SGD) linear SVM
644 solver. SGD minimizes directly the primal SVM objective (see @ref svm):
645
646 \[
647 E(\bw) = \frac{\lambda}{2} \left\| \bw \right\|^2 + \frac{1}{n} \sum_{i=1}^n
648 \ell_i(\langle \bw,\bx\rangle)
649 \]
650
651 Firts, rewrite the objective as the average
652
653 \[
654 E(\bw) = \frac{1}{n} \sum_{i=1}^n E_i(\bw),
655 \quad
656 E_i(\bw) = \frac{\lambda}{2} \left\| \bw \right\|^2 + \ell_i(\langle \bw,\bx\rangle).
657 \]
658
659 Then SGD performs gradient steps by considering at each iteration
660 one term $E_i(\bw)$ selected at random from this average.
661 In its most basic form, the algorithm is:
662
663 - Start with $\bw_0 = 0$.
664 - For $t=1,2,\dots T$:
665 - Sample one index $i$ in $1,\dots,n$ uniformly at random.
666 - Compute a subgradient $\bg_t$ of $E_i(\bw)$ at $\bw_t$.
667 - Compute the learning rate $\eta_t$.
668 - Update $\bw_{t+1} = \bw_t - \eta_t \bg_t$.
669
670 Provided that the learning rate $\eta_t$ is chosen correctly, this
671 simple algorithm is guaranteed to converge to the minimizer $\bw^*$ of
672 $E$.
673
674 <!-- ------------------------------------------------------------- -->
675 @section svm-sgd-convergence Convergence and speed
676 <!-- ------------------------------------------------------------- -->
677
678 The goal of the SGD algorithm is to bring the *primal suboptimality*
679 below a threshold $\epsilon_P$:
680 \[
681 E(\bw_t) - E(\bw^*) \leq \epsilon_P.
682 \]
683
684 If the learning rate $\eta_t$ is selected appropriately, SGD can be
685 shown to converge properly. For example,
686 @cite{shalev-shwartz07pegasos} show that, since $E(\bw)$ is
687 $\lambda$-strongly convex, then using the learning rate
688 \[
689 \boxed{\eta_t = \frac{1}{\lambda t}}
690 \]
691 guarantees that the algorithm reaches primal-suboptimality $\epsilon_P$ in
692 \[
693 \tilde O\left( \frac{1}{\lambda \epsilon_P} \right).
694 \]
695 iterations. This particular SGD variant is sometimes known as PEGASOS
696 @cite{shalev-shwartz07pegasos} and is the version implemented in
697 VLFeat.
698
699 The *convergence speed* is not sufficient to tell the *learning speed*,
700 i.e. how quickly an algorithm can learn an SVM that performs optimally
701 on the test set. The following two observations
702 can be used to link convergence speed to learning speed:
703
704 - The regularizer strength is often heuristically selected to be
705 inversely proportional to the number of training samples: $\lambda =
706 \lambda_0 /n$. This reflects the fact that with more training data
707 the prior should count less.
708 - The primal suboptimality $\epsilon_P$ should be about the same as
709 the estimation error of the SVM primal. This estimation error is due
710 to the finite training set size and can be shown to be of the order
711 of $1/\lambda n = 1 / \lambda_0$.
712
713 Under these two assumptions, PEGASOS can learn a linear SVM in time
714 $\tilde O(n)$, which is *linear in the number of training
715 examples*. This fares much better with $O(n^2)$ or worse of non-linear
716 SVM solvers.
717
718 <!-- ------------------------------------------------------------- -->
719 @section svm-sgd-bias The bias term
720 <!-- ------------------------------------------------------------- -->
721
722 Adding a bias $b$ to the SVM scoring function $\langle \bw, \bx
723 \rangle +b$ is done, as explained in @ref svm-bias, by appending a
724 constant feature $B$ (the *bias multiplier*) to the data vectors $\bx$
725 and a corresponding weight element $w_b$ to the weight vector $\bw$,
726 so that $b = B w_b$ As noted, the bias multiplier should be
727 relatively large in order to avoid shrinking the bias towards zero,
728 but small to make the optimization stable. In particular, setting $B$
729 to zero learns an unbiased SVM (::vl_svm_set_bias_multiplier).
730
731 To counter instability caused by a large bias multiplier, the learning
732 rate of the bias is slowed down by multiplying the overall learning
733 rate $\eta_t$ by a bias-specific rate coefficient
734 (::vl_svm_set_bias_learning_rate).
735
736 As a rule of thumb, if the data vectors $\bx$ are $l^2$ normalized (as
737 they typically should for optimal performance), then a reasonable bias
738 multiplier is in the range 1 to 10 and a reasonable bias learning rate
739 is somewhere in the range of the inverse of that (in this manner the
740 two parts of the extended feature vector $(\bx, B)$ are balanced).
741
742 <!-- ------------------------------------------------------------- -->
743 @section svm-sgd-starting-iteration Adjusting the learning rate
744 <!-- ------------------------------------------------------------- -->
745
746 Initially, the learning rate $\eta_t = 1/\lambda t$ is usually too
747 fast: as usually $\lambda \ll 1$, $\eta_1 \gg 1$. But this is clearly
748 excessive (for example, without a loss term, the best learning rate at
749 the first iteration is simply $\eta_1=1$, as this nails the optimum in
750 one step). Thus, the learning rate formula is modified to be $\eta_t =
751 1 / \lambda (t + t_0)$, where $t_0 \approx 2/\lambda$, which is
752 equivalent to start $t_0$ iterations later. In this manner $\eta_1
753 \approx 1/2$.
754
755 <!-- ------------------------------------------------------------ --->
756 @subsection svm-sgd-warm-start Warm start
757 <!-- ------------------------------------------------------------ --->
758
759 Starting from a given model $\bw$ is easy in SGD as the optimization
760 runs in the primal. However, the starting iteration index $t$ should
761 also be advanced for a warm start, as otherwise the initial setting of
762 $\bw$ is rapidly forgot (::vl_svm_set_model, ::vl_svm_set_bias,
763 ::vl_svm_set_iteration_number).
764
765 <!-- ------------------------------------------------------------- -->
766 @section svm-sgd-details Implementation details
767 <!-- ------------------------------------------------------------- -->
768
769 @par "Random sampling of points"
770
771 Rather than visiting points completely at random, VLFeat SDCA follows
772 the best practice of visiting all the points at every epoch (pass
773 through the data), changing the order of the visit randomly by picking
774 every time a new random permutation.
775
776 @par "Factored representation"
777
778 At each iteration, the SGD algorithm updates the vector $\bw$
779 (including the additional bias component $w_b$) as $\bw_{t+1}
780 \leftarrow \bw_t - \lambda \eta_t \bw_t - \eta_t \bg_t$, where
781 $\eta_t$ is the learning rate. If the subgradient of the loss function
782 $\bg_t$ is zero at a given iteration, this amounts to simply shrink
783 $\bw$ towards the origin by multiplying it by the factor $1 - \lambda
784 \eta_t$. Thus such an iteration can be accelerated significantly by
785 representing internally $\bw_t = f_t \bu_t$, where $f_t$ is a scaling
786 factor. Then, the update becomes
787 \[
788 f_{t+1} \bu_{t+1}
789 = f_{t} \bu_{t} - \lambda \eta_t f_{t} \bu_{t} - \eta_t \bg_t
790 = (1-\lambda \eta_t) f_{t} \bu_{t} - \eta_t \bg_t.
791 \]
792 Setting $f_{t+1} = (1-\lambda \eta_t) f_{t}$, this gives the update
793 equation for $\bu_t$
794 \[
795 \bu_{t+1} = \bu_{t} - \frac{\eta_t}{f_{t+1}} \bg_t.
796 \]
797 but this step can be skipped whenever $\bg_t$ is equal to zero.
798
799 When the bias component has a different learning rate, this scheme
800 must be adjusted slightly by adding a separated factor for the bias,
801 but it is otherwise identical.
802
803
804 **/
805
806 /*
807
808 <!-- ------------------------------------------------------------ --->
809 @section svm-pegasos PEGASOS
810 <!-- ------------------------------------------------------------ --->
811
812 <!-- ------------------------------------------------------------ --->
813 @subsection svm-pegasos-algorithm Algorithm
814 <!-- ------------------------------------------------------------ --->
815
816 PEGASOS @cite{shalev-shwartz07pegasos} is a stochastic subgradient
817 optimizer. At the <em>t</em>-th iteration the algorithm:
818
819 - Samples uniformly at random as subset @f$ A_t @f$ of <em>k</em> of
820 training pairs @f$(x,y)@f$ from the <em>m</em> pairs provided for
821 training (this subset is called mini batch).
822 - Computes a subgradient @f$ \nabla_t @f$ of the function @f$ E_t(w) =
823 \frac{1}{2}\|w\|^2 + \frac{1}{k} \sum_{(x,y) \in A_t} \ell(w;(x,y))
824 @f$ (this is the SVM objective function restricted to the
825 minibatch).
826 - Compute an intermediate weight vector @f$ w_{t+1/2} @f$ by doing a
827 step @f$ w_{t+1/2} = w_t - \alpha_t \nabla_t @f$ with learning rate
828 @f$ \alpha_t = 1/(\eta t) @f$ along the subgradient. Note that the
829 learning rate is inversely proportional to the iteration number.
830 - Back projects the weight vector @f$ w_{t+1/2} @f$ on the
831 hypersphere of radius @f$ \sqrt{\lambda} @f$ to obtain the next
832 model estimate @f$ w_{t+1} @f$:
833 @f[
834 w_t = \min\{1, \sqrt{\lambda}/\|w\|\} w_{t+1/2}.
835 @f]
836 The hypersphere is guaranteed to contain the optimal weight vector
837 @f$ w^* @f$.
838
839 VLFeat implementation fixes to one the size of the mini batches @f$ k
840 @f$.
841
842
843 <!-- ------------------------------------------------------------ --->
844 @subsection svm-pegasos-permutation Permutation
845 <!-- ------------------------------------------------------------ --->
846
847 VLFeat PEGASOS can use a user-defined permutation to decide the order
848 in which data points are visited (instead of using random
849 sampling). By specifying a permutation the algorithm is guaranteed to
850 visit each data point exactly once in each loop. The permutation needs
851 not to be bijective. This can be used to visit certain data samples
852 more or less often than others, implicitly reweighting their relative
853 importance in the SVM objective function. This can be used to balance
854 the data.
855
856 <!-- ------------------------------------------------------------ --->
857 @subsection svm-pegasos-kernels Non-linear kernels
858 <!-- ------------------------------------------------------------ --->
859
860 PEGASOS can be extended to non-linear kernels, but the algorithm is
861 not particularly efficient in this setting [1]. When possible, it may
862 be preferable to work with explicit feature maps.
863
864 Let @f$ k(x,y) @f$ be a positive definite kernel. A <em>feature
865 map</em> is a function @f$ \Psi(x) @f$ such that @f$ k(x,y) = \langle
866 \Psi(x), \Psi(y) \rangle @f$. Using this representation the non-linear
867 SVM learning objective function writes:
868
869 @f[
870 \min_{w} \frac{\lambda}{2} \|w\|^2 + \frac{1}{m} \sum_{i=1}^n
871 \ell(w; (\Psi(x)_i,y_i)).
872 @f]
873
874 Thus the only difference with the linear case is that the feature @f$
875 \Psi(x) @f$ is used in place of the data @f$ x @f$.
876
877 @f$ \Psi(x) @f$ can be learned off-line, for instance by using the
878 incomplete Cholesky decomposition @f$ V^\top V @f$ of the Gram matrix
879 @f$ K = [k(x_i,x_j)] @f$ (in this case @f$ \Psi(x_i) @f$ is the
880 <em>i</em>-th columns of <em>V</em>). Alternatively, for additive
881 kernels (e.g. intersection, Chi2) the explicit feature map computed by
882 @ref homkermap.h can be used.
883
884 For additive kernels it is also possible to perform the feature
885 expansion online inside the solver, setting the specific feature map
886 via ::vl_svmdataset_set_map. This is particular useful to keep the
887 size of the training data small, when the number of the samples is big
888 or the memory is limited.
889 */
890
891 #include "svm.h"
892 #include "mathop.h"
893 #include <string.h>
894
895 struct VlSvm_ {
896 VlSvmSolverType solver ; /**< SVM solver type. */
897
898 vl_size dimension ; /**< Model dimension. */
899 double * model ; /**< Model ($\bw$ vector). */
900 double bias ; /**< Bias. */
901 double biasMultiplier ; /**< Bias feature multiplier. */
902
903 /* valid during a run */
904 double lambda ; /**< Regularizer multiplier. */
905 void const * data ;
906 vl_size numData ;
907 double const * labels ; /**< Data labels. */
908 double const * weights ; /**< Data weights. */
909
910 VlSvmDataset * ownDataset ; /**< Optional owned dataset. */
911
912 VlSvmDiagnosticFunction diagnosticFn ;
913 void * diagnosticFnData ;
914 vl_size diagnosticFrequency ; /**< Frequency of diagnostic. */
915
916 VlSvmLossFunction lossFn ;
917 VlSvmLossFunction conjugateLossFn ;
918 VlSvmLossFunction lossDerivativeFn ;
919 VlSvmDcaUpdateFunction dcaUpdateFn ;
920 VlSvmInnerProductFunction innerProductFn ;
921 VlSvmAccumulateFunction accumulateFn ;
922
923 vl_size iteration ; /**< Current iterations number. */
924 vl_size maxNumIterations ; /**< Maximum number of iterations. */
925 double epsilon ; /**< Stopping threshold. */
926
927 /* Book keeping */
928 VlSvmStatistics statistics ; /**< Statistcs. */
929 double * scores ;
930
931 /* SGD specific */
932 double biasLearningRate ; /**< Bias learning rate. */
933
934 /* SDCA specific */
935 double * alpha ; /**< Dual variables. */
936 } ;
937
938 /* ---------------------------------------------------------------- */
939
940 /** @brief Create a new object with plain data.
941 ** @param type type of SMV solver.
942 ** @param data a pointer to a matrix of data.
943 ** @param dimension dimension of the SVM model.
944 ** @param numData number of training samples.
945 ** @param labels training labels.
946 ** @param lambda regularizer parameter.
947 ** @return the new object.
948 **
949 ** @a data has one column per sample, in @c double format.
950 ** More advanced inputs can be used with ::vl_svm_new_with_dataset
951 ** and ::vl_svm_new_with_abstract_data.
952 **
953 ** @sa ::vl_svm_delete
954 **/
955
956 VlSvm *
vl_svm_new(VlSvmSolverType type,double const * data,vl_size dimension,vl_size numData,double const * labels,double lambda)957 vl_svm_new (VlSvmSolverType type,
958 double const * data,
959 vl_size dimension,
960 vl_size numData,
961 double const * labels,
962 double lambda)
963 {
964 VlSvmDataset * dataset = vl_svmdataset_new(VL_TYPE_DOUBLE, (void*)data, dimension, numData) ;
965 VlSvm * self = vl_svm_new_with_dataset (type, dataset, labels, lambda) ;
966 self->ownDataset = dataset ;
967 return self ;
968 }
969
970 /** @brief Create a new object with a dataset.
971 ** @param solver type of SMV solver.
972 ** @param dataset SVM dataset object
973 ** @param labels training samples labels.
974 ** @param lambda regularizer parameter.
975 ** @return the new object.
976 ** @sa ::vl_svm_delete
977 **/
978
979 VlSvm *
vl_svm_new_with_dataset(VlSvmSolverType solver,VlSvmDataset * dataset,double const * labels,double lambda)980 vl_svm_new_with_dataset (VlSvmSolverType solver,
981 VlSvmDataset * dataset,
982 double const * labels,
983 double lambda)
984 {
985 VlSvm * self = vl_svm_new_with_abstract_data (solver,
986 dataset,
987 vl_svmdataset_get_dimension(dataset),
988 vl_svmdataset_get_num_data(dataset),
989 labels,
990 lambda) ;
991 vl_svm_set_data_functions (self,
992 vl_svmdataset_get_inner_product_function(dataset),
993 vl_svmdataset_get_accumulate_function(dataset)) ;
994 return self ;
995 }
996
997 /** @brief Create a new object with abstract data.
998 ** @param solver type of SMV solver.
999 ** @param data pointer to the data.
1000 ** @param dimension dimension of the SVM model.
1001 ** @param numData num training samples.
1002 ** @param labels training samples labels.
1003 ** @param lambda regularizer parameter.
1004 ** @return the new object.
1005 **
1006 ** After calling this function, ::vl_svm_set_data_functions *must*
1007 ** be used to setup suitable callbacks for the inner product
1008 ** and accumulation operations (@see svm-data-abstraction).
1009 **
1010 ** @sa ::vl_svm_delete
1011 **/
1012
1013 VlSvm *
vl_svm_new_with_abstract_data(VlSvmSolverType solver,void * data,vl_size dimension,vl_size numData,double const * labels,double lambda)1014 vl_svm_new_with_abstract_data (VlSvmSolverType solver,
1015 void * data,
1016 vl_size dimension,
1017 vl_size numData,
1018 double const * labels,
1019 double lambda)
1020 {
1021 VlSvm * self = vl_calloc(1,sizeof(VlSvm)) ;
1022
1023 assert(dimension >= 1) ;
1024 assert(numData >= 1) ;
1025 assert(labels) ;
1026
1027 self->solver = solver ;
1028
1029 self->dimension = dimension ;
1030 self->model = 0 ;
1031 self->bias = 0 ;
1032 self->biasMultiplier = 1.0 ;
1033
1034 self->lambda = lambda ;
1035 self->data = data ;
1036 self->numData = numData ;
1037 self->labels = labels ;
1038
1039 self->diagnosticFrequency = numData ;
1040 self->diagnosticFn = 0 ;
1041 self->diagnosticFnData = 0 ;
1042
1043 self->lossFn = vl_svm_hinge_loss ;
1044 self->conjugateLossFn = vl_svm_hinge_conjugate_loss ;
1045 self->lossDerivativeFn = vl_svm_hinge_loss_derivative ;
1046 self->dcaUpdateFn = vl_svm_hinge_dca_update ;
1047
1048 self->innerProductFn = 0 ;
1049 self->accumulateFn = 0 ;
1050
1051 self->iteration = 0 ;
1052 self->maxNumIterations = VL_MAX((double)numData, vl_ceil_f(10.0 / lambda)) ;
1053 self->epsilon = 1e-2 ;
1054
1055 /* SGD */
1056 self->biasLearningRate = 0.01 ;
1057
1058 /* SDCA */
1059 self->alpha = 0 ;
1060
1061 /* allocations */
1062 self->model = vl_calloc(dimension, sizeof(double)) ;
1063 if (self->model == NULL) goto err_alloc ;
1064
1065 if (self->solver == VlSvmSolverSdca) {
1066 self->alpha = vl_calloc(self->numData, sizeof(double)) ;
1067 if (self->alpha == NULL) goto err_alloc ;
1068 }
1069
1070 self->scores = vl_calloc(numData, sizeof(double)) ;
1071 if (self->scores == NULL) goto err_alloc ;
1072
1073 return self ;
1074
1075 err_alloc:
1076 if (self->scores) {
1077 vl_free (self->scores) ;
1078 self->scores = 0 ;
1079 }
1080 if (self->model) {
1081 vl_free (self->model) ;
1082 self->model = 0 ;
1083 }
1084 if (self->alpha) {
1085 vl_free (self->alpha) ;
1086 self->alpha = 0 ;
1087 }
1088 return 0 ;
1089 }
1090
1091 /** @brief Delete object.
1092 ** @param self object.
1093 ** @sa ::vl_svm_new
1094 **/
1095
1096 void
vl_svm_delete(VlSvm * self)1097 vl_svm_delete (VlSvm * self)
1098 {
1099 if (self->model) {
1100 vl_free (self->model) ;
1101 self->model = 0 ;
1102 }
1103 if (self->alpha) {
1104 vl_free (self->alpha) ;
1105 self->alpha = 0 ;
1106 }
1107 if (self->ownDataset) {
1108 vl_svmdataset_delete(self->ownDataset) ;
1109 self->ownDataset = 0 ;
1110 }
1111 vl_free (self) ;
1112 }
1113
1114 /* ---------------------------------------------------------------- */
1115 /* Setters and getters */
1116 /* ---------------------------------------------------------------- */
1117
1118 /** @brief Set the convergence threshold
1119 ** @param self object
1120 ** @param epsilon threshold (non-negative).
1121 **/
1122
vl_svm_set_epsilon(VlSvm * self,double epsilon)1123 void vl_svm_set_epsilon (VlSvm *self, double epsilon)
1124 {
1125 assert(self) ;
1126 assert(epsilon >= 0) ;
1127 self->epsilon = epsilon ;
1128 }
1129
1130 /** @brief Get the convergence threshold
1131 ** @param self object
1132 ** @return epsilon threshold.
1133 **/
1134
vl_svm_get_epsilon(VlSvm const * self)1135 double vl_svm_get_epsilon (VlSvm const *self)
1136 {
1137 assert(self) ;
1138 return self->epsilon ;
1139 }
1140
1141 /** @brief Set the bias learning rate
1142 ** @param self object
1143 ** @param rate bias learning rate (positive).
1144 **
1145 ** This parameter applies only to the SGD solver.
1146 **/
1147
vl_svm_set_bias_learning_rate(VlSvm * self,double rate)1148 void vl_svm_set_bias_learning_rate (VlSvm *self, double rate)
1149 {
1150 assert(self) ;
1151 assert(rate > 0) ;
1152 self->biasLearningRate = rate ;
1153 }
1154
1155 /** @brief Get the bias leraning rate.
1156 ** @param self object
1157 ** @return bias learning rate.
1158 **/
1159
vl_svm_get_bias_learning_rate(VlSvm const * self)1160 double vl_svm_get_bias_learning_rate (VlSvm const *self)
1161 {
1162 assert(self) ;
1163 return self->biasLearningRate ;
1164 }
1165
1166 /** @brief Set the bias multiplier.
1167 ** @param self object
1168 ** @param b bias multiplier.
1169 **
1170 ** The *bias multiplier* is the value of the constant feature
1171 ** appended to the data vectors to implement the bias (@ref svm-bias).
1172 **/
1173
vl_svm_set_bias_multiplier(VlSvm * self,double b)1174 void vl_svm_set_bias_multiplier (VlSvm * self, double b)
1175 {
1176 assert(self) ;
1177 assert(b >= 0) ;
1178 self->biasMultiplier = b ;
1179 }
1180
1181 /** @brief Get the bias multiplier.
1182 ** @param self object.
1183 ** @return bias multiplier.
1184 **/
1185
vl_svm_get_bias_multiplier(VlSvm const * self)1186 double vl_svm_get_bias_multiplier (VlSvm const * self)
1187 {
1188 assert(self) ;
1189 return self->biasMultiplier ;
1190 }
1191
1192 /** @brief Set the current iteratio number.
1193 ** @param self object.
1194 ** @param n iteration number.
1195 **
1196 ** If called before training,
1197 ** this can be used with SGD for a warm start, as the net
1198 ** effect is to slow down the learning rate.
1199 **/
1200
vl_svm_set_iteration_number(VlSvm * self,vl_uindex n)1201 void vl_svm_set_iteration_number (VlSvm *self, vl_uindex n)
1202 {
1203 assert(self) ;
1204 self->iteration = n ;
1205 }
1206
1207 /** @brief Get the current iteration number.
1208 ** @param self object.
1209 ** @return current iteration number.
1210 **/
1211
vl_svm_get_iteration_number(VlSvm const * self)1212 vl_size vl_svm_get_iteration_number (VlSvm const *self)
1213 {
1214 assert(self) ;
1215 return self->iteration ;
1216 }
1217
1218 /** @brief Set the maximum number of iterations.
1219 ** @param self object.
1220 ** @param n maximum number of iterations.
1221 **/
1222
vl_svm_set_max_num_iterations(VlSvm * self,vl_size n)1223 void vl_svm_set_max_num_iterations (VlSvm *self, vl_size n)
1224 {
1225 assert(self) ;
1226 self->maxNumIterations = n ;
1227 }
1228
1229 /** @brief Get the maximum number of iterations.
1230 ** @param self object.
1231 ** @return maximum number of iterations.
1232 **/
1233
vl_svm_get_max_num_iterations(VlSvm const * self)1234 vl_size vl_svm_get_max_num_iterations (VlSvm const *self)
1235 {
1236 assert(self) ;
1237 return self->maxNumIterations ;
1238 }
1239
1240 /** @brief Set the diagnostic frequency.
1241 ** @param self object.
1242 ** @param f diagnostic frequency (@c >= 1).
1243 **
1244 ** A diagnostic round (to test for convergence and to printout
1245 ** information) is performed every @a f iterations.
1246 **/
1247
vl_svm_set_diagnostic_frequency(VlSvm * self,vl_size f)1248 void vl_svm_set_diagnostic_frequency (VlSvm *self, vl_size f)
1249 {
1250 assert(self) ;
1251 assert(f > 0) ;
1252 self->diagnosticFrequency = f ;
1253 }
1254
1255 /** @brief Get the diagnostic frequency.
1256 ** @param self object.
1257 ** @return diagnostic frequency.
1258 **/
1259
vl_svm_get_diagnostic_frequency(VlSvm const * self)1260 vl_size vl_svm_get_diagnostic_frequency (VlSvm const *self)
1261 {
1262 assert(self) ;
1263 return self->diagnosticFrequency ;
1264 }
1265
1266 /** @brief Get the SVM solver type.
1267 ** @param self object.
1268 ** @return SVM solver type.
1269 **/
1270
vl_svm_get_solver(VlSvm const * self)1271 VlSvmSolverType vl_svm_get_solver (VlSvm const * self)
1272 {
1273 assert(self) ;
1274 return self->solver ;
1275 }
1276
1277 /** @brief Set the regularizer parameter lambda.
1278 ** @param self object.
1279 ** @param lambda regularizer parameter.
1280 **
1281 ** Note that @a lambda is usually set when calling a
1282 ** constructor for ::VlSvm as certain parameters, such
1283 ** as the maximum number of iterations, are tuned accordingly.
1284 ** This tuning is not performed when @a lambda is changed
1285 ** using this function.
1286 **/
1287
vl_svm_set_lambda(VlSvm * self,double lambda)1288 void vl_svm_set_lambda (VlSvm * self, double lambda)
1289 {
1290 assert(self) ;
1291 assert(lambda >= 0) ;
1292 self->lambda = lambda ;
1293 }
1294
1295 /** @brief Get the regularizer parameter lambda.
1296 ** @param self object.
1297 ** @return diagnostic frequency.
1298 **/
1299
vl_svm_get_lambda(VlSvm const * self)1300 double vl_svm_get_lambda (VlSvm const * self)
1301 {
1302 assert(self) ;
1303 return self->lambda ;
1304 }
1305
1306 /** @brief Set the data weights.
1307 ** @param self object.
1308 ** @param weights data weights.
1309 **
1310 ** @a weights must be an array of non-negative weights.
1311 ** The loss of each data point is multiplied by the corresponding
1312 ** weight.
1313 **
1314 ** Set @a weights to @c NULL to weight the data uniformly by 1 (default).
1315 **
1316 ** Note that the @a weights array is *not* copied and must be valid
1317 ** througout the object lifetime (unless it is replaced).
1318 **/
1319
vl_svm_set_weights(VlSvm * self,double const * weights)1320 void vl_svm_set_weights (VlSvm * self, double const *weights)
1321 {
1322 assert(self) ;
1323 self->weights = weights ;
1324 }
1325
1326 /** @brief Get the data weights.
1327 ** @param self object.
1328 ** @return data weights.
1329 **/
1330
vl_svm_get_weights(VlSvm const * self)1331 double const *vl_svm_get_weights (VlSvm const * self)
1332 {
1333 assert(self) ;
1334 return self->weights ;
1335 }
1336
1337 /* ---------------------------------------------------------------- */
1338 /* Get data */
1339 /* ---------------------------------------------------------------- */
1340
1341 /** @brief Get the model dimenison.
1342 ** @param self object.
1343 ** @return model dimension.
1344 **
1345 ** This is the dimensionality of the weight vector $\bw$.
1346 **/
1347
vl_svm_get_dimension(VlSvm * self)1348 vl_size vl_svm_get_dimension (VlSvm *self)
1349 {
1350 assert(self) ;
1351 return self->dimension ;
1352 }
1353
1354 /** @brief Get the number of data samples.
1355 ** @param self object.
1356 ** @return model number of data samples
1357 **
1358 ** This is the dimensionality of the weight vector $\bw$.
1359 **/
1360
vl_svm_get_num_data(VlSvm * self)1361 vl_size vl_svm_get_num_data (VlSvm *self)
1362 {
1363 assert(self) ;
1364 return self->numData ;
1365 }
1366
1367 /** @brief Get the SVM model.
1368 ** @param self object.
1369 ** @return model.
1370 **
1371 ** This is the weight vector $\bw$.
1372 **/
1373
vl_svm_get_model(VlSvm const * self)1374 double const * vl_svm_get_model (VlSvm const *self)
1375 {
1376 assert(self) ;
1377 return self->model ;
1378 }
1379
1380 /** @brief Set the SVM model.
1381 ** @param self object.
1382 ** @param model model.
1383 **
1384 ** The function *copies* the content of the vector @a model to the
1385 ** internal model buffer. This operation can be used for warm start
1386 ** with the SGD algorithm, but has undefined effect with the SDCA algorithm.
1387 **/
1388
vl_svm_set_model(VlSvm * self,double const * model)1389 void vl_svm_set_model (VlSvm *self, double const *model)
1390 {
1391 assert(self) ;
1392 assert(model) ;
1393 memcpy(self->model, model, sizeof(double) * vl_svm_get_dimension(self)) ;
1394 }
1395
1396 /** @brief Set the SVM bias.
1397 ** @param self object.
1398 ** @param b bias.
1399 **
1400 ** The function set the internal representation of the SVM bias to
1401 ** be equal to @a b (the bias multiplier
1402 ** is applied). The same remark
1403 ** that applies to ::vl_svm_set_model applies here too.
1404 **/
1405
vl_svm_set_bias(VlSvm * self,double b)1406 void vl_svm_set_bias (VlSvm *self, double b)
1407 {
1408 assert(self);
1409 if (self->biasMultiplier) {
1410 self->bias = b / self->biasMultiplier ;
1411 }
1412 }
1413
1414 /** @brief Get the value of the bias.
1415 ** @param self object.
1416 ** @return bias $b$.
1417 **
1418 ** The value of the bias returned already include the effect of
1419 ** bias mutliplier.
1420 **/
1421
vl_svm_get_bias(VlSvm const * self)1422 double vl_svm_get_bias (VlSvm const *self)
1423 {
1424 assert(self) ;
1425 return self->bias * self->biasMultiplier ;
1426 }
1427
1428 /** @brief Get the solver statistics.
1429 ** @param self object.
1430 ** @return statistics.
1431 **/
1432
vl_svm_get_statistics(VlSvm const * self)1433 VlSvmStatistics const * vl_svm_get_statistics (VlSvm const *self)
1434 {
1435 assert(self) ;
1436 return &self->statistics ;
1437 }
1438
1439 /** @brief Get the scores of the data points.
1440 ** @param self object.
1441 ** @return vector of scores.
1442 **
1443 ** After training or during the diagnostic callback,
1444 ** this function can be used to retrieve the scores
1445 ** of the points, i.e. $\langle \bx_i, \bw \rangle + b$.
1446 **/
1447
vl_svm_get_scores(VlSvm const * self)1448 double const * vl_svm_get_scores (VlSvm const *self)
1449 {
1450 return self->scores ;
1451 }
1452
1453 /* ---------------------------------------------------------------- */
1454 /* Callbacks */
1455 /* ---------------------------------------------------------------- */
1456
1457 /** @typedef VlSvmDiagnosticFunction
1458 ** @brief SVM diagnostic function pointer.
1459 ** @param svm is an instance of ::VlSvm .
1460 **/
1461
1462 /** @typedef VlSvmAccumulateFunction
1463 ** @brief Pointer to a function that adds to @a model the data point at
1464 ** position @a element multiplied by the constant @a multiplier.
1465 **/
1466
1467 /** @typedef VlSvmInnerProductFunction
1468 ** @brief Pointer to a function that defines the inner product
1469 ** between the data point at position @a element and the SVM model
1470 **/
1471
1472 /** @brief Set the diagnostic function callback
1473 ** @param self object.
1474 ** @param f diagnostic function pointer.
1475 ** @param data pointer to data used by the diagnostic function.
1476 **/
1477
1478 void
vl_svm_set_diagnostic_function(VlSvm * self,VlSvmDiagnosticFunction f,void * data)1479 vl_svm_set_diagnostic_function (VlSvm *self, VlSvmDiagnosticFunction f, void *data) {
1480 self->diagnosticFn = f ;
1481 self->diagnosticFnData = data ;
1482 }
1483
1484 /** @brief Set the data functions.
1485 ** @param self object.
1486 ** @param inner inner product function.
1487 ** @param acc accumulate function.
1488 **
1489 ** See @ref svm-data-abstraction.
1490 **/
1491
vl_svm_set_data_functions(VlSvm * self,VlSvmInnerProductFunction inner,VlSvmAccumulateFunction acc)1492 void vl_svm_set_data_functions (VlSvm *self, VlSvmInnerProductFunction inner, VlSvmAccumulateFunction acc)
1493 {
1494 assert(self) ;
1495 assert(inner) ;
1496 assert(acc) ;
1497 self->innerProductFn = inner ;
1498 self->accumulateFn = acc ;
1499 }
1500
1501 /** @brief Set the loss function callback.
1502 ** @param self object.
1503 ** @param f loss function callback.
1504 **
1505 ** Note that setting up a loss requires specifying more than just one
1506 ** callback. See @ref svm-loss-functions for details.
1507 **/
1508
vl_svm_set_loss_function(VlSvm * self,VlSvmLossFunction f)1509 void vl_svm_set_loss_function (VlSvm *self, VlSvmLossFunction f)
1510 {
1511 assert(self) ;
1512 self->lossFn = f ;
1513 }
1514
1515 /** @brief Set the loss derivative function callback.
1516 ** @copydetails vl_svm_set_loss_function.
1517 **/
1518
vl_svm_set_loss_derivative_function(VlSvm * self,VlSvmLossFunction f)1519 void vl_svm_set_loss_derivative_function (VlSvm *self, VlSvmLossFunction f)
1520 {
1521 assert(self) ;
1522 self->lossDerivativeFn = f ;
1523 }
1524
1525 /** @brief Set the conjugate loss function callback.
1526 ** @copydetails vl_svm_set_loss_function.
1527 **/
1528
vl_svm_set_conjugate_loss_function(VlSvm * self,VlSvmLossFunction f)1529 void vl_svm_set_conjugate_loss_function (VlSvm *self, VlSvmLossFunction f)
1530 {
1531 assert(self) ;
1532 self->conjugateLossFn = f ;
1533 }
1534
1535 /** @brief Set the DCA update function callback.
1536 ** @copydetails vl_svm_set_loss_function.
1537 **/
1538
vl_svm_set_dca_update_function(VlSvm * self,VlSvmDcaUpdateFunction f)1539 void vl_svm_set_dca_update_function (VlSvm *self, VlSvmDcaUpdateFunction f)
1540 {
1541 assert(self) ;
1542 self->dcaUpdateFn = f ;
1543 }
1544
1545 /** @brief Set the loss function to one of the default types.
1546 ** @param self object.
1547 ** @param loss type of loss function.
1548 ** @sa @ref svm-loss-functions.
1549 **/
1550
1551 void
vl_svm_set_loss(VlSvm * self,VlSvmLossType loss)1552 vl_svm_set_loss (VlSvm *self, VlSvmLossType loss)
1553 {
1554 #define SETLOSS(x,y) \
1555 case VlSvmLoss ## x: \
1556 vl_svm_set_loss_function(self, vl_svm_ ## y ## _loss) ; \
1557 vl_svm_set_loss_derivative_function(self, vl_svm_ ## y ## _loss_derivative) ; \
1558 vl_svm_set_conjugate_loss_function(self, vl_svm_ ## y ## _conjugate_loss) ; \
1559 vl_svm_set_dca_update_function(self, vl_svm_ ## y ## _dca_update) ; \
1560 break;
1561
1562 switch (loss) {
1563 SETLOSS(Hinge, hinge) ;
1564 SETLOSS(Hinge2, hinge2) ;
1565 SETLOSS(L1, l1) ;
1566 SETLOSS(L2, l2) ;
1567 SETLOSS(Logistic, logistic) ;
1568 default:
1569 assert(0) ;
1570 }
1571 }
1572
1573 /* ---------------------------------------------------------------- */
1574 /* Pre-defined losses */
1575 /* ---------------------------------------------------------------- */
1576
1577 /** @typedef VlSvmLossFunction
1578 ** @brief SVM loss function pointer.
1579 ** @param inner inner product between sample and model $\bw^\top \bx$.
1580 ** @param label sample label $y$.
1581 ** @return value of the loss.
1582 **
1583 ** The interface is the same for a loss function, its derivative,
1584 ** or the conjugate loss.
1585 **
1586 ** @sa @ref svm-fundamentals
1587 **/
1588
1589 /** @typedef VlSvmDcaUpdateFunction
1590 ** @brief SVM SDCA update function pointer.
1591 ** @param alpha current value of the dual variable.
1592 ** @param inner inner product $\bw^\top \bx$ of the sample with the SVM model.
1593 ** @param norm2 normalization factor $\|\bx\|^2/\lambda n$.
1594 ** @param label label $y$ of the sample.
1595 ** @return incremental update $\Delta\alpha$ of the dual variable.
1596 **
1597 ** @sa @ref svm-sdca
1598 **/
1599
1600 /** @brief SVM hinge loss
1601 ** @copydetails VlSvmLossFunction */
1602 double
vl_svm_hinge_loss(double inner,double label)1603 vl_svm_hinge_loss (double inner, double label)
1604 {
1605 return VL_MAX(1 - label * inner, 0.0);
1606 }
1607
1608 /** @brief SVM hinge loss derivative
1609 ** @copydetails VlSvmLossFunction */
1610 double
vl_svm_hinge_loss_derivative(double inner,double label)1611 vl_svm_hinge_loss_derivative (double inner, double label)
1612 {
1613 if (label * inner < 1.0) {
1614 return - label ;
1615 } else {
1616 return 0.0 ;
1617 }
1618 }
1619
1620 /** @brief SVM hinge loss conjugate
1621 ** @param u dual variable.
1622 ** @param label label value.
1623 ** @return conjugate loss.
1624 **/
1625 double
vl_svm_hinge_conjugate_loss(double u,double label)1626 vl_svm_hinge_conjugate_loss (double u, double label) {
1627 double z = label * u ;
1628 if (-1 <= z && z <= 0) {
1629 return label * u ;
1630 } else {
1631 return VL_INFINITY_D ;
1632 }
1633 }
1634
1635 /** @brief SVM hinge loss DCA update
1636 ** @copydetails VlSvmDcaUpdateFunction */
1637 double
vl_svm_hinge_dca_update(double alpha,double inner,double norm2,double label)1638 vl_svm_hinge_dca_update (double alpha, double inner, double norm2, double label) {
1639 double palpha = (label - inner) / norm2 + alpha ;
1640 return label * VL_MAX(0, VL_MIN(1, label * palpha)) - alpha ;
1641 }
1642
1643 /** @brief SVM square hinge loss
1644 ** @copydetails VlSvmLossFunction */
1645 double
vl_svm_hinge2_loss(double inner,double label)1646 vl_svm_hinge2_loss (double inner,double label)
1647 {
1648 double z = VL_MAX(1 - label * inner, 0.0) ;
1649 return z*z ;
1650 }
1651
1652 /** @brief SVM square hinge loss derivative
1653 ** @copydetails VlSvmLossFunction */
1654 double
vl_svm_hinge2_loss_derivative(double inner,double label)1655 vl_svm_hinge2_loss_derivative (double inner, double label)
1656 {
1657 if (label * inner < 1.0) {
1658 return 2 * (inner - label) ;
1659 } else {
1660 return 0 ;
1661 }
1662 }
1663
1664 /** @brief SVM square hinge loss conjugate
1665 ** @copydetails vl_svm_hinge_conjugate_loss */
1666 double
vl_svm_hinge2_conjugate_loss(double u,double label)1667 vl_svm_hinge2_conjugate_loss (double u, double label) {
1668 if (label * u <= 0) {
1669 return (label + u/4) * u ;
1670 } else {
1671 return VL_INFINITY_D ;
1672 }
1673 }
1674
1675 /** @brief SVM square hinge loss DCA update
1676 ** @copydetails VlSvmDcaUpdateFunction */
1677 double
vl_svm_hinge2_dca_update(double alpha,double inner,double norm2,double label)1678 vl_svm_hinge2_dca_update (double alpha, double inner, double norm2, double label) {
1679 double palpha = (label - inner - 0.5*alpha) / (norm2 + 0.5) + alpha ;
1680 return label * VL_MAX(0, label * palpha) - alpha ;
1681 }
1682
1683 /** @brief SVM l1 loss
1684 ** @copydetails VlSvmLossFunction */
1685 double
vl_svm_l1_loss(double inner,double label)1686 vl_svm_l1_loss (double inner,double label)
1687 {
1688 return vl_abs_d(label - inner) ;
1689 }
1690
1691 /** @brief SVM l1 loss derivative
1692 ** @copydetails VlSvmLossFunction */
1693 double
vl_svm_l1_loss_derivative(double inner,double label)1694 vl_svm_l1_loss_derivative (double inner, double label)
1695 {
1696 if (label > inner) {
1697 return - 1.0 ;
1698 } else {
1699 return + 1.0 ;
1700 }
1701 }
1702
1703 /** @brief SVM l1 loss conjugate
1704 ** @copydetails vl_svm_hinge_conjugate_loss */
1705 double
vl_svm_l1_conjugate_loss(double u,double label)1706 vl_svm_l1_conjugate_loss (double u, double label) {
1707 if (vl_abs_d(u) <= 1) {
1708 return label*u ;
1709 } else {
1710 return VL_INFINITY_D ;
1711 }
1712 }
1713
1714 /** @brief SVM l1 loss DCA update
1715 ** @copydetails VlSvmDcaUpdateFunction */
1716 double
vl_svm_l1_dca_update(double alpha,double inner,double norm2,double label)1717 vl_svm_l1_dca_update (double alpha, double inner, double norm2, double label) {
1718 if (vl_abs_d(alpha) <= 1) {
1719 double palpha = (label - inner) / norm2 + alpha ;
1720 return VL_MAX(-1.0, VL_MIN(1.0, palpha)) - alpha ;
1721 } else {
1722 return VL_INFINITY_D ;
1723 }
1724 }
1725
1726 /** @brief SVM l2 loss
1727 ** @copydetails VlSvmLossFunction */
1728 double
vl_svm_l2_loss(double inner,double label)1729 vl_svm_l2_loss (double inner,double label)
1730 {
1731 double z = label - inner ;
1732 return z*z ;
1733 }
1734
1735 /** @brief SVM l2 loss derivative
1736 ** @copydetails VlSvmLossFunction */
1737 double
vl_svm_l2_loss_derivative(double inner,double label)1738 vl_svm_l2_loss_derivative (double inner, double label)
1739 {
1740 return - 2 * (label - inner) ;
1741 }
1742
1743 /** @brief SVM l2 loss conjugate
1744 ** @copydetails vl_svm_hinge_conjugate_loss */
1745 double
vl_svm_l2_conjugate_loss(double u,double label)1746 vl_svm_l2_conjugate_loss (double u, double label) {
1747 return (label + u/4) * u ;
1748 }
1749
1750 /** @brief SVM l2 loss DCA update
1751 ** @copydetails VlSvmDcaUpdateFunction */
1752 double
vl_svm_l2_dca_update(double alpha,double inner,double norm2,double label)1753 vl_svm_l2_dca_update (double alpha, double inner, double norm2, double label) {
1754 return (label - inner - 0.5*alpha) / (norm2 + 0.5) ;
1755 }
1756
1757 /** @brief SVM l2 loss
1758 ** @copydetails VlSvmLossFunction */
1759 double
vl_svm_logistic_loss(double inner,double label)1760 vl_svm_logistic_loss (double inner,double label)
1761 {
1762 double z = label * inner ;
1763 if (z >= 0) {
1764 return log(1.0 + exp(-z)) ;
1765 } else {
1766 return -z + log(exp(z) + 1.0) ;
1767 }
1768 }
1769
1770 /** @brief SVM l2 loss derivative
1771 ** @copydetails VlSvmLossFunction */
1772 double
vl_svm_logistic_loss_derivative(double inner,double label)1773 vl_svm_logistic_loss_derivative (double inner, double label)
1774 {
1775 double z = label * inner ;
1776 double t = 1 / (1 + exp(-z)) ; /* this is stable for z << 0 too */
1777 return label * (t - 1) ; /* = -label exp(-z) / (1 + exp(-z)) */
1778 }
1779
xlogx(double x)1780 VL_INLINE double xlogx(double x)
1781 {
1782 if (x <= 1e-10) return 0 ;
1783 return x*log(x) ;
1784 }
1785
1786 /** @brief SVM l2 loss conjugate
1787 ** @copydetails vl_svm_hinge_conjugate_loss */
1788 double
vl_svm_logistic_conjugate_loss(double u,double label)1789 vl_svm_logistic_conjugate_loss (double u, double label) {
1790 double z = label * u ;
1791 if (-1 <= z && z <= 0) {
1792 return xlogx(-z) + xlogx(1+z) ;
1793 } else {
1794 return VL_INFINITY_D ;
1795 }
1796 }
1797
1798 /** @brief SVM l2 loss DCA update
1799 ** @copydetails VlSvmDcaUpdateFunction */
1800 double
vl_svm_logistic_dca_update(double alpha,double inner,double norm2,double label)1801 vl_svm_logistic_dca_update (double alpha, double inner, double norm2, double label) {
1802 /*
1803 The goal is to solve the problem
1804
1805 min_delta A/2 delta^2 + B delta + l*(-alpha - delta|y), -1 <= - y (alpha+delta) <= 0
1806
1807 where A = norm2, B = inner, and y = label. To simplify the notation, we set
1808
1809 f(beta) = beta * log(beta) + (1 - beta) * log(1 - beta)
1810
1811 where beta = y(alpha + delta) such that
1812
1813 l*(-alpha - delta |y) = f(beta).
1814
1815 Hence 0 <= beta <= 1, delta = + y beta - alpha. Substituting
1816
1817 min_beta A/2 beta^2 + y (B - A alpha) beta + f(beta) + const
1818
1819 The Newton step is then given by
1820
1821 beta = beta - (A beta + y(B - A alpha) + df) / (A + ddf).
1822
1823 However, the function is singluar for beta=0 and beta=1 (infinite
1824 first and second order derivatives). Since the function is monotonic
1825 (second derivarive always strictly greater than zero) and smooth,
1826 we canuse bisection to find the zero crossing of the first derivative.
1827 Once one is sufficiently close to the optimum, a one or two Newton
1828 steps are sufficien to land on it with excellent accuracy.
1829 */
1830
1831 double df, ddf, der, dder ;
1832 vl_index t ;
1833
1834 /* bisection */
1835 double beta1 = 0 ;
1836 double beta2 = 1 ;
1837 double beta = 0.5 ;
1838
1839 for (t = 0 ; t < 5 ; ++t) {
1840 df = log(beta) - log(1-beta) ;
1841 der = norm2 * beta + label * (inner - norm2*alpha) + df ;
1842 if (der >= 0) {
1843 beta2 = beta ;
1844 } else {
1845 beta1 = beta ;
1846 }
1847 beta = 0.5 * (beta1 + beta2) ;
1848 }
1849
1850 #if 1
1851 /* a final Newton step, but not too close to the singularities */
1852 for (t = 0 ; (t < 2) & (beta > VL_EPSILON_D) & (beta < 1-VL_EPSILON_D) ; ++t) {
1853 df = log(beta) - log(1-beta) ;
1854 ddf = 1 / (beta * (1-beta)) ;
1855 der = norm2 * beta + label * (inner - norm2*alpha) + df ;
1856 dder = norm2 + ddf ;
1857 beta -= der / dder ;
1858 beta = VL_MAX(0, VL_MIN(1, beta)) ;
1859 }
1860 #endif
1861
1862 return label * beta - alpha ;
1863 }
1864
1865 /* ---------------------------------------------------------------- */
1866
1867 /** @internal @brief Update SVM statistics
1868 ** @param self object.
1869 **/
1870
_vl_svm_update_statistics(VlSvm * self)1871 void _vl_svm_update_statistics (VlSvm *self)
1872 {
1873 vl_size i, k ;
1874 double inner, p ;
1875
1876 memset(&self->statistics, 0, sizeof(VlSvmStatistics)) ;
1877
1878 self->statistics.regularizer = self->bias * self->bias ;
1879 for (i = 0; i < self->dimension; i++) {
1880 self->statistics.regularizer += self->model[i] * self->model[i] ;
1881 }
1882 self->statistics.regularizer *= self->lambda * 0.5 ;
1883
1884 for (k = 0; k < self->numData ; k++) {
1885 p = (self->weights) ? self->weights[k] : 1.0 ;
1886 if (p <= 0) continue ;
1887 inner = self->innerProductFn(self->data, k, self->model) ;
1888 inner += self->bias * self->biasMultiplier ;
1889 self->scores[k] = inner ;
1890 self->statistics.loss += p * self->lossFn(inner, self->labels[k]) ;
1891 if (self->solver == VlSvmSolverSdca) {
1892
1893 self->statistics.dualLoss -= p * self->conjugateLossFn(- self->alpha[k] / p, self->labels[k]) ;
1894 }
1895 }
1896
1897 self->statistics.loss /= self->numData ;
1898 self->statistics.objective = self->statistics.regularizer + self->statistics.loss ;
1899
1900 if (self->solver == VlSvmSolverSdca) {
1901 self->statistics.dualLoss /= self->numData ;
1902 self->statistics.dualObjective = - self->statistics.regularizer + self->statistics.dualLoss ;
1903 self->statistics.dualityGap = self->statistics.objective - self->statistics.dualObjective ;
1904 }
1905 }
1906
1907 /* ---------------------------------------------------------------- */
1908 /* Evaluate rather than solve */
1909 /* ---------------------------------------------------------------- */
1910
_vl_svm_evaluate(VlSvm * self)1911 void _vl_svm_evaluate (VlSvm *self)
1912 {
1913 double startTime = vl_get_cpu_time () ;
1914
1915 _vl_svm_update_statistics (self) ;
1916
1917 self->statistics.elapsedTime = vl_get_cpu_time() - startTime ;
1918 self->statistics.iteration = 0 ;
1919 self->statistics.epoch = 0 ;
1920 self->statistics.status = VlSvmStatusConverged ;
1921
1922 if (self->diagnosticFn) {
1923 self->diagnosticFn(self, self->diagnosticFnData) ;
1924 }
1925 }
1926
1927 /* ---------------------------------------------------------------- */
1928 /* Stochastic Dual Coordinate Ascent Solver */
1929 /* ---------------------------------------------------------------- */
1930
_vl_svm_sdca_train(VlSvm * self)1931 void _vl_svm_sdca_train (VlSvm *self)
1932 {
1933 double * norm2 ;
1934 vl_index * permutation ;
1935 vl_uindex i, t ;
1936 double inner, delta, multiplier, p ;
1937
1938 double startTime = vl_get_cpu_time () ;
1939 VlRand * rand = vl_get_rand() ;
1940
1941 norm2 = (double*) vl_calloc(self->numData, sizeof(double));
1942 permutation = vl_calloc(self->numData, sizeof(vl_index)) ;
1943
1944 {
1945 double * buffer = vl_calloc(self->dimension, sizeof(double)) ;
1946 for (i = 0 ; i < (unsigned)self->numData; i++) {
1947 double n2 ;
1948 permutation [i] = i ;
1949 memset(buffer, 0, self->dimension * sizeof(double)) ;
1950 self->accumulateFn (self->data, i, buffer, 1) ;
1951 n2 = self->innerProductFn (self->data, i, buffer) ;
1952 n2 += self->biasMultiplier * self->biasMultiplier ;
1953 norm2[i] = n2 / (self->lambda * self->numData) ;
1954 }
1955 vl_free(buffer) ;
1956 }
1957
1958 for (t = 0 ; 1 ; ++t) {
1959
1960 if (t % self->numData == 0) {
1961 /* once a new epoch is reached (all data have been visited),
1962 change permutation */
1963 vl_rand_permute_indexes(rand, permutation, self->numData) ;
1964 }
1965
1966 /* pick a sample and compute update */
1967 i = permutation[t % self->numData] ;
1968 p = (self->weights) ? self->weights[i] : 1.0 ;
1969 if (p > 0) {
1970 inner = self->innerProductFn(self->data, i, self->model) ;
1971 inner += self->bias * self->biasMultiplier ;
1972 delta = p * self->dcaUpdateFn(self->alpha[i] / p, inner, p * norm2[i], self->labels[i]) ;
1973 } else {
1974 delta = 0 ;
1975 }
1976
1977 /* apply update */
1978 if (delta != 0) {
1979 self->alpha[i] += delta ;
1980 multiplier = delta / (self->numData * self->lambda) ;
1981 self->accumulateFn(self->data,i,self->model,multiplier) ;
1982 self->bias += self->biasMultiplier * multiplier;
1983 }
1984
1985 /* call diagnostic occasionally */
1986 if ((t + 1) % self->diagnosticFrequency == 0 || t + 1 == self->maxNumIterations) {
1987 _vl_svm_update_statistics (self) ;
1988 self->statistics.elapsedTime = vl_get_cpu_time() - startTime ;
1989 self->statistics.iteration = t ;
1990 self->statistics.epoch = t / self->numData ;
1991
1992 self->statistics.status = VlSvmStatusTraining ;
1993 if (self->statistics.dualityGap < self->epsilon) {
1994 self->statistics.status = VlSvmStatusConverged ;
1995 }
1996 else if (t + 1 == self->maxNumIterations) {
1997 self->statistics.status = VlSvmStatusMaxNumIterationsReached ;
1998 }
1999
2000 if (self->diagnosticFn) {
2001 self->diagnosticFn(self, self->diagnosticFnData) ;
2002 }
2003
2004 if (self->statistics.status != VlSvmStatusTraining) {
2005 break ;
2006 }
2007 }
2008 } /* next iteration */
2009
2010 vl_free (norm2) ;
2011 vl_free (permutation) ;
2012 }
2013
2014 /* ---------------------------------------------------------------- */
2015 /* Stochastic Gradient Descent Solver */
2016 /* ---------------------------------------------------------------- */
2017
_vl_svm_sgd_train(VlSvm * self)2018 void _vl_svm_sgd_train (VlSvm *self)
2019 {
2020 vl_index * permutation ;
2021 double * scores ;
2022 double * previousScores ;
2023 vl_uindex i, t, k ;
2024 double inner, gradient, rate, biasRate, p ;
2025 double factor = 1.0 ;
2026 double biasFactor = 1.0 ; /* to allow slower bias learning rate */
2027 vl_index t0 = VL_MAX(2, vl_ceil_d(1.0 / self->lambda)) ;
2028 //t0=2 ;
2029
2030 double startTime = vl_get_cpu_time () ;
2031 VlRand * rand = vl_get_rand() ;
2032
2033 permutation = vl_calloc(self->numData, sizeof(vl_index)) ;
2034 scores = vl_calloc(self->numData * 2, sizeof(double)) ;
2035 previousScores = scores + self->numData ;
2036
2037 for (i = 0 ; i < (unsigned)self->numData; i++) {
2038 permutation [i] = i ;
2039 previousScores [i] = - VL_INFINITY_D ;
2040 }
2041
2042 /*
2043 We store the w vector as the product fw (factor * model).
2044 We also use a different factor for the bias: biasFactor * biasMultiplier
2045 to enable a slower learning rate for the bias.
2046
2047 Given this representation, it is easy to carry the two key operations:
2048
2049 * Inner product: <fw,x> = f <w,x>
2050
2051 * Model update: fp wp = fw - rate * lambda * w - rate * g
2052 = f(1 - rate * lambda) w - rate * g
2053
2054 Thus the update equations are:
2055
2056 fp = f(1 - rate * lambda), and
2057 wp = w + rate / fp * g ;
2058
2059 * Realization of the scaling factor. Before the statistics function
2060 is called, or training finishes, the factor (and biasFactor)
2061 are explicitly applied to the model and the bias.
2062 */
2063
2064 for (t = 0 ; 1 ; ++t) {
2065
2066 if (t % self->numData == 0) {
2067 /* once a new epoch is reached (all data have been visited),
2068 change permutation */
2069 vl_rand_permute_indexes(rand, permutation, self->numData) ;
2070 }
2071
2072 /* pick a sample and compute update */
2073 i = permutation[t % self->numData] ;
2074 p = (self->weights) ? self->weights[i] : 1.0 ;
2075 p = VL_MAX(0.0, p) ; /* we assume non-negative weights, so this is just for robustness */
2076 inner = factor * self->innerProductFn(self->data, i, self->model) ;
2077 inner += biasFactor * (self->biasMultiplier * self->bias) ;
2078 gradient = p * self->lossDerivativeFn(inner, self->labels[i]) ;
2079 previousScores[i] = scores[i] ;
2080 scores[i] = inner ;
2081
2082 /* apply update */
2083 rate = 1.0 / (self->lambda * (t + t0)) ;
2084 biasRate = rate * self->biasLearningRate ;
2085 factor *= (1.0 - self->lambda * rate) ;
2086 biasFactor *= (1.0 - self->lambda * biasRate) ;
2087
2088 /* debug: realize the scaling factor all the times */
2089 /*
2090 for (k = 0 ; k < self->dimension ; ++k) self->model[k] *= factor ;
2091 self->bias *= biasFactor;
2092 factor = 1.0 ;
2093 biasFactor = 1.0 ;
2094 */
2095
2096 if (gradient != 0) {
2097 self->accumulateFn(self->data, i, self->model, - gradient * rate / factor) ;
2098 self->bias += self->biasMultiplier * (- gradient * biasRate / biasFactor) ;
2099 }
2100
2101 /* call diagnostic occasionally */
2102 if ((t + 1) % self->diagnosticFrequency == 0 || t + 1 == self->maxNumIterations) {
2103
2104 /* realize factor before computing statistics or completing training */
2105 for (k = 0 ; k < self->dimension ; ++k) self->model[k] *= factor ;
2106 self->bias *= biasFactor;
2107 factor = 1.0 ;
2108 biasFactor = 1.0 ;
2109
2110 _vl_svm_update_statistics (self) ;
2111
2112 for (k = 0 ; k < self->numData ; ++k) {
2113 double delta = scores[k] - previousScores[k] ;
2114 self->statistics.scoresVariation += delta * delta ;
2115 }
2116 self->statistics.scoresVariation = sqrt(self->statistics.scoresVariation) / self->numData ;
2117
2118 self->statistics.elapsedTime = vl_get_cpu_time() - startTime ;
2119 self->statistics.iteration = t ;
2120 self->statistics.epoch = t / self->numData ;
2121
2122 self->statistics.status = VlSvmStatusTraining ;
2123 if (self->statistics.scoresVariation < self->epsilon) {
2124 self->statistics.status = VlSvmStatusConverged ;
2125 }
2126 else if (t + 1 == self->maxNumIterations) {
2127 self->statistics.status = VlSvmStatusMaxNumIterationsReached ;
2128 }
2129
2130 if (self->diagnosticFn) {
2131 self->diagnosticFn(self, self->diagnosticFnData) ;
2132 }
2133
2134 if (self->statistics.status != VlSvmStatusTraining) {
2135 break ;
2136 }
2137 }
2138 } /* next iteration */
2139
2140 vl_free (scores) ;
2141 vl_free (permutation) ;
2142 }
2143
2144 /* ---------------------------------------------------------------- */
2145 /* Dispatcher */
2146 /* ---------------------------------------------------------------- */
2147
2148 /** @brief Run the SVM solver
2149 ** @param self object.
2150 **
2151 ** The data on which the SVM operates is passed upon the cration of
2152 ** the ::VlSvm object. This function runs a solver to learn a
2153 ** corresponding model. See @ref svm-starting.
2154 **/
2155
vl_svm_train(VlSvm * self)2156 void vl_svm_train (VlSvm * self)
2157 {
2158 assert (self) ;
2159 switch (self->solver) {
2160 case VlSvmSolverSdca:
2161 _vl_svm_sdca_train(self) ;
2162 break ;
2163 case VlSvmSolverSgd:
2164 _vl_svm_sgd_train(self) ;
2165 break ;
2166 case VlSvmSolverNone:
2167 _vl_svm_evaluate(self) ;
2168 break ;
2169 default:
2170 assert(0) ;
2171 }
2172 }
2173