1 /** @file sift.c
2  ** @brief SIFT - Definition
3  ** @author Andrea Vedaldi
4  **/
5 
6 /*
7 Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
8 All rights reserved.
9 
10 This file is part of the VLFeat library and is made available under
11 the terms of the BSD license (see the COPYING file).
12 */
13 
14 /**
15 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
16 @page sift Scale Invariant Feature Transform (SIFT)
17 @author Andrea Vedaldi
18 @par "Credits:" May people have contributed with suggestions and bug
19 reports. Although the following list is certainly incomplete, we would
20 like to thank: Wei Dong, Loic, Giuseppe, Liu, Erwin, P. Ivanov, and
21 Q. S. Luo.
22 @tableofcontents
23 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
24 
25 @ref sift.h implements a @ref sift-usage "SIFT filter object", a
26 reusable object to extract SIFT features @cite{lowe99object} from one
27 or multiple images.
28 
29 This is the original VLFeat implementation of SIFT, designed to be
30 compatible with Lowe's original SIFT. See @ref covdet for a different
31 version of SIFT integrated in the more general covariant feature
32 detector engine.
33 
34 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
35 @section sift-intro Overview
36 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
37 
38 A SIFT feature is a selected image region (also called keypoint) with
39 an associated descriptor. Keypoints are extracted by the <b>@ref
40 sift-intro-detector "SIFT detector"</b> and their descriptors are
41 computed by the <b>@ref sift-intro-descriptor "SIFT descriptor"</b>.  It is
42 also common to use independently the SIFT detector (i.e. computing the
43 keypoints without descriptors) or the SIFT descriptor (i.e.  computing
44 descriptors of custom keypoints).
45 
46 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
47 @subsection sift-intro-detector SIFT detector
48 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
49 
50 A SIFT <em>keypoint</em> is a circular image region with an
51 orientation. It is described by a geometric <em>frame</em> of four
52 parameters: the keypoint center coordinates @e x and @e y, its @e
53 scale (the radius of the region), and its @e orientation (an angle
54 expressed in radians). The SIFT detector uses as keypoints image
55 structures which resemble &ldquo;blobs&rdquo;. By searching for blobs
56 at multiple scales and positions, the SIFT detector is invariant (or,
57 more accurately, covariant) to translation, rotations, and re scaling
58 of the image.
59 
60 The keypoint orientation is also determined from the local image
61 appearance and is covariant to image rotations. Depending on the
62 symmetry of the keypoint appearance, determining the orientation can
63 be ambiguous. In this case, the SIFT detectors returns a list of up to
64 four possible orientations, constructing up to four frames (differing
65 only by their orientation) for each detected image blob.
66 
67 @image html sift-frame.png "SIFT keypoints are circular image regions with an orientation."
68 
69 There are several parameters that influence the detection of SIFT
70 keypoints. First, searching keypoints at multiple scales is obtained
71 by constructing a so-called &ldquo;Gaussian scale space&rdquo;. The
72 scale space is just a collection of images obtained by progressively
73 smoothing the input image, which is analogous to gradually reducing
74 the image resolution. Conventionally, the smoothing level is called
75 <em>scale</em> of the image. The construction of the scale space is
76 influenced by the following parameters, set when creating the SIFT
77 filter object by ::vl_sift_new():
78 
79 - <b>Number of octaves</b>. Increasing the scale by an octave means
80   doubling the size of the smoothing kernel, whose effect is roughly
81   equivalent to halving the image resolution. By default, the scale
82   space spans as many octaves as possible (i.e. roughly <code>
83   log2(min(width,height)</code>), which has the effect of searching
84   keypoints of all possible sizes.
85 - <b>First octave index</b>. By convention, the octave of index 0
86   starts with the image full resolution. Specifying an index greater
87   than 0 starts the scale space at a lower resolution (e.g. 1 halves
88   the resolution). Similarly, specifying a negative index starts the
89   scale space at an higher resolution image, and can be useful to
90   extract very small features (since this is obtained by interpolating
91   the input image, it does not make much sense to go past -1).
92 - <b>Number of levels per octave</b>. Each octave is sampled at this
93   given number of intermediate scales (by default 3). Increasing this
94   number might in principle return more refined keypoints, but in
95   practice can make their selection unstable due to noise (see [1]).
96 
97 Keypoints are further refined by eliminating those that are likely to
98 be unstable, either because they are selected nearby an image edge,
99 rather than an image blob, or are found on image structures with low
100 contrast. Filtering is controlled by the follow:
101 
102 - <b>Peak threshold.</b> This is the minimum amount of contrast to
103   accept a keypoint. It is set by configuring the SIFT filter object
104   by ::vl_sift_set_peak_thresh().
105 - <b>Edge threshold.</b> This is the edge rejection threshold. It is
106   set by configuring the SIFT filter object by
107   ::vl_sift_set_edge_thresh().
108 
109 <table>
110  <caption>Summary of the parameters influencing the SIFT detector.</caption>
111  <tr style="font-weight:bold;">
112  <td>Parameter</td>
113  <td>See also</td>
114  <td>Controlled by</td>
115  <td>Comment</td>
116  </tr>
117  <tr>
118  <td>number of octaves</td>
119  <td> @ref sift-intro-detector </td>
120  <td>::vl_sift_new</td>
121  <td></td>
122  </tr>
123  <tr>
124  <td>first octave index</td>
125  <td> @ref sift-intro-detector </td>
126  <td>::vl_sift_new</td>
127  <td>set to -1 to extract very small features</td>
128  </tr>
129  <tr>
130  <td>number of scale levels per octave</td>
131  <td> @ref sift-intro-detector </td>
132  <td>::vl_sift_new</td>
133  <td>can affect the number of extracted keypoints</td>
134  </tr>
135  <tr>
136  <td>edge threshold</td>
137  <td> @ref sift-intro-detector </td>
138  <td>::vl_sift_set_edge_thresh</td>
139  <td>decrease to eliminate more keypoints</td>
140  </tr>
141  <tr>
142  <td>peak threshold</td>
143  <td> @ref sift-intro-detector </td>
144  <td>::vl_sift_set_peak_thresh</td>
145  <td>increase to eliminate more keypoints</td>
146  </tr>
147 </table>
148 
149 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
150 @subsection sift-intro-descriptor SIFT Descriptor
151 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
152 
153 @sa @ref sift-tech-descriptor "Descriptor technical details"
154 
155 A SIFT descriptor is a 3-D spatial histogram of the image gradients in
156 characterizing the appearance of a keypoint. The gradient at each
157 pixel is regarded as a sample of a three-dimensional elementary
158 feature vector, formed by the pixel location and the gradient
159 orientation. Samples are weighed by the gradient norm and accumulated
160 in a 3-D histogram @em h, which (up to normalization and clamping)
161 forms the SIFT descriptor of the region. An additional Gaussian
162 weighting function is applied to give less importance to gradients
163 farther away from the keypoint center. Orientations are quantized into
164 eight bins and the spatial coordinates into four each, as follows:
165 
166 @image html sift-descr-easy.png "The SIFT descriptor is a spatial histogram of the image gradient."
167 
168 SIFT descriptors are computed by either calling
169 ::vl_sift_calc_keypoint_descriptor or
170 ::vl_sift_calc_raw_descriptor. They accept as input a keypoint
171 frame, which specifies the descriptor center, its size, and its
172 orientation on the image plane. The following parameters influence the
173 descriptor calculation:
174 
175 - <b>magnification factor</b>. The descriptor size is determined by
176 multiplying the keypoint scale by this factor. It is set by
177 ::vl_sift_set_magnif.
178 - <b>Gaussian window size</b>. The descriptor support is determined by
179 a Gaussian window, which discounts gradient contributions farther away
180 from the descriptor center. The standard deviation of this window is
181 set by ::vl_sift_set_window_size and expressed in unit of bins.
182 
183 VLFeat SIFT descriptor uses the following convention. The @em y axis
184 points downwards and angles are measured clockwise (to be consistent
185 with the standard image convention). The 3-D histogram (consisting of
186 @f$ 8 \times 4 \times 4 = 128 @f$ bins) is stacked as a single
187 128-dimensional vector, where the fastest varying dimension is the
188 orientation and the slowest the @em y spatial coordinate. This is
189 illustrated by the following figure.
190 
191 @image html sift-conv-vlfeat.png "VLFeat conventions"
192 
193 @note Keypoints (frames) D. Lowe's SIFT implementation convention is
194 slightly different: The @em y axis points upwards and the angles are
195 measured counter-clockwise.
196 
197 @image html sift-conv.png "D. Lowes' SIFT implementation conventions"
198 
199 <table>
200  <caption>Summary of the parameters influencing the SIFT descriptor.</caption>
201  <tr style="font-weight:bold;">
202  <td>Parameter</td>
203  <td>See also</td>
204  <td>Controlled by</td>
205  <td>Comment</td>
206  </tr>
207  <tr>
208  <td>magnification factor</td>
209  <td> @ref sift-intro-descriptor </td>
210  <td>::vl_sift_set_magnif</td>
211  <td>increase this value to enlarge the image region described</td>
212  </tr>
213  <tr>
214  <td>Gaussian window size</td>
215  <td> @ref sift-intro-descriptor </td>
216  <td>::vl_sift_set_window_size</td>
217  <td>smaller values let the center of the descriptor count more</td>
218  </tr>
219 </table>
220 
221 
222 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
223 @section sift-intro-extensions Extensions
224 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
225 
226 <b>Eliminating low-contrast descriptors.</b> Near-uniform patches do
227 not yield stable keypoints or descriptors. ::vl_sift_set_norm_thresh()
228 can be used to set a threshold on the average norm of the local
229 gradient to zero-out descriptors that correspond to very low contrast
230 regions. By default, the threshold is equal to zero, which means that
231 no descriptor is zeroed. Normally this option is useful only with
232 custom keypoints, as detected keypoints are implicitly selected at
233 high contrast image regions.
234 
235 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
236 @section sift-usage Using the SIFT filter object
237 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
238 
239 The code provided in this module can be used in different ways.  You
240 can instantiate and use a <b>SIFT filter</b> to extract both SIFT
241 keypoints and descriptors from one or multiple images. Alternatively,
242 you can use one of the low level functions to run only a part of the
243 SIFT algorithm (for instance, to compute the SIFT descriptors of
244 custom keypoints).
245 
246 To use a <b>SIFT filter</b> object:
247 
248 - Initialize a SIFT filter object with ::vl_sift_new(). The filter can
249   be reused for multiple images of the same size (e.g. for an entire
250   video sequence).
251 - For each octave in the scale space:
252   - Compute the next octave of the DOG scale space using either
253    ::vl_sift_process_first_octave() or ::vl_sift_process_next_octave()
254    (stop processing if ::VL_ERR_EOF is returned).
255   - Run the SIFT detector with ::vl_sift_detect() to get the keypoints.
256   - For each keypoint:
257     - Use ::vl_sift_calc_keypoint_orientations() to get the keypoint orientation(s).
258     - For each orientation:
259       - Use ::vl_sift_calc_keypoint_descriptor() to get the keypoint descriptor.
260 - Delete the SIFT filter by ::vl_sift_delete().
261 
262 To compute SIFT descriptors of custom keypoints, use
263 ::vl_sift_calc_raw_descriptor().
264 
265 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
266 @section sift-tech Technical details
267 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
268 
269 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
270 @subsection sift-tech-ss Scale space
271 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
272 
273 In order to search for image blobs at multiple scale, the SIFT
274 detector construct a scale space, defined as follows. Let
275 @f$I_0(\mathbf{x})@f$ denote an idealized <em>infinite resolution</em>
276 image. Consider the  <em>Gaussian kernel</em>
277 
278 @f[
279  g_{\sigma}(\mathbf{x})
280  =
281  \frac{1}{2\pi\sigma^2}
282  \exp
283  \left(
284  -\frac{1}{2}
285  \frac{\mathbf{x}^\top\mathbf{x}}{\sigma^2}
286  \right)
287 @f]
288 
289 The <b>Gaussian scale space</b> is the collection of smoothed images
290 
291 @f[
292  I_\sigma = g_\sigma * I,  \quad \sigma \geq 0.
293 @f]
294 
295 The image at infinite resolution @f$ I_0 @f$ is useful conceptually,
296 but is not available to us; instead, the input image @f$ I_{\sigma_n}
297 @f$ is assumed to be pre-smoothed at a nominal level @f$ \sigma_n =
298 0.5 @f$ to account for the finite resolution of the pixels. Thus in
299 practice the scale space is computed by
300 
301 @f[
302 I_\sigma = g_{\sqrt{\sigma^2 - \sigma_n^2}} * I_{\sigma_n},
303 \quad \sigma \geq \sigma_n.
304 @f]
305 
306 Scales are sampled at logarithmic steps given by
307 
308 @f[
309 \sigma = \sigma_0 2^{o+s/S},
310 \quad s = 0,\dots,S-1,
311 \quad o = o_{\min}, \dots, o_{\min}+O-1,
312 @f]
313 
314 where @f$ \sigma_0 = 1.6 @f$ is the <em>base scale</em>, @f$ o_{\min}
315 @f$ is the <em>first octave index</em>, @em O the <em>number of
316 octaves</em> and @em S the <em>number of scales per octave</em>.
317 
318 Blobs are detected as local extrema of the <b>Difference of
319 Gaussians</b> (DoG) scale space, obtained by subtracting successive
320 scales of the Gaussian scale space:
321 
322 @f[
323 \mathrm{DoG}_{\sigma(o,s)} = I_{\sigma(o,s+1)} - I_{\sigma(o,s)}
324 @f]
325 
326 At each next octave, the resolution of the images is halved to save
327 computations. The images composing the Gaussian and DoG scale space
328 can then be arranged as in the following figure:
329 
330 @image html sift-ss.png  "GSS and DoG scale space  structures."
331 
332 The black vertical segments represent images of the Gaussian Scale
333 Space (GSS), arranged by increasing scale @f$\sigma@f$.  Notice that
334 the scale level index @e s varies in a slightly redundant set
335 
336 @f[
337 s = -1, \dots, S+2
338 @f]
339 
340 This simplifies glueing together different octaves and extracting DoG
341 maxima (required by the SIFT detector).
342 
343 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
344 @subsection sift-tech-detector Detector
345 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
346 
347 The SIFT frames (keypoints) are extracted based on local extrema
348 (peaks) of the DoG scale space. Numerically, local extrema are
349 elements whose @f$ 3 \times 3 \times 3 @f$ neighbors (in space and
350 scale) have all smaller (or larger) value.  Once extracted, local
351 extrema are quadratically interpolated (this is very important
352 especially at the lower resolution scales in order to have accurate
353 keypoint localization at the full resolution).  Finally, they are
354 filtered to eliminate low-contrast responses or responses close to
355 edges and the orientation(s) are assigned, as explained next.
356 
357 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
358 @subsubsection sift-tech-detector-peak Eliminating low contrast responses
359 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
360 
361 Peaks which are too short may have been generated by noise and are
362 discarded.  This is done by comparing the absolute value of the DoG
363 scale space at the peak with the <b>peak threshold</b> @f$t_p@f$ and
364 discarding the peak its value is below the threshold.
365 
366 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
367 @subsubsection sift-tech-detector-edge Eliminating edge responses
368 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
369 
370 Peaks which are too flat are often generated by edges and do not yield
371 stable features. These peaks are detected and removed as follows.
372 Given a peak @f$x,y,\sigma@f$, the algorithm evaluates the @em x,@em y
373 Hessian of of the DoG scale space at the scale @f$\sigma@f$.  Then the
374 following score (similar to the Harris function) is computed:
375 
376 @f[
377 \frac{(\mathrm{tr}\,D(x,y,\sigma))^2}{\det D(x,y,\sigma)},
378 \quad
379 D =
380 \left[
381 \begin{array}{cc}
382 \frac{\partial^2 \mathrm{DoG}}{\partial x^2} &
383 \frac{\partial^2 \mathrm{DoG}}{\partial x\partial y} \\
384 \frac{\partial^2 \mathrm{DoG}}{\partial x\partial y} &
385 \frac{\partial^2 \mathrm{DoG}}{\partial y^2}
386 \end{array}
387 \right].
388  @f]
389 
390 This score has a minimum (equal to 4) when both eigenvalues of the
391 Jacobian are equal (curved peak) and increases as one of the
392 eigenvalues grows and the other stays small. Peaks are retained if the
393 score is below the quantity @f$(t_e+1)(t_e+1)/t_e@f$, where @f$t_e@f$
394 is the <b>edge threshold</b>. Notice that this quantity has a minimum
395 equal to 4 when @f$t_e=1@f$ and grows thereafter. Therefore the range
396 of the edge threshold is @f$[1,\infty)@f$.
397 
398 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
399 @subsection sift-tech-detector-orientation Orientation assignment
400 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
401 
402 A peak in the DoG scale space fixes 2 parameters of the keypoint: the
403 position and scale. It remains to choose an orientation. In order to
404 do this, SIFT computes an histogram of the gradient orientations in a
405 Gaussian window with a standard deviation which is 1.5 times bigger
406 than the scale @f$\sigma@f$ of the keypoint.
407 
408 @image html sift-orient.png
409 
410 This histogram is then smoothed and the maximum is selected.  In
411 addition to the biggest mode, up to other three modes whose amplitude
412 is within the 80% of the biggest mode are retained and returned as
413 additional orientations.
414 
415 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
416 @subsection sift-tech-descriptor Descriptor
417 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
418 
419 A SIFT descriptor of a local region (keypoint) is a 3-D spatial
420 histogram of the image gradients.  The gradient at each pixel is
421 regarded as a sample of a three-dimensional elementary feature vector,
422 formed by the pixel location and the gradient orientation. Samples are
423 weighed by the gradient norm and accumulated in a 3-D histogram @em h,
424 which (up to normalization and clamping) forms the SIFT descriptor of
425 the region. An additional Gaussian weighting function is applied to
426 give less importance to gradients farther away from the keypoint
427 center.
428 
429 <!-- @image html sift-bins.png -->
430 
431 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
432 @subsubsection sift-tech-descriptor-can Construction in the canonical frame
433 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
434 
435 Denote the gradient vector field computed at the scale @f$ \sigma @f$ by
436 @f[
437   J(x,y) = \nabla I_\sigma(x,y)
438   =
439   \left[\begin{array}{cc}
440   \frac{\partial I_\sigma}{\partial x} &
441   \frac{\partial I_\sigma}{\partial y} &
442   \end{array}\right]
443 @f]
444 
445 The descriptor is a 3-D spatial histogram capturing the distribution
446 of @f$ J(x,y) @f$. It is convenient to describe its construction in
447 the <em>canonical frame</em>. In this frame, the image and descriptor
448 axes coincide and each spatial bin has side 1. The histogram has @f$
449 N_\theta \times N_x \times N_y @f$ bins (usually @f$ 8 \times 4 \times
450 4 @f$), as in the following figure:
451 
452 @image html sift-can.png Canonical SIFT descriptor and spatial binning functions
453 
454 Bins are indexed by a triplet of indexes <em>t, i, j</em> and their
455 centers are given by
456 
457 @f{eqnarray*}
458  \theta_t &=& \frac{2\pi}{N_\theta} t, \quad t = 0,\dots,N_{\theta}-1, \\
459  x_i &=& i - \frac{N_x -1}{2}, \quad i = 0,\dots,N_x-1, \\
460  y_j &=& j - \frac{N_x -1}{2}, \quad j = 0,\dots,N_y-1. \\
461 @f}
462 
463 The histogram is computed by using trilinear interpolation, i.e.  by
464 weighing contributions by the <em>binning functions</em>
465 
466 @f{eqnarray*}
467   \displaystyle
468   w(z) &=& \mathrm{max}(0, 1 - |z|),
469   \\
470   \displaystyle
471   w_\mathrm{ang}(z) &=& \sum_{k=-\infty}^{+\infty}
472   w\left(
473   \frac{N_\theta}{2\pi} z + N_\theta k
474   \right).
475 @f}
476 
477 The gradient vector field is transformed in a three-dimensional
478 density map of weighed contributions
479 
480 @f[
481    f(\theta, x, y) = |J(x,y)| \delta(\theta - \angle J(x,y))
482 @f]
483 
484 The historam is localized in the keypoint support by a Gaussian window
485 of standard deviation @f$ \sigma_{\mathrm{win}} @f$. The histogram is
486 then given by
487 
488 @f{eqnarray*}
489  h(t,i,j) &=& \int
490  g_{\sigma_\mathrm{win}}(x,y)
491  w_\mathrm{ang}(\theta - \theta_t) w(x-x_i) w(y-y_j)
492  f(\theta,x,y)
493  d\theta\,dx\,dy
494 \\
495 &=& \int
496  g_{\sigma_\mathrm{win}}(x,y)
497  w_\mathrm{ang}(\angle J(x,y) - \theta_t) w(x-x_i) w(y-y_j)
498  |J(x,y)|\,dx\,dy
499 @f}
500 
501 In post processing, the histogram is @f$ l^2 @f$ normalized, then
502 clamped at 0.2, and @f$ l^2 @f$ normalized again.
503 
504 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
505 @subsubsection sift-tech-descriptor-image Calculation in the image frame
506 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
507 
508 Invariance to similarity transformation is attained by attaching
509 descriptors to SIFT keypoints (or other similarity-covariant frames).
510 Then projecting the image in the canonical descriptor frames has the
511 effect of undoing the image deformation.
512 
513 In practice, however, it is convenient to compute the descriptor
514 directly in the image frame. To do this, denote with a hat quantities
515 relative to the canonical frame and without a hat quantities relative
516 to the image frame (so for instance @f$ \hat x @f$ is the @e
517 x-coordinate in the canonical frame and @f$ x @f$ the x-coordinate in
518 the image frame). Assume that canonical and image frame are
519 related by an affinity:
520 
521 @f[
522   \mathbf{x} = A \hat{\mathbf{x}} + T,
523   \qquad
524   \mathbf{x} =
525   \begin{bmatrix}{c}
526     x \\
527     y
528   \end{bmatrix},
529   \quad
530   \mathbf{x} =
531   \begin{bmatrix}{c}
532     \hat x \\
533     \hat y
534   \end{bmatrix}.
535 @f]
536 
537 @image html sift-image-frame.png
538 
539 Then all quantities can be computed in the image frame directly. For instance,
540 the image at infinite resolution in the two frames are related by
541 
542 @f[
543  \hat I_0(\hat{\mathbf{x}})  = I_0(\mathbf{x}),
544  \qquad
545  \mathbf{x} = A \hat{\mathbf{x}} + T.
546 @f]
547 
548 The canonized image at scale @f$ \hat \sigma @f$ is in relation with the scaled image
549 
550 @f[
551  \hat I_{\hat{\sigma}}(\hat{\mathbf{x}})  = I_{A\hat{\sigma}}(\mathbf{x}),
552  \qquad \mathbf{x} = A \hat{\mathbf{x}} + T
553 @f]
554 
555 where, by generalizing the previous definitions, we have
556 
557 @f[
558  I_{A\hat \sigma}(\mathbf{x}) = (g_{A\hat\sigma} * I_0)(\mathbf{x}),
559 \quad
560  g_{A\hat\sigma}(\mathbf{x})
561  =
562  \frac{1}{2\pi|A|\hat \sigma^2}
563  \exp
564  \left(
565  -\frac{1}{2}
566  \frac{\mathbf{x}^\top A^{-\top}A^{-1}\mathbf{x}}{\hat \sigma^2}
567  \right)
568 @f]
569 
570 Deriving shows that the gradient fields are in relation
571 
572 @f[
573   \hat J(\hat{\mathbf{x}}) = J(\mathbf{x}) A,
574  \quad J(\mathbf{x}) = (\nabla I_{A\hat\sigma})(\mathbf{x}),
575  \qquad \mathbf{x} = A \hat{\mathbf{x}} + T.
576 @f]
577 
578 Therefore we can compute the descriptor either in the image or canonical frame as:
579 
580 @f{eqnarray*}
581  h(t,i,j)
582  &=&
583  \int
584  g_{\hat \sigma_\mathrm{win}}(\hat{\mathbf{x}})\,
585  w_\mathrm{ang}(\angle \hat J(\hat{\mathbf{x}}) - \theta_t)\,
586  w_{ij}(\hat{\mathbf{x}})\,
587  |\hat J(\hat{\mathbf{x}})|\,
588  d\hat{\mathbf{x}}
589  \\
590  &=& \int
591  g_{A \hat \sigma_\mathrm{win}}(\mathbf{x} - T)\,
592  w_\mathrm{ang}(\angle J(\mathbf{x})A - \theta_t)\,
593  w_{ij}(A^{-1}(\mathbf{x} - T))\,
594  |J(\mathbf{x})A|\,
595  d\mathbf{x}.
596 @f}
597 
598 where we defined the product of the two spatial binning functions
599 
600 @f[
601  w_{ij}(\hat{\mathbf{x}}) = w(\hat x - \hat x_i) w(\hat y - \hat y_j)
602 @f]
603 
604 
605 In the actual implementation, this integral is computed by visiting a
606 rectangular area of the image that fully contains the keypoint grid
607 (along with half a bin border to fully include the bin windowing
608 function). Since the descriptor can be rotated, this area is a
609 rectangle of sides @f$m/2\sqrt{2} (N_x+1,N_y+1)@f$ (see also the
610 illustration).
611 
612 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
613 @subsubsection sift-tech-descriptor-std Standard SIFT descriptor
614 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  -->
615 
616 For a SIFT-detected keypoint of center @f$ T @f$, scale @f$ \sigma @f$
617 and orientation @f$ \theta @f$, the affine transformation @f$ (A,T)
618 @f$ reduces to the similarity transformation
619 
620 @f[
621      \mathbf{x} = m \sigma R(\theta) \hat{\mathbf{x}} + T
622 @f]
623 
624 where @f$ R(\theta) @f$ is a counter-clockwise rotation of @f$ \theta
625 @f$ radians, @f$ m \mathcal{\sigma} @f$ is the size of a descriptor
626 bin in pixels, and @e m is the <b>descriptor magnification factor</b>
627 which expresses how much larger a descriptor bin is compared to
628 the scale of the keypoint @f$ \sigma @f$
629 (the default value is @e m = 3). Moreover, the
630 standard SIFT descriptor computes the image gradient at the scale of
631 the keypoints, which in the canonical frame is equivalent to a
632 smoothing of @f$ \hat \sigma = 1/m @f$. Finally, the default
633 Gaussian window size is set to have standard deviation
634  @f$ \hat \sigma_\mathrm{win} = 2 @f$. This yields the formula
635 
636 @f{eqnarray*}
637  h(t,i,j)
638  &=&
639  m \sigma \int
640  g_{\sigma_\mathrm{win}}(\mathbf{x} - T)\,
641  w_\mathrm{ang}(\angle J(\mathbf{x}) - \theta  - \theta_t)\,
642  w_{ij}\left(\frac{R(\theta)^\top \mathbf{x} - T}{m\sigma}\right)\,
643  |J(\mathbf{x})|\,
644  d\mathbf{x},
645 \\
646 \sigma_{\mathrm{win}} &=& m\sigma\hat \sigma_{\mathrm{win}},
647 \\
648  J(\mathbf{x})
649  &=& \nabla (g_{m \sigma \hat \sigma} * I)(\mathbf{x})
650  = \nabla (g_{\sigma} * I)(\mathbf{x})
651  = \nabla I_{\sigma} (\mathbf{x}).
652 @f}
653 
654 
655 
656 **/
657 
658 #include "sift.h"
659 #include "imopv.h"
660 #include "mathop.h"
661 
662 #include <assert.h>
663 #include <stdlib.h>
664 #include <string.h>
665 #include <math.h>
666 #include <stdio.h>
667 
668 /** @internal @brief Use bilinear interpolation to compute orientations */
669 #define VL_SIFT_BILINEAR_ORIENTATIONS 1
670 
671 #define EXPN_SZ  256          /**< ::fast_expn table size @internal */
672 #define EXPN_MAX 25.0         /**< ::fast_expn table max  @internal */
673 double expn_tab [EXPN_SZ+1] ; /**< ::fast_expn table      @internal */
674 
675 #define NBO 8
676 #define NBP 4
677 
678 #define log2(x) (log(x)/VL_LOG_OF_2)
679 
680 /** ------------------------------------------------------------------
681  ** @internal
682  ** @brief Fast @f$exp(-x)@f$ approximation
683  **
684  ** @param x argument.
685  **
686  ** The argument must be in the range [0, ::EXPN_MAX] .
687  **
688  ** @return approximation of @f$exp(-x)@f$.
689  **/
690 
691 VL_INLINE double
fast_expn(double x)692 fast_expn (double x)
693 {
694   double a,b,r ;
695   int i ;
696   /*assert(0 <= x && x <= EXPN_MAX) ;*/
697 
698   if (x > EXPN_MAX) return 0.0 ;
699 
700   x *= EXPN_SZ / EXPN_MAX ;
701   i = (int)vl_floor_d (x) ;
702   r = x - i ;
703   a = expn_tab [i    ] ;
704   b = expn_tab [i + 1] ;
705   return a + r * (b - a) ;
706 }
707 
708 /** ------------------------------------------------------------------
709  ** @internal
710  ** @brief Initialize tables for ::fast_expn
711  **/
712 
713 VL_INLINE void
fast_expn_init()714 fast_expn_init ()
715 {
716   int k  ;
717   for(k = 0 ; k < EXPN_SZ + 1 ; ++ k) {
718     expn_tab [k] = exp (- (double) k * (EXPN_MAX / EXPN_SZ)) ;
719   }
720 }
721 
722 /** ------------------------------------------------------------------
723  ** @internal
724  ** @brief Copy image, upsample rows and take transpose
725  **
726  ** @param dst     output image buffer.
727  ** @param src     input image buffer.
728  ** @param width   input image width.
729  ** @param height  input image height.
730  **
731  ** The output image has dimensions @a height by 2 @a width (so the
732  ** destination buffer must be at least as big as two times the
733  ** input buffer).
734  **
735  ** Upsampling is performed by linear interpolation.
736  **/
737 
738 static void
copy_and_upsample_rows(vl_sift_pix * dst,vl_sift_pix const * src,int width,int height)739 copy_and_upsample_rows
740 (vl_sift_pix       *dst,
741  vl_sift_pix const *src, int width, int height)
742 {
743   int x, y ;
744   vl_sift_pix a, b ;
745 
746   for(y = 0 ; y < height ; ++y) {
747     b = a = *src++ ;
748     for(x = 0 ; x < width - 1 ; ++x) {
749       b = *src++ ;
750       *dst = a ;             dst += height ;
751       *dst = 0.5 * (a + b) ; dst += height ;
752       a = b ;
753     }
754     *dst = b ; dst += height ;
755     *dst = b ; dst += height ;
756     dst += 1 - width * 2 * height ;
757   }
758 }
759 
760 /** ------------------------------------------------------------------
761  ** @internal
762  ** @brief Smooth an image
763  ** @param self        SIFT filter.
764  ** @param outputImage output imgae buffer.
765  ** @param tempImage   temporary image buffer.
766  ** @param inputImage  input image buffer.
767  ** @param width       input image width.
768  ** @param height      input image height.
769  ** @param sigma       smoothing.
770  **/
771 
772 static void
_vl_sift_smooth(VlSiftFilt * self,vl_sift_pix * outputImage,vl_sift_pix * tempImage,vl_sift_pix const * inputImage,vl_size width,vl_size height,double sigma)773 _vl_sift_smooth (VlSiftFilt * self,
774                  vl_sift_pix * outputImage,
775                  vl_sift_pix * tempImage,
776                  vl_sift_pix const * inputImage,
777                  vl_size width,
778                  vl_size height,
779                  double sigma)
780 {
781   /* prepare Gaussian filter */
782   if (self->gaussFilterSigma != sigma) {
783     vl_uindex j ;
784     vl_sift_pix acc = 0 ;
785     if (self->gaussFilter) vl_free (self->gaussFilter) ;
786     self->gaussFilterWidth = VL_MAX(ceil(4.0 * sigma), 1) ;
787     self->gaussFilterSigma = sigma ;
788     self->gaussFilter = vl_malloc (sizeof(vl_sift_pix) * (2 * self->gaussFilterWidth + 1)) ;
789 
790     for (j = 0 ; j < 2 * self->gaussFilterWidth + 1 ; ++j) {
791       vl_sift_pix d = ((vl_sift_pix)((signed)j - (signed)self->gaussFilterWidth)) / ((vl_sift_pix)sigma) ;
792       self->gaussFilter[j] = (vl_sift_pix) exp (- 0.5 * (d*d)) ;
793       acc += self->gaussFilter[j] ;
794     }
795     for (j = 0 ; j < 2 * self->gaussFilterWidth + 1 ; ++j) {
796       self->gaussFilter[j] /= acc ;
797     }
798   }
799 
800   if (self->gaussFilterWidth == 0) {
801     memcpy (outputImage, inputImage, sizeof(vl_sift_pix) * width * height) ;
802     return ;
803   }
804 
805   vl_imconvcol_vf (tempImage, height,
806                    inputImage, width, height, width,
807                    self->gaussFilter,
808                    - self->gaussFilterWidth, self->gaussFilterWidth,
809                    1, VL_PAD_BY_CONTINUITY | VL_TRANSPOSE) ;
810 
811   vl_imconvcol_vf (outputImage, width,
812                    tempImage, height, width, height,
813                    self->gaussFilter,
814                    - self->gaussFilterWidth, self->gaussFilterWidth,
815                    1, VL_PAD_BY_CONTINUITY | VL_TRANSPOSE) ;
816 }
817 
818 /** ------------------------------------------------------------------
819  ** @internal
820  ** @brief Copy and downsample an image
821  **
822  ** @param dst    output imgae buffer.
823  ** @param src    input  image buffer.
824  ** @param width  input  image width.
825  ** @param height input  image height.
826  ** @param d      octaves (non negative).
827  **
828  ** The function downsamples the image @a d times, reducing it to @c
829  ** 1/2^d of its original size. The parameters @a width and @a height
830  ** are the size of the input image. The destination image @a dst is
831  ** assumed to be <code>floor(width/2^d)</code> pixels wide and
832  ** <code>floor(height/2^d)</code> pixels high.
833  **/
834 
835 static void
copy_and_downsample(vl_sift_pix * dst,vl_sift_pix const * src,int width,int height,int d)836 copy_and_downsample
837 (vl_sift_pix       *dst,
838  vl_sift_pix const *src,
839  int width, int height, int d)
840 {
841   int x, y ;
842 
843   d = 1 << d ; /* d = 2^d */
844   for(y = 0 ; y < height ; y+=d) {
845     vl_sift_pix const * srcrowp = src + y * width ;
846     for(x = 0 ; x < width - (d-1) ; x+=d) {
847       *dst++ = *srcrowp ;
848       srcrowp += d ;
849     }
850   }
851 }
852 
853 /** ------------------------------------------------------------------
854  ** @brief Create a new SIFT filter
855  **
856  ** @param width    image width.
857  ** @param height   image height.
858  ** @param noctaves number of octaves.
859  ** @param nlevels  number of levels per octave.
860  ** @param o_min    first octave index.
861  **
862  ** The function allocates and returns a new SIFT filter for the
863  ** specified image and scale space geometry.
864  **
865  ** Setting @a O to a negative value sets the number of octaves to the
866  ** maximum possible value depending on the size of the image.
867  **
868  ** @return the new SIFT filter.
869  ** @sa ::vl_sift_delete().
870  **/
871 
872 VL_EXPORT
873 VlSiftFilt *
vl_sift_new(int width,int height,int noctaves,int nlevels,int o_min)874 vl_sift_new (int width, int height,
875              int noctaves, int nlevels,
876              int o_min)
877 {
878   VlSiftFilt *f = vl_malloc (sizeof(VlSiftFilt)) ;
879 
880   int w   = VL_SHIFT_LEFT (width,  -o_min) ;
881   int h   = VL_SHIFT_LEFT (height, -o_min) ;
882   int nel = w * h ;
883 
884   /* negative value O => calculate max. value */
885   if (noctaves < 0) {
886     noctaves = VL_MAX (floor (log2 (VL_MIN(width, height))) - o_min - 3, 1) ;
887   }
888 
889   f-> width   = width ;
890   f-> height  = height ;
891   f-> O       = noctaves ;
892   f-> S       = nlevels ;
893   f-> o_min   = o_min ;
894   f-> s_min   = -1 ;
895   f-> s_max   = nlevels + 1 ;
896   f-> o_cur   = o_min ;
897 
898   f-> temp    = vl_malloc (sizeof(vl_sift_pix) * nel    ) ;
899   f-> octave  = vl_malloc (sizeof(vl_sift_pix) * nel
900                         * (f->s_max - f->s_min + 1)  ) ;
901   f-> dog     = vl_malloc (sizeof(vl_sift_pix) * nel
902                         * (f->s_max - f->s_min    )  ) ;
903   f-> grad    = vl_malloc (sizeof(vl_sift_pix) * nel * 2
904                         * (f->s_max - f->s_min    )  ) ;
905 
906   f-> sigman  = 0.5 ;
907   f-> sigmak  = pow (2.0, 1.0 / nlevels) ;
908   f-> sigma0  = 1.6 * f->sigmak ;
909   f-> dsigma0 = f->sigma0 * sqrt (1.0 - 1.0 / (f->sigmak*f->sigmak)) ;
910 
911   f-> gaussFilter = NULL ;
912   f-> gaussFilterSigma = 0 ;
913   f-> gaussFilterWidth = 0 ;
914 
915   f-> octave_width  = 0 ;
916   f-> octave_height = 0 ;
917 
918   f-> keys     = 0 ;
919   f-> nkeys    = 0 ;
920   f-> keys_res = 0 ;
921 
922   f-> peak_thresh = 0.0 ;
923   f-> edge_thresh = 10.0 ;
924   f-> norm_thresh = 0.0 ;
925   f-> magnif      = 3.0 ;
926   f-> windowSize  = NBP / 2 ;
927 
928   f-> grad_o  = o_min - 1 ;
929 
930   /* initialize fast_expn stuff */
931   fast_expn_init () ;
932 
933   return f ;
934 }
935 
936 /** -------------------------------------------------------------------
937  ** @brief Delete SIFT filter
938  **
939  ** @param f SIFT filter to delete.
940  **
941  ** The function frees the resources allocated by ::vl_sift_new().
942  **/
943 
944 VL_EXPORT
945 void
vl_sift_delete(VlSiftFilt * f)946 vl_sift_delete (VlSiftFilt* f)
947 {
948   if (f) {
949     if (f->keys) vl_free (f->keys) ;
950     if (f->grad) vl_free (f->grad) ;
951     if (f->dog) vl_free (f->dog) ;
952     if (f->octave) vl_free (f->octave) ;
953     if (f->temp) vl_free (f->temp) ;
954     if (f->gaussFilter) vl_free (f->gaussFilter) ;
955     vl_free (f) ;
956   }
957 }
958 
959 /** ------------------------------------------------------------------
960  ** @brief Start processing a new image
961  **
962  ** @param f  SIFT filter.
963  ** @param im image data.
964  **
965  ** The function starts processing a new image by computing its
966  ** Gaussian scale space at the lower octave. It also empties the
967  ** internal keypoint buffer.
968  **
969  ** @return error code. The function returns ::VL_ERR_EOF if there are
970  ** no more octaves to process.
971  **
972  ** @sa ::vl_sift_process_next_octave().
973  **/
974 
975 VL_EXPORT
976 int
vl_sift_process_first_octave(VlSiftFilt * f,vl_sift_pix const * im)977 vl_sift_process_first_octave (VlSiftFilt *f, vl_sift_pix const *im)
978 {
979   int o, s, h, w ;
980   double sa, sb ;
981   vl_sift_pix *octave ;
982 
983   /* shortcuts */
984   vl_sift_pix *temp   = f-> temp ;
985   int width           = f-> width ;
986   int height          = f-> height ;
987   int o_min           = f-> o_min ;
988   int s_min           = f-> s_min ;
989   int s_max           = f-> s_max ;
990   double sigma0       = f-> sigma0 ;
991   double sigmak       = f-> sigmak ;
992   double sigman       = f-> sigman ;
993   double dsigma0      = f-> dsigma0 ;
994 
995   /* restart from the first */
996   f->o_cur = o_min ;
997   f->nkeys = 0 ;
998   w = f-> octave_width  = VL_SHIFT_LEFT(f->width,  - f->o_cur) ;
999   h = f-> octave_height = VL_SHIFT_LEFT(f->height, - f->o_cur) ;
1000 
1001   /* is there at least one octave? */
1002   if (f->O == 0)
1003     return VL_ERR_EOF ;
1004 
1005   /* ------------------------------------------------------------------
1006    *                     Compute the first sublevel of the first octave
1007    * --------------------------------------------------------------- */
1008 
1009   /*
1010    * If the first octave has negative index, we upscale the image; if
1011    * the first octave has positive index, we downscale the image; if
1012    * the first octave has index zero, we just copy the image.
1013    */
1014 
1015   octave = vl_sift_get_octave (f, s_min) ;
1016 
1017   if (o_min < 0) {
1018     /* double once */
1019     copy_and_upsample_rows (temp,   im,   width,      height) ;
1020     copy_and_upsample_rows (octave, temp, height, 2 * width ) ;
1021 
1022     /* double more */
1023     for(o = -1 ; o > o_min ; --o) {
1024       copy_and_upsample_rows (temp, octave,
1025                               width << -o,      height << -o ) ;
1026       copy_and_upsample_rows (octave, temp,
1027                               width << -o, 2 * (height << -o)) ;
1028     }
1029   }
1030   else if (o_min > 0) {
1031     /* downsample */
1032     copy_and_downsample (octave, im, width, height, o_min) ;
1033   }
1034   else {
1035     /* direct copy */
1036     memcpy(octave, im, sizeof(vl_sift_pix) * width * height) ;
1037   }
1038 
1039   /*
1040    * Here we adjust the smoothing of the first level of the octave.
1041    * The input image is assumed to have nominal smoothing equal to
1042    * f->simgan.
1043    */
1044 
1045   sa = sigma0 * pow (sigmak,   s_min) ;
1046   sb = sigman * pow (2.0,    - o_min) ;
1047 
1048   if (sa > sb) {
1049     double sd = sqrt (sa*sa - sb*sb) ;
1050     _vl_sift_smooth (f, octave, temp, octave, w, h, sd) ;
1051   }
1052 
1053   /* -----------------------------------------------------------------
1054    *                                          Compute the first octave
1055    * -------------------------------------------------------------- */
1056 
1057   for(s = s_min + 1 ; s <= s_max ; ++s) {
1058     double sd = dsigma0 * pow (sigmak, s) ;
1059     _vl_sift_smooth (f, vl_sift_get_octave(f, s), temp,
1060                      vl_sift_get_octave(f, s - 1), w, h, sd) ;
1061   }
1062 
1063   return VL_ERR_OK ;
1064 }
1065 
1066 /** ------------------------------------------------------------------
1067  ** @brief Process next octave
1068  **
1069  ** @param f SIFT filter.
1070  **
1071  ** The function computes the next octave of the Gaussian scale space.
1072  ** Notice that this clears the record of any feature detected in the
1073  ** previous octave.
1074  **
1075  ** @return error code. The function returns the error
1076  ** ::VL_ERR_EOF when there are no more octaves to process.
1077  **
1078  ** @sa ::vl_sift_process_first_octave().
1079  **/
1080 
1081 VL_EXPORT
1082 int
vl_sift_process_next_octave(VlSiftFilt * f)1083 vl_sift_process_next_octave (VlSiftFilt *f)
1084 {
1085 
1086   int s, h, w, s_best ;
1087   double sa, sb ;
1088   vl_sift_pix *octave, *pt ;
1089 
1090   /* shortcuts */
1091   vl_sift_pix *temp   = f-> temp ;
1092   int O               = f-> O ;
1093   int S               = f-> S ;
1094   int o_min           = f-> o_min ;
1095   int s_min           = f-> s_min ;
1096   int s_max           = f-> s_max ;
1097   double sigma0       = f-> sigma0 ;
1098   double sigmak       = f-> sigmak ;
1099   double dsigma0      = f-> dsigma0 ;
1100 
1101   /* is there another octave ? */
1102   if (f->o_cur == o_min + O - 1)
1103     return VL_ERR_EOF ;
1104 
1105   /* retrieve base */
1106   s_best = VL_MIN(s_min + S, s_max) ;
1107   w      = vl_sift_get_octave_width  (f) ;
1108   h      = vl_sift_get_octave_height (f) ;
1109   pt     = vl_sift_get_octave        (f, s_best) ;
1110   octave = vl_sift_get_octave        (f, s_min) ;
1111 
1112   /* next octave */
1113   copy_and_downsample (octave, pt, w, h, 1) ;
1114 
1115   f-> o_cur            += 1 ;
1116   f-> nkeys             = 0 ;
1117   w = f-> octave_width  = VL_SHIFT_LEFT(f->width,  - f->o_cur) ;
1118   h = f-> octave_height = VL_SHIFT_LEFT(f->height, - f->o_cur) ;
1119 
1120   sa = sigma0 * powf (sigmak, s_min     ) ;
1121   sb = sigma0 * powf (sigmak, s_best - S) ;
1122 
1123   if (sa > sb) {
1124     double sd = sqrt (sa*sa - sb*sb) ;
1125     _vl_sift_smooth (f, octave, temp, octave, w, h, sd) ;
1126   }
1127 
1128   /* ------------------------------------------------------------------
1129    *                                                        Fill octave
1130    * --------------------------------------------------------------- */
1131 
1132   for(s = s_min + 1 ; s <= s_max ; ++s) {
1133     double sd = dsigma0 * pow (sigmak, s) ;
1134     _vl_sift_smooth (f, vl_sift_get_octave(f, s), temp,
1135                      vl_sift_get_octave(f, s - 1), w, h, sd) ;
1136   }
1137 
1138   return VL_ERR_OK ;
1139 }
1140 
1141 /** ------------------------------------------------------------------
1142  ** @brief Detect keypoints
1143  **
1144  ** The function detect keypoints in the current octave filling the
1145  ** internal keypoint buffer. Keypoints can be retrieved by
1146  ** ::vl_sift_get_keypoints().
1147  **
1148  ** @param f SIFT filter.
1149  **/
1150 
1151 VL_EXPORT
1152 void
vl_sift_detect(VlSiftFilt * f)1153 vl_sift_detect (VlSiftFilt * f)
1154 {
1155   vl_sift_pix* dog   = f-> dog ;
1156   int          s_min = f-> s_min ;
1157   int          s_max = f-> s_max ;
1158   int          w     = f-> octave_width ;
1159   int          h     = f-> octave_height ;
1160   double       te    = f-> edge_thresh ;
1161   double       tp    = f-> peak_thresh ;
1162 
1163   int const    xo    = 1 ;      /* x-stride */
1164   int const    yo    = w ;      /* y-stride */
1165   int const    so    = w * h ;  /* s-stride */
1166 
1167   double       xper  = pow (2.0, f->o_cur) ;
1168 
1169   int x, y, s, i, ii, jj ;
1170   vl_sift_pix *pt, v ;
1171   VlSiftKeypoint *k ;
1172 
1173   /* clear current list */
1174   f-> nkeys = 0 ;
1175 
1176   /* compute difference of gaussian (DoG) */
1177   pt = f-> dog ;
1178   for (s = s_min ; s <= s_max - 1 ; ++s) {
1179     vl_sift_pix* src_a = vl_sift_get_octave (f, s    ) ;
1180     vl_sift_pix* src_b = vl_sift_get_octave (f, s + 1) ;
1181     vl_sift_pix* end_a = src_a + w * h ;
1182     while (src_a != end_a) {
1183       *pt++ = *src_b++ - *src_a++ ;
1184     }
1185   }
1186 
1187   /* -----------------------------------------------------------------
1188    *                                          Find local maxima of DoG
1189    * -------------------------------------------------------------- */
1190 
1191   /* start from dog [1,1,s_min+1] */
1192   pt  = dog + xo + yo + so ;
1193 
1194   for(s = s_min + 1 ; s <= s_max - 2 ; ++s) {
1195     for(y = 1 ; y < h - 1 ; ++y) {
1196       for(x = 1 ; x < w - 1 ; ++x) {
1197         v = *pt ;
1198 
1199 #define CHECK_NEIGHBORS(CMP,SGN)                    \
1200         ( v CMP ## = SGN 0.8 * tp &&                \
1201           v CMP *(pt + xo) &&                       \
1202           v CMP *(pt - xo) &&                       \
1203           v CMP *(pt + so) &&                       \
1204           v CMP *(pt - so) &&                       \
1205           v CMP *(pt + yo) &&                       \
1206           v CMP *(pt - yo) &&                       \
1207                                                     \
1208           v CMP *(pt + yo + xo) &&                  \
1209           v CMP *(pt + yo - xo) &&                  \
1210           v CMP *(pt - yo + xo) &&                  \
1211           v CMP *(pt - yo - xo) &&                  \
1212                                                     \
1213           v CMP *(pt + xo      + so) &&             \
1214           v CMP *(pt - xo      + so) &&             \
1215           v CMP *(pt + yo      + so) &&             \
1216           v CMP *(pt - yo      + so) &&             \
1217           v CMP *(pt + yo + xo + so) &&             \
1218           v CMP *(pt + yo - xo + so) &&             \
1219           v CMP *(pt - yo + xo + so) &&             \
1220           v CMP *(pt - yo - xo + so) &&             \
1221                                                     \
1222           v CMP *(pt + xo      - so) &&             \
1223           v CMP *(pt - xo      - so) &&             \
1224           v CMP *(pt + yo      - so) &&             \
1225           v CMP *(pt - yo      - so) &&             \
1226           v CMP *(pt + yo + xo - so) &&             \
1227           v CMP *(pt + yo - xo - so) &&             \
1228           v CMP *(pt - yo + xo - so) &&             \
1229           v CMP *(pt - yo - xo - so) )
1230 
1231         if (CHECK_NEIGHBORS(>,+) ||
1232             CHECK_NEIGHBORS(<,-) ) {
1233 
1234           /* make room for more keypoints */
1235           if (f->nkeys >= f->keys_res) {
1236             f->keys_res += 500 ;
1237             if (f->keys) {
1238               f->keys = vl_realloc (f->keys,
1239                                     f->keys_res *
1240                                     sizeof(VlSiftKeypoint)) ;
1241             } else {
1242               f->keys = vl_malloc (f->keys_res *
1243                                    sizeof(VlSiftKeypoint)) ;
1244             }
1245           }
1246 
1247           k = f->keys + (f->nkeys ++) ;
1248 
1249           k-> ix = x ;
1250           k-> iy = y ;
1251           k-> is = s ;
1252         }
1253         pt += 1 ;
1254       }
1255       pt += 2 ;
1256     }
1257     pt += 2 * yo ;
1258   }
1259 
1260   /* -----------------------------------------------------------------
1261    *                                               Refine local maxima
1262    * -------------------------------------------------------------- */
1263 
1264   /* this pointer is used to write the keypoints back */
1265   k = f->keys ;
1266 
1267   for (i = 0 ; i < f->nkeys ; ++i) {
1268 
1269     int x = f-> keys [i] .ix ;
1270     int y = f-> keys [i] .iy ;
1271     int s = f-> keys [i]. is ;
1272 
1273     double Dx=0,Dy=0,Ds=0,Dxx=0,Dyy=0,Dss=0,Dxy=0,Dxs=0,Dys=0 ;
1274     double A [3*3], b [3] ;
1275 
1276     int dx = 0 ;
1277     int dy = 0 ;
1278 
1279     int iter, i, j ;
1280 
1281     for (iter = 0 ; iter < 5 ; ++iter) {
1282 
1283       x += dx ;
1284       y += dy ;
1285 
1286       pt = dog
1287         + xo * x
1288         + yo * y
1289         + so * (s - s_min) ;
1290 
1291       /** @brief Index GSS @internal */
1292 #define at(dx,dy,ds) (*( pt + (dx)*xo + (dy)*yo + (ds)*so))
1293 
1294       /** @brief Index matrix A @internal */
1295 #define Aat(i,j)     (A[(i)+(j)*3])
1296 
1297       /* compute the gradient */
1298       Dx = 0.5 * (at(+1,0,0) - at(-1,0,0)) ;
1299       Dy = 0.5 * (at(0,+1,0) - at(0,-1,0));
1300       Ds = 0.5 * (at(0,0,+1) - at(0,0,-1)) ;
1301 
1302       /* compute the Hessian */
1303       Dxx = (at(+1,0,0) + at(-1,0,0) - 2.0 * at(0,0,0)) ;
1304       Dyy = (at(0,+1,0) + at(0,-1,0) - 2.0 * at(0,0,0)) ;
1305       Dss = (at(0,0,+1) + at(0,0,-1) - 2.0 * at(0,0,0)) ;
1306 
1307       Dxy = 0.25 * ( at(+1,+1,0) + at(-1,-1,0) - at(-1,+1,0) - at(+1,-1,0) ) ;
1308       Dxs = 0.25 * ( at(+1,0,+1) + at(-1,0,-1) - at(-1,0,+1) - at(+1,0,-1) ) ;
1309       Dys = 0.25 * ( at(0,+1,+1) + at(0,-1,-1) - at(0,-1,+1) - at(0,+1,-1) ) ;
1310 
1311       /* solve linear system ....................................... */
1312       Aat(0,0) = Dxx ;
1313       Aat(1,1) = Dyy ;
1314       Aat(2,2) = Dss ;
1315       Aat(0,1) = Aat(1,0) = Dxy ;
1316       Aat(0,2) = Aat(2,0) = Dxs ;
1317       Aat(1,2) = Aat(2,1) = Dys ;
1318 
1319       b[0] = - Dx ;
1320       b[1] = - Dy ;
1321       b[2] = - Ds ;
1322 
1323       /* Gauss elimination */
1324       for(j = 0 ; j < 3 ; ++j) {
1325         double maxa    = 0 ;
1326         double maxabsa = 0 ;
1327         int    maxi    = -1 ;
1328         double tmp ;
1329 
1330         /* look for the maximally stable pivot */
1331         for (i = j ; i < 3 ; ++i) {
1332           double a    = Aat (i,j) ;
1333           double absa = vl_abs_d (a) ;
1334           if (absa > maxabsa) {
1335             maxa    = a ;
1336             maxabsa = absa ;
1337             maxi    = i ;
1338           }
1339         }
1340 
1341         /* if singular give up */
1342         if (maxabsa < 1e-10f) {
1343           b[0] = 0 ;
1344           b[1] = 0 ;
1345           b[2] = 0 ;
1346           break ;
1347         }
1348 
1349         i = maxi ;
1350 
1351         /* swap j-th row with i-th row and normalize j-th row */
1352         for(jj = j ; jj < 3 ; ++jj) {
1353           tmp = Aat(i,jj) ; Aat(i,jj) = Aat(j,jj) ; Aat(j,jj) = tmp ;
1354           Aat(j,jj) /= maxa ;
1355         }
1356         tmp = b[j] ; b[j] = b[i] ; b[i] = tmp ;
1357         b[j] /= maxa ;
1358 
1359         /* elimination */
1360         for (ii = j+1 ; ii < 3 ; ++ii) {
1361           double x = Aat(ii,j) ;
1362           for (jj = j ; jj < 3 ; ++jj) {
1363             Aat(ii,jj) -= x * Aat(j,jj) ;
1364           }
1365           b[ii] -= x * b[j] ;
1366         }
1367       }
1368 
1369       /* backward substitution */
1370       for (i = 2 ; i > 0 ; --i) {
1371         double x = b[i] ;
1372         for (ii = i-1 ; ii >= 0 ; --ii) {
1373           b[ii] -= x * Aat(ii,i) ;
1374         }
1375       }
1376 
1377       /* .......................................................... */
1378       /* If the translation of the keypoint is big, move the keypoint
1379        * and re-iterate the computation. Otherwise we are all set.
1380        */
1381 
1382       dx= ((b[0] >  0.6 && x < w - 2) ?  1 : 0)
1383         + ((b[0] < -0.6 && x > 1    ) ? -1 : 0) ;
1384 
1385       dy= ((b[1] >  0.6 && y < h - 2) ?  1 : 0)
1386         + ((b[1] < -0.6 && y > 1    ) ? -1 : 0) ;
1387 
1388       if (dx == 0 && dy == 0) break ;
1389     }
1390 
1391     /* check threshold and other conditions */
1392     {
1393       double val   = at(0,0,0)
1394         + 0.5 * (Dx * b[0] + Dy * b[1] + Ds * b[2]) ;
1395       double score = (Dxx+Dyy)*(Dxx+Dyy) / (Dxx*Dyy - Dxy*Dxy) ;
1396       double xn = x + b[0] ;
1397       double yn = y + b[1] ;
1398       double sn = s + b[2] ;
1399 
1400       vl_bool good =
1401         vl_abs_d (val)  > tp                  &&
1402         score           < (te+1)*(te+1)/te    &&
1403         score           >= 0                  &&
1404         vl_abs_d (b[0]) <  1.5                &&
1405         vl_abs_d (b[1]) <  1.5                &&
1406         vl_abs_d (b[2]) <  1.5                &&
1407         xn              >= 0                  &&
1408         xn              <= w - 1              &&
1409         yn              >= 0                  &&
1410         yn              <= h - 1              &&
1411         sn              >= s_min              &&
1412         sn              <= s_max ;
1413 
1414       if (good) {
1415         k-> o     = f->o_cur ;
1416         k-> ix    = x ;
1417         k-> iy    = y ;
1418         k-> is    = s ;
1419         k-> s     = sn ;
1420         k-> x     = xn * xper ;
1421         k-> y     = yn * xper ;
1422         k-> sigma = f->sigma0 * pow (2.0, sn/f->S) * xper ;
1423         ++ k ;
1424       }
1425 
1426     } /* done checking */
1427   } /* next keypoint to refine */
1428 
1429   /* update keypoint count */
1430   f-> nkeys = (int)(k - f->keys) ;
1431 }
1432 
1433 
1434 /** ------------------------------------------------------------------
1435  ** @internal
1436  ** @brief Update gradients to current GSS octave
1437  **
1438  ** @param f SIFT filter.
1439  **
1440  ** The function makes sure that the gradient buffer is up-to-date
1441  ** with the current GSS data.
1442  **
1443  ** @remark The minimum octave size is 2x2xS.
1444  **/
1445 
1446 static void
update_gradient(VlSiftFilt * f)1447 update_gradient (VlSiftFilt *f)
1448 {
1449   int       s_min = f->s_min ;
1450   int       s_max = f->s_max ;
1451   int       w     = vl_sift_get_octave_width  (f) ;
1452   int       h     = vl_sift_get_octave_height (f) ;
1453   int const xo    = 1 ;
1454   int const yo    = w ;
1455   int const so    = h * w ;
1456   int y, s ;
1457 
1458   if (f->grad_o == f->o_cur) return ;
1459 
1460   for (s  = s_min + 1 ;
1461        s <= s_max - 2 ; ++ s) {
1462 
1463     vl_sift_pix *src, *end, *grad, gx, gy ;
1464 
1465 #define SAVE_BACK                                                       \
1466     *grad++ = vl_fast_sqrt_f (gx*gx + gy*gy) ;                          \
1467     *grad++ = vl_mod_2pi_f   (vl_fast_atan2_f (gy, gx) + 2*VL_PI) ;     \
1468     ++src ;                                                             \
1469 
1470     src  = vl_sift_get_octave (f,s) ;
1471     grad = f->grad + 2 * so * (s - s_min -1) ;
1472 
1473     /* first pixel of the first row */
1474     gx = src[+xo] - src[0] ;
1475     gy = src[+yo] - src[0] ;
1476     SAVE_BACK ;
1477 
1478     /* middle pixels of the  first row */
1479     end = (src - 1) + w - 1 ;
1480     while (src < end) {
1481       gx = 0.5 * (src[+xo] - src[-xo]) ;
1482       gy =        src[+yo] - src[0] ;
1483       SAVE_BACK ;
1484     }
1485 
1486     /* last pixel of the first row */
1487     gx = src[0]   - src[-xo] ;
1488     gy = src[+yo] - src[0] ;
1489     SAVE_BACK ;
1490 
1491     for (y = 1 ; y < h -1 ; ++y) {
1492 
1493       /* first pixel of the middle rows */
1494       gx =        src[+xo] - src[0] ;
1495       gy = 0.5 * (src[+yo] - src[-yo]) ;
1496       SAVE_BACK ;
1497 
1498       /* middle pixels of the middle rows */
1499       end = (src - 1) + w - 1 ;
1500       while (src < end) {
1501         gx = 0.5 * (src[+xo] - src[-xo]) ;
1502         gy = 0.5 * (src[+yo] - src[-yo]) ;
1503         SAVE_BACK ;
1504       }
1505 
1506       /* last pixel of the middle row */
1507       gx =        src[0]   - src[-xo] ;
1508       gy = 0.5 * (src[+yo] - src[-yo]) ;
1509       SAVE_BACK ;
1510     }
1511 
1512     /* first pixel of the last row */
1513     gx = src[+xo] - src[0] ;
1514     gy = src[  0] - src[-yo] ;
1515     SAVE_BACK ;
1516 
1517     /* middle pixels of the last row */
1518     end = (src - 1) + w - 1 ;
1519     while (src < end) {
1520       gx = 0.5 * (src[+xo] - src[-xo]) ;
1521       gy =        src[0]   - src[-yo] ;
1522       SAVE_BACK ;
1523     }
1524 
1525     /* last pixel of the last row */
1526     gx = src[0]   - src[-xo] ;
1527     gy = src[0]   - src[-yo] ;
1528     SAVE_BACK ;
1529   }
1530   f->grad_o = f->o_cur ;
1531 }
1532 
1533 /** ------------------------------------------------------------------
1534  ** @brief Calculate the keypoint orientation(s)
1535  **
1536  ** @param f        SIFT filter.
1537  ** @param angles   orientations (output).
1538  ** @param k        keypoint.
1539  **
1540  ** The function computes the orientation(s) of the keypoint @a k.
1541  ** The function returns the number of orientations found (up to
1542  ** four). The orientations themselves are written to the vector @a
1543  ** angles.
1544  **
1545  ** @remark The function requires the keypoint octave @a k->o to be
1546  ** equal to the filter current octave ::vl_sift_get_octave. If this
1547  ** is not the case, the function returns zero orientations.
1548  **
1549  ** @remark The function requires the keypoint scale level @c k->s to
1550  ** be in the range @c s_min+1 and @c s_max-2 (where usually @c
1551  ** s_min=0 and @c s_max=S+2). If this is not the case, the function
1552  ** returns zero orientations.
1553  **
1554  ** @return number of orientations found.
1555  **/
1556 
1557 VL_EXPORT
1558 int
vl_sift_calc_keypoint_orientations(VlSiftFilt * f,double angles[4],VlSiftKeypoint const * k)1559 vl_sift_calc_keypoint_orientations (VlSiftFilt *f,
1560                                     double angles [4],
1561                                     VlSiftKeypoint const *k)
1562 {
1563   double const winf   = 1.5 ;
1564   double       xper   = pow (2.0, f->o_cur) ;
1565 
1566   int          w      = f-> octave_width ;
1567   int          h      = f-> octave_height ;
1568   int const    xo     = 2 ;         /* x-stride */
1569   int const    yo     = 2 * w ;     /* y-stride */
1570   int const    so     = 2 * w * h ; /* s-stride */
1571   double       x      = k-> x     / xper ;
1572   double       y      = k-> y     / xper ;
1573   double       sigma  = k-> sigma / xper ;
1574 
1575   int          xi     = (int) (x + 0.5) ;
1576   int          yi     = (int) (y + 0.5) ;
1577   int          si     = k-> is ;
1578 
1579   double const sigmaw = winf * sigma ;
1580   int          W      = VL_MAX(floor (3.0 * sigmaw), 1) ;
1581 
1582   int          nangles= 0 ;
1583 
1584   enum {nbins = 36} ;
1585 
1586   double hist [nbins], maxh ;
1587   vl_sift_pix const * pt ;
1588   int xs, ys, iter, i ;
1589 
1590   /* skip if the keypoint octave is not current */
1591   if(k->o != f->o_cur)
1592     return 0 ;
1593 
1594   /* skip the keypoint if it is out of bounds */
1595   if(xi < 0            ||
1596      xi > w - 1        ||
1597      yi < 0            ||
1598      yi > h - 1        ||
1599      si < f->s_min + 1 ||
1600      si > f->s_max - 2  ) {
1601     return 0 ;
1602   }
1603 
1604   /* make gradient up to date */
1605   update_gradient (f) ;
1606 
1607   /* clear histogram */
1608   memset (hist, 0, sizeof(double) * nbins) ;
1609 
1610   /* compute orientation histogram */
1611   pt = f-> grad + xo*xi + yo*yi + so*(si - f->s_min - 1) ;
1612 
1613 #undef  at
1614 #define at(dx,dy) (*(pt + xo * (dx) + yo * (dy)))
1615 
1616   for(ys  =  VL_MAX (- W,       - yi) ;
1617       ys <=  VL_MIN (+ W, h - 1 - yi) ; ++ys) {
1618 
1619     for(xs  = VL_MAX (- W,       - xi) ;
1620         xs <= VL_MIN (+ W, w - 1 - xi) ; ++xs) {
1621 
1622 
1623       double dx = (double)(xi + xs) - x;
1624       double dy = (double)(yi + ys) - y;
1625       double r2 = dx*dx + dy*dy ;
1626       double wgt, mod, ang, fbin ;
1627 
1628       /* limit to a circular window */
1629       if (r2 >= W*W + 0.6) continue ;
1630 
1631       wgt  = fast_expn (r2 / (2*sigmaw*sigmaw)) ;
1632       mod  = *(pt + xs*xo + ys*yo    ) ;
1633       ang  = *(pt + xs*xo + ys*yo + 1) ;
1634       fbin = nbins * ang / (2 * VL_PI) ;
1635 
1636 #if defined(VL_SIFT_BILINEAR_ORIENTATIONS)
1637       {
1638         int bin = (int) vl_floor_d (fbin - 0.5) ;
1639         double rbin = fbin - bin - 0.5 ;
1640         hist [(bin + nbins) % nbins] += (1 - rbin) * mod * wgt ;
1641         hist [(bin + 1    ) % nbins] += (    rbin) * mod * wgt ;
1642       }
1643 #else
1644       {
1645         int    bin  = vl_floor_d (fbin) ;
1646         bin = vl_floor_d (nbins * ang / (2*VL_PI)) ;
1647         hist [(bin) % nbins] += mod * wgt ;
1648       }
1649 #endif
1650 
1651     } /* for xs */
1652   } /* for ys */
1653 
1654   /* smooth histogram */
1655   for (iter = 0; iter < 6; iter ++) {
1656     double prev  = hist [nbins - 1] ;
1657     double first = hist [0] ;
1658     int i ;
1659     for (i = 0; i < nbins - 1; i++) {
1660       double newh = (prev + hist[i] + hist[(i+1) % nbins]) / 3.0;
1661       prev = hist[i] ;
1662       hist[i] = newh ;
1663     }
1664     hist[i] = (prev + hist[i] + first) / 3.0 ;
1665   }
1666 
1667   /* find the histogram maximum */
1668   maxh = 0 ;
1669   for (i = 0 ; i < nbins ; ++i)
1670     maxh = VL_MAX (maxh, hist [i]) ;
1671 
1672   /* find peaks within 80% from max */
1673   nangles = 0 ;
1674   for(i = 0 ; i < nbins ; ++i) {
1675     double h0 = hist [i] ;
1676     double hm = hist [(i - 1 + nbins) % nbins] ;
1677     double hp = hist [(i + 1 + nbins) % nbins] ;
1678 
1679     /* is this a peak? */
1680     if (h0 > 0.8*maxh && h0 > hm && h0 > hp) {
1681 
1682       /* quadratic interpolation */
1683       double di = - 0.5 * (hp - hm) / (hp + hm - 2 * h0) ;
1684       double th = 2 * VL_PI * (i + di + 0.5) / nbins ;
1685       angles [ nangles++ ] = th ;
1686       if( nangles == 4 )
1687         goto enough_angles ;
1688     }
1689   }
1690  enough_angles:
1691   return nangles ;
1692 }
1693 
1694 
1695 /** ------------------------------------------------------------------
1696  ** @internal
1697  ** @brief Normalizes in norm L_2 a descriptor
1698  ** @param begin begin of histogram.
1699  ** @param end   end of histogram.
1700  **/
1701 
1702 VL_INLINE vl_sift_pix
normalize_histogram(vl_sift_pix * begin,vl_sift_pix * end)1703 normalize_histogram
1704 (vl_sift_pix *begin, vl_sift_pix *end)
1705 {
1706   vl_sift_pix* iter ;
1707   vl_sift_pix  norm = 0.0 ;
1708 
1709   for (iter = begin ; iter != end ; ++ iter)
1710     norm += (*iter) * (*iter) ;
1711 
1712   norm = vl_fast_sqrt_f (norm) + VL_EPSILON_F ;
1713 
1714   for (iter = begin; iter != end ; ++ iter)
1715     *iter /= norm ;
1716 
1717   return norm;
1718 }
1719 
1720 /** ------------------------------------------------------------------
1721  ** @brief Run the SIFT descriptor on raw data
1722  **
1723  ** @param f        SIFT filter.
1724  ** @param grad     image gradients.
1725  ** @param descr    SIFT descriptor (output).
1726  ** @param width    image width.
1727  ** @param height   image height.
1728  ** @param x        keypoint x coordinate.
1729  ** @param y        keypoint y coordinate.
1730  ** @param sigma    keypoint scale.
1731  ** @param angle0   keypoint orientation.
1732  **
1733  ** The function runs the SIFT descriptor on raw data. Here @a image
1734  ** is a 2 x @a width x @a height array (by convention, the memory
1735  ** layout is a s such the first index is the fastest varying
1736  ** one). The first @a width x @a height layer of the array contains
1737  ** the gradient magnitude and the second the gradient angle (in
1738  ** radians, between 0 and @f$ 2\pi @f$). @a x, @a y and @a sigma give
1739  ** the keypoint center and scale respectively.
1740  **
1741  ** In order to be equivalent to a standard SIFT descriptor the image
1742  ** gradient must be computed at a smoothing level equal to the scale
1743  ** of the keypoint. In practice, the actual SIFT algorithm makes the
1744  ** following additional approximation, which influence the result:
1745  **
1746  ** - Scale is discretized in @c S levels.
1747  ** - The image is downsampled once for each octave (if you do this,
1748  **   the parameters @a x, @a y and @a sigma must be
1749  **   scaled too).
1750  **/
1751 
1752 VL_EXPORT
1753 void
vl_sift_calc_raw_descriptor(VlSiftFilt const * f,vl_sift_pix const * grad,vl_sift_pix * descr,int width,int height,double x,double y,double sigma,double angle0)1754 vl_sift_calc_raw_descriptor (VlSiftFilt const *f,
1755                              vl_sift_pix const* grad,
1756                              vl_sift_pix *descr,
1757                              int width, int height,
1758                              double x, double y,
1759                              double sigma,
1760                              double angle0)
1761 {
1762   double const magnif = f-> magnif ;
1763 
1764   int          w      = width ;
1765   int          h      = height ;
1766   int const    xo     = 2 ;         /* x-stride */
1767   int const    yo     = 2 * w ;     /* y-stride */
1768 
1769   int          xi     = (int) (x + 0.5) ;
1770   int          yi     = (int) (y + 0.5) ;
1771 
1772   double const st0    = sin (angle0) ;
1773   double const ct0    = cos (angle0) ;
1774   double const SBP    = magnif * sigma + VL_EPSILON_D ;
1775   int    const W      = floor
1776     (sqrt(2.0) * SBP * (NBP + 1) / 2.0 + 0.5) ;
1777 
1778   int const binto = 1 ;          /* bin theta-stride */
1779   int const binyo = NBO * NBP ;  /* bin y-stride */
1780   int const binxo = NBO ;        /* bin x-stride */
1781 
1782   int bin, dxi, dyi ;
1783   vl_sift_pix const *pt ;
1784   vl_sift_pix       *dpt ;
1785 
1786   /* check bounds */
1787   if(xi    <  0               ||
1788      xi    >= w               ||
1789      yi    <  0               ||
1790      yi    >= h -    1        )
1791     return ;
1792 
1793   /* clear descriptor */
1794   memset (descr, 0, sizeof(vl_sift_pix) * NBO*NBP*NBP) ;
1795 
1796   /* Center the scale space and the descriptor on the current keypoint.
1797    * Note that dpt is pointing to the bin of center (SBP/2,SBP/2,0).
1798    */
1799   pt  = grad + xi*xo + yi*yo ;
1800   dpt = descr + (NBP/2) * binyo + (NBP/2) * binxo ;
1801 
1802 #undef atd
1803 #define atd(dbinx,dbiny,dbint) *(dpt + (dbint)*binto + (dbiny)*binyo + (dbinx)*binxo)
1804 
1805   /*
1806    * Process pixels in the intersection of the image rectangle
1807    * (1,1)-(M-1,N-1) and the keypoint bounding box.
1808    */
1809   for(dyi =  VL_MAX(- W,   - yi   ) ;
1810       dyi <= VL_MIN(+ W, h - yi -1) ; ++ dyi) {
1811 
1812     for(dxi =  VL_MAX(- W,   - xi   ) ;
1813         dxi <= VL_MIN(+ W, w - xi -1) ; ++ dxi) {
1814 
1815       /* retrieve */
1816       vl_sift_pix mod   = *( pt + dxi*xo + dyi*yo + 0 ) ;
1817       vl_sift_pix angle = *( pt + dxi*xo + dyi*yo + 1 ) ;
1818       vl_sift_pix theta = vl_mod_2pi_f (angle - angle0) ;
1819 
1820       /* fractional displacement */
1821       vl_sift_pix dx = xi + dxi - x;
1822       vl_sift_pix dy = yi + dyi - y;
1823 
1824       /* get the displacement normalized w.r.t. the keypoint
1825          orientation and extension */
1826       vl_sift_pix nx = ( ct0 * dx + st0 * dy) / SBP ;
1827       vl_sift_pix ny = (-st0 * dx + ct0 * dy) / SBP ;
1828       vl_sift_pix nt = NBO * theta / (2 * VL_PI) ;
1829 
1830       /* Get the Gaussian weight of the sample. The Gaussian window
1831        * has a standard deviation equal to NBP/2. Note that dx and dy
1832        * are in the normalized frame, so that -NBP/2 <= dx <=
1833        * NBP/2. */
1834       vl_sift_pix const wsigma = f->windowSize ;
1835       vl_sift_pix win = fast_expn
1836         ((nx*nx + ny*ny)/(2.0 * wsigma * wsigma)) ;
1837 
1838       /* The sample will be distributed in 8 adjacent bins.
1839          We start from the ``lower-left'' bin. */
1840       int         binx = (int)vl_floor_f (nx - 0.5) ;
1841       int         biny = (int)vl_floor_f (ny - 0.5) ;
1842       int         bint = (int)vl_floor_f (nt) ;
1843       vl_sift_pix rbinx = nx - (binx + 0.5) ;
1844       vl_sift_pix rbiny = ny - (biny + 0.5) ;
1845       vl_sift_pix rbint = nt - bint ;
1846       int         dbinx ;
1847       int         dbiny ;
1848       int         dbint ;
1849 
1850       /* Distribute the current sample into the 8 adjacent bins*/
1851       for(dbinx = 0 ; dbinx < 2 ; ++dbinx) {
1852         for(dbiny = 0 ; dbiny < 2 ; ++dbiny) {
1853           for(dbint = 0 ; dbint < 2 ; ++dbint) {
1854 
1855             if (binx + dbinx >= - (NBP/2) &&
1856                 binx + dbinx <    (NBP/2) &&
1857                 biny + dbiny >= - (NBP/2) &&
1858                 biny + dbiny <    (NBP/2) ) {
1859               vl_sift_pix weight = win
1860                 * mod
1861                 * vl_abs_f (1 - dbinx - rbinx)
1862                 * vl_abs_f (1 - dbiny - rbiny)
1863                 * vl_abs_f (1 - dbint - rbint) ;
1864 
1865               atd(binx+dbinx, biny+dbiny, (bint+dbint) % NBO) += weight ;
1866             }
1867           }
1868         }
1869       }
1870     }
1871   }
1872 
1873   /* Standard SIFT descriptors are normalized, truncated and normalized again */
1874   if(1) {
1875 
1876     /* normalize L2 norm */
1877     vl_sift_pix norm = normalize_histogram (descr, descr + NBO*NBP*NBP) ;
1878 
1879     /*
1880        Set the descriptor to zero if it is lower than our
1881        norm_threshold.  We divide by the number of samples in the
1882        descriptor region because the Gaussian window used in the
1883        calculation of the descriptor is not normalized.
1884      */
1885     int numSamples =
1886       (VL_MIN(W, w - xi -1) - VL_MAX(-W, - xi) + 1) *
1887       (VL_MIN(W, h - yi -1) - VL_MAX(-W, - yi) + 1) ;
1888 
1889     if(f-> norm_thresh && norm < f-> norm_thresh * numSamples) {
1890         for(bin = 0; bin < NBO*NBP*NBP ; ++ bin)
1891             descr [bin] = 0;
1892     }
1893     else {
1894       /* truncate at 0.2. */
1895       for(bin = 0; bin < NBO*NBP*NBP ; ++ bin) {
1896         if (descr [bin] > 0.2) descr [bin] = 0.2;
1897       }
1898 
1899       /* normalize again. */
1900       normalize_histogram (descr, descr + NBO*NBP*NBP) ;
1901     }
1902   }
1903 }
1904 
1905 /** ------------------------------------------------------------------
1906  ** @brief Compute the descriptor of a keypoint
1907  **
1908  ** @param f        SIFT filter.
1909  ** @param descr    SIFT descriptor (output)
1910  ** @param k        keypoint.
1911  ** @param angle0   keypoint direction.
1912  **
1913  ** The function computes the SIFT descriptor of the keypoint @a k of
1914  ** orientation @a angle0. The function fills the buffer @a descr
1915  ** which must be large enough to hold the descriptor.
1916  **
1917  ** The function assumes that the keypoint is on the current octave.
1918  ** If not, it does not do anything.
1919  **/
1920 
1921 VL_EXPORT
1922 void
vl_sift_calc_keypoint_descriptor(VlSiftFilt * f,vl_sift_pix * descr,VlSiftKeypoint const * k,double angle0)1923 vl_sift_calc_keypoint_descriptor (VlSiftFilt *f,
1924                                   vl_sift_pix *descr,
1925                                   VlSiftKeypoint const* k,
1926                                   double angle0)
1927 {
1928   /*
1929      The SIFT descriptor is a three dimensional histogram of the
1930      position and orientation of the gradient.  There are NBP bins for
1931      each spatial dimension and NBO bins for the orientation dimension,
1932      for a total of NBP x NBP x NBO bins.
1933 
1934      The support of each spatial bin has an extension of SBP = 3sigma
1935      pixels, where sigma is the scale of the keypoint.  Thus all the
1936      bins together have a support SBP x NBP pixels wide. Since
1937      weighting and interpolation of pixel is used, the support extends
1938      by another half bin. Therefore, the support is a square window of
1939      SBP x (NBP + 1) pixels. Finally, since the patch can be
1940      arbitrarily rotated, we need to consider a window 2W += sqrt(2) x
1941      SBP x (NBP + 1) pixels wide.
1942   */
1943 
1944   double const magnif      = f-> magnif ;
1945 
1946   double       xper        = pow (2.0, f->o_cur) ;
1947 
1948   int          w           = f-> octave_width ;
1949   int          h           = f-> octave_height ;
1950   int const    xo          = 2 ;         /* x-stride */
1951   int const    yo          = 2 * w ;     /* y-stride */
1952   int const    so          = 2 * w * h ; /* s-stride */
1953   double       x           = k-> x     / xper ;
1954   double       y           = k-> y     / xper ;
1955   double       sigma       = k-> sigma / xper ;
1956 
1957   int          xi          = (int) (x + 0.5) ;
1958   int          yi          = (int) (y + 0.5) ;
1959   int          si          = k-> is ;
1960 
1961   double const st0         = sin (angle0) ;
1962   double const ct0         = cos (angle0) ;
1963   double const SBP         = magnif * sigma + VL_EPSILON_D ;
1964   int    const W           = floor
1965     (sqrt(2.0) * SBP * (NBP + 1) / 2.0 + 0.5) ;
1966 
1967   int const binto = 1 ;          /* bin theta-stride */
1968   int const binyo = NBO * NBP ;  /* bin y-stride */
1969   int const binxo = NBO ;        /* bin x-stride */
1970 
1971   int bin, dxi, dyi ;
1972   vl_sift_pix const *pt ;
1973   vl_sift_pix       *dpt ;
1974 
1975   /* check bounds */
1976   if(k->o  != f->o_cur        ||
1977      xi    <  0               ||
1978      xi    >= w               ||
1979      yi    <  0               ||
1980      yi    >= h -    1        ||
1981      si    <  f->s_min + 1    ||
1982      si    >  f->s_max - 2     )
1983     return ;
1984 
1985   /* synchronize gradient buffer */
1986   update_gradient (f) ;
1987 
1988   /* VL_PRINTF("W = %d ; magnif = %g ; SBP = %g\n", W,magnif,SBP) ; */
1989 
1990   /* clear descriptor */
1991   memset (descr, 0, sizeof(vl_sift_pix) * NBO*NBP*NBP) ;
1992 
1993   /* Center the scale space and the descriptor on the current keypoint.
1994    * Note that dpt is pointing to the bin of center (SBP/2,SBP/2,0).
1995    */
1996   pt  = f->grad + xi*xo + yi*yo + (si - f->s_min - 1)*so ;
1997   dpt = descr + (NBP/2) * binyo + (NBP/2) * binxo ;
1998 
1999 #undef atd
2000 #define atd(dbinx,dbiny,dbint) *(dpt + (dbint)*binto + (dbiny)*binyo + (dbinx)*binxo)
2001 
2002   /*
2003    * Process pixels in the intersection of the image rectangle
2004    * (1,1)-(M-1,N-1) and the keypoint bounding box.
2005    */
2006   for(dyi =  VL_MAX (- W, 1 - yi    ) ;
2007       dyi <= VL_MIN (+ W, h - yi - 2) ; ++ dyi) {
2008 
2009     for(dxi =  VL_MAX (- W, 1 - xi    ) ;
2010         dxi <= VL_MIN (+ W, w - xi - 2) ; ++ dxi) {
2011 
2012       /* retrieve */
2013       vl_sift_pix mod   = *( pt + dxi*xo + dyi*yo + 0 ) ;
2014       vl_sift_pix angle = *( pt + dxi*xo + dyi*yo + 1 ) ;
2015       vl_sift_pix theta = vl_mod_2pi_f (angle - angle0) ;
2016 
2017       /* fractional displacement */
2018       vl_sift_pix dx = xi + dxi - x;
2019       vl_sift_pix dy = yi + dyi - y;
2020 
2021       /* get the displacement normalized w.r.t. the keypoint
2022          orientation and extension */
2023       vl_sift_pix nx = ( ct0 * dx + st0 * dy) / SBP ;
2024       vl_sift_pix ny = (-st0 * dx + ct0 * dy) / SBP ;
2025       vl_sift_pix nt = NBO * theta / (2 * VL_PI) ;
2026 
2027       /* Get the Gaussian weight of the sample. The Gaussian window
2028        * has a standard deviation equal to NBP/2. Note that dx and dy
2029        * are in the normalized frame, so that -NBP/2 <= dx <=
2030        * NBP/2. */
2031       vl_sift_pix const wsigma = f->windowSize ;
2032       vl_sift_pix win = fast_expn
2033         ((nx*nx + ny*ny)/(2.0 * wsigma * wsigma)) ;
2034 
2035       /* The sample will be distributed in 8 adjacent bins.
2036          We start from the ``lower-left'' bin. */
2037       int         binx = (int)vl_floor_f (nx - 0.5) ;
2038       int         biny = (int)vl_floor_f (ny - 0.5) ;
2039       int         bint = (int)vl_floor_f (nt) ;
2040       vl_sift_pix rbinx = nx - (binx + 0.5) ;
2041       vl_sift_pix rbiny = ny - (biny + 0.5) ;
2042       vl_sift_pix rbint = nt - bint ;
2043       int         dbinx ;
2044       int         dbiny ;
2045       int         dbint ;
2046 
2047       /* Distribute the current sample into the 8 adjacent bins*/
2048       for(dbinx = 0 ; dbinx < 2 ; ++dbinx) {
2049         for(dbiny = 0 ; dbiny < 2 ; ++dbiny) {
2050           for(dbint = 0 ; dbint < 2 ; ++dbint) {
2051 
2052             if (binx + dbinx >= - (NBP/2) &&
2053                 binx + dbinx <    (NBP/2) &&
2054                 biny + dbiny >= - (NBP/2) &&
2055                 biny + dbiny <    (NBP/2) ) {
2056               vl_sift_pix weight = win
2057                 * mod
2058                 * vl_abs_f (1 - dbinx - rbinx)
2059                 * vl_abs_f (1 - dbiny - rbiny)
2060                 * vl_abs_f (1 - dbint - rbint) ;
2061 
2062               atd(binx+dbinx, biny+dbiny, (bint+dbint) % NBO) += weight ;
2063             }
2064           }
2065         }
2066       }
2067     }
2068   }
2069 
2070   /* Standard SIFT descriptors are normalized, truncated and normalized again */
2071   if(1) {
2072 
2073     /* Normalize the histogram to L2 unit length. */
2074     vl_sift_pix norm = normalize_histogram (descr, descr + NBO*NBP*NBP) ;
2075 
2076     /* Set the descriptor to zero if it is lower than our norm_threshold */
2077     if(f-> norm_thresh && norm < f-> norm_thresh) {
2078         for(bin = 0; bin < NBO*NBP*NBP ; ++ bin)
2079             descr [bin] = 0;
2080     }
2081     else {
2082 
2083       /* Truncate at 0.2. */
2084       for(bin = 0; bin < NBO*NBP*NBP ; ++ bin) {
2085         if (descr [bin] > 0.2) descr [bin] = 0.2;
2086       }
2087 
2088       /* Normalize again. */
2089       normalize_histogram (descr, descr + NBO*NBP*NBP) ;
2090     }
2091   }
2092 
2093 }
2094 
2095 /** ------------------------------------------------------------------
2096  ** @brief Initialize a keypoint from its position and scale
2097  **
2098  ** @param f     SIFT filter.
2099  ** @param k     SIFT keypoint (output).
2100  ** @param x     x coordinate of the keypoint center.
2101  ** @param y     y coordinate of the keypoint center.
2102  ** @param sigma keypoint scale.
2103  **
2104  ** The function initializes a keypoint structure @a k from
2105  ** the location @a x
2106  ** and @a y and the scale @a sigma of the keypoint. The keypoint structure
2107  ** maps the keypoint to an octave and scale level of the discretized
2108  ** Gaussian scale space, which is required for instance to compute the
2109  ** keypoint SIFT descriptor.
2110  **
2111  ** @par Algorithm
2112  **
2113  ** The formula linking the keypoint scale sigma to the octave and
2114  ** scale indexes is
2115  **
2116  ** @f[ \sigma(o,s) = \sigma_0 2^{o+s/S} @f]
2117  **
2118  ** In addition to the scale index @e s (which can be fractional due
2119  ** to scale interpolation) a keypoint has an integer scale index @e
2120  ** is too (which is the index of the scale level where it was
2121  ** detected in the DoG scale space). We have the constraints (@ref
2122  ** sift-tech-detector see also the "SIFT detector"):
2123  **
2124  ** - @e o is integer in the range @f$ [o_\mathrm{min},
2125  **   o_{\mathrm{min}}+O-1] @f$.
2126  ** - @e is is integer in the range @f$ [s_\mathrm{min}+1,
2127  **   s_\mathrm{max}-2] @f$.  This depends on how the scale is
2128  **   determined during detection, and must be so here because
2129  **   gradients are computed only for this range of scale levels
2130  **   and are required for the calculation of the SIFT descriptor.
2131  ** - @f$ |s - is| < 0.5 @f$ for detected keypoints in most cases due
2132  **   to the interpolation technique used during detection. However
2133  **   this is not necessary.
2134  **
2135  ** Thus octave o represents scales @f$ \{ \sigma(o, s) : s \in
2136  ** [s_\mathrm{min}+1-.5, s_\mathrm{max}-2+.5] \} @f$. Note that some
2137  ** scales may be represented more than once. For each scale, we
2138  ** select the largest possible octave that contains it, i.e.
2139  **
2140  ** @f[
2141  **  o(\sigma)
2142  **  = \max \{ o \in \mathbb{Z} :
2143  **    \sigma_0 2^{\frac{s_\mathrm{min}+1-.5}{S}} \leq \sigma \}
2144  **  = \mathrm{floor}\,\left[
2145  **    \log_2(\sigma / \sigma_0) - \frac{s_\mathrm{min}+1-.5}{S}\right]
2146  ** @f]
2147  **
2148  ** and then
2149  **
2150  ** @f[
2151  ** s(\sigma) = S  \left[\log_2(\sigma / \sigma_0) - o(\sigma)\right],
2152  ** \quad
2153  ** is(\sigma) = \mathrm{round}\,(s(\sigma))
2154  ** @f]
2155  **
2156  ** In practice, both @f$ o(\sigma) @f$ and @f$ is(\sigma) @f$ are
2157  ** clamped to their feasible range as determined by the SIFT filter
2158  ** parameters.
2159  **/
2160 
2161 VL_EXPORT
2162 void
vl_sift_keypoint_init(VlSiftFilt const * f,VlSiftKeypoint * k,double x,double y,double sigma)2163 vl_sift_keypoint_init (VlSiftFilt const *f,
2164                        VlSiftKeypoint *k,
2165                        double x,
2166                        double y,
2167                        double sigma)
2168 {
2169   int    o, ix, iy, is ;
2170   double s, phi, xper ;
2171 
2172   phi = log2 ((sigma + VL_EPSILON_D) / f->sigma0) ;
2173   o   = (int)vl_floor_d (phi -  ((double) f->s_min + 0.5) / f->S) ;
2174   o   = VL_MIN (o, f->o_min + f->O - 1) ;
2175   o   = VL_MAX (o, f->o_min           ) ;
2176   s   = f->S * (phi - o) ;
2177 
2178   is  = (int)(s + 0.5) ;
2179   is  = VL_MIN(is, f->s_max - 2) ;
2180   is  = VL_MAX(is, f->s_min + 1) ;
2181 
2182   xper = pow (2.0, o) ;
2183   ix   = (int)(x / xper + 0.5) ;
2184   iy   = (int)(y / xper + 0.5) ;
2185 
2186   k -> o  = o ;
2187 
2188   k -> ix = ix ;
2189   k -> iy = iy ;
2190   k -> is = is ;
2191 
2192   k -> x = x ;
2193   k -> y = y ;
2194   k -> s = s ;
2195 
2196   k->sigma = sigma ;
2197 }
2198