1 /** @file sift.c
2 ** @brief SIFT - Definition
3 ** @author Andrea Vedaldi
4 **/
5
6 /*
7 Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
8 All rights reserved.
9
10 This file is part of the VLFeat library and is made available under
11 the terms of the BSD license (see the COPYING file).
12 */
13
14 /**
15 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
16 @page sift Scale Invariant Feature Transform (SIFT)
17 @author Andrea Vedaldi
18 @par "Credits:" May people have contributed with suggestions and bug
19 reports. Although the following list is certainly incomplete, we would
20 like to thank: Wei Dong, Loic, Giuseppe, Liu, Erwin, P. Ivanov, and
21 Q. S. Luo.
22 @tableofcontents
23 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
24
25 @ref sift.h implements a @ref sift-usage "SIFT filter object", a
26 reusable object to extract SIFT features @cite{lowe99object} from one
27 or multiple images.
28
29 This is the original VLFeat implementation of SIFT, designed to be
30 compatible with Lowe's original SIFT. See @ref covdet for a different
31 version of SIFT integrated in the more general covariant feature
32 detector engine.
33
34 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
35 @section sift-intro Overview
36 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
37
38 A SIFT feature is a selected image region (also called keypoint) with
39 an associated descriptor. Keypoints are extracted by the <b>@ref
40 sift-intro-detector "SIFT detector"</b> and their descriptors are
41 computed by the <b>@ref sift-intro-descriptor "SIFT descriptor"</b>. It is
42 also common to use independently the SIFT detector (i.e. computing the
43 keypoints without descriptors) or the SIFT descriptor (i.e. computing
44 descriptors of custom keypoints).
45
46 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
47 @subsection sift-intro-detector SIFT detector
48 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
49
50 A SIFT <em>keypoint</em> is a circular image region with an
51 orientation. It is described by a geometric <em>frame</em> of four
52 parameters: the keypoint center coordinates @e x and @e y, its @e
53 scale (the radius of the region), and its @e orientation (an angle
54 expressed in radians). The SIFT detector uses as keypoints image
55 structures which resemble “blobs”. By searching for blobs
56 at multiple scales and positions, the SIFT detector is invariant (or,
57 more accurately, covariant) to translation, rotations, and re scaling
58 of the image.
59
60 The keypoint orientation is also determined from the local image
61 appearance and is covariant to image rotations. Depending on the
62 symmetry of the keypoint appearance, determining the orientation can
63 be ambiguous. In this case, the SIFT detectors returns a list of up to
64 four possible orientations, constructing up to four frames (differing
65 only by their orientation) for each detected image blob.
66
67 @image html sift-frame.png "SIFT keypoints are circular image regions with an orientation."
68
69 There are several parameters that influence the detection of SIFT
70 keypoints. First, searching keypoints at multiple scales is obtained
71 by constructing a so-called “Gaussian scale space”. The
72 scale space is just a collection of images obtained by progressively
73 smoothing the input image, which is analogous to gradually reducing
74 the image resolution. Conventionally, the smoothing level is called
75 <em>scale</em> of the image. The construction of the scale space is
76 influenced by the following parameters, set when creating the SIFT
77 filter object by ::vl_sift_new():
78
79 - <b>Number of octaves</b>. Increasing the scale by an octave means
80 doubling the size of the smoothing kernel, whose effect is roughly
81 equivalent to halving the image resolution. By default, the scale
82 space spans as many octaves as possible (i.e. roughly <code>
83 log2(min(width,height)</code>), which has the effect of searching
84 keypoints of all possible sizes.
85 - <b>First octave index</b>. By convention, the octave of index 0
86 starts with the image full resolution. Specifying an index greater
87 than 0 starts the scale space at a lower resolution (e.g. 1 halves
88 the resolution). Similarly, specifying a negative index starts the
89 scale space at an higher resolution image, and can be useful to
90 extract very small features (since this is obtained by interpolating
91 the input image, it does not make much sense to go past -1).
92 - <b>Number of levels per octave</b>. Each octave is sampled at this
93 given number of intermediate scales (by default 3). Increasing this
94 number might in principle return more refined keypoints, but in
95 practice can make their selection unstable due to noise (see [1]).
96
97 Keypoints are further refined by eliminating those that are likely to
98 be unstable, either because they are selected nearby an image edge,
99 rather than an image blob, or are found on image structures with low
100 contrast. Filtering is controlled by the follow:
101
102 - <b>Peak threshold.</b> This is the minimum amount of contrast to
103 accept a keypoint. It is set by configuring the SIFT filter object
104 by ::vl_sift_set_peak_thresh().
105 - <b>Edge threshold.</b> This is the edge rejection threshold. It is
106 set by configuring the SIFT filter object by
107 ::vl_sift_set_edge_thresh().
108
109 <table>
110 <caption>Summary of the parameters influencing the SIFT detector.</caption>
111 <tr style="font-weight:bold;">
112 <td>Parameter</td>
113 <td>See also</td>
114 <td>Controlled by</td>
115 <td>Comment</td>
116 </tr>
117 <tr>
118 <td>number of octaves</td>
119 <td> @ref sift-intro-detector </td>
120 <td>::vl_sift_new</td>
121 <td></td>
122 </tr>
123 <tr>
124 <td>first octave index</td>
125 <td> @ref sift-intro-detector </td>
126 <td>::vl_sift_new</td>
127 <td>set to -1 to extract very small features</td>
128 </tr>
129 <tr>
130 <td>number of scale levels per octave</td>
131 <td> @ref sift-intro-detector </td>
132 <td>::vl_sift_new</td>
133 <td>can affect the number of extracted keypoints</td>
134 </tr>
135 <tr>
136 <td>edge threshold</td>
137 <td> @ref sift-intro-detector </td>
138 <td>::vl_sift_set_edge_thresh</td>
139 <td>decrease to eliminate more keypoints</td>
140 </tr>
141 <tr>
142 <td>peak threshold</td>
143 <td> @ref sift-intro-detector </td>
144 <td>::vl_sift_set_peak_thresh</td>
145 <td>increase to eliminate more keypoints</td>
146 </tr>
147 </table>
148
149 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
150 @subsection sift-intro-descriptor SIFT Descriptor
151 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
152
153 @sa @ref sift-tech-descriptor "Descriptor technical details"
154
155 A SIFT descriptor is a 3-D spatial histogram of the image gradients in
156 characterizing the appearance of a keypoint. The gradient at each
157 pixel is regarded as a sample of a three-dimensional elementary
158 feature vector, formed by the pixel location and the gradient
159 orientation. Samples are weighed by the gradient norm and accumulated
160 in a 3-D histogram @em h, which (up to normalization and clamping)
161 forms the SIFT descriptor of the region. An additional Gaussian
162 weighting function is applied to give less importance to gradients
163 farther away from the keypoint center. Orientations are quantized into
164 eight bins and the spatial coordinates into four each, as follows:
165
166 @image html sift-descr-easy.png "The SIFT descriptor is a spatial histogram of the image gradient."
167
168 SIFT descriptors are computed by either calling
169 ::vl_sift_calc_keypoint_descriptor or
170 ::vl_sift_calc_raw_descriptor. They accept as input a keypoint
171 frame, which specifies the descriptor center, its size, and its
172 orientation on the image plane. The following parameters influence the
173 descriptor calculation:
174
175 - <b>magnification factor</b>. The descriptor size is determined by
176 multiplying the keypoint scale by this factor. It is set by
177 ::vl_sift_set_magnif.
178 - <b>Gaussian window size</b>. The descriptor support is determined by
179 a Gaussian window, which discounts gradient contributions farther away
180 from the descriptor center. The standard deviation of this window is
181 set by ::vl_sift_set_window_size and expressed in unit of bins.
182
183 VLFeat SIFT descriptor uses the following convention. The @em y axis
184 points downwards and angles are measured clockwise (to be consistent
185 with the standard image convention). The 3-D histogram (consisting of
186 @f$ 8 \times 4 \times 4 = 128 @f$ bins) is stacked as a single
187 128-dimensional vector, where the fastest varying dimension is the
188 orientation and the slowest the @em y spatial coordinate. This is
189 illustrated by the following figure.
190
191 @image html sift-conv-vlfeat.png "VLFeat conventions"
192
193 @note Keypoints (frames) D. Lowe's SIFT implementation convention is
194 slightly different: The @em y axis points upwards and the angles are
195 measured counter-clockwise.
196
197 @image html sift-conv.png "D. Lowes' SIFT implementation conventions"
198
199 <table>
200 <caption>Summary of the parameters influencing the SIFT descriptor.</caption>
201 <tr style="font-weight:bold;">
202 <td>Parameter</td>
203 <td>See also</td>
204 <td>Controlled by</td>
205 <td>Comment</td>
206 </tr>
207 <tr>
208 <td>magnification factor</td>
209 <td> @ref sift-intro-descriptor </td>
210 <td>::vl_sift_set_magnif</td>
211 <td>increase this value to enlarge the image region described</td>
212 </tr>
213 <tr>
214 <td>Gaussian window size</td>
215 <td> @ref sift-intro-descriptor </td>
216 <td>::vl_sift_set_window_size</td>
217 <td>smaller values let the center of the descriptor count more</td>
218 </tr>
219 </table>
220
221
222 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
223 @section sift-intro-extensions Extensions
224 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
225
226 <b>Eliminating low-contrast descriptors.</b> Near-uniform patches do
227 not yield stable keypoints or descriptors. ::vl_sift_set_norm_thresh()
228 can be used to set a threshold on the average norm of the local
229 gradient to zero-out descriptors that correspond to very low contrast
230 regions. By default, the threshold is equal to zero, which means that
231 no descriptor is zeroed. Normally this option is useful only with
232 custom keypoints, as detected keypoints are implicitly selected at
233 high contrast image regions.
234
235 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
236 @section sift-usage Using the SIFT filter object
237 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
238
239 The code provided in this module can be used in different ways. You
240 can instantiate and use a <b>SIFT filter</b> to extract both SIFT
241 keypoints and descriptors from one or multiple images. Alternatively,
242 you can use one of the low level functions to run only a part of the
243 SIFT algorithm (for instance, to compute the SIFT descriptors of
244 custom keypoints).
245
246 To use a <b>SIFT filter</b> object:
247
248 - Initialize a SIFT filter object with ::vl_sift_new(). The filter can
249 be reused for multiple images of the same size (e.g. for an entire
250 video sequence).
251 - For each octave in the scale space:
252 - Compute the next octave of the DOG scale space using either
253 ::vl_sift_process_first_octave() or ::vl_sift_process_next_octave()
254 (stop processing if ::VL_ERR_EOF is returned).
255 - Run the SIFT detector with ::vl_sift_detect() to get the keypoints.
256 - For each keypoint:
257 - Use ::vl_sift_calc_keypoint_orientations() to get the keypoint orientation(s).
258 - For each orientation:
259 - Use ::vl_sift_calc_keypoint_descriptor() to get the keypoint descriptor.
260 - Delete the SIFT filter by ::vl_sift_delete().
261
262 To compute SIFT descriptors of custom keypoints, use
263 ::vl_sift_calc_raw_descriptor().
264
265 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
266 @section sift-tech Technical details
267 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
268
269 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
270 @subsection sift-tech-ss Scale space
271 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
272
273 In order to search for image blobs at multiple scale, the SIFT
274 detector construct a scale space, defined as follows. Let
275 @f$I_0(\mathbf{x})@f$ denote an idealized <em>infinite resolution</em>
276 image. Consider the <em>Gaussian kernel</em>
277
278 @f[
279 g_{\sigma}(\mathbf{x})
280 =
281 \frac{1}{2\pi\sigma^2}
282 \exp
283 \left(
284 -\frac{1}{2}
285 \frac{\mathbf{x}^\top\mathbf{x}}{\sigma^2}
286 \right)
287 @f]
288
289 The <b>Gaussian scale space</b> is the collection of smoothed images
290
291 @f[
292 I_\sigma = g_\sigma * I, \quad \sigma \geq 0.
293 @f]
294
295 The image at infinite resolution @f$ I_0 @f$ is useful conceptually,
296 but is not available to us; instead, the input image @f$ I_{\sigma_n}
297 @f$ is assumed to be pre-smoothed at a nominal level @f$ \sigma_n =
298 0.5 @f$ to account for the finite resolution of the pixels. Thus in
299 practice the scale space is computed by
300
301 @f[
302 I_\sigma = g_{\sqrt{\sigma^2 - \sigma_n^2}} * I_{\sigma_n},
303 \quad \sigma \geq \sigma_n.
304 @f]
305
306 Scales are sampled at logarithmic steps given by
307
308 @f[
309 \sigma = \sigma_0 2^{o+s/S},
310 \quad s = 0,\dots,S-1,
311 \quad o = o_{\min}, \dots, o_{\min}+O-1,
312 @f]
313
314 where @f$ \sigma_0 = 1.6 @f$ is the <em>base scale</em>, @f$ o_{\min}
315 @f$ is the <em>first octave index</em>, @em O the <em>number of
316 octaves</em> and @em S the <em>number of scales per octave</em>.
317
318 Blobs are detected as local extrema of the <b>Difference of
319 Gaussians</b> (DoG) scale space, obtained by subtracting successive
320 scales of the Gaussian scale space:
321
322 @f[
323 \mathrm{DoG}_{\sigma(o,s)} = I_{\sigma(o,s+1)} - I_{\sigma(o,s)}
324 @f]
325
326 At each next octave, the resolution of the images is halved to save
327 computations. The images composing the Gaussian and DoG scale space
328 can then be arranged as in the following figure:
329
330 @image html sift-ss.png "GSS and DoG scale space structures."
331
332 The black vertical segments represent images of the Gaussian Scale
333 Space (GSS), arranged by increasing scale @f$\sigma@f$. Notice that
334 the scale level index @e s varies in a slightly redundant set
335
336 @f[
337 s = -1, \dots, S+2
338 @f]
339
340 This simplifies glueing together different octaves and extracting DoG
341 maxima (required by the SIFT detector).
342
343 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
344 @subsection sift-tech-detector Detector
345 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
346
347 The SIFT frames (keypoints) are extracted based on local extrema
348 (peaks) of the DoG scale space. Numerically, local extrema are
349 elements whose @f$ 3 \times 3 \times 3 @f$ neighbors (in space and
350 scale) have all smaller (or larger) value. Once extracted, local
351 extrema are quadratically interpolated (this is very important
352 especially at the lower resolution scales in order to have accurate
353 keypoint localization at the full resolution). Finally, they are
354 filtered to eliminate low-contrast responses or responses close to
355 edges and the orientation(s) are assigned, as explained next.
356
357 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
358 @subsubsection sift-tech-detector-peak Eliminating low contrast responses
359 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
360
361 Peaks which are too short may have been generated by noise and are
362 discarded. This is done by comparing the absolute value of the DoG
363 scale space at the peak with the <b>peak threshold</b> @f$t_p@f$ and
364 discarding the peak its value is below the threshold.
365
366 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
367 @subsubsection sift-tech-detector-edge Eliminating edge responses
368 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
369
370 Peaks which are too flat are often generated by edges and do not yield
371 stable features. These peaks are detected and removed as follows.
372 Given a peak @f$x,y,\sigma@f$, the algorithm evaluates the @em x,@em y
373 Hessian of of the DoG scale space at the scale @f$\sigma@f$. Then the
374 following score (similar to the Harris function) is computed:
375
376 @f[
377 \frac{(\mathrm{tr}\,D(x,y,\sigma))^2}{\det D(x,y,\sigma)},
378 \quad
379 D =
380 \left[
381 \begin{array}{cc}
382 \frac{\partial^2 \mathrm{DoG}}{\partial x^2} &
383 \frac{\partial^2 \mathrm{DoG}}{\partial x\partial y} \\
384 \frac{\partial^2 \mathrm{DoG}}{\partial x\partial y} &
385 \frac{\partial^2 \mathrm{DoG}}{\partial y^2}
386 \end{array}
387 \right].
388 @f]
389
390 This score has a minimum (equal to 4) when both eigenvalues of the
391 Jacobian are equal (curved peak) and increases as one of the
392 eigenvalues grows and the other stays small. Peaks are retained if the
393 score is below the quantity @f$(t_e+1)(t_e+1)/t_e@f$, where @f$t_e@f$
394 is the <b>edge threshold</b>. Notice that this quantity has a minimum
395 equal to 4 when @f$t_e=1@f$ and grows thereafter. Therefore the range
396 of the edge threshold is @f$[1,\infty)@f$.
397
398 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
399 @subsection sift-tech-detector-orientation Orientation assignment
400 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
401
402 A peak in the DoG scale space fixes 2 parameters of the keypoint: the
403 position and scale. It remains to choose an orientation. In order to
404 do this, SIFT computes an histogram of the gradient orientations in a
405 Gaussian window with a standard deviation which is 1.5 times bigger
406 than the scale @f$\sigma@f$ of the keypoint.
407
408 @image html sift-orient.png
409
410 This histogram is then smoothed and the maximum is selected. In
411 addition to the biggest mode, up to other three modes whose amplitude
412 is within the 80% of the biggest mode are retained and returned as
413 additional orientations.
414
415 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
416 @subsection sift-tech-descriptor Descriptor
417 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
418
419 A SIFT descriptor of a local region (keypoint) is a 3-D spatial
420 histogram of the image gradients. The gradient at each pixel is
421 regarded as a sample of a three-dimensional elementary feature vector,
422 formed by the pixel location and the gradient orientation. Samples are
423 weighed by the gradient norm and accumulated in a 3-D histogram @em h,
424 which (up to normalization and clamping) forms the SIFT descriptor of
425 the region. An additional Gaussian weighting function is applied to
426 give less importance to gradients farther away from the keypoint
427 center.
428
429 <!-- @image html sift-bins.png -->
430
431 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
432 @subsubsection sift-tech-descriptor-can Construction in the canonical frame
433 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
434
435 Denote the gradient vector field computed at the scale @f$ \sigma @f$ by
436 @f[
437 J(x,y) = \nabla I_\sigma(x,y)
438 =
439 \left[\begin{array}{cc}
440 \frac{\partial I_\sigma}{\partial x} &
441 \frac{\partial I_\sigma}{\partial y} &
442 \end{array}\right]
443 @f]
444
445 The descriptor is a 3-D spatial histogram capturing the distribution
446 of @f$ J(x,y) @f$. It is convenient to describe its construction in
447 the <em>canonical frame</em>. In this frame, the image and descriptor
448 axes coincide and each spatial bin has side 1. The histogram has @f$
449 N_\theta \times N_x \times N_y @f$ bins (usually @f$ 8 \times 4 \times
450 4 @f$), as in the following figure:
451
452 @image html sift-can.png Canonical SIFT descriptor and spatial binning functions
453
454 Bins are indexed by a triplet of indexes <em>t, i, j</em> and their
455 centers are given by
456
457 @f{eqnarray*}
458 \theta_t &=& \frac{2\pi}{N_\theta} t, \quad t = 0,\dots,N_{\theta}-1, \\
459 x_i &=& i - \frac{N_x -1}{2}, \quad i = 0,\dots,N_x-1, \\
460 y_j &=& j - \frac{N_x -1}{2}, \quad j = 0,\dots,N_y-1. \\
461 @f}
462
463 The histogram is computed by using trilinear interpolation, i.e. by
464 weighing contributions by the <em>binning functions</em>
465
466 @f{eqnarray*}
467 \displaystyle
468 w(z) &=& \mathrm{max}(0, 1 - |z|),
469 \\
470 \displaystyle
471 w_\mathrm{ang}(z) &=& \sum_{k=-\infty}^{+\infty}
472 w\left(
473 \frac{N_\theta}{2\pi} z + N_\theta k
474 \right).
475 @f}
476
477 The gradient vector field is transformed in a three-dimensional
478 density map of weighed contributions
479
480 @f[
481 f(\theta, x, y) = |J(x,y)| \delta(\theta - \angle J(x,y))
482 @f]
483
484 The historam is localized in the keypoint support by a Gaussian window
485 of standard deviation @f$ \sigma_{\mathrm{win}} @f$. The histogram is
486 then given by
487
488 @f{eqnarray*}
489 h(t,i,j) &=& \int
490 g_{\sigma_\mathrm{win}}(x,y)
491 w_\mathrm{ang}(\theta - \theta_t) w(x-x_i) w(y-y_j)
492 f(\theta,x,y)
493 d\theta\,dx\,dy
494 \\
495 &=& \int
496 g_{\sigma_\mathrm{win}}(x,y)
497 w_\mathrm{ang}(\angle J(x,y) - \theta_t) w(x-x_i) w(y-y_j)
498 |J(x,y)|\,dx\,dy
499 @f}
500
501 In post processing, the histogram is @f$ l^2 @f$ normalized, then
502 clamped at 0.2, and @f$ l^2 @f$ normalized again.
503
504 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
505 @subsubsection sift-tech-descriptor-image Calculation in the image frame
506 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
507
508 Invariance to similarity transformation is attained by attaching
509 descriptors to SIFT keypoints (or other similarity-covariant frames).
510 Then projecting the image in the canonical descriptor frames has the
511 effect of undoing the image deformation.
512
513 In practice, however, it is convenient to compute the descriptor
514 directly in the image frame. To do this, denote with a hat quantities
515 relative to the canonical frame and without a hat quantities relative
516 to the image frame (so for instance @f$ \hat x @f$ is the @e
517 x-coordinate in the canonical frame and @f$ x @f$ the x-coordinate in
518 the image frame). Assume that canonical and image frame are
519 related by an affinity:
520
521 @f[
522 \mathbf{x} = A \hat{\mathbf{x}} + T,
523 \qquad
524 \mathbf{x} =
525 \begin{bmatrix}{c}
526 x \\
527 y
528 \end{bmatrix},
529 \quad
530 \mathbf{x} =
531 \begin{bmatrix}{c}
532 \hat x \\
533 \hat y
534 \end{bmatrix}.
535 @f]
536
537 @image html sift-image-frame.png
538
539 Then all quantities can be computed in the image frame directly. For instance,
540 the image at infinite resolution in the two frames are related by
541
542 @f[
543 \hat I_0(\hat{\mathbf{x}}) = I_0(\mathbf{x}),
544 \qquad
545 \mathbf{x} = A \hat{\mathbf{x}} + T.
546 @f]
547
548 The canonized image at scale @f$ \hat \sigma @f$ is in relation with the scaled image
549
550 @f[
551 \hat I_{\hat{\sigma}}(\hat{\mathbf{x}}) = I_{A\hat{\sigma}}(\mathbf{x}),
552 \qquad \mathbf{x} = A \hat{\mathbf{x}} + T
553 @f]
554
555 where, by generalizing the previous definitions, we have
556
557 @f[
558 I_{A\hat \sigma}(\mathbf{x}) = (g_{A\hat\sigma} * I_0)(\mathbf{x}),
559 \quad
560 g_{A\hat\sigma}(\mathbf{x})
561 =
562 \frac{1}{2\pi|A|\hat \sigma^2}
563 \exp
564 \left(
565 -\frac{1}{2}
566 \frac{\mathbf{x}^\top A^{-\top}A^{-1}\mathbf{x}}{\hat \sigma^2}
567 \right)
568 @f]
569
570 Deriving shows that the gradient fields are in relation
571
572 @f[
573 \hat J(\hat{\mathbf{x}}) = J(\mathbf{x}) A,
574 \quad J(\mathbf{x}) = (\nabla I_{A\hat\sigma})(\mathbf{x}),
575 \qquad \mathbf{x} = A \hat{\mathbf{x}} + T.
576 @f]
577
578 Therefore we can compute the descriptor either in the image or canonical frame as:
579
580 @f{eqnarray*}
581 h(t,i,j)
582 &=&
583 \int
584 g_{\hat \sigma_\mathrm{win}}(\hat{\mathbf{x}})\,
585 w_\mathrm{ang}(\angle \hat J(\hat{\mathbf{x}}) - \theta_t)\,
586 w_{ij}(\hat{\mathbf{x}})\,
587 |\hat J(\hat{\mathbf{x}})|\,
588 d\hat{\mathbf{x}}
589 \\
590 &=& \int
591 g_{A \hat \sigma_\mathrm{win}}(\mathbf{x} - T)\,
592 w_\mathrm{ang}(\angle J(\mathbf{x})A - \theta_t)\,
593 w_{ij}(A^{-1}(\mathbf{x} - T))\,
594 |J(\mathbf{x})A|\,
595 d\mathbf{x}.
596 @f}
597
598 where we defined the product of the two spatial binning functions
599
600 @f[
601 w_{ij}(\hat{\mathbf{x}}) = w(\hat x - \hat x_i) w(\hat y - \hat y_j)
602 @f]
603
604
605 In the actual implementation, this integral is computed by visiting a
606 rectangular area of the image that fully contains the keypoint grid
607 (along with half a bin border to fully include the bin windowing
608 function). Since the descriptor can be rotated, this area is a
609 rectangle of sides @f$m/2\sqrt{2} (N_x+1,N_y+1)@f$ (see also the
610 illustration).
611
612 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
613 @subsubsection sift-tech-descriptor-std Standard SIFT descriptor
614 <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
615
616 For a SIFT-detected keypoint of center @f$ T @f$, scale @f$ \sigma @f$
617 and orientation @f$ \theta @f$, the affine transformation @f$ (A,T)
618 @f$ reduces to the similarity transformation
619
620 @f[
621 \mathbf{x} = m \sigma R(\theta) \hat{\mathbf{x}} + T
622 @f]
623
624 where @f$ R(\theta) @f$ is a counter-clockwise rotation of @f$ \theta
625 @f$ radians, @f$ m \mathcal{\sigma} @f$ is the size of a descriptor
626 bin in pixels, and @e m is the <b>descriptor magnification factor</b>
627 which expresses how much larger a descriptor bin is compared to
628 the scale of the keypoint @f$ \sigma @f$
629 (the default value is @e m = 3). Moreover, the
630 standard SIFT descriptor computes the image gradient at the scale of
631 the keypoints, which in the canonical frame is equivalent to a
632 smoothing of @f$ \hat \sigma = 1/m @f$. Finally, the default
633 Gaussian window size is set to have standard deviation
634 @f$ \hat \sigma_\mathrm{win} = 2 @f$. This yields the formula
635
636 @f{eqnarray*}
637 h(t,i,j)
638 &=&
639 m \sigma \int
640 g_{\sigma_\mathrm{win}}(\mathbf{x} - T)\,
641 w_\mathrm{ang}(\angle J(\mathbf{x}) - \theta - \theta_t)\,
642 w_{ij}\left(\frac{R(\theta)^\top \mathbf{x} - T}{m\sigma}\right)\,
643 |J(\mathbf{x})|\,
644 d\mathbf{x},
645 \\
646 \sigma_{\mathrm{win}} &=& m\sigma\hat \sigma_{\mathrm{win}},
647 \\
648 J(\mathbf{x})
649 &=& \nabla (g_{m \sigma \hat \sigma} * I)(\mathbf{x})
650 = \nabla (g_{\sigma} * I)(\mathbf{x})
651 = \nabla I_{\sigma} (\mathbf{x}).
652 @f}
653
654
655
656 **/
657
658 #include "sift.h"
659 #include "imopv.h"
660 #include "mathop.h"
661
662 #include <assert.h>
663 #include <stdlib.h>
664 #include <string.h>
665 #include <math.h>
666 #include <stdio.h>
667
668 /** @internal @brief Use bilinear interpolation to compute orientations */
669 #define VL_SIFT_BILINEAR_ORIENTATIONS 1
670
671 #define EXPN_SZ 256 /**< ::fast_expn table size @internal */
672 #define EXPN_MAX 25.0 /**< ::fast_expn table max @internal */
673 double expn_tab [EXPN_SZ+1] ; /**< ::fast_expn table @internal */
674
675 #define NBO 8
676 #define NBP 4
677
678 #define log2(x) (log(x)/VL_LOG_OF_2)
679
680 /** ------------------------------------------------------------------
681 ** @internal
682 ** @brief Fast @f$exp(-x)@f$ approximation
683 **
684 ** @param x argument.
685 **
686 ** The argument must be in the range [0, ::EXPN_MAX] .
687 **
688 ** @return approximation of @f$exp(-x)@f$.
689 **/
690
691 VL_INLINE double
fast_expn(double x)692 fast_expn (double x)
693 {
694 double a,b,r ;
695 int i ;
696 /*assert(0 <= x && x <= EXPN_MAX) ;*/
697
698 if (x > EXPN_MAX) return 0.0 ;
699
700 x *= EXPN_SZ / EXPN_MAX ;
701 i = (int)vl_floor_d (x) ;
702 r = x - i ;
703 a = expn_tab [i ] ;
704 b = expn_tab [i + 1] ;
705 return a + r * (b - a) ;
706 }
707
708 /** ------------------------------------------------------------------
709 ** @internal
710 ** @brief Initialize tables for ::fast_expn
711 **/
712
713 VL_INLINE void
fast_expn_init()714 fast_expn_init ()
715 {
716 int k ;
717 for(k = 0 ; k < EXPN_SZ + 1 ; ++ k) {
718 expn_tab [k] = exp (- (double) k * (EXPN_MAX / EXPN_SZ)) ;
719 }
720 }
721
722 /** ------------------------------------------------------------------
723 ** @internal
724 ** @brief Copy image, upsample rows and take transpose
725 **
726 ** @param dst output image buffer.
727 ** @param src input image buffer.
728 ** @param width input image width.
729 ** @param height input image height.
730 **
731 ** The output image has dimensions @a height by 2 @a width (so the
732 ** destination buffer must be at least as big as two times the
733 ** input buffer).
734 **
735 ** Upsampling is performed by linear interpolation.
736 **/
737
738 static void
copy_and_upsample_rows(vl_sift_pix * dst,vl_sift_pix const * src,int width,int height)739 copy_and_upsample_rows
740 (vl_sift_pix *dst,
741 vl_sift_pix const *src, int width, int height)
742 {
743 int x, y ;
744 vl_sift_pix a, b ;
745
746 for(y = 0 ; y < height ; ++y) {
747 b = a = *src++ ;
748 for(x = 0 ; x < width - 1 ; ++x) {
749 b = *src++ ;
750 *dst = a ; dst += height ;
751 *dst = 0.5 * (a + b) ; dst += height ;
752 a = b ;
753 }
754 *dst = b ; dst += height ;
755 *dst = b ; dst += height ;
756 dst += 1 - width * 2 * height ;
757 }
758 }
759
760 /** ------------------------------------------------------------------
761 ** @internal
762 ** @brief Smooth an image
763 ** @param self SIFT filter.
764 ** @param outputImage output imgae buffer.
765 ** @param tempImage temporary image buffer.
766 ** @param inputImage input image buffer.
767 ** @param width input image width.
768 ** @param height input image height.
769 ** @param sigma smoothing.
770 **/
771
772 static void
_vl_sift_smooth(VlSiftFilt * self,vl_sift_pix * outputImage,vl_sift_pix * tempImage,vl_sift_pix const * inputImage,vl_size width,vl_size height,double sigma)773 _vl_sift_smooth (VlSiftFilt * self,
774 vl_sift_pix * outputImage,
775 vl_sift_pix * tempImage,
776 vl_sift_pix const * inputImage,
777 vl_size width,
778 vl_size height,
779 double sigma)
780 {
781 /* prepare Gaussian filter */
782 if (self->gaussFilterSigma != sigma) {
783 vl_uindex j ;
784 vl_sift_pix acc = 0 ;
785 if (self->gaussFilter) vl_free (self->gaussFilter) ;
786 self->gaussFilterWidth = VL_MAX(ceil(4.0 * sigma), 1) ;
787 self->gaussFilterSigma = sigma ;
788 self->gaussFilter = vl_malloc (sizeof(vl_sift_pix) * (2 * self->gaussFilterWidth + 1)) ;
789
790 for (j = 0 ; j < 2 * self->gaussFilterWidth + 1 ; ++j) {
791 vl_sift_pix d = ((vl_sift_pix)((signed)j - (signed)self->gaussFilterWidth)) / ((vl_sift_pix)sigma) ;
792 self->gaussFilter[j] = (vl_sift_pix) exp (- 0.5 * (d*d)) ;
793 acc += self->gaussFilter[j] ;
794 }
795 for (j = 0 ; j < 2 * self->gaussFilterWidth + 1 ; ++j) {
796 self->gaussFilter[j] /= acc ;
797 }
798 }
799
800 if (self->gaussFilterWidth == 0) {
801 memcpy (outputImage, inputImage, sizeof(vl_sift_pix) * width * height) ;
802 return ;
803 }
804
805 vl_imconvcol_vf (tempImage, height,
806 inputImage, width, height, width,
807 self->gaussFilter,
808 - self->gaussFilterWidth, self->gaussFilterWidth,
809 1, VL_PAD_BY_CONTINUITY | VL_TRANSPOSE) ;
810
811 vl_imconvcol_vf (outputImage, width,
812 tempImage, height, width, height,
813 self->gaussFilter,
814 - self->gaussFilterWidth, self->gaussFilterWidth,
815 1, VL_PAD_BY_CONTINUITY | VL_TRANSPOSE) ;
816 }
817
818 /** ------------------------------------------------------------------
819 ** @internal
820 ** @brief Copy and downsample an image
821 **
822 ** @param dst output imgae buffer.
823 ** @param src input image buffer.
824 ** @param width input image width.
825 ** @param height input image height.
826 ** @param d octaves (non negative).
827 **
828 ** The function downsamples the image @a d times, reducing it to @c
829 ** 1/2^d of its original size. The parameters @a width and @a height
830 ** are the size of the input image. The destination image @a dst is
831 ** assumed to be <code>floor(width/2^d)</code> pixels wide and
832 ** <code>floor(height/2^d)</code> pixels high.
833 **/
834
835 static void
copy_and_downsample(vl_sift_pix * dst,vl_sift_pix const * src,int width,int height,int d)836 copy_and_downsample
837 (vl_sift_pix *dst,
838 vl_sift_pix const *src,
839 int width, int height, int d)
840 {
841 int x, y ;
842
843 d = 1 << d ; /* d = 2^d */
844 for(y = 0 ; y < height ; y+=d) {
845 vl_sift_pix const * srcrowp = src + y * width ;
846 for(x = 0 ; x < width - (d-1) ; x+=d) {
847 *dst++ = *srcrowp ;
848 srcrowp += d ;
849 }
850 }
851 }
852
853 /** ------------------------------------------------------------------
854 ** @brief Create a new SIFT filter
855 **
856 ** @param width image width.
857 ** @param height image height.
858 ** @param noctaves number of octaves.
859 ** @param nlevels number of levels per octave.
860 ** @param o_min first octave index.
861 **
862 ** The function allocates and returns a new SIFT filter for the
863 ** specified image and scale space geometry.
864 **
865 ** Setting @a O to a negative value sets the number of octaves to the
866 ** maximum possible value depending on the size of the image.
867 **
868 ** @return the new SIFT filter.
869 ** @sa ::vl_sift_delete().
870 **/
871
872 VL_EXPORT
873 VlSiftFilt *
vl_sift_new(int width,int height,int noctaves,int nlevels,int o_min)874 vl_sift_new (int width, int height,
875 int noctaves, int nlevels,
876 int o_min)
877 {
878 VlSiftFilt *f = vl_malloc (sizeof(VlSiftFilt)) ;
879
880 int w = VL_SHIFT_LEFT (width, -o_min) ;
881 int h = VL_SHIFT_LEFT (height, -o_min) ;
882 int nel = w * h ;
883
884 /* negative value O => calculate max. value */
885 if (noctaves < 0) {
886 noctaves = VL_MAX (floor (log2 (VL_MIN(width, height))) - o_min - 3, 1) ;
887 }
888
889 f-> width = width ;
890 f-> height = height ;
891 f-> O = noctaves ;
892 f-> S = nlevels ;
893 f-> o_min = o_min ;
894 f-> s_min = -1 ;
895 f-> s_max = nlevels + 1 ;
896 f-> o_cur = o_min ;
897
898 f-> temp = vl_malloc (sizeof(vl_sift_pix) * nel ) ;
899 f-> octave = vl_malloc (sizeof(vl_sift_pix) * nel
900 * (f->s_max - f->s_min + 1) ) ;
901 f-> dog = vl_malloc (sizeof(vl_sift_pix) * nel
902 * (f->s_max - f->s_min ) ) ;
903 f-> grad = vl_malloc (sizeof(vl_sift_pix) * nel * 2
904 * (f->s_max - f->s_min ) ) ;
905
906 f-> sigman = 0.5 ;
907 f-> sigmak = pow (2.0, 1.0 / nlevels) ;
908 f-> sigma0 = 1.6 * f->sigmak ;
909 f-> dsigma0 = f->sigma0 * sqrt (1.0 - 1.0 / (f->sigmak*f->sigmak)) ;
910
911 f-> gaussFilter = NULL ;
912 f-> gaussFilterSigma = 0 ;
913 f-> gaussFilterWidth = 0 ;
914
915 f-> octave_width = 0 ;
916 f-> octave_height = 0 ;
917
918 f-> keys = 0 ;
919 f-> nkeys = 0 ;
920 f-> keys_res = 0 ;
921
922 f-> peak_thresh = 0.0 ;
923 f-> edge_thresh = 10.0 ;
924 f-> norm_thresh = 0.0 ;
925 f-> magnif = 3.0 ;
926 f-> windowSize = NBP / 2 ;
927
928 f-> grad_o = o_min - 1 ;
929
930 /* initialize fast_expn stuff */
931 fast_expn_init () ;
932
933 return f ;
934 }
935
936 /** -------------------------------------------------------------------
937 ** @brief Delete SIFT filter
938 **
939 ** @param f SIFT filter to delete.
940 **
941 ** The function frees the resources allocated by ::vl_sift_new().
942 **/
943
944 VL_EXPORT
945 void
vl_sift_delete(VlSiftFilt * f)946 vl_sift_delete (VlSiftFilt* f)
947 {
948 if (f) {
949 if (f->keys) vl_free (f->keys) ;
950 if (f->grad) vl_free (f->grad) ;
951 if (f->dog) vl_free (f->dog) ;
952 if (f->octave) vl_free (f->octave) ;
953 if (f->temp) vl_free (f->temp) ;
954 if (f->gaussFilter) vl_free (f->gaussFilter) ;
955 vl_free (f) ;
956 }
957 }
958
959 /** ------------------------------------------------------------------
960 ** @brief Start processing a new image
961 **
962 ** @param f SIFT filter.
963 ** @param im image data.
964 **
965 ** The function starts processing a new image by computing its
966 ** Gaussian scale space at the lower octave. It also empties the
967 ** internal keypoint buffer.
968 **
969 ** @return error code. The function returns ::VL_ERR_EOF if there are
970 ** no more octaves to process.
971 **
972 ** @sa ::vl_sift_process_next_octave().
973 **/
974
975 VL_EXPORT
976 int
vl_sift_process_first_octave(VlSiftFilt * f,vl_sift_pix const * im)977 vl_sift_process_first_octave (VlSiftFilt *f, vl_sift_pix const *im)
978 {
979 int o, s, h, w ;
980 double sa, sb ;
981 vl_sift_pix *octave ;
982
983 /* shortcuts */
984 vl_sift_pix *temp = f-> temp ;
985 int width = f-> width ;
986 int height = f-> height ;
987 int o_min = f-> o_min ;
988 int s_min = f-> s_min ;
989 int s_max = f-> s_max ;
990 double sigma0 = f-> sigma0 ;
991 double sigmak = f-> sigmak ;
992 double sigman = f-> sigman ;
993 double dsigma0 = f-> dsigma0 ;
994
995 /* restart from the first */
996 f->o_cur = o_min ;
997 f->nkeys = 0 ;
998 w = f-> octave_width = VL_SHIFT_LEFT(f->width, - f->o_cur) ;
999 h = f-> octave_height = VL_SHIFT_LEFT(f->height, - f->o_cur) ;
1000
1001 /* is there at least one octave? */
1002 if (f->O == 0)
1003 return VL_ERR_EOF ;
1004
1005 /* ------------------------------------------------------------------
1006 * Compute the first sublevel of the first octave
1007 * --------------------------------------------------------------- */
1008
1009 /*
1010 * If the first octave has negative index, we upscale the image; if
1011 * the first octave has positive index, we downscale the image; if
1012 * the first octave has index zero, we just copy the image.
1013 */
1014
1015 octave = vl_sift_get_octave (f, s_min) ;
1016
1017 if (o_min < 0) {
1018 /* double once */
1019 copy_and_upsample_rows (temp, im, width, height) ;
1020 copy_and_upsample_rows (octave, temp, height, 2 * width ) ;
1021
1022 /* double more */
1023 for(o = -1 ; o > o_min ; --o) {
1024 copy_and_upsample_rows (temp, octave,
1025 width << -o, height << -o ) ;
1026 copy_and_upsample_rows (octave, temp,
1027 width << -o, 2 * (height << -o)) ;
1028 }
1029 }
1030 else if (o_min > 0) {
1031 /* downsample */
1032 copy_and_downsample (octave, im, width, height, o_min) ;
1033 }
1034 else {
1035 /* direct copy */
1036 memcpy(octave, im, sizeof(vl_sift_pix) * width * height) ;
1037 }
1038
1039 /*
1040 * Here we adjust the smoothing of the first level of the octave.
1041 * The input image is assumed to have nominal smoothing equal to
1042 * f->simgan.
1043 */
1044
1045 sa = sigma0 * pow (sigmak, s_min) ;
1046 sb = sigman * pow (2.0, - o_min) ;
1047
1048 if (sa > sb) {
1049 double sd = sqrt (sa*sa - sb*sb) ;
1050 _vl_sift_smooth (f, octave, temp, octave, w, h, sd) ;
1051 }
1052
1053 /* -----------------------------------------------------------------
1054 * Compute the first octave
1055 * -------------------------------------------------------------- */
1056
1057 for(s = s_min + 1 ; s <= s_max ; ++s) {
1058 double sd = dsigma0 * pow (sigmak, s) ;
1059 _vl_sift_smooth (f, vl_sift_get_octave(f, s), temp,
1060 vl_sift_get_octave(f, s - 1), w, h, sd) ;
1061 }
1062
1063 return VL_ERR_OK ;
1064 }
1065
1066 /** ------------------------------------------------------------------
1067 ** @brief Process next octave
1068 **
1069 ** @param f SIFT filter.
1070 **
1071 ** The function computes the next octave of the Gaussian scale space.
1072 ** Notice that this clears the record of any feature detected in the
1073 ** previous octave.
1074 **
1075 ** @return error code. The function returns the error
1076 ** ::VL_ERR_EOF when there are no more octaves to process.
1077 **
1078 ** @sa ::vl_sift_process_first_octave().
1079 **/
1080
1081 VL_EXPORT
1082 int
vl_sift_process_next_octave(VlSiftFilt * f)1083 vl_sift_process_next_octave (VlSiftFilt *f)
1084 {
1085
1086 int s, h, w, s_best ;
1087 double sa, sb ;
1088 vl_sift_pix *octave, *pt ;
1089
1090 /* shortcuts */
1091 vl_sift_pix *temp = f-> temp ;
1092 int O = f-> O ;
1093 int S = f-> S ;
1094 int o_min = f-> o_min ;
1095 int s_min = f-> s_min ;
1096 int s_max = f-> s_max ;
1097 double sigma0 = f-> sigma0 ;
1098 double sigmak = f-> sigmak ;
1099 double dsigma0 = f-> dsigma0 ;
1100
1101 /* is there another octave ? */
1102 if (f->o_cur == o_min + O - 1)
1103 return VL_ERR_EOF ;
1104
1105 /* retrieve base */
1106 s_best = VL_MIN(s_min + S, s_max) ;
1107 w = vl_sift_get_octave_width (f) ;
1108 h = vl_sift_get_octave_height (f) ;
1109 pt = vl_sift_get_octave (f, s_best) ;
1110 octave = vl_sift_get_octave (f, s_min) ;
1111
1112 /* next octave */
1113 copy_and_downsample (octave, pt, w, h, 1) ;
1114
1115 f-> o_cur += 1 ;
1116 f-> nkeys = 0 ;
1117 w = f-> octave_width = VL_SHIFT_LEFT(f->width, - f->o_cur) ;
1118 h = f-> octave_height = VL_SHIFT_LEFT(f->height, - f->o_cur) ;
1119
1120 sa = sigma0 * powf (sigmak, s_min ) ;
1121 sb = sigma0 * powf (sigmak, s_best - S) ;
1122
1123 if (sa > sb) {
1124 double sd = sqrt (sa*sa - sb*sb) ;
1125 _vl_sift_smooth (f, octave, temp, octave, w, h, sd) ;
1126 }
1127
1128 /* ------------------------------------------------------------------
1129 * Fill octave
1130 * --------------------------------------------------------------- */
1131
1132 for(s = s_min + 1 ; s <= s_max ; ++s) {
1133 double sd = dsigma0 * pow (sigmak, s) ;
1134 _vl_sift_smooth (f, vl_sift_get_octave(f, s), temp,
1135 vl_sift_get_octave(f, s - 1), w, h, sd) ;
1136 }
1137
1138 return VL_ERR_OK ;
1139 }
1140
1141 /** ------------------------------------------------------------------
1142 ** @brief Detect keypoints
1143 **
1144 ** The function detect keypoints in the current octave filling the
1145 ** internal keypoint buffer. Keypoints can be retrieved by
1146 ** ::vl_sift_get_keypoints().
1147 **
1148 ** @param f SIFT filter.
1149 **/
1150
1151 VL_EXPORT
1152 void
vl_sift_detect(VlSiftFilt * f)1153 vl_sift_detect (VlSiftFilt * f)
1154 {
1155 vl_sift_pix* dog = f-> dog ;
1156 int s_min = f-> s_min ;
1157 int s_max = f-> s_max ;
1158 int w = f-> octave_width ;
1159 int h = f-> octave_height ;
1160 double te = f-> edge_thresh ;
1161 double tp = f-> peak_thresh ;
1162
1163 int const xo = 1 ; /* x-stride */
1164 int const yo = w ; /* y-stride */
1165 int const so = w * h ; /* s-stride */
1166
1167 double xper = pow (2.0, f->o_cur) ;
1168
1169 int x, y, s, i, ii, jj ;
1170 vl_sift_pix *pt, v ;
1171 VlSiftKeypoint *k ;
1172
1173 /* clear current list */
1174 f-> nkeys = 0 ;
1175
1176 /* compute difference of gaussian (DoG) */
1177 pt = f-> dog ;
1178 for (s = s_min ; s <= s_max - 1 ; ++s) {
1179 vl_sift_pix* src_a = vl_sift_get_octave (f, s ) ;
1180 vl_sift_pix* src_b = vl_sift_get_octave (f, s + 1) ;
1181 vl_sift_pix* end_a = src_a + w * h ;
1182 while (src_a != end_a) {
1183 *pt++ = *src_b++ - *src_a++ ;
1184 }
1185 }
1186
1187 /* -----------------------------------------------------------------
1188 * Find local maxima of DoG
1189 * -------------------------------------------------------------- */
1190
1191 /* start from dog [1,1,s_min+1] */
1192 pt = dog + xo + yo + so ;
1193
1194 for(s = s_min + 1 ; s <= s_max - 2 ; ++s) {
1195 for(y = 1 ; y < h - 1 ; ++y) {
1196 for(x = 1 ; x < w - 1 ; ++x) {
1197 v = *pt ;
1198
1199 #define CHECK_NEIGHBORS(CMP,SGN) \
1200 ( v CMP ## = SGN 0.8 * tp && \
1201 v CMP *(pt + xo) && \
1202 v CMP *(pt - xo) && \
1203 v CMP *(pt + so) && \
1204 v CMP *(pt - so) && \
1205 v CMP *(pt + yo) && \
1206 v CMP *(pt - yo) && \
1207 \
1208 v CMP *(pt + yo + xo) && \
1209 v CMP *(pt + yo - xo) && \
1210 v CMP *(pt - yo + xo) && \
1211 v CMP *(pt - yo - xo) && \
1212 \
1213 v CMP *(pt + xo + so) && \
1214 v CMP *(pt - xo + so) && \
1215 v CMP *(pt + yo + so) && \
1216 v CMP *(pt - yo + so) && \
1217 v CMP *(pt + yo + xo + so) && \
1218 v CMP *(pt + yo - xo + so) && \
1219 v CMP *(pt - yo + xo + so) && \
1220 v CMP *(pt - yo - xo + so) && \
1221 \
1222 v CMP *(pt + xo - so) && \
1223 v CMP *(pt - xo - so) && \
1224 v CMP *(pt + yo - so) && \
1225 v CMP *(pt - yo - so) && \
1226 v CMP *(pt + yo + xo - so) && \
1227 v CMP *(pt + yo - xo - so) && \
1228 v CMP *(pt - yo + xo - so) && \
1229 v CMP *(pt - yo - xo - so) )
1230
1231 if (CHECK_NEIGHBORS(>,+) ||
1232 CHECK_NEIGHBORS(<,-) ) {
1233
1234 /* make room for more keypoints */
1235 if (f->nkeys >= f->keys_res) {
1236 f->keys_res += 500 ;
1237 if (f->keys) {
1238 f->keys = vl_realloc (f->keys,
1239 f->keys_res *
1240 sizeof(VlSiftKeypoint)) ;
1241 } else {
1242 f->keys = vl_malloc (f->keys_res *
1243 sizeof(VlSiftKeypoint)) ;
1244 }
1245 }
1246
1247 k = f->keys + (f->nkeys ++) ;
1248
1249 k-> ix = x ;
1250 k-> iy = y ;
1251 k-> is = s ;
1252 }
1253 pt += 1 ;
1254 }
1255 pt += 2 ;
1256 }
1257 pt += 2 * yo ;
1258 }
1259
1260 /* -----------------------------------------------------------------
1261 * Refine local maxima
1262 * -------------------------------------------------------------- */
1263
1264 /* this pointer is used to write the keypoints back */
1265 k = f->keys ;
1266
1267 for (i = 0 ; i < f->nkeys ; ++i) {
1268
1269 int x = f-> keys [i] .ix ;
1270 int y = f-> keys [i] .iy ;
1271 int s = f-> keys [i]. is ;
1272
1273 double Dx=0,Dy=0,Ds=0,Dxx=0,Dyy=0,Dss=0,Dxy=0,Dxs=0,Dys=0 ;
1274 double A [3*3], b [3] ;
1275
1276 int dx = 0 ;
1277 int dy = 0 ;
1278
1279 int iter, i, j ;
1280
1281 for (iter = 0 ; iter < 5 ; ++iter) {
1282
1283 x += dx ;
1284 y += dy ;
1285
1286 pt = dog
1287 + xo * x
1288 + yo * y
1289 + so * (s - s_min) ;
1290
1291 /** @brief Index GSS @internal */
1292 #define at(dx,dy,ds) (*( pt + (dx)*xo + (dy)*yo + (ds)*so))
1293
1294 /** @brief Index matrix A @internal */
1295 #define Aat(i,j) (A[(i)+(j)*3])
1296
1297 /* compute the gradient */
1298 Dx = 0.5 * (at(+1,0,0) - at(-1,0,0)) ;
1299 Dy = 0.5 * (at(0,+1,0) - at(0,-1,0));
1300 Ds = 0.5 * (at(0,0,+1) - at(0,0,-1)) ;
1301
1302 /* compute the Hessian */
1303 Dxx = (at(+1,0,0) + at(-1,0,0) - 2.0 * at(0,0,0)) ;
1304 Dyy = (at(0,+1,0) + at(0,-1,0) - 2.0 * at(0,0,0)) ;
1305 Dss = (at(0,0,+1) + at(0,0,-1) - 2.0 * at(0,0,0)) ;
1306
1307 Dxy = 0.25 * ( at(+1,+1,0) + at(-1,-1,0) - at(-1,+1,0) - at(+1,-1,0) ) ;
1308 Dxs = 0.25 * ( at(+1,0,+1) + at(-1,0,-1) - at(-1,0,+1) - at(+1,0,-1) ) ;
1309 Dys = 0.25 * ( at(0,+1,+1) + at(0,-1,-1) - at(0,-1,+1) - at(0,+1,-1) ) ;
1310
1311 /* solve linear system ....................................... */
1312 Aat(0,0) = Dxx ;
1313 Aat(1,1) = Dyy ;
1314 Aat(2,2) = Dss ;
1315 Aat(0,1) = Aat(1,0) = Dxy ;
1316 Aat(0,2) = Aat(2,0) = Dxs ;
1317 Aat(1,2) = Aat(2,1) = Dys ;
1318
1319 b[0] = - Dx ;
1320 b[1] = - Dy ;
1321 b[2] = - Ds ;
1322
1323 /* Gauss elimination */
1324 for(j = 0 ; j < 3 ; ++j) {
1325 double maxa = 0 ;
1326 double maxabsa = 0 ;
1327 int maxi = -1 ;
1328 double tmp ;
1329
1330 /* look for the maximally stable pivot */
1331 for (i = j ; i < 3 ; ++i) {
1332 double a = Aat (i,j) ;
1333 double absa = vl_abs_d (a) ;
1334 if (absa > maxabsa) {
1335 maxa = a ;
1336 maxabsa = absa ;
1337 maxi = i ;
1338 }
1339 }
1340
1341 /* if singular give up */
1342 if (maxabsa < 1e-10f) {
1343 b[0] = 0 ;
1344 b[1] = 0 ;
1345 b[2] = 0 ;
1346 break ;
1347 }
1348
1349 i = maxi ;
1350
1351 /* swap j-th row with i-th row and normalize j-th row */
1352 for(jj = j ; jj < 3 ; ++jj) {
1353 tmp = Aat(i,jj) ; Aat(i,jj) = Aat(j,jj) ; Aat(j,jj) = tmp ;
1354 Aat(j,jj) /= maxa ;
1355 }
1356 tmp = b[j] ; b[j] = b[i] ; b[i] = tmp ;
1357 b[j] /= maxa ;
1358
1359 /* elimination */
1360 for (ii = j+1 ; ii < 3 ; ++ii) {
1361 double x = Aat(ii,j) ;
1362 for (jj = j ; jj < 3 ; ++jj) {
1363 Aat(ii,jj) -= x * Aat(j,jj) ;
1364 }
1365 b[ii] -= x * b[j] ;
1366 }
1367 }
1368
1369 /* backward substitution */
1370 for (i = 2 ; i > 0 ; --i) {
1371 double x = b[i] ;
1372 for (ii = i-1 ; ii >= 0 ; --ii) {
1373 b[ii] -= x * Aat(ii,i) ;
1374 }
1375 }
1376
1377 /* .......................................................... */
1378 /* If the translation of the keypoint is big, move the keypoint
1379 * and re-iterate the computation. Otherwise we are all set.
1380 */
1381
1382 dx= ((b[0] > 0.6 && x < w - 2) ? 1 : 0)
1383 + ((b[0] < -0.6 && x > 1 ) ? -1 : 0) ;
1384
1385 dy= ((b[1] > 0.6 && y < h - 2) ? 1 : 0)
1386 + ((b[1] < -0.6 && y > 1 ) ? -1 : 0) ;
1387
1388 if (dx == 0 && dy == 0) break ;
1389 }
1390
1391 /* check threshold and other conditions */
1392 {
1393 double val = at(0,0,0)
1394 + 0.5 * (Dx * b[0] + Dy * b[1] + Ds * b[2]) ;
1395 double score = (Dxx+Dyy)*(Dxx+Dyy) / (Dxx*Dyy - Dxy*Dxy) ;
1396 double xn = x + b[0] ;
1397 double yn = y + b[1] ;
1398 double sn = s + b[2] ;
1399
1400 vl_bool good =
1401 vl_abs_d (val) > tp &&
1402 score < (te+1)*(te+1)/te &&
1403 score >= 0 &&
1404 vl_abs_d (b[0]) < 1.5 &&
1405 vl_abs_d (b[1]) < 1.5 &&
1406 vl_abs_d (b[2]) < 1.5 &&
1407 xn >= 0 &&
1408 xn <= w - 1 &&
1409 yn >= 0 &&
1410 yn <= h - 1 &&
1411 sn >= s_min &&
1412 sn <= s_max ;
1413
1414 if (good) {
1415 k-> o = f->o_cur ;
1416 k-> ix = x ;
1417 k-> iy = y ;
1418 k-> is = s ;
1419 k-> s = sn ;
1420 k-> x = xn * xper ;
1421 k-> y = yn * xper ;
1422 k-> sigma = f->sigma0 * pow (2.0, sn/f->S) * xper ;
1423 ++ k ;
1424 }
1425
1426 } /* done checking */
1427 } /* next keypoint to refine */
1428
1429 /* update keypoint count */
1430 f-> nkeys = (int)(k - f->keys) ;
1431 }
1432
1433
1434 /** ------------------------------------------------------------------
1435 ** @internal
1436 ** @brief Update gradients to current GSS octave
1437 **
1438 ** @param f SIFT filter.
1439 **
1440 ** The function makes sure that the gradient buffer is up-to-date
1441 ** with the current GSS data.
1442 **
1443 ** @remark The minimum octave size is 2x2xS.
1444 **/
1445
1446 static void
update_gradient(VlSiftFilt * f)1447 update_gradient (VlSiftFilt *f)
1448 {
1449 int s_min = f->s_min ;
1450 int s_max = f->s_max ;
1451 int w = vl_sift_get_octave_width (f) ;
1452 int h = vl_sift_get_octave_height (f) ;
1453 int const xo = 1 ;
1454 int const yo = w ;
1455 int const so = h * w ;
1456 int y, s ;
1457
1458 if (f->grad_o == f->o_cur) return ;
1459
1460 for (s = s_min + 1 ;
1461 s <= s_max - 2 ; ++ s) {
1462
1463 vl_sift_pix *src, *end, *grad, gx, gy ;
1464
1465 #define SAVE_BACK \
1466 *grad++ = vl_fast_sqrt_f (gx*gx + gy*gy) ; \
1467 *grad++ = vl_mod_2pi_f (vl_fast_atan2_f (gy, gx) + 2*VL_PI) ; \
1468 ++src ; \
1469
1470 src = vl_sift_get_octave (f,s) ;
1471 grad = f->grad + 2 * so * (s - s_min -1) ;
1472
1473 /* first pixel of the first row */
1474 gx = src[+xo] - src[0] ;
1475 gy = src[+yo] - src[0] ;
1476 SAVE_BACK ;
1477
1478 /* middle pixels of the first row */
1479 end = (src - 1) + w - 1 ;
1480 while (src < end) {
1481 gx = 0.5 * (src[+xo] - src[-xo]) ;
1482 gy = src[+yo] - src[0] ;
1483 SAVE_BACK ;
1484 }
1485
1486 /* last pixel of the first row */
1487 gx = src[0] - src[-xo] ;
1488 gy = src[+yo] - src[0] ;
1489 SAVE_BACK ;
1490
1491 for (y = 1 ; y < h -1 ; ++y) {
1492
1493 /* first pixel of the middle rows */
1494 gx = src[+xo] - src[0] ;
1495 gy = 0.5 * (src[+yo] - src[-yo]) ;
1496 SAVE_BACK ;
1497
1498 /* middle pixels of the middle rows */
1499 end = (src - 1) + w - 1 ;
1500 while (src < end) {
1501 gx = 0.5 * (src[+xo] - src[-xo]) ;
1502 gy = 0.5 * (src[+yo] - src[-yo]) ;
1503 SAVE_BACK ;
1504 }
1505
1506 /* last pixel of the middle row */
1507 gx = src[0] - src[-xo] ;
1508 gy = 0.5 * (src[+yo] - src[-yo]) ;
1509 SAVE_BACK ;
1510 }
1511
1512 /* first pixel of the last row */
1513 gx = src[+xo] - src[0] ;
1514 gy = src[ 0] - src[-yo] ;
1515 SAVE_BACK ;
1516
1517 /* middle pixels of the last row */
1518 end = (src - 1) + w - 1 ;
1519 while (src < end) {
1520 gx = 0.5 * (src[+xo] - src[-xo]) ;
1521 gy = src[0] - src[-yo] ;
1522 SAVE_BACK ;
1523 }
1524
1525 /* last pixel of the last row */
1526 gx = src[0] - src[-xo] ;
1527 gy = src[0] - src[-yo] ;
1528 SAVE_BACK ;
1529 }
1530 f->grad_o = f->o_cur ;
1531 }
1532
1533 /** ------------------------------------------------------------------
1534 ** @brief Calculate the keypoint orientation(s)
1535 **
1536 ** @param f SIFT filter.
1537 ** @param angles orientations (output).
1538 ** @param k keypoint.
1539 **
1540 ** The function computes the orientation(s) of the keypoint @a k.
1541 ** The function returns the number of orientations found (up to
1542 ** four). The orientations themselves are written to the vector @a
1543 ** angles.
1544 **
1545 ** @remark The function requires the keypoint octave @a k->o to be
1546 ** equal to the filter current octave ::vl_sift_get_octave. If this
1547 ** is not the case, the function returns zero orientations.
1548 **
1549 ** @remark The function requires the keypoint scale level @c k->s to
1550 ** be in the range @c s_min+1 and @c s_max-2 (where usually @c
1551 ** s_min=0 and @c s_max=S+2). If this is not the case, the function
1552 ** returns zero orientations.
1553 **
1554 ** @return number of orientations found.
1555 **/
1556
1557 VL_EXPORT
1558 int
vl_sift_calc_keypoint_orientations(VlSiftFilt * f,double angles[4],VlSiftKeypoint const * k)1559 vl_sift_calc_keypoint_orientations (VlSiftFilt *f,
1560 double angles [4],
1561 VlSiftKeypoint const *k)
1562 {
1563 double const winf = 1.5 ;
1564 double xper = pow (2.0, f->o_cur) ;
1565
1566 int w = f-> octave_width ;
1567 int h = f-> octave_height ;
1568 int const xo = 2 ; /* x-stride */
1569 int const yo = 2 * w ; /* y-stride */
1570 int const so = 2 * w * h ; /* s-stride */
1571 double x = k-> x / xper ;
1572 double y = k-> y / xper ;
1573 double sigma = k-> sigma / xper ;
1574
1575 int xi = (int) (x + 0.5) ;
1576 int yi = (int) (y + 0.5) ;
1577 int si = k-> is ;
1578
1579 double const sigmaw = winf * sigma ;
1580 int W = VL_MAX(floor (3.0 * sigmaw), 1) ;
1581
1582 int nangles= 0 ;
1583
1584 enum {nbins = 36} ;
1585
1586 double hist [nbins], maxh ;
1587 vl_sift_pix const * pt ;
1588 int xs, ys, iter, i ;
1589
1590 /* skip if the keypoint octave is not current */
1591 if(k->o != f->o_cur)
1592 return 0 ;
1593
1594 /* skip the keypoint if it is out of bounds */
1595 if(xi < 0 ||
1596 xi > w - 1 ||
1597 yi < 0 ||
1598 yi > h - 1 ||
1599 si < f->s_min + 1 ||
1600 si > f->s_max - 2 ) {
1601 return 0 ;
1602 }
1603
1604 /* make gradient up to date */
1605 update_gradient (f) ;
1606
1607 /* clear histogram */
1608 memset (hist, 0, sizeof(double) * nbins) ;
1609
1610 /* compute orientation histogram */
1611 pt = f-> grad + xo*xi + yo*yi + so*(si - f->s_min - 1) ;
1612
1613 #undef at
1614 #define at(dx,dy) (*(pt + xo * (dx) + yo * (dy)))
1615
1616 for(ys = VL_MAX (- W, - yi) ;
1617 ys <= VL_MIN (+ W, h - 1 - yi) ; ++ys) {
1618
1619 for(xs = VL_MAX (- W, - xi) ;
1620 xs <= VL_MIN (+ W, w - 1 - xi) ; ++xs) {
1621
1622
1623 double dx = (double)(xi + xs) - x;
1624 double dy = (double)(yi + ys) - y;
1625 double r2 = dx*dx + dy*dy ;
1626 double wgt, mod, ang, fbin ;
1627
1628 /* limit to a circular window */
1629 if (r2 >= W*W + 0.6) continue ;
1630
1631 wgt = fast_expn (r2 / (2*sigmaw*sigmaw)) ;
1632 mod = *(pt + xs*xo + ys*yo ) ;
1633 ang = *(pt + xs*xo + ys*yo + 1) ;
1634 fbin = nbins * ang / (2 * VL_PI) ;
1635
1636 #if defined(VL_SIFT_BILINEAR_ORIENTATIONS)
1637 {
1638 int bin = (int) vl_floor_d (fbin - 0.5) ;
1639 double rbin = fbin - bin - 0.5 ;
1640 hist [(bin + nbins) % nbins] += (1 - rbin) * mod * wgt ;
1641 hist [(bin + 1 ) % nbins] += ( rbin) * mod * wgt ;
1642 }
1643 #else
1644 {
1645 int bin = vl_floor_d (fbin) ;
1646 bin = vl_floor_d (nbins * ang / (2*VL_PI)) ;
1647 hist [(bin) % nbins] += mod * wgt ;
1648 }
1649 #endif
1650
1651 } /* for xs */
1652 } /* for ys */
1653
1654 /* smooth histogram */
1655 for (iter = 0; iter < 6; iter ++) {
1656 double prev = hist [nbins - 1] ;
1657 double first = hist [0] ;
1658 int i ;
1659 for (i = 0; i < nbins - 1; i++) {
1660 double newh = (prev + hist[i] + hist[(i+1) % nbins]) / 3.0;
1661 prev = hist[i] ;
1662 hist[i] = newh ;
1663 }
1664 hist[i] = (prev + hist[i] + first) / 3.0 ;
1665 }
1666
1667 /* find the histogram maximum */
1668 maxh = 0 ;
1669 for (i = 0 ; i < nbins ; ++i)
1670 maxh = VL_MAX (maxh, hist [i]) ;
1671
1672 /* find peaks within 80% from max */
1673 nangles = 0 ;
1674 for(i = 0 ; i < nbins ; ++i) {
1675 double h0 = hist [i] ;
1676 double hm = hist [(i - 1 + nbins) % nbins] ;
1677 double hp = hist [(i + 1 + nbins) % nbins] ;
1678
1679 /* is this a peak? */
1680 if (h0 > 0.8*maxh && h0 > hm && h0 > hp) {
1681
1682 /* quadratic interpolation */
1683 double di = - 0.5 * (hp - hm) / (hp + hm - 2 * h0) ;
1684 double th = 2 * VL_PI * (i + di + 0.5) / nbins ;
1685 angles [ nangles++ ] = th ;
1686 if( nangles == 4 )
1687 goto enough_angles ;
1688 }
1689 }
1690 enough_angles:
1691 return nangles ;
1692 }
1693
1694
1695 /** ------------------------------------------------------------------
1696 ** @internal
1697 ** @brief Normalizes in norm L_2 a descriptor
1698 ** @param begin begin of histogram.
1699 ** @param end end of histogram.
1700 **/
1701
1702 VL_INLINE vl_sift_pix
normalize_histogram(vl_sift_pix * begin,vl_sift_pix * end)1703 normalize_histogram
1704 (vl_sift_pix *begin, vl_sift_pix *end)
1705 {
1706 vl_sift_pix* iter ;
1707 vl_sift_pix norm = 0.0 ;
1708
1709 for (iter = begin ; iter != end ; ++ iter)
1710 norm += (*iter) * (*iter) ;
1711
1712 norm = vl_fast_sqrt_f (norm) + VL_EPSILON_F ;
1713
1714 for (iter = begin; iter != end ; ++ iter)
1715 *iter /= norm ;
1716
1717 return norm;
1718 }
1719
1720 /** ------------------------------------------------------------------
1721 ** @brief Run the SIFT descriptor on raw data
1722 **
1723 ** @param f SIFT filter.
1724 ** @param grad image gradients.
1725 ** @param descr SIFT descriptor (output).
1726 ** @param width image width.
1727 ** @param height image height.
1728 ** @param x keypoint x coordinate.
1729 ** @param y keypoint y coordinate.
1730 ** @param sigma keypoint scale.
1731 ** @param angle0 keypoint orientation.
1732 **
1733 ** The function runs the SIFT descriptor on raw data. Here @a image
1734 ** is a 2 x @a width x @a height array (by convention, the memory
1735 ** layout is a s such the first index is the fastest varying
1736 ** one). The first @a width x @a height layer of the array contains
1737 ** the gradient magnitude and the second the gradient angle (in
1738 ** radians, between 0 and @f$ 2\pi @f$). @a x, @a y and @a sigma give
1739 ** the keypoint center and scale respectively.
1740 **
1741 ** In order to be equivalent to a standard SIFT descriptor the image
1742 ** gradient must be computed at a smoothing level equal to the scale
1743 ** of the keypoint. In practice, the actual SIFT algorithm makes the
1744 ** following additional approximation, which influence the result:
1745 **
1746 ** - Scale is discretized in @c S levels.
1747 ** - The image is downsampled once for each octave (if you do this,
1748 ** the parameters @a x, @a y and @a sigma must be
1749 ** scaled too).
1750 **/
1751
1752 VL_EXPORT
1753 void
vl_sift_calc_raw_descriptor(VlSiftFilt const * f,vl_sift_pix const * grad,vl_sift_pix * descr,int width,int height,double x,double y,double sigma,double angle0)1754 vl_sift_calc_raw_descriptor (VlSiftFilt const *f,
1755 vl_sift_pix const* grad,
1756 vl_sift_pix *descr,
1757 int width, int height,
1758 double x, double y,
1759 double sigma,
1760 double angle0)
1761 {
1762 double const magnif = f-> magnif ;
1763
1764 int w = width ;
1765 int h = height ;
1766 int const xo = 2 ; /* x-stride */
1767 int const yo = 2 * w ; /* y-stride */
1768
1769 int xi = (int) (x + 0.5) ;
1770 int yi = (int) (y + 0.5) ;
1771
1772 double const st0 = sin (angle0) ;
1773 double const ct0 = cos (angle0) ;
1774 double const SBP = magnif * sigma + VL_EPSILON_D ;
1775 int const W = floor
1776 (sqrt(2.0) * SBP * (NBP + 1) / 2.0 + 0.5) ;
1777
1778 int const binto = 1 ; /* bin theta-stride */
1779 int const binyo = NBO * NBP ; /* bin y-stride */
1780 int const binxo = NBO ; /* bin x-stride */
1781
1782 int bin, dxi, dyi ;
1783 vl_sift_pix const *pt ;
1784 vl_sift_pix *dpt ;
1785
1786 /* check bounds */
1787 if(xi < 0 ||
1788 xi >= w ||
1789 yi < 0 ||
1790 yi >= h - 1 )
1791 return ;
1792
1793 /* clear descriptor */
1794 memset (descr, 0, sizeof(vl_sift_pix) * NBO*NBP*NBP) ;
1795
1796 /* Center the scale space and the descriptor on the current keypoint.
1797 * Note that dpt is pointing to the bin of center (SBP/2,SBP/2,0).
1798 */
1799 pt = grad + xi*xo + yi*yo ;
1800 dpt = descr + (NBP/2) * binyo + (NBP/2) * binxo ;
1801
1802 #undef atd
1803 #define atd(dbinx,dbiny,dbint) *(dpt + (dbint)*binto + (dbiny)*binyo + (dbinx)*binxo)
1804
1805 /*
1806 * Process pixels in the intersection of the image rectangle
1807 * (1,1)-(M-1,N-1) and the keypoint bounding box.
1808 */
1809 for(dyi = VL_MAX(- W, - yi ) ;
1810 dyi <= VL_MIN(+ W, h - yi -1) ; ++ dyi) {
1811
1812 for(dxi = VL_MAX(- W, - xi ) ;
1813 dxi <= VL_MIN(+ W, w - xi -1) ; ++ dxi) {
1814
1815 /* retrieve */
1816 vl_sift_pix mod = *( pt + dxi*xo + dyi*yo + 0 ) ;
1817 vl_sift_pix angle = *( pt + dxi*xo + dyi*yo + 1 ) ;
1818 vl_sift_pix theta = vl_mod_2pi_f (angle - angle0) ;
1819
1820 /* fractional displacement */
1821 vl_sift_pix dx = xi + dxi - x;
1822 vl_sift_pix dy = yi + dyi - y;
1823
1824 /* get the displacement normalized w.r.t. the keypoint
1825 orientation and extension */
1826 vl_sift_pix nx = ( ct0 * dx + st0 * dy) / SBP ;
1827 vl_sift_pix ny = (-st0 * dx + ct0 * dy) / SBP ;
1828 vl_sift_pix nt = NBO * theta / (2 * VL_PI) ;
1829
1830 /* Get the Gaussian weight of the sample. The Gaussian window
1831 * has a standard deviation equal to NBP/2. Note that dx and dy
1832 * are in the normalized frame, so that -NBP/2 <= dx <=
1833 * NBP/2. */
1834 vl_sift_pix const wsigma = f->windowSize ;
1835 vl_sift_pix win = fast_expn
1836 ((nx*nx + ny*ny)/(2.0 * wsigma * wsigma)) ;
1837
1838 /* The sample will be distributed in 8 adjacent bins.
1839 We start from the ``lower-left'' bin. */
1840 int binx = (int)vl_floor_f (nx - 0.5) ;
1841 int biny = (int)vl_floor_f (ny - 0.5) ;
1842 int bint = (int)vl_floor_f (nt) ;
1843 vl_sift_pix rbinx = nx - (binx + 0.5) ;
1844 vl_sift_pix rbiny = ny - (biny + 0.5) ;
1845 vl_sift_pix rbint = nt - bint ;
1846 int dbinx ;
1847 int dbiny ;
1848 int dbint ;
1849
1850 /* Distribute the current sample into the 8 adjacent bins*/
1851 for(dbinx = 0 ; dbinx < 2 ; ++dbinx) {
1852 for(dbiny = 0 ; dbiny < 2 ; ++dbiny) {
1853 for(dbint = 0 ; dbint < 2 ; ++dbint) {
1854
1855 if (binx + dbinx >= - (NBP/2) &&
1856 binx + dbinx < (NBP/2) &&
1857 biny + dbiny >= - (NBP/2) &&
1858 biny + dbiny < (NBP/2) ) {
1859 vl_sift_pix weight = win
1860 * mod
1861 * vl_abs_f (1 - dbinx - rbinx)
1862 * vl_abs_f (1 - dbiny - rbiny)
1863 * vl_abs_f (1 - dbint - rbint) ;
1864
1865 atd(binx+dbinx, biny+dbiny, (bint+dbint) % NBO) += weight ;
1866 }
1867 }
1868 }
1869 }
1870 }
1871 }
1872
1873 /* Standard SIFT descriptors are normalized, truncated and normalized again */
1874 if(1) {
1875
1876 /* normalize L2 norm */
1877 vl_sift_pix norm = normalize_histogram (descr, descr + NBO*NBP*NBP) ;
1878
1879 /*
1880 Set the descriptor to zero if it is lower than our
1881 norm_threshold. We divide by the number of samples in the
1882 descriptor region because the Gaussian window used in the
1883 calculation of the descriptor is not normalized.
1884 */
1885 int numSamples =
1886 (VL_MIN(W, w - xi -1) - VL_MAX(-W, - xi) + 1) *
1887 (VL_MIN(W, h - yi -1) - VL_MAX(-W, - yi) + 1) ;
1888
1889 if(f-> norm_thresh && norm < f-> norm_thresh * numSamples) {
1890 for(bin = 0; bin < NBO*NBP*NBP ; ++ bin)
1891 descr [bin] = 0;
1892 }
1893 else {
1894 /* truncate at 0.2. */
1895 for(bin = 0; bin < NBO*NBP*NBP ; ++ bin) {
1896 if (descr [bin] > 0.2) descr [bin] = 0.2;
1897 }
1898
1899 /* normalize again. */
1900 normalize_histogram (descr, descr + NBO*NBP*NBP) ;
1901 }
1902 }
1903 }
1904
1905 /** ------------------------------------------------------------------
1906 ** @brief Compute the descriptor of a keypoint
1907 **
1908 ** @param f SIFT filter.
1909 ** @param descr SIFT descriptor (output)
1910 ** @param k keypoint.
1911 ** @param angle0 keypoint direction.
1912 **
1913 ** The function computes the SIFT descriptor of the keypoint @a k of
1914 ** orientation @a angle0. The function fills the buffer @a descr
1915 ** which must be large enough to hold the descriptor.
1916 **
1917 ** The function assumes that the keypoint is on the current octave.
1918 ** If not, it does not do anything.
1919 **/
1920
1921 VL_EXPORT
1922 void
vl_sift_calc_keypoint_descriptor(VlSiftFilt * f,vl_sift_pix * descr,VlSiftKeypoint const * k,double angle0)1923 vl_sift_calc_keypoint_descriptor (VlSiftFilt *f,
1924 vl_sift_pix *descr,
1925 VlSiftKeypoint const* k,
1926 double angle0)
1927 {
1928 /*
1929 The SIFT descriptor is a three dimensional histogram of the
1930 position and orientation of the gradient. There are NBP bins for
1931 each spatial dimension and NBO bins for the orientation dimension,
1932 for a total of NBP x NBP x NBO bins.
1933
1934 The support of each spatial bin has an extension of SBP = 3sigma
1935 pixels, where sigma is the scale of the keypoint. Thus all the
1936 bins together have a support SBP x NBP pixels wide. Since
1937 weighting and interpolation of pixel is used, the support extends
1938 by another half bin. Therefore, the support is a square window of
1939 SBP x (NBP + 1) pixels. Finally, since the patch can be
1940 arbitrarily rotated, we need to consider a window 2W += sqrt(2) x
1941 SBP x (NBP + 1) pixels wide.
1942 */
1943
1944 double const magnif = f-> magnif ;
1945
1946 double xper = pow (2.0, f->o_cur) ;
1947
1948 int w = f-> octave_width ;
1949 int h = f-> octave_height ;
1950 int const xo = 2 ; /* x-stride */
1951 int const yo = 2 * w ; /* y-stride */
1952 int const so = 2 * w * h ; /* s-stride */
1953 double x = k-> x / xper ;
1954 double y = k-> y / xper ;
1955 double sigma = k-> sigma / xper ;
1956
1957 int xi = (int) (x + 0.5) ;
1958 int yi = (int) (y + 0.5) ;
1959 int si = k-> is ;
1960
1961 double const st0 = sin (angle0) ;
1962 double const ct0 = cos (angle0) ;
1963 double const SBP = magnif * sigma + VL_EPSILON_D ;
1964 int const W = floor
1965 (sqrt(2.0) * SBP * (NBP + 1) / 2.0 + 0.5) ;
1966
1967 int const binto = 1 ; /* bin theta-stride */
1968 int const binyo = NBO * NBP ; /* bin y-stride */
1969 int const binxo = NBO ; /* bin x-stride */
1970
1971 int bin, dxi, dyi ;
1972 vl_sift_pix const *pt ;
1973 vl_sift_pix *dpt ;
1974
1975 /* check bounds */
1976 if(k->o != f->o_cur ||
1977 xi < 0 ||
1978 xi >= w ||
1979 yi < 0 ||
1980 yi >= h - 1 ||
1981 si < f->s_min + 1 ||
1982 si > f->s_max - 2 )
1983 return ;
1984
1985 /* synchronize gradient buffer */
1986 update_gradient (f) ;
1987
1988 /* VL_PRINTF("W = %d ; magnif = %g ; SBP = %g\n", W,magnif,SBP) ; */
1989
1990 /* clear descriptor */
1991 memset (descr, 0, sizeof(vl_sift_pix) * NBO*NBP*NBP) ;
1992
1993 /* Center the scale space and the descriptor on the current keypoint.
1994 * Note that dpt is pointing to the bin of center (SBP/2,SBP/2,0).
1995 */
1996 pt = f->grad + xi*xo + yi*yo + (si - f->s_min - 1)*so ;
1997 dpt = descr + (NBP/2) * binyo + (NBP/2) * binxo ;
1998
1999 #undef atd
2000 #define atd(dbinx,dbiny,dbint) *(dpt + (dbint)*binto + (dbiny)*binyo + (dbinx)*binxo)
2001
2002 /*
2003 * Process pixels in the intersection of the image rectangle
2004 * (1,1)-(M-1,N-1) and the keypoint bounding box.
2005 */
2006 for(dyi = VL_MAX (- W, 1 - yi ) ;
2007 dyi <= VL_MIN (+ W, h - yi - 2) ; ++ dyi) {
2008
2009 for(dxi = VL_MAX (- W, 1 - xi ) ;
2010 dxi <= VL_MIN (+ W, w - xi - 2) ; ++ dxi) {
2011
2012 /* retrieve */
2013 vl_sift_pix mod = *( pt + dxi*xo + dyi*yo + 0 ) ;
2014 vl_sift_pix angle = *( pt + dxi*xo + dyi*yo + 1 ) ;
2015 vl_sift_pix theta = vl_mod_2pi_f (angle - angle0) ;
2016
2017 /* fractional displacement */
2018 vl_sift_pix dx = xi + dxi - x;
2019 vl_sift_pix dy = yi + dyi - y;
2020
2021 /* get the displacement normalized w.r.t. the keypoint
2022 orientation and extension */
2023 vl_sift_pix nx = ( ct0 * dx + st0 * dy) / SBP ;
2024 vl_sift_pix ny = (-st0 * dx + ct0 * dy) / SBP ;
2025 vl_sift_pix nt = NBO * theta / (2 * VL_PI) ;
2026
2027 /* Get the Gaussian weight of the sample. The Gaussian window
2028 * has a standard deviation equal to NBP/2. Note that dx and dy
2029 * are in the normalized frame, so that -NBP/2 <= dx <=
2030 * NBP/2. */
2031 vl_sift_pix const wsigma = f->windowSize ;
2032 vl_sift_pix win = fast_expn
2033 ((nx*nx + ny*ny)/(2.0 * wsigma * wsigma)) ;
2034
2035 /* The sample will be distributed in 8 adjacent bins.
2036 We start from the ``lower-left'' bin. */
2037 int binx = (int)vl_floor_f (nx - 0.5) ;
2038 int biny = (int)vl_floor_f (ny - 0.5) ;
2039 int bint = (int)vl_floor_f (nt) ;
2040 vl_sift_pix rbinx = nx - (binx + 0.5) ;
2041 vl_sift_pix rbiny = ny - (biny + 0.5) ;
2042 vl_sift_pix rbint = nt - bint ;
2043 int dbinx ;
2044 int dbiny ;
2045 int dbint ;
2046
2047 /* Distribute the current sample into the 8 adjacent bins*/
2048 for(dbinx = 0 ; dbinx < 2 ; ++dbinx) {
2049 for(dbiny = 0 ; dbiny < 2 ; ++dbiny) {
2050 for(dbint = 0 ; dbint < 2 ; ++dbint) {
2051
2052 if (binx + dbinx >= - (NBP/2) &&
2053 binx + dbinx < (NBP/2) &&
2054 biny + dbiny >= - (NBP/2) &&
2055 biny + dbiny < (NBP/2) ) {
2056 vl_sift_pix weight = win
2057 * mod
2058 * vl_abs_f (1 - dbinx - rbinx)
2059 * vl_abs_f (1 - dbiny - rbiny)
2060 * vl_abs_f (1 - dbint - rbint) ;
2061
2062 atd(binx+dbinx, biny+dbiny, (bint+dbint) % NBO) += weight ;
2063 }
2064 }
2065 }
2066 }
2067 }
2068 }
2069
2070 /* Standard SIFT descriptors are normalized, truncated and normalized again */
2071 if(1) {
2072
2073 /* Normalize the histogram to L2 unit length. */
2074 vl_sift_pix norm = normalize_histogram (descr, descr + NBO*NBP*NBP) ;
2075
2076 /* Set the descriptor to zero if it is lower than our norm_threshold */
2077 if(f-> norm_thresh && norm < f-> norm_thresh) {
2078 for(bin = 0; bin < NBO*NBP*NBP ; ++ bin)
2079 descr [bin] = 0;
2080 }
2081 else {
2082
2083 /* Truncate at 0.2. */
2084 for(bin = 0; bin < NBO*NBP*NBP ; ++ bin) {
2085 if (descr [bin] > 0.2) descr [bin] = 0.2;
2086 }
2087
2088 /* Normalize again. */
2089 normalize_histogram (descr, descr + NBO*NBP*NBP) ;
2090 }
2091 }
2092
2093 }
2094
2095 /** ------------------------------------------------------------------
2096 ** @brief Initialize a keypoint from its position and scale
2097 **
2098 ** @param f SIFT filter.
2099 ** @param k SIFT keypoint (output).
2100 ** @param x x coordinate of the keypoint center.
2101 ** @param y y coordinate of the keypoint center.
2102 ** @param sigma keypoint scale.
2103 **
2104 ** The function initializes a keypoint structure @a k from
2105 ** the location @a x
2106 ** and @a y and the scale @a sigma of the keypoint. The keypoint structure
2107 ** maps the keypoint to an octave and scale level of the discretized
2108 ** Gaussian scale space, which is required for instance to compute the
2109 ** keypoint SIFT descriptor.
2110 **
2111 ** @par Algorithm
2112 **
2113 ** The formula linking the keypoint scale sigma to the octave and
2114 ** scale indexes is
2115 **
2116 ** @f[ \sigma(o,s) = \sigma_0 2^{o+s/S} @f]
2117 **
2118 ** In addition to the scale index @e s (which can be fractional due
2119 ** to scale interpolation) a keypoint has an integer scale index @e
2120 ** is too (which is the index of the scale level where it was
2121 ** detected in the DoG scale space). We have the constraints (@ref
2122 ** sift-tech-detector see also the "SIFT detector"):
2123 **
2124 ** - @e o is integer in the range @f$ [o_\mathrm{min},
2125 ** o_{\mathrm{min}}+O-1] @f$.
2126 ** - @e is is integer in the range @f$ [s_\mathrm{min}+1,
2127 ** s_\mathrm{max}-2] @f$. This depends on how the scale is
2128 ** determined during detection, and must be so here because
2129 ** gradients are computed only for this range of scale levels
2130 ** and are required for the calculation of the SIFT descriptor.
2131 ** - @f$ |s - is| < 0.5 @f$ for detected keypoints in most cases due
2132 ** to the interpolation technique used during detection. However
2133 ** this is not necessary.
2134 **
2135 ** Thus octave o represents scales @f$ \{ \sigma(o, s) : s \in
2136 ** [s_\mathrm{min}+1-.5, s_\mathrm{max}-2+.5] \} @f$. Note that some
2137 ** scales may be represented more than once. For each scale, we
2138 ** select the largest possible octave that contains it, i.e.
2139 **
2140 ** @f[
2141 ** o(\sigma)
2142 ** = \max \{ o \in \mathbb{Z} :
2143 ** \sigma_0 2^{\frac{s_\mathrm{min}+1-.5}{S}} \leq \sigma \}
2144 ** = \mathrm{floor}\,\left[
2145 ** \log_2(\sigma / \sigma_0) - \frac{s_\mathrm{min}+1-.5}{S}\right]
2146 ** @f]
2147 **
2148 ** and then
2149 **
2150 ** @f[
2151 ** s(\sigma) = S \left[\log_2(\sigma / \sigma_0) - o(\sigma)\right],
2152 ** \quad
2153 ** is(\sigma) = \mathrm{round}\,(s(\sigma))
2154 ** @f]
2155 **
2156 ** In practice, both @f$ o(\sigma) @f$ and @f$ is(\sigma) @f$ are
2157 ** clamped to their feasible range as determined by the SIFT filter
2158 ** parameters.
2159 **/
2160
2161 VL_EXPORT
2162 void
vl_sift_keypoint_init(VlSiftFilt const * f,VlSiftKeypoint * k,double x,double y,double sigma)2163 vl_sift_keypoint_init (VlSiftFilt const *f,
2164 VlSiftKeypoint *k,
2165 double x,
2166 double y,
2167 double sigma)
2168 {
2169 int o, ix, iy, is ;
2170 double s, phi, xper ;
2171
2172 phi = log2 ((sigma + VL_EPSILON_D) / f->sigma0) ;
2173 o = (int)vl_floor_d (phi - ((double) f->s_min + 0.5) / f->S) ;
2174 o = VL_MIN (o, f->o_min + f->O - 1) ;
2175 o = VL_MAX (o, f->o_min ) ;
2176 s = f->S * (phi - o) ;
2177
2178 is = (int)(s + 0.5) ;
2179 is = VL_MIN(is, f->s_max - 2) ;
2180 is = VL_MAX(is, f->s_min + 1) ;
2181
2182 xper = pow (2.0, o) ;
2183 ix = (int)(x / xper + 0.5) ;
2184 iy = (int)(y / xper + 0.5) ;
2185
2186 k -> o = o ;
2187
2188 k -> ix = ix ;
2189 k -> iy = iy ;
2190 k -> is = is ;
2191
2192 k -> x = x ;
2193 k -> y = y ;
2194 k -> s = s ;
2195
2196 k->sigma = sigma ;
2197 }
2198