1 /*
2  * Software License Agreement (Simplified BSD License)
3  *
4  * Point Cloud Library (PCL) - www.pointclouds.org
5  * Copyright (c) 2013-, Open Perception, Inc.
6  * Copyright (c) 2012, Piotr Dollar & Ron Appel.  [pdollar-at-caltech.edu]
7  *
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice, this
14  *    list of conditions and the following disclaimer.
15  *
16  * 2. Redistributions in binary form must reproduce the above copyright notice,
17  *    this list of conditions and the following disclaimer in the documentation
18  *    and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are those
32  * of the authors and should not be interpreted as representing official policies,
33  * either expressed or implied, of the FreeBSD Project.
34  *
35  * Derived from Piotr Dollar's MATLAB Image&Video Toolbox      Version 3.00.
36  *
37  * hog.cpp
38  * Created on: Nov 30, 2012
39  * Authors: Matteo Munaro, Stefano Ghidoni, Stefano Michieletto
40  */
41 
42 #include <pcl/people/hog.h>
43 
44 #include <cstring> // for memcpy
45 #include <algorithm> // for std::min
46 
47 #if defined(__SSE2__)
48   #include <pcl/sse.h> // sse methods
49 #else
50   #include <cstdlib>
51 #endif
52 
53 /** \brief Constructor. */
HOG()54 pcl::people::HOG::HOG ()
55 {
56   // set default values for optional parameters:
57   h_ = 128;
58   w_ = 64;
59   n_channels_ = 3;
60   bin_size_ = 8;
61   n_orients_ = 9;
62   soft_bin_ = true;
63   clip_ = 0.2;
64 }
65 
66 /** \brief Destructor. */
~HOG()67 pcl::people::HOG::~HOG () {}
68 
69 void
gradMag(float * I,int h,int w,int d,float * M,float * O) const70 pcl::people::HOG::gradMag( float *I, int h, int w, int d, float *M, float *O ) const
71 {
72 #if defined(__SSE2__)
73   int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m;
74   float *acost = acosTable(), acMult=25000/2.02f;
75 
76   // allocate memory for storing one column of output (padded so h4%4==0)
77   h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);
78 
79   M2=(float*) alMalloc(s,16);
80   _M2=(__m128*) M2;
81   Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx;
82   Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy;
83 
84   // compute gradient magnitude and orientation for each column
85   for( x=0; x<w; x++ ) {
86   // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
87   for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
88   for( y=0; y<d*h4/4; y++ ) _M2[y]=pcl::sse_add(pcl::sse_mul(_Gx[y],_Gx[y]),pcl::sse_mul(_Gy[y],_Gy[y]));
89   // store gradients with maximum response in the first channel
90   for(c=1; c<d; c++) {
91     for( y=0; y<h4/4; y++ ) {
92     y1=h4/4*c+y; _m = pcl::sse_cmpgt( _M2[y1], _M2[y] );
93     _M2[y] = pcl::sse_or( pcl::sse_and(_m,_M2[y1]), pcl::sse_andnot(_m,_M2[y]) );
94     _Gx[y] = pcl::sse_or( pcl::sse_and(_m,_Gx[y1]), pcl::sse_andnot(_m,_Gx[y]) );
95     _Gy[y] = pcl::sse_or( pcl::sse_and(_m,_Gy[y1]), pcl::sse_andnot(_m,_Gy[y]) );
96     }
97   }
98   // compute gradient magnitude (M) and normalize Gx
99   for( y=0; y<h4/4; y++ ) {
100     _m = pcl::sse_min( pcl::sse_rcpsqrt(_M2[y]), pcl::sse_set(1e10f) );
101     _M2[y] = pcl::sse_rcp(_m);
102     _Gx[y] = pcl::sse_mul( pcl::sse_mul(_Gx[y],_m), pcl::sse_set(acMult) );
103     _Gx[y] = pcl::sse_xor( _Gx[y], pcl::sse_and(_Gy[y], pcl::sse_set(-0.f)) );
104   };
105 
106   memcpy( M+x*h, M2, h*sizeof(float) );
107   // compute and store gradient orientation (O) via table lookup
108   if(O!=nullptr) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
109   }
110   alFree(Gx); alFree(Gy); alFree(M2);
111 #else
112   int x, y, y1, c, h4, s; float *Gx, *Gy, *M2;
113   float *acost = acosTable(), acMult=25000/2.02f;
114 
115   // allocate memory for storing one column of output (padded so h4%4==0)
116   h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);
117 
118   M2=(float*) alMalloc(s,16);
119   Gx=(float*) alMalloc(s,16);
120   Gy=(float*) alMalloc(s,16);
121   float m;
122 
123   // compute gradient magnitude and orientation for each column
124   for( x=0; x<w; x++ ) {
125   // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
126   for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
127   for( y=0; y<d*h4; y++ )
128   {
129     M2[y] = Gx[y] * Gx[y] + Gy[y] * Gy[y];
130   }
131 
132   // store gradients with maximum response in the first channel
133   for(c=1; c<d; c++)
134   {
135   for( y=0; y<h4/4; y++ )
136     {
137       y1=h4/4*c+y;
138 
139     for (int ii = 0; ii < 4; ++ii)
140     {
141       if (M2[y1 * 4 + ii] > M2[y * 4 + ii])
142       {
143         M2[y * 4 + ii] = M2[y1 * 4 + ii];
144         Gx[y * 4 + ii] = Gx[y1 * 4 + ii];
145         Gy[y * 4 + ii] = Gy[y1 * 4 + ii];
146       }
147     }
148     }
149   }
150   // compute gradient magnitude (M) and normalize Gx
151   for( y=0; y<h4; y++ )
152   {
153   m = 1.0f / ::sqrt (M2[y]);
154   m = m < 1e10f ? m : 1e10f;
155     M2[y] = 1.0f / m;
156     Gx[y] = ((Gx[y] * m) * acMult);
157     if (Gy[y] < 0)
158     Gx[y] = -Gx[y];
159   }
160 
161   memcpy( M+x*h, M2, h*sizeof(float) );
162   // compute and store gradient orientation (O) via table lookup
163   if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
164   }
165   alFree(Gx); alFree(Gy); alFree(M2);
166 #endif
167 }
168 
169 void
gradHist(float * M,float * O,int h,int w,int bin_size,int n_orients,bool soft_bin,float * H) const170 pcl::people::HOG::gradHist( float *M, float *O, int h, int w, int bin_size, int n_orients, bool soft_bin, float *H ) const
171 {
172   const int hb=h/bin_size, wb=w/bin_size, h0=hb*bin_size, w0=wb*bin_size, nb=wb*hb;
173   const float s=(float)bin_size, sInv=1/s, sInv2=1/s/s;
174   float *H0, *H1, *M0, *M1; int *O0, *O1;
175   O0=(int*)alMalloc(h*sizeof(int),16); M0=(float*) alMalloc(h*sizeof(float),16);
176   O1=(int*)alMalloc(h*sizeof(int),16); M1=(float*) alMalloc(h*sizeof(float),16);
177 
178   // main loop
179   float xb = 0;
180   float init = 0;
181   for(int x=0; x<w0; x++ ) {
182     // compute target orientation bins for entire column - very fast
183     gradQuantize( O+x*h, M+x*h, O0, O1, M0, M1, n_orients, nb, h0, sInv2 );
184 
185     if( !soft_bin || bin_size==1 ) {
186       // interpolate w.r.t. orientation only, not spatial bin_size
187       H1 = H + (x / bin_size) * hb;
188 
189       const auto GH = [&H1, &O0, &O1, &M0, &M1](int y) {
190         H1[O0[y]] += M0[y];
191         H1[O1[y]] += M1[y];
192       };
193 
194       for (int y = 0; y < h0;) {
195         for (int inner_loop = 0; inner_loop < bin_size; ++inner_loop, ++y) {
196           GH(y);
197         }
198         H1++;
199       }
200     } else {
201       // interpolate using trilinear interpolation
202 #if defined(__SSE2__)
203       float ms[4], xyd, yb, xd, yd; __m128 _m, _m0, _m1;
204       bool hasLf, hasRt; int xb0, yb0;
205       if( x==0 ) { init=(0+.5f)*sInv-0.5f; xb=init; }
206       hasLf = xb>=0; xb0 = hasLf?(int)xb:-1; hasRt = xb0 < wb-1;
207       xd=xb-xb0; xb+=sInv; yb=init;
208       int y=0;
209       // lambda for code conciseness
210       // @TODO: remove the very generic closure for specific variable one
211       const auto GHinit = [&]()
212       {
213           yd=yb-yb0;
214           yb+=sInv;
215           H0=H+xb0*hb+yb0;
216           xyd=xd*yd;
217 
218           ms[0]=1-xd-yd+xyd;
219           ms[1]=yd-xyd;
220           ms[2]=xd-xyd;
221           ms[3]=xyd;
222       };
223       const auto GH = [&H1](const auto&H, const auto&ma, const auto&mb)
224       {
225           H1=H;
226           pcl::sse_stru(*H1, pcl::sse_add(pcl::sse_ldu(*H1), pcl::sse_mul(ma,mb)));
227       };
228       // leading rows, no top bin_size
229       for( ; y<bin_size/2; y++ ) {
230         yb0=-1; GHinit();
231         if(hasLf) { H0[O0[y]+1]+=ms[1]*M0[y]; H0[O1[y]+1]+=ms[1]*M1[y]; }
232         if(hasRt) { H0[O0[y]+hb+1]+=ms[3]*M0[y]; H0[O1[y]+hb+1]+=ms[3]*M1[y]; }
233       }
234       // main rows, has top and bottom bins, use SSE for minor speedup
235       for( ; ; y++ ) {
236         yb0 = (int) yb; if(yb0>=hb-1) break; GHinit();
237         _m0=pcl::sse_set(M0[y]); _m1=pcl::sse_set(M1[y]);
238         if(hasLf) { _m=pcl::sse_set(0,0,ms[1],ms[0]);
239         GH(H0+O0[y],_m,_m0); GH(H0+O1[y],_m,_m1); }
240         if(hasRt) { _m=pcl::sse_set(0,0,ms[3],ms[2]);
241         GH(H0+O0[y]+hb,_m,_m0); GH(H0+O1[y]+hb,_m,_m1); }
242       }
243       // final rows, no bottom bin_size
244       for( ; y<h0; y++ ) {
245         yb0 = (int) yb; GHinit();
246         if(hasLf) { H0[O0[y]]+=ms[0]*M0[y]; H0[O1[y]]+=ms[0]*M1[y]; }
247         if(hasRt) { H0[O0[y]+hb]+=ms[2]*M0[y]; H0[O1[y]+hb]+=ms[2]*M1[y]; }
248       }
249 #else
250       float ms[4], xyd, yb, xd, yd;
251       bool hasLf, hasRt; int xb0, yb0;
252       if( x==0 ) { init=(0+.5f)*sInv-0.5f; xb=init; }
253       hasLf = xb>=0; xb0 = hasLf?(int)xb:-1; hasRt = xb0 < wb-1;
254       xd=xb-xb0; xb+=sInv; yb=init; int y=0;
255       // lambda for code conciseness
256       const auto GHinit = [&]()
257       {
258           yd=yb-yb0;
259           yb+=sInv;
260           H0=H+xb0*hb+yb0;
261           xyd=xd*yd;
262 
263           ms[0]=1-xd-yd+xyd;
264           ms[1]=yd-xyd;
265           ms[2]=xd-xyd;
266           ms[3]=xyd;
267       };
268       // leading rows, no top bin_size
269       for( ; y<bin_size/2; y++ ) {
270         yb0=-1; GHinit();
271         if(hasLf) { H0[O0[y]+1]+=ms[1]*M0[y]; H0[O1[y]+1]+=ms[1]*M1[y]; }
272         if(hasRt) { H0[O0[y]+hb+1]+=ms[3]*M0[y]; H0[O1[y]+hb+1]+=ms[3]*M1[y]; }
273       }
274       // main rows, has top and bottom bins
275       for( ; ; y++ ) {
276         yb0 = (int) yb;
277         if(yb0>=hb-1)
278           break;
279         GHinit();
280 
281         if(hasLf)
282         {
283           H0[O0[y]+1]+=ms[1]*M0[y];
284           H0[O1[y]+1]+=ms[1]*M1[y];
285           H0[O0[y]]+=ms[0]*M0[y];
286           H0[O1[y]]+=ms[0]*M1[y];
287         }
288             if(hasRt)
289         {
290           H0[O0[y]+hb+1]+=ms[3]*M0[y];
291           H0[O1[y]+hb+1]+=ms[3]*M1[y];
292           H0[O0[y]+hb]+=ms[2]*M0[y];
293           H0[O1[y]+hb]+=ms[2]*M1[y];
294         }
295       }
296       // final rows, no bottom bin_size
297       for( ; y<h0; y++ ) {
298         yb0 = (int) yb; GHinit();
299         if(hasLf) { H0[O0[y]]+=ms[0]*M0[y]; H0[O1[y]]+=ms[0]*M1[y]; }
300         if(hasRt) { H0[O0[y]+hb]+=ms[2]*M0[y]; H0[O1[y]+hb]+=ms[2]*M1[y]; }
301       }
302 #endif
303     }
304   }
305 
306   alFree(O0); alFree(O1); alFree(M0); alFree(M1);
307 }
308 
309 void
normalization(float * H,int h,int w,int bin_size,int n_orients,float clip,float * G) const310 pcl::people::HOG::normalization (float *H, int h, int w, int bin_size, int n_orients, float clip, float *G) const
311 {
312   float *N, *N1, *H1; int o, x, y, hb=h/bin_size, wb=w/bin_size, nb=wb*hb;
313   float eps = 1e-4f/4/bin_size/bin_size/bin_size/bin_size; // precise backward equality
314   // compute 2x2 block normalization values
315   N = (float*) calloc(nb,sizeof(float));
316   for( o=0; o<n_orients; o++ ) for( x=0; x<nb; x++ ) N[x]+=H[x+o*nb]*H[x+o*nb];
317   for( x=0; x<wb-1; x++ ) for( y=0; y<hb-1; y++ ) {
318     N1=N+x*hb+y; *N1=1/float(::sqrt( N1[0] + N1[1] + N1[hb] + N1[hb+1] +eps )); }
319   // perform 4 normalizations per spatial block (handling boundary regions)
320   for( o=0; o<n_orients; o++ ) for( x=0; x<wb; x++ ) {
321     H1=H+o*nb+x*hb; N1=N+x*hb; float *Gs[4]; Gs[0]=G+o*nb+x*hb;
322     const auto U = [&Gs, &H1, &N1, &y, &clip](const auto&a, const auto&b)
323     {
324         Gs[a][y] = std::min(clip, H1[y] * N1[y - b]);
325     };
326     for( y=1; y<4; y++ ) Gs[y]=Gs[y-1]+nb*n_orients;
327     bool lf, md, rt; lf=(x==0); rt=(x==wb-1); md=(!lf && !rt);
328     y=0; if(!rt) U(0,0); if(!lf) U(2,hb);
329     if(lf) for( y=1; y<hb-1; y++ ) { U(0,0); U(1,1); }
330     if(md) for( y=1; y<hb-1; y++ ) { U(0,0); U(1,1); U(2,hb); U(3,hb+1); }
331     if(rt) for( y=1; y<hb-1; y++ ) { U(2,hb); U(3,hb+1); }
332     y=hb-1; if(!rt) U(1,1); if(!lf) U(3,hb+1);
333   } free(N);
334 }
335 
336 void
compute(float * I,int h,int w,int n_channels,int bin_size,int n_orients,bool soft_bin,float * descriptor)337 pcl::people::HOG::compute (float *I, int h, int w, int n_channels, int bin_size, int n_orients, bool soft_bin, float *descriptor)
338 {
339   h_ = h;
340   w_ = w;
341   n_channels_ = n_channels;
342   bin_size_ = bin_size;
343   n_orients_ = n_orients;
344   soft_bin_ = soft_bin;
345 
346   compute (I, descriptor);
347 }
348 
349 void
compute(float * I,float * descriptor) const350 pcl::people::HOG::compute (float *I, float *descriptor) const
351 {
352   // HOG computation:
353   float *M, *O, *G, *H;
354   M = new float[h_ * w_];
355   O = new float[h_ * w_];
356   H = new float[(w_ / bin_size_) * (h_ / bin_size_) * n_orients_]();
357   G = new float[(w_ / bin_size_) * (h_ / bin_size_) * n_orients_ *4]();
358 
359   // Compute gradient magnitude and orientation at each location (uses sse):
360   gradMag (I, h_, w_, n_channels_, M, O );
361 
362   // Compute n_orients gradient histograms per bin_size x bin_size block of pixels:
363   gradHist ( M, O, h_, w_, bin_size_, n_orients_, soft_bin_, H );
364 
365   // Apply normalizations:
366   normalization ( H, h_, w_, bin_size_, n_orients_, clip_, G );
367 
368   // Select descriptor of internal part of the image (remove borders):
369   int k = 0;
370   for (int l = 0; l < (n_orients_ * 4); l++)
371   {
372     for (int j = 1; j < (w_ / bin_size_ - 1); j++)
373     {
374       for (int i = 1; i < (h_ / bin_size_ - 1); i++)
375       {
376         descriptor[k] = G[i + j * h_ / bin_size_ + l * (h_ / bin_size_) * (w_ / bin_size_)];
377         k++;
378       }
379     }
380   }
381   delete[] M; delete[] O; delete[] H; delete[] G;
382 }
383 
384 void
grad1(float * I,float * Gx,float * Gy,int h,int w,int x) const385 pcl::people::HOG::grad1 (float *I, float *Gx, float *Gy, int h, int w, int x) const
386 {
387 #if defined(__SSE2__)
388   int y, y1;
389   float *Ip, *In, r;
390   __m128 *_G, _r;
391   // compute column of Gx
392   Ip = I-h;
393   In = I+h;
394   r = .5f;
395   if(x == 0)
396   {
397     r = 1;
398     Ip += h;
399   }
400   else if(x == w-1)
401   {
402     r = 1;
403     In -= h;
404   }
405   if( h<4 || h%4>0 || (std::size_t(I)&15) || (std::size_t(Gx)&15) )
406   {
407     for( y = 0; y < h; y++ )
408       *Gx++ = (*In++ - *Ip++) * r;
409   } else {
410     _G = (__m128*) Gx;
411     __m128 *_Ip = (__m128*) Ip;
412     __m128 *_In = (__m128*) In;
413     _r = pcl::sse_set(r);
414     for(y = 0; y < h; y += 4)
415       *_G++ = pcl::sse_mul(pcl::sse_sub(*_In++,*_Ip++), _r);
416   }
417   // compute column of Gy
418   const auto GRADY = [&Gy, &In, &Ip](const auto& r)
419   {
420       *Gy++ = (*In++ - *Ip++) * r;
421   };
422   Ip = I;
423   In = Ip + 1;
424   // GRADY(1); Ip--; for(y = 1; y < h-1; y++) GRADY(.5f); In--; GRADY(1);
425   y1 = ((~((std::size_t) Gy) + 1) & 15)/4;
426   if(y1 == 0) y1 = 4;
427   if(y1 > h-1) y1 = h-1;
428   GRADY(1);
429   Ip--;
430   for(y = 1; y < y1; y++) GRADY(.5f);
431   _r = pcl::sse_set(.5f);
432   _G = (__m128*) Gy;
433   for(; y+4 < h-1; y += 4, Ip += 4, In += 4, Gy += 4)
434     *_G++ = pcl::sse_mul(pcl::sse_sub(pcl::sse_ldu(*In),pcl::sse_ldu(*Ip)), _r);
435   for(; y < h-1; y++) GRADY(.5f);
436   In--;
437   GRADY(1);
438 #else
439   int y, y1;
440   float *Ip, *In, r;
441 
442   // compute column of Gx
443   Ip = I - h;
444   In = I + h;
445   r = .5f;
446 
447   if(x == 0)
448   {
449     r = 1;
450     Ip += h;
451   }
452   else if (x == w-1)
453   {
454     r = 1;
455     In -= h;
456   }
457 
458   for (y = 0; y < h; y++)
459     *Gx++=(*In++ - *Ip++)*r;
460 
461   // compute column of Gy
462   const auto GRADY = [&Gy, &In, &Ip](const auto& r)
463   {
464       *Gy++ = (*In++ - *Ip++) * r;
465   };
466   Ip=I; In=Ip+1;
467   // GRADY(1); Ip--; for(y=1; y<h-1; y++) GRADY(.5f); In--; GRADY(1);
468   y1=((~((std::size_t) Gy) + 1) & 15)/4; if(y1==0) y1=4; if(y1>h-1) y1=h-1;
469   GRADY(1); Ip--; for(y=1; y<y1; y++) GRADY(.5f);
470 
471   r = 0.5f;
472   for(; y<h-1; y++)
473     GRADY(.5f); In--; GRADY(1);
474 #endif
475 }
476 
477 float*
acosTable() const478 pcl::people::HOG::acosTable () const
479 {
480   int n = 25000;
481   int n2 = n / 2;
482   static float a[25000];
483   static bool init = false;
484   if( init ) return a+n2;
485   float ni = 2.02f/(float) n;
486   for(int i=0; i<n; i++ )
487   {
488     float t = i*ni - 1.01f;
489     t = t<-1 ? -1 : (t>1 ? 1 : t);
490     t = (float) std::acos( t );
491     a[i] = (t <= M_PI-1e-5f) ? t : 0;
492   }
493   init = true;
494   return a+n2;
495 }
496 
497 void
gradQuantize(float * O,float * M,int * O0,int * O1,float * M0,float * M1,int n_orients,int nb,int n,float norm) const498 pcl::people::HOG::gradQuantize (float *O, float *M, int *O0, int *O1, float *M0, float *M1, int n_orients, int nb, int n, float norm) const
499 {
500   // define useful constants
501   const float oMult = (float)n_orients/M_PI;
502   const int oMax = n_orients * nb;
503 
504   int i = 0;
505 #if defined(__SSE2__)
506   // assumes all *OUTPUT* matrices are 4-byte aligned
507   __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _o0f, _m, *_M0, *_M1;
508   const __m128 _norm = pcl::sse_set(norm);
509   const __m128 _oMult = pcl::sse_set(oMult);
510   const __m128 _nbf = pcl::sse_set((float)nb);
511   const __m128i _oMax = pcl::sse_set(oMax);
512   const __m128i _nb = pcl::sse_set(nb);
513 
514   // perform the majority of the work with sse
515   _O0=(__m128i*) O0;
516   _O1=(__m128i*) O1;
517   _M0=(__m128*) M0;
518   _M1=(__m128*) M1;
519   for( ; i <= n-4; i += 4 ) {
520     _o = pcl::sse_mul(pcl::sse_ldu(O[i]),_oMult);
521     _o0f = pcl::sse_cvt(pcl::sse_cvt(_o));
522     _o0 = pcl::sse_cvt(pcl::sse_mul(_o0f,_nbf));
523     _o1 = pcl::sse_add(_o0,_nb);
524     _o1 = pcl::sse_and(pcl::sse_cmpgt(_oMax,_o1),_o1);
525     *_O0++ = _o0;
526     *_O1++ = _o1;
527     _m = pcl::sse_mul(pcl::sse_ldu(M[i]),_norm);
528     *_M1 = pcl::sse_mul(pcl::sse_sub(_o,_o0f),_m);
529     *_M0 = pcl::sse_sub(_m,*_M1); _M0++; _M1++;
530   }
531 #endif
532 
533   // compute trailing locations without sse
534   for( ; i < n; i++ ) {
535     float o = O[i] * oMult;
536     float m = M[i] * norm;
537     int o0 = (int) o;
538     float od = o - o0;
539     o0 *= nb;
540     int o1 = o0 + nb;
541     if(o1 == oMax) o1 = 0;
542     O0[i] = o0;
543     O1[i] = o1;
544     M1[i] = od * m;
545     M0[i] = m - M1[i];
546   }
547 }
548 
549 inline void*
alMalloc(std::size_t size,int alignment) const550 pcl::people::HOG::alMalloc (std::size_t size, int alignment) const
551 {
552   const std::size_t pSize = sizeof(void*), a = alignment-1;
553   void *raw = malloc(size + a + pSize);
554   void *aligned = (void*) (((std::size_t) raw + pSize + a) & ~a);
555   *(void**) ((std::size_t) aligned-pSize) = raw;
556   return aligned;
557 }
558 
559 inline void
alFree(void * aligned) const560 pcl::people::HOG::alFree (void* aligned) const
561 {
562   void* raw = *(void**)((char*)aligned-sizeof(void*));
563   free(raw);
564 }
565 
566