1 /*
2 * Software License Agreement (Simplified BSD License)
3 *
4 * Point Cloud Library (PCL) - www.pointclouds.org
5 * Copyright (c) 2013-, Open Perception, Inc.
6 * Copyright (c) 2012, Piotr Dollar & Ron Appel. [pdollar-at-caltech.edu]
7 *
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright notice, this
14 * list of conditions and the following disclaimer.
15 *
16 * 2. Redistributions in binary form must reproduce the above copyright notice,
17 * this list of conditions and the following disclaimer in the documentation
18 * and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * The views and conclusions contained in the software and documentation are those
32 * of the authors and should not be interpreted as representing official policies,
33 * either expressed or implied, of the FreeBSD Project.
34 *
35 * Derived from Piotr Dollar's MATLAB Image&Video Toolbox Version 3.00.
36 *
37 * hog.cpp
38 * Created on: Nov 30, 2012
39 * Authors: Matteo Munaro, Stefano Ghidoni, Stefano Michieletto
40 */
41
42 #include <pcl/people/hog.h>
43
44 #include <cstring> // for memcpy
45 #include <algorithm> // for std::min
46
47 #if defined(__SSE2__)
48 #include <pcl/sse.h> // sse methods
49 #else
50 #include <cstdlib>
51 #endif
52
53 /** \brief Constructor. */
HOG()54 pcl::people::HOG::HOG ()
55 {
56 // set default values for optional parameters:
57 h_ = 128;
58 w_ = 64;
59 n_channels_ = 3;
60 bin_size_ = 8;
61 n_orients_ = 9;
62 soft_bin_ = true;
63 clip_ = 0.2;
64 }
65
66 /** \brief Destructor. */
~HOG()67 pcl::people::HOG::~HOG () {}
68
69 void
gradMag(float * I,int h,int w,int d,float * M,float * O) const70 pcl::people::HOG::gradMag( float *I, int h, int w, int d, float *M, float *O ) const
71 {
72 #if defined(__SSE2__)
73 int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m;
74 float *acost = acosTable(), acMult=25000/2.02f;
75
76 // allocate memory for storing one column of output (padded so h4%4==0)
77 h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);
78
79 M2=(float*) alMalloc(s,16);
80 _M2=(__m128*) M2;
81 Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx;
82 Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy;
83
84 // compute gradient magnitude and orientation for each column
85 for( x=0; x<w; x++ ) {
86 // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
87 for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
88 for( y=0; y<d*h4/4; y++ ) _M2[y]=pcl::sse_add(pcl::sse_mul(_Gx[y],_Gx[y]),pcl::sse_mul(_Gy[y],_Gy[y]));
89 // store gradients with maximum response in the first channel
90 for(c=1; c<d; c++) {
91 for( y=0; y<h4/4; y++ ) {
92 y1=h4/4*c+y; _m = pcl::sse_cmpgt( _M2[y1], _M2[y] );
93 _M2[y] = pcl::sse_or( pcl::sse_and(_m,_M2[y1]), pcl::sse_andnot(_m,_M2[y]) );
94 _Gx[y] = pcl::sse_or( pcl::sse_and(_m,_Gx[y1]), pcl::sse_andnot(_m,_Gx[y]) );
95 _Gy[y] = pcl::sse_or( pcl::sse_and(_m,_Gy[y1]), pcl::sse_andnot(_m,_Gy[y]) );
96 }
97 }
98 // compute gradient magnitude (M) and normalize Gx
99 for( y=0; y<h4/4; y++ ) {
100 _m = pcl::sse_min( pcl::sse_rcpsqrt(_M2[y]), pcl::sse_set(1e10f) );
101 _M2[y] = pcl::sse_rcp(_m);
102 _Gx[y] = pcl::sse_mul( pcl::sse_mul(_Gx[y],_m), pcl::sse_set(acMult) );
103 _Gx[y] = pcl::sse_xor( _Gx[y], pcl::sse_and(_Gy[y], pcl::sse_set(-0.f)) );
104 };
105
106 memcpy( M+x*h, M2, h*sizeof(float) );
107 // compute and store gradient orientation (O) via table lookup
108 if(O!=nullptr) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
109 }
110 alFree(Gx); alFree(Gy); alFree(M2);
111 #else
112 int x, y, y1, c, h4, s; float *Gx, *Gy, *M2;
113 float *acost = acosTable(), acMult=25000/2.02f;
114
115 // allocate memory for storing one column of output (padded so h4%4==0)
116 h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);
117
118 M2=(float*) alMalloc(s,16);
119 Gx=(float*) alMalloc(s,16);
120 Gy=(float*) alMalloc(s,16);
121 float m;
122
123 // compute gradient magnitude and orientation for each column
124 for( x=0; x<w; x++ ) {
125 // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
126 for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
127 for( y=0; y<d*h4; y++ )
128 {
129 M2[y] = Gx[y] * Gx[y] + Gy[y] * Gy[y];
130 }
131
132 // store gradients with maximum response in the first channel
133 for(c=1; c<d; c++)
134 {
135 for( y=0; y<h4/4; y++ )
136 {
137 y1=h4/4*c+y;
138
139 for (int ii = 0; ii < 4; ++ii)
140 {
141 if (M2[y1 * 4 + ii] > M2[y * 4 + ii])
142 {
143 M2[y * 4 + ii] = M2[y1 * 4 + ii];
144 Gx[y * 4 + ii] = Gx[y1 * 4 + ii];
145 Gy[y * 4 + ii] = Gy[y1 * 4 + ii];
146 }
147 }
148 }
149 }
150 // compute gradient magnitude (M) and normalize Gx
151 for( y=0; y<h4; y++ )
152 {
153 m = 1.0f / ::sqrt (M2[y]);
154 m = m < 1e10f ? m : 1e10f;
155 M2[y] = 1.0f / m;
156 Gx[y] = ((Gx[y] * m) * acMult);
157 if (Gy[y] < 0)
158 Gx[y] = -Gx[y];
159 }
160
161 memcpy( M+x*h, M2, h*sizeof(float) );
162 // compute and store gradient orientation (O) via table lookup
163 if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
164 }
165 alFree(Gx); alFree(Gy); alFree(M2);
166 #endif
167 }
168
169 void
gradHist(float * M,float * O,int h,int w,int bin_size,int n_orients,bool soft_bin,float * H) const170 pcl::people::HOG::gradHist( float *M, float *O, int h, int w, int bin_size, int n_orients, bool soft_bin, float *H ) const
171 {
172 const int hb=h/bin_size, wb=w/bin_size, h0=hb*bin_size, w0=wb*bin_size, nb=wb*hb;
173 const float s=(float)bin_size, sInv=1/s, sInv2=1/s/s;
174 float *H0, *H1, *M0, *M1; int *O0, *O1;
175 O0=(int*)alMalloc(h*sizeof(int),16); M0=(float*) alMalloc(h*sizeof(float),16);
176 O1=(int*)alMalloc(h*sizeof(int),16); M1=(float*) alMalloc(h*sizeof(float),16);
177
178 // main loop
179 float xb = 0;
180 float init = 0;
181 for(int x=0; x<w0; x++ ) {
182 // compute target orientation bins for entire column - very fast
183 gradQuantize( O+x*h, M+x*h, O0, O1, M0, M1, n_orients, nb, h0, sInv2 );
184
185 if( !soft_bin || bin_size==1 ) {
186 // interpolate w.r.t. orientation only, not spatial bin_size
187 H1 = H + (x / bin_size) * hb;
188
189 const auto GH = [&H1, &O0, &O1, &M0, &M1](int y) {
190 H1[O0[y]] += M0[y];
191 H1[O1[y]] += M1[y];
192 };
193
194 for (int y = 0; y < h0;) {
195 for (int inner_loop = 0; inner_loop < bin_size; ++inner_loop, ++y) {
196 GH(y);
197 }
198 H1++;
199 }
200 } else {
201 // interpolate using trilinear interpolation
202 #if defined(__SSE2__)
203 float ms[4], xyd, yb, xd, yd; __m128 _m, _m0, _m1;
204 bool hasLf, hasRt; int xb0, yb0;
205 if( x==0 ) { init=(0+.5f)*sInv-0.5f; xb=init; }
206 hasLf = xb>=0; xb0 = hasLf?(int)xb:-1; hasRt = xb0 < wb-1;
207 xd=xb-xb0; xb+=sInv; yb=init;
208 int y=0;
209 // lambda for code conciseness
210 // @TODO: remove the very generic closure for specific variable one
211 const auto GHinit = [&]()
212 {
213 yd=yb-yb0;
214 yb+=sInv;
215 H0=H+xb0*hb+yb0;
216 xyd=xd*yd;
217
218 ms[0]=1-xd-yd+xyd;
219 ms[1]=yd-xyd;
220 ms[2]=xd-xyd;
221 ms[3]=xyd;
222 };
223 const auto GH = [&H1](const auto&H, const auto&ma, const auto&mb)
224 {
225 H1=H;
226 pcl::sse_stru(*H1, pcl::sse_add(pcl::sse_ldu(*H1), pcl::sse_mul(ma,mb)));
227 };
228 // leading rows, no top bin_size
229 for( ; y<bin_size/2; y++ ) {
230 yb0=-1; GHinit();
231 if(hasLf) { H0[O0[y]+1]+=ms[1]*M0[y]; H0[O1[y]+1]+=ms[1]*M1[y]; }
232 if(hasRt) { H0[O0[y]+hb+1]+=ms[3]*M0[y]; H0[O1[y]+hb+1]+=ms[3]*M1[y]; }
233 }
234 // main rows, has top and bottom bins, use SSE for minor speedup
235 for( ; ; y++ ) {
236 yb0 = (int) yb; if(yb0>=hb-1) break; GHinit();
237 _m0=pcl::sse_set(M0[y]); _m1=pcl::sse_set(M1[y]);
238 if(hasLf) { _m=pcl::sse_set(0,0,ms[1],ms[0]);
239 GH(H0+O0[y],_m,_m0); GH(H0+O1[y],_m,_m1); }
240 if(hasRt) { _m=pcl::sse_set(0,0,ms[3],ms[2]);
241 GH(H0+O0[y]+hb,_m,_m0); GH(H0+O1[y]+hb,_m,_m1); }
242 }
243 // final rows, no bottom bin_size
244 for( ; y<h0; y++ ) {
245 yb0 = (int) yb; GHinit();
246 if(hasLf) { H0[O0[y]]+=ms[0]*M0[y]; H0[O1[y]]+=ms[0]*M1[y]; }
247 if(hasRt) { H0[O0[y]+hb]+=ms[2]*M0[y]; H0[O1[y]+hb]+=ms[2]*M1[y]; }
248 }
249 #else
250 float ms[4], xyd, yb, xd, yd;
251 bool hasLf, hasRt; int xb0, yb0;
252 if( x==0 ) { init=(0+.5f)*sInv-0.5f; xb=init; }
253 hasLf = xb>=0; xb0 = hasLf?(int)xb:-1; hasRt = xb0 < wb-1;
254 xd=xb-xb0; xb+=sInv; yb=init; int y=0;
255 // lambda for code conciseness
256 const auto GHinit = [&]()
257 {
258 yd=yb-yb0;
259 yb+=sInv;
260 H0=H+xb0*hb+yb0;
261 xyd=xd*yd;
262
263 ms[0]=1-xd-yd+xyd;
264 ms[1]=yd-xyd;
265 ms[2]=xd-xyd;
266 ms[3]=xyd;
267 };
268 // leading rows, no top bin_size
269 for( ; y<bin_size/2; y++ ) {
270 yb0=-1; GHinit();
271 if(hasLf) { H0[O0[y]+1]+=ms[1]*M0[y]; H0[O1[y]+1]+=ms[1]*M1[y]; }
272 if(hasRt) { H0[O0[y]+hb+1]+=ms[3]*M0[y]; H0[O1[y]+hb+1]+=ms[3]*M1[y]; }
273 }
274 // main rows, has top and bottom bins
275 for( ; ; y++ ) {
276 yb0 = (int) yb;
277 if(yb0>=hb-1)
278 break;
279 GHinit();
280
281 if(hasLf)
282 {
283 H0[O0[y]+1]+=ms[1]*M0[y];
284 H0[O1[y]+1]+=ms[1]*M1[y];
285 H0[O0[y]]+=ms[0]*M0[y];
286 H0[O1[y]]+=ms[0]*M1[y];
287 }
288 if(hasRt)
289 {
290 H0[O0[y]+hb+1]+=ms[3]*M0[y];
291 H0[O1[y]+hb+1]+=ms[3]*M1[y];
292 H0[O0[y]+hb]+=ms[2]*M0[y];
293 H0[O1[y]+hb]+=ms[2]*M1[y];
294 }
295 }
296 // final rows, no bottom bin_size
297 for( ; y<h0; y++ ) {
298 yb0 = (int) yb; GHinit();
299 if(hasLf) { H0[O0[y]]+=ms[0]*M0[y]; H0[O1[y]]+=ms[0]*M1[y]; }
300 if(hasRt) { H0[O0[y]+hb]+=ms[2]*M0[y]; H0[O1[y]+hb]+=ms[2]*M1[y]; }
301 }
302 #endif
303 }
304 }
305
306 alFree(O0); alFree(O1); alFree(M0); alFree(M1);
307 }
308
309 void
normalization(float * H,int h,int w,int bin_size,int n_orients,float clip,float * G) const310 pcl::people::HOG::normalization (float *H, int h, int w, int bin_size, int n_orients, float clip, float *G) const
311 {
312 float *N, *N1, *H1; int o, x, y, hb=h/bin_size, wb=w/bin_size, nb=wb*hb;
313 float eps = 1e-4f/4/bin_size/bin_size/bin_size/bin_size; // precise backward equality
314 // compute 2x2 block normalization values
315 N = (float*) calloc(nb,sizeof(float));
316 for( o=0; o<n_orients; o++ ) for( x=0; x<nb; x++ ) N[x]+=H[x+o*nb]*H[x+o*nb];
317 for( x=0; x<wb-1; x++ ) for( y=0; y<hb-1; y++ ) {
318 N1=N+x*hb+y; *N1=1/float(::sqrt( N1[0] + N1[1] + N1[hb] + N1[hb+1] +eps )); }
319 // perform 4 normalizations per spatial block (handling boundary regions)
320 for( o=0; o<n_orients; o++ ) for( x=0; x<wb; x++ ) {
321 H1=H+o*nb+x*hb; N1=N+x*hb; float *Gs[4]; Gs[0]=G+o*nb+x*hb;
322 const auto U = [&Gs, &H1, &N1, &y, &clip](const auto&a, const auto&b)
323 {
324 Gs[a][y] = std::min(clip, H1[y] * N1[y - b]);
325 };
326 for( y=1; y<4; y++ ) Gs[y]=Gs[y-1]+nb*n_orients;
327 bool lf, md, rt; lf=(x==0); rt=(x==wb-1); md=(!lf && !rt);
328 y=0; if(!rt) U(0,0); if(!lf) U(2,hb);
329 if(lf) for( y=1; y<hb-1; y++ ) { U(0,0); U(1,1); }
330 if(md) for( y=1; y<hb-1; y++ ) { U(0,0); U(1,1); U(2,hb); U(3,hb+1); }
331 if(rt) for( y=1; y<hb-1; y++ ) { U(2,hb); U(3,hb+1); }
332 y=hb-1; if(!rt) U(1,1); if(!lf) U(3,hb+1);
333 } free(N);
334 }
335
336 void
compute(float * I,int h,int w,int n_channels,int bin_size,int n_orients,bool soft_bin,float * descriptor)337 pcl::people::HOG::compute (float *I, int h, int w, int n_channels, int bin_size, int n_orients, bool soft_bin, float *descriptor)
338 {
339 h_ = h;
340 w_ = w;
341 n_channels_ = n_channels;
342 bin_size_ = bin_size;
343 n_orients_ = n_orients;
344 soft_bin_ = soft_bin;
345
346 compute (I, descriptor);
347 }
348
349 void
compute(float * I,float * descriptor) const350 pcl::people::HOG::compute (float *I, float *descriptor) const
351 {
352 // HOG computation:
353 float *M, *O, *G, *H;
354 M = new float[h_ * w_];
355 O = new float[h_ * w_];
356 H = new float[(w_ / bin_size_) * (h_ / bin_size_) * n_orients_]();
357 G = new float[(w_ / bin_size_) * (h_ / bin_size_) * n_orients_ *4]();
358
359 // Compute gradient magnitude and orientation at each location (uses sse):
360 gradMag (I, h_, w_, n_channels_, M, O );
361
362 // Compute n_orients gradient histograms per bin_size x bin_size block of pixels:
363 gradHist ( M, O, h_, w_, bin_size_, n_orients_, soft_bin_, H );
364
365 // Apply normalizations:
366 normalization ( H, h_, w_, bin_size_, n_orients_, clip_, G );
367
368 // Select descriptor of internal part of the image (remove borders):
369 int k = 0;
370 for (int l = 0; l < (n_orients_ * 4); l++)
371 {
372 for (int j = 1; j < (w_ / bin_size_ - 1); j++)
373 {
374 for (int i = 1; i < (h_ / bin_size_ - 1); i++)
375 {
376 descriptor[k] = G[i + j * h_ / bin_size_ + l * (h_ / bin_size_) * (w_ / bin_size_)];
377 k++;
378 }
379 }
380 }
381 delete[] M; delete[] O; delete[] H; delete[] G;
382 }
383
384 void
grad1(float * I,float * Gx,float * Gy,int h,int w,int x) const385 pcl::people::HOG::grad1 (float *I, float *Gx, float *Gy, int h, int w, int x) const
386 {
387 #if defined(__SSE2__)
388 int y, y1;
389 float *Ip, *In, r;
390 __m128 *_G, _r;
391 // compute column of Gx
392 Ip = I-h;
393 In = I+h;
394 r = .5f;
395 if(x == 0)
396 {
397 r = 1;
398 Ip += h;
399 }
400 else if(x == w-1)
401 {
402 r = 1;
403 In -= h;
404 }
405 if( h<4 || h%4>0 || (std::size_t(I)&15) || (std::size_t(Gx)&15) )
406 {
407 for( y = 0; y < h; y++ )
408 *Gx++ = (*In++ - *Ip++) * r;
409 } else {
410 _G = (__m128*) Gx;
411 __m128 *_Ip = (__m128*) Ip;
412 __m128 *_In = (__m128*) In;
413 _r = pcl::sse_set(r);
414 for(y = 0; y < h; y += 4)
415 *_G++ = pcl::sse_mul(pcl::sse_sub(*_In++,*_Ip++), _r);
416 }
417 // compute column of Gy
418 const auto GRADY = [&Gy, &In, &Ip](const auto& r)
419 {
420 *Gy++ = (*In++ - *Ip++) * r;
421 };
422 Ip = I;
423 In = Ip + 1;
424 // GRADY(1); Ip--; for(y = 1; y < h-1; y++) GRADY(.5f); In--; GRADY(1);
425 y1 = ((~((std::size_t) Gy) + 1) & 15)/4;
426 if(y1 == 0) y1 = 4;
427 if(y1 > h-1) y1 = h-1;
428 GRADY(1);
429 Ip--;
430 for(y = 1; y < y1; y++) GRADY(.5f);
431 _r = pcl::sse_set(.5f);
432 _G = (__m128*) Gy;
433 for(; y+4 < h-1; y += 4, Ip += 4, In += 4, Gy += 4)
434 *_G++ = pcl::sse_mul(pcl::sse_sub(pcl::sse_ldu(*In),pcl::sse_ldu(*Ip)), _r);
435 for(; y < h-1; y++) GRADY(.5f);
436 In--;
437 GRADY(1);
438 #else
439 int y, y1;
440 float *Ip, *In, r;
441
442 // compute column of Gx
443 Ip = I - h;
444 In = I + h;
445 r = .5f;
446
447 if(x == 0)
448 {
449 r = 1;
450 Ip += h;
451 }
452 else if (x == w-1)
453 {
454 r = 1;
455 In -= h;
456 }
457
458 for (y = 0; y < h; y++)
459 *Gx++=(*In++ - *Ip++)*r;
460
461 // compute column of Gy
462 const auto GRADY = [&Gy, &In, &Ip](const auto& r)
463 {
464 *Gy++ = (*In++ - *Ip++) * r;
465 };
466 Ip=I; In=Ip+1;
467 // GRADY(1); Ip--; for(y=1; y<h-1; y++) GRADY(.5f); In--; GRADY(1);
468 y1=((~((std::size_t) Gy) + 1) & 15)/4; if(y1==0) y1=4; if(y1>h-1) y1=h-1;
469 GRADY(1); Ip--; for(y=1; y<y1; y++) GRADY(.5f);
470
471 r = 0.5f;
472 for(; y<h-1; y++)
473 GRADY(.5f); In--; GRADY(1);
474 #endif
475 }
476
477 float*
acosTable() const478 pcl::people::HOG::acosTable () const
479 {
480 int n = 25000;
481 int n2 = n / 2;
482 static float a[25000];
483 static bool init = false;
484 if( init ) return a+n2;
485 float ni = 2.02f/(float) n;
486 for(int i=0; i<n; i++ )
487 {
488 float t = i*ni - 1.01f;
489 t = t<-1 ? -1 : (t>1 ? 1 : t);
490 t = (float) std::acos( t );
491 a[i] = (t <= M_PI-1e-5f) ? t : 0;
492 }
493 init = true;
494 return a+n2;
495 }
496
497 void
gradQuantize(float * O,float * M,int * O0,int * O1,float * M0,float * M1,int n_orients,int nb,int n,float norm) const498 pcl::people::HOG::gradQuantize (float *O, float *M, int *O0, int *O1, float *M0, float *M1, int n_orients, int nb, int n, float norm) const
499 {
500 // define useful constants
501 const float oMult = (float)n_orients/M_PI;
502 const int oMax = n_orients * nb;
503
504 int i = 0;
505 #if defined(__SSE2__)
506 // assumes all *OUTPUT* matrices are 4-byte aligned
507 __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _o0f, _m, *_M0, *_M1;
508 const __m128 _norm = pcl::sse_set(norm);
509 const __m128 _oMult = pcl::sse_set(oMult);
510 const __m128 _nbf = pcl::sse_set((float)nb);
511 const __m128i _oMax = pcl::sse_set(oMax);
512 const __m128i _nb = pcl::sse_set(nb);
513
514 // perform the majority of the work with sse
515 _O0=(__m128i*) O0;
516 _O1=(__m128i*) O1;
517 _M0=(__m128*) M0;
518 _M1=(__m128*) M1;
519 for( ; i <= n-4; i += 4 ) {
520 _o = pcl::sse_mul(pcl::sse_ldu(O[i]),_oMult);
521 _o0f = pcl::sse_cvt(pcl::sse_cvt(_o));
522 _o0 = pcl::sse_cvt(pcl::sse_mul(_o0f,_nbf));
523 _o1 = pcl::sse_add(_o0,_nb);
524 _o1 = pcl::sse_and(pcl::sse_cmpgt(_oMax,_o1),_o1);
525 *_O0++ = _o0;
526 *_O1++ = _o1;
527 _m = pcl::sse_mul(pcl::sse_ldu(M[i]),_norm);
528 *_M1 = pcl::sse_mul(pcl::sse_sub(_o,_o0f),_m);
529 *_M0 = pcl::sse_sub(_m,*_M1); _M0++; _M1++;
530 }
531 #endif
532
533 // compute trailing locations without sse
534 for( ; i < n; i++ ) {
535 float o = O[i] * oMult;
536 float m = M[i] * norm;
537 int o0 = (int) o;
538 float od = o - o0;
539 o0 *= nb;
540 int o1 = o0 + nb;
541 if(o1 == oMax) o1 = 0;
542 O0[i] = o0;
543 O1[i] = o1;
544 M1[i] = od * m;
545 M0[i] = m - M1[i];
546 }
547 }
548
549 inline void*
alMalloc(std::size_t size,int alignment) const550 pcl::people::HOG::alMalloc (std::size_t size, int alignment) const
551 {
552 const std::size_t pSize = sizeof(void*), a = alignment-1;
553 void *raw = malloc(size + a + pSize);
554 void *aligned = (void*) (((std::size_t) raw + pSize + a) & ~a);
555 *(void**) ((std::size_t) aligned-pSize) = raw;
556 return aligned;
557 }
558
559 inline void
alFree(void * aligned) const560 pcl::people::HOG::alFree (void* aligned) const
561 {
562 void* raw = *(void**)((char*)aligned-sizeof(void*));
563 free(raw);
564 }
565
566