1 /** @file mathop_avx.c
2  ** @brief mathop for AVX - Definition
3  ** @author Andrea Vedaldi, David Novotny
4  **/
5 
6 /*
7 Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
8 All rights reserved.
9 
10 This file is part of the VLFeat library and is made available under
11 the terms of the BSD license (see the COPYING file).
12 */
13 
14 /* ---------------------------------------------------------------- */
15 #if ! defined(VL_MATHOP_AVX_INSTANTIATING)
16 
17 #include "mathop_avx.h"
18 
19 #undef FLT
20 #define FLT VL_TYPE_DOUBLE
21 #define VL_MATHOP_AVX_INSTANTIATING
22 #include "mathop_avx.c"
23 
24 #undef FLT
25 #define FLT VL_TYPE_FLOAT
26 #define VL_MATHOP_AVX_INSTANTIATING
27 #include "mathop_avx.c"
28 
29 /* ---------------------------------------------------------------- */
30 /* VL_MATHOP_AVX_INSTANTIATING */
31 #else
32 #ifndef VL_DISABLE_AVX
33 
34 #ifndef __AVX__
35 #error Compiling AVX functions but AVX does not seem to be supported by the compiler.
36 #endif
37 
38 #include <immintrin.h>
39 #include "generic.h"
40 #include "mathop.h"
41 #include "float.h"
42 
43 VL_INLINE T
VL_XCAT(_vl_vhsum_avx_,SFX)44 VL_XCAT(_vl_vhsum_avx_, SFX)(VTYPEavx x)
45 {
46   T acc ;
47 #if (VSIZEavx == 8)
48   {
49     //VTYPEavx hsum = _mm256_hadd_ps(x, x);
50     //hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
51     //_mm_store_ss(&acc, _mm_hadd_ps( _mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum) ) );
52     VTYPEavx hsum = VHADD2avx(x, x);
53     hsum = VADDavx(hsum, VPERMavx(hsum, hsum, 0x1));
54     VST1(&acc, VHADDavx( VCSTavx(hsum), VCSTavx(hsum) ) );
55   }
56 #else
57   {
58     //VTYPEavx hsum = _mm256_add_pd(x, _mm256_permute2f128_pd(x, x, 0x1));
59     VTYPEavx hsum = VADDavx(x, VPERMavx(x, x, 0x1));
60 
61     //_mm_store_sd(&acc, _mm_hadd_pd( _mm256_castpd256_pd128(hsum), _mm256_castpd256_pd128(hsum) ) );
62     VST1(&acc, VHADDavx( VCSTavx(hsum), VCSTavx(hsum) ) );
63   }
64 #endif
65   return acc ;
66 }
67 
68 VL_EXPORT T
VL_XCAT(_vl_distance_l2_avx_,SFX)69 VL_XCAT(_vl_distance_l2_avx_, SFX)
70 (vl_size dimension, T const * X, T const * Y)
71 {
72 
73   T const * X_end = X + dimension ;
74   T const * X_vec_end = X_end - VSIZEavx + 1 ;
75   T acc ;
76   VTYPEavx vacc = VSTZavx() ;
77   vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(Y) ;
78 
79   if (dataAligned) {
80     while (X < X_vec_end) {
81       VTYPEavx a = *(VTYPEavx*)X ;
82       VTYPEavx b = *(VTYPEavx*)Y ;
83       VTYPEavx delta = VSUBavx(a, b) ;
84       VTYPEavx delta2 = VMULavx(delta, delta) ;
85       vacc = VADDavx(vacc, delta2) ;
86       X += VSIZEavx ;
87       Y += VSIZEavx ;
88     }
89   } else {
90     while (X < X_vec_end) {
91       VTYPEavx a = VLDUavx(X) ;
92       VTYPEavx b = VLDUavx(Y) ;
93       VTYPEavx delta = VSUBavx(a, b) ;
94       VTYPEavx delta2 = VMULavx(delta, delta) ;
95       vacc = VADDavx(vacc, delta2) ;
96       X += VSIZEavx ;
97       Y += VSIZEavx ;
98     }
99   }
100 
101   acc = VL_XCAT(_vl_vhsum_avx_, SFX)(vacc) ;
102 
103   while (X < X_end) {
104     T a = *X++ ;
105     T b = *Y++ ;
106     T delta = a - b ;
107     acc += delta * delta ;
108   }
109 
110   return acc ;
111 }
112 
113 VL_EXPORT T
VL_XCAT(_vl_distance_mahalanobis_sq_avx_,SFX)114 VL_XCAT(_vl_distance_mahalanobis_sq_avx_, SFX)
115 (vl_size dimension, T const * X, T const * MU, T const * S)
116 {
117   T const * X_end = X + dimension ;
118   T const * X_vec_end = X_end - VSIZEavx + 1 ;
119   T acc ;
120   VTYPEavx vacc = VSTZavx() ;
121   vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(MU) & VALIGNEDavx(S);
122 
123   if (dataAligned) {
124     while (X < X_vec_end) {
125       VTYPEavx a = *(VTYPEavx*)X ;
126       VTYPEavx b = *(VTYPEavx*)MU ;
127       VTYPEavx c = *(VTYPEavx*)S ;
128 
129       VTYPEavx delta = VSUBavx(a, b) ;
130       VTYPEavx delta2 = VMULavx(delta, delta) ;
131       VTYPEavx delta2div = VMULavx(delta2,c);
132 
133       vacc = VADDavx(vacc, delta2div) ;
134 
135       X  += VSIZEavx ;
136       MU += VSIZEavx ;
137       S  += VSIZEavx ;
138     }
139   } else {
140     while (X < X_vec_end) {
141 
142       VTYPEavx a = VLDUavx(X) ;
143       VTYPEavx b = VLDUavx(MU) ;
144       VTYPEavx c = VLDUavx(S) ;
145 
146       VTYPEavx delta = VSUBavx(a, b) ;
147       VTYPEavx delta2 = VMULavx(delta, delta) ;
148       VTYPEavx delta2div = VMULavx(delta2,c);
149 
150       vacc = VADDavx(vacc, delta2div) ;
151 
152       X  += VSIZEavx ;
153       MU += VSIZEavx ;
154       S  += VSIZEavx ;
155     }
156   }
157 
158   acc = VL_XCAT(_vl_vhsum_avx_, SFX)(vacc) ;
159 
160   while (X < X_end) {
161     T a = *X++ ;
162     T b = *MU++ ;
163     T c = *S++ ;
164     T delta = a - b ;
165     acc += (delta * delta) * c;
166   }
167 
168   return acc ;
169 }
170 
171 VL_EXPORT void
VL_XCAT(_vl_weighted_mean_avx_,SFX)172 VL_XCAT(_vl_weighted_mean_avx_, SFX)
173 (vl_size dimension, T * MU, T const * X, T const  W)
174 {
175   T const * X_end = X + dimension ;
176   T const * X_vec_end = X_end - VSIZEavx + 1 ;
177 
178   vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(MU);
179   VTYPEavx w = VLD1avx (&W) ;
180 
181   if (dataAligned) {
182     while (X < X_vec_end) {
183       VTYPEavx a = *(VTYPEavx*)X ;
184       VTYPEavx mu = *(VTYPEavx*)MU ;
185 
186       VTYPEavx aw = VMULavx(a, w) ;
187       VTYPEavx meanStore = VADDavx(aw, mu);
188 
189       *(VTYPEavx *)MU = meanStore;
190 
191       X += VSIZEavx ;
192       MU += VSIZEavx ;
193     }
194   } else {
195     while (X < X_vec_end) {
196       VTYPEavx a  = VLDUavx(X) ;
197       VTYPEavx mu = VLDUavx(MU) ;
198 
199       VTYPEavx aw = VMULavx(a, w) ;
200       VTYPEavx meanStore = VADDavx(aw, mu);
201 
202       VST2Uavx(MU,meanStore);
203 
204       X += VSIZEavx ;
205       MU += VSIZEavx ;
206     }
207   }
208 
209   while (X < X_end) {
210     T a = *X++ ;
211     *MU += a * W ;
212     MU++;
213   }
214 }
215 
216 VL_EXPORT void
VL_XCAT(_vl_weighted_sigma_avx_,SFX)217 VL_XCAT(_vl_weighted_sigma_avx_, SFX)
218 (vl_size dimension, T * S, T const * X, T const * Y, T const W)
219 {
220   T const * X_end = X + dimension ;
221   T const * X_vec_end = X_end - VSIZEavx + 1 ;
222 
223   vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(Y) & VALIGNEDavx(S);
224 
225   VTYPEavx w = VLD1avx (&W) ;
226 
227   if (dataAligned) {
228     while (X < X_vec_end) {
229       VTYPEavx a = *(VTYPEavx*)X ;
230       VTYPEavx b = *(VTYPEavx*)Y ;
231       VTYPEavx s = *(VTYPEavx*)S ;
232 
233       VTYPEavx delta = VSUBavx(a, b) ;
234       VTYPEavx delta2 = VMULavx(delta, delta) ;
235       VTYPEavx delta2w = VMULavx(delta2, w) ;
236       VTYPEavx sigmaStore = VADDavx(s,delta2w);
237 
238       *(VTYPEavx *)S = sigmaStore;
239 
240       X += VSIZEavx ;
241       Y += VSIZEavx ;
242       S += VSIZEavx ;
243     }
244   } else {
245     while (X < X_vec_end) {
246       VTYPEavx a = VLDUavx(X) ;
247       VTYPEavx b = VLDUavx(Y) ;
248       VTYPEavx s = VLDUavx(S) ;
249 
250       VTYPEavx delta = VSUBavx(a, b) ;
251       VTYPEavx delta2 = VMULavx(delta, delta) ;
252       VTYPEavx delta2w = VMULavx(delta2, w) ;
253       VTYPEavx sigmaStore = VADDavx(s,delta2w);
254 
255       VST2Uavx(S,sigmaStore);
256 
257       X += VSIZEavx ;
258       Y += VSIZEavx ;
259       S += VSIZEavx ;
260     }
261   }
262 
263   while (X < X_end) {
264     T a = *X++ ;
265     T b = *Y++ ;
266     T delta = a - b ;
267     *S += ((delta * delta)*W) ;
268     S++;
269   }
270 }
271 
272 /* VL_DISABLE_AVX */
273 #endif
274 #undef VL_MATHOP_AVX_INSTANTIATING
275 #endif
276