1 /** @file mathop_avx.c
2 ** @brief mathop for AVX - Definition
3 ** @author Andrea Vedaldi, David Novotny
4 **/
5
6 /*
7 Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
8 All rights reserved.
9
10 This file is part of the VLFeat library and is made available under
11 the terms of the BSD license (see the COPYING file).
12 */
13
14 /* ---------------------------------------------------------------- */
15 #if ! defined(VL_MATHOP_AVX_INSTANTIATING)
16
17 #include "mathop_avx.h"
18
19 #undef FLT
20 #define FLT VL_TYPE_DOUBLE
21 #define VL_MATHOP_AVX_INSTANTIATING
22 #include "mathop_avx.c"
23
24 #undef FLT
25 #define FLT VL_TYPE_FLOAT
26 #define VL_MATHOP_AVX_INSTANTIATING
27 #include "mathop_avx.c"
28
29 /* ---------------------------------------------------------------- */
30 /* VL_MATHOP_AVX_INSTANTIATING */
31 #else
32 #ifndef VL_DISABLE_AVX
33
34 #ifndef __AVX__
35 #error Compiling AVX functions but AVX does not seem to be supported by the compiler.
36 #endif
37
38 #include <immintrin.h>
39 #include "generic.h"
40 #include "mathop.h"
41 #include "float.h"
42
43 VL_INLINE T
VL_XCAT(_vl_vhsum_avx_,SFX)44 VL_XCAT(_vl_vhsum_avx_, SFX)(VTYPEavx x)
45 {
46 T acc ;
47 #if (VSIZEavx == 8)
48 {
49 //VTYPEavx hsum = _mm256_hadd_ps(x, x);
50 //hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
51 //_mm_store_ss(&acc, _mm_hadd_ps( _mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum) ) );
52 VTYPEavx hsum = VHADD2avx(x, x);
53 hsum = VADDavx(hsum, VPERMavx(hsum, hsum, 0x1));
54 VST1(&acc, VHADDavx( VCSTavx(hsum), VCSTavx(hsum) ) );
55 }
56 #else
57 {
58 //VTYPEavx hsum = _mm256_add_pd(x, _mm256_permute2f128_pd(x, x, 0x1));
59 VTYPEavx hsum = VADDavx(x, VPERMavx(x, x, 0x1));
60
61 //_mm_store_sd(&acc, _mm_hadd_pd( _mm256_castpd256_pd128(hsum), _mm256_castpd256_pd128(hsum) ) );
62 VST1(&acc, VHADDavx( VCSTavx(hsum), VCSTavx(hsum) ) );
63 }
64 #endif
65 return acc ;
66 }
67
68 VL_EXPORT T
VL_XCAT(_vl_distance_l2_avx_,SFX)69 VL_XCAT(_vl_distance_l2_avx_, SFX)
70 (vl_size dimension, T const * X, T const * Y)
71 {
72
73 T const * X_end = X + dimension ;
74 T const * X_vec_end = X_end - VSIZEavx + 1 ;
75 T acc ;
76 VTYPEavx vacc = VSTZavx() ;
77 vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(Y) ;
78
79 if (dataAligned) {
80 while (X < X_vec_end) {
81 VTYPEavx a = *(VTYPEavx*)X ;
82 VTYPEavx b = *(VTYPEavx*)Y ;
83 VTYPEavx delta = VSUBavx(a, b) ;
84 VTYPEavx delta2 = VMULavx(delta, delta) ;
85 vacc = VADDavx(vacc, delta2) ;
86 X += VSIZEavx ;
87 Y += VSIZEavx ;
88 }
89 } else {
90 while (X < X_vec_end) {
91 VTYPEavx a = VLDUavx(X) ;
92 VTYPEavx b = VLDUavx(Y) ;
93 VTYPEavx delta = VSUBavx(a, b) ;
94 VTYPEavx delta2 = VMULavx(delta, delta) ;
95 vacc = VADDavx(vacc, delta2) ;
96 X += VSIZEavx ;
97 Y += VSIZEavx ;
98 }
99 }
100
101 acc = VL_XCAT(_vl_vhsum_avx_, SFX)(vacc) ;
102
103 while (X < X_end) {
104 T a = *X++ ;
105 T b = *Y++ ;
106 T delta = a - b ;
107 acc += delta * delta ;
108 }
109
110 return acc ;
111 }
112
113 VL_EXPORT T
VL_XCAT(_vl_distance_mahalanobis_sq_avx_,SFX)114 VL_XCAT(_vl_distance_mahalanobis_sq_avx_, SFX)
115 (vl_size dimension, T const * X, T const * MU, T const * S)
116 {
117 T const * X_end = X + dimension ;
118 T const * X_vec_end = X_end - VSIZEavx + 1 ;
119 T acc ;
120 VTYPEavx vacc = VSTZavx() ;
121 vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(MU) & VALIGNEDavx(S);
122
123 if (dataAligned) {
124 while (X < X_vec_end) {
125 VTYPEavx a = *(VTYPEavx*)X ;
126 VTYPEavx b = *(VTYPEavx*)MU ;
127 VTYPEavx c = *(VTYPEavx*)S ;
128
129 VTYPEavx delta = VSUBavx(a, b) ;
130 VTYPEavx delta2 = VMULavx(delta, delta) ;
131 VTYPEavx delta2div = VMULavx(delta2,c);
132
133 vacc = VADDavx(vacc, delta2div) ;
134
135 X += VSIZEavx ;
136 MU += VSIZEavx ;
137 S += VSIZEavx ;
138 }
139 } else {
140 while (X < X_vec_end) {
141
142 VTYPEavx a = VLDUavx(X) ;
143 VTYPEavx b = VLDUavx(MU) ;
144 VTYPEavx c = VLDUavx(S) ;
145
146 VTYPEavx delta = VSUBavx(a, b) ;
147 VTYPEavx delta2 = VMULavx(delta, delta) ;
148 VTYPEavx delta2div = VMULavx(delta2,c);
149
150 vacc = VADDavx(vacc, delta2div) ;
151
152 X += VSIZEavx ;
153 MU += VSIZEavx ;
154 S += VSIZEavx ;
155 }
156 }
157
158 acc = VL_XCAT(_vl_vhsum_avx_, SFX)(vacc) ;
159
160 while (X < X_end) {
161 T a = *X++ ;
162 T b = *MU++ ;
163 T c = *S++ ;
164 T delta = a - b ;
165 acc += (delta * delta) * c;
166 }
167
168 return acc ;
169 }
170
171 VL_EXPORT void
VL_XCAT(_vl_weighted_mean_avx_,SFX)172 VL_XCAT(_vl_weighted_mean_avx_, SFX)
173 (vl_size dimension, T * MU, T const * X, T const W)
174 {
175 T const * X_end = X + dimension ;
176 T const * X_vec_end = X_end - VSIZEavx + 1 ;
177
178 vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(MU);
179 VTYPEavx w = VLD1avx (&W) ;
180
181 if (dataAligned) {
182 while (X < X_vec_end) {
183 VTYPEavx a = *(VTYPEavx*)X ;
184 VTYPEavx mu = *(VTYPEavx*)MU ;
185
186 VTYPEavx aw = VMULavx(a, w) ;
187 VTYPEavx meanStore = VADDavx(aw, mu);
188
189 *(VTYPEavx *)MU = meanStore;
190
191 X += VSIZEavx ;
192 MU += VSIZEavx ;
193 }
194 } else {
195 while (X < X_vec_end) {
196 VTYPEavx a = VLDUavx(X) ;
197 VTYPEavx mu = VLDUavx(MU) ;
198
199 VTYPEavx aw = VMULavx(a, w) ;
200 VTYPEavx meanStore = VADDavx(aw, mu);
201
202 VST2Uavx(MU,meanStore);
203
204 X += VSIZEavx ;
205 MU += VSIZEavx ;
206 }
207 }
208
209 while (X < X_end) {
210 T a = *X++ ;
211 *MU += a * W ;
212 MU++;
213 }
214 }
215
216 VL_EXPORT void
VL_XCAT(_vl_weighted_sigma_avx_,SFX)217 VL_XCAT(_vl_weighted_sigma_avx_, SFX)
218 (vl_size dimension, T * S, T const * X, T const * Y, T const W)
219 {
220 T const * X_end = X + dimension ;
221 T const * X_vec_end = X_end - VSIZEavx + 1 ;
222
223 vl_bool dataAligned = VALIGNEDavx(X) & VALIGNEDavx(Y) & VALIGNEDavx(S);
224
225 VTYPEavx w = VLD1avx (&W) ;
226
227 if (dataAligned) {
228 while (X < X_vec_end) {
229 VTYPEavx a = *(VTYPEavx*)X ;
230 VTYPEavx b = *(VTYPEavx*)Y ;
231 VTYPEavx s = *(VTYPEavx*)S ;
232
233 VTYPEavx delta = VSUBavx(a, b) ;
234 VTYPEavx delta2 = VMULavx(delta, delta) ;
235 VTYPEavx delta2w = VMULavx(delta2, w) ;
236 VTYPEavx sigmaStore = VADDavx(s,delta2w);
237
238 *(VTYPEavx *)S = sigmaStore;
239
240 X += VSIZEavx ;
241 Y += VSIZEavx ;
242 S += VSIZEavx ;
243 }
244 } else {
245 while (X < X_vec_end) {
246 VTYPEavx a = VLDUavx(X) ;
247 VTYPEavx b = VLDUavx(Y) ;
248 VTYPEavx s = VLDUavx(S) ;
249
250 VTYPEavx delta = VSUBavx(a, b) ;
251 VTYPEavx delta2 = VMULavx(delta, delta) ;
252 VTYPEavx delta2w = VMULavx(delta2, w) ;
253 VTYPEavx sigmaStore = VADDavx(s,delta2w);
254
255 VST2Uavx(S,sigmaStore);
256
257 X += VSIZEavx ;
258 Y += VSIZEavx ;
259 S += VSIZEavx ;
260 }
261 }
262
263 while (X < X_end) {
264 T a = *X++ ;
265 T b = *Y++ ;
266 T delta = a - b ;
267 *S += ((delta * delta)*W) ;
268 S++;
269 }
270 }
271
272 /* VL_DISABLE_AVX */
273 #endif
274 #undef VL_MATHOP_AVX_INSTANTIATING
275 #endif
276