1 #![feature(test)]
2 #![feature(cfg_target_feature)]
3 extern crate test;
4 extern crate simd;
5
6 use test::black_box as bb;
7 use test::Bencher as B;
8 use simd::f32x4;
9 #[cfg(target_feature = "avx")]
10 use simd::x86::avx::{f32x8, f64x4};
11 // #[cfg(target_feature = "avx2")]
12 // use simd::x86::avx2::Avx2F32x8;
13
14
15 #[bench]
multiply_naive(b: &mut B)16 fn multiply_naive(b: &mut B) {
17 let x = [[1.0_f32; 4]; 4];
18 let y = [[2.0; 4]; 4];
19 b.iter(|| {
20 for _ in 0..100 {
21 let (x, y) = bb((&x, &y));
22
23 bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
24 x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
25 x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
26 x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
27 [x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
28 x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
29 x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
30 x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
31 [x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
32 x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
33 x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
34 x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
35 [x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
36 x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
37 x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
38 x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
39 ]);
40 }
41 })
42 }
43
44 #[bench]
multiply_simd4_32(b: &mut B)45 fn multiply_simd4_32(b: &mut B) {
46 let x = [f32x4::splat(1.0_f32); 4];
47 let y = [f32x4::splat(2.0); 4];
48 b.iter(|| {
49 for _ in 0..100 {
50 let (x, y) = bb((&x, &y));
51
52 let y0 = y[0];
53 let y1 = y[1];
54 let y2 = y[2];
55 let y3 = y[3];
56 bb(&[f32x4::splat(y0.extract(0)) * x[0] +
57 f32x4::splat(y0.extract(1)) * x[1] +
58 f32x4::splat(y0.extract(2)) * x[2] +
59 f32x4::splat(y0.extract(3)) * x[3],
60 f32x4::splat(y1.extract(0)) * x[0] +
61 f32x4::splat(y1.extract(1)) * x[1] +
62 f32x4::splat(y1.extract(2)) * x[2] +
63 f32x4::splat(y1.extract(3)) * x[3],
64 f32x4::splat(y2.extract(0)) * x[0] +
65 f32x4::splat(y2.extract(1)) * x[1] +
66 f32x4::splat(y2.extract(2)) * x[2] +
67 f32x4::splat(y2.extract(3)) * x[3],
68 f32x4::splat(y3.extract(0)) * x[0] +
69 f32x4::splat(y3.extract(1)) * x[1] +
70 f32x4::splat(y3.extract(2)) * x[2] +
71 f32x4::splat(y3.extract(3)) * x[3],
72 ]);
73 }
74 })
75 }
76
77 #[cfg(target_feature = "avx")]
78 #[bench]
multiply_simd4_64(b: &mut B)79 fn multiply_simd4_64(b: &mut B) {
80 let x = [f64x4::splat(1.0_f64); 4];
81 let y = [f64x4::splat(2.0); 4];
82 b.iter(|| {
83 for _ in 0..100 {
84 let (x, y) = bb((&x, &y));
85
86 let y0 = y[0];
87 let y1 = y[1];
88 let y2 = y[2];
89 let y3 = y[3];
90 bb(&[f64x4::splat(y0.extract(0)) * x[0] +
91 f64x4::splat(y0.extract(1)) * x[1] +
92 f64x4::splat(y0.extract(2)) * x[2] +
93 f64x4::splat(y0.extract(3)) * x[3],
94 f64x4::splat(y1.extract(0)) * x[0] +
95 f64x4::splat(y1.extract(1)) * x[1] +
96 f64x4::splat(y1.extract(2)) * x[2] +
97 f64x4::splat(y1.extract(3)) * x[3],
98 f64x4::splat(y2.extract(0)) * x[0] +
99 f64x4::splat(y2.extract(1)) * x[1] +
100 f64x4::splat(y2.extract(2)) * x[2] +
101 f64x4::splat(y2.extract(3)) * x[3],
102 f64x4::splat(y3.extract(0)) * x[0] +
103 f64x4::splat(y3.extract(1)) * x[1] +
104 f64x4::splat(y3.extract(2)) * x[2] +
105 f64x4::splat(y3.extract(3)) * x[3],
106 ]);
107 }
108 })
109 }
110
111 #[bench]
inverse_naive(b: &mut B)112 fn inverse_naive(b: &mut B) {
113 let mut x = [[0_f32; 4]; 4];
114 for i in 0..4 { x[i][i] = 1.0 }
115
116 b.iter(|| {
117 for _ in 0..100 {
118 let x = bb(&x);
119
120 let mut t = [[0_f32; 4]; 4];
121 for i in 0..4 {
122 t[0][i] = x[i][0];
123 t[1][i] = x[i][1];
124 t[2][i] = x[i][2];
125 t[3][i] = x[i][3];
126 }
127
128 let _0 = t[2][2] * t[3][3];
129 let _1 = t[2][3] * t[3][2];
130 let _2 = t[2][1] * t[3][3];
131 let _3 = t[2][3] * t[3][1];
132 let _4 = t[2][1] * t[3][2];
133 let _5 = t[2][2] * t[3][1];
134 let _6 = t[2][0] * t[3][3];
135 let _7 = t[2][3] * t[3][0];
136 let _8 = t[2][0] * t[3][2];
137 let _9 = t[2][2] * t[3][0];
138 let _10 = t[2][0] * t[3][1];
139 let _11 = t[2][1] * t[3][0];
140
141 let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
142 (_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
143 let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
144 (_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
145 let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
146 (_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
147 let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
148 (_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
149 let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
150 (_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
151 let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
152 (_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
153 let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
154 (_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
155 let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
156 (_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
157
158 let _0 = t[0][2] * t[1][3];
159 let _1 = t[0][3] * t[1][2];
160 let _2 = t[0][1] * t[1][3];
161 let _3 = t[0][3] * t[1][1];
162 let _4 = t[0][1] * t[1][2];
163 let _5 = t[0][2] * t[1][1];
164 let _6 = t[0][0] * t[1][3];
165 let _7 = t[0][3] * t[1][0];
166 let _8 = t[0][0] * t[1][2];
167 let _9 = t[0][2] * t[1][0];
168 let _10 = t[0][0] * t[1][1];
169 let _11 = t[0][1] * t[1][0];
170
171 let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
172 (_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
173 let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
174 (_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
175 let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
176 (_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
177 let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
178 (_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
179 let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
180 (_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
181 let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
182 (_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
183 let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
184 (_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
185 let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
186 (_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);
187
188 let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
189
190 let det = 1.0 / det;
191 let mut ret = [[d00, d01, d02, d03],
192 [d10, d11, d12, d13],
193 [d20, d21, d22, d23],
194 [d30, d31, d32, d33]];
195 for i in 0..4 {
196 for j in 0..4 {
197 ret[i][j] *= det;
198 }
199 }
200 bb(&ret);
201 }
202 })
203 }
204
205 #[bench]
inverse_simd4(b: &mut B)206 fn inverse_simd4(b: &mut B) {
207 let mut x = [f32x4::splat(0_f32); 4];
208 for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }
209
210 fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
211 f32x4::new(v.extract(0), v.extract(1),
212 w.extract(4 - 4), w.extract(5 - 4))
213 }
214 fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
215 f32x4::new(v.extract(0), v.extract(2),
216 w.extract(4 - 4), w.extract(6 - 4))
217 }
218 fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
219 f32x4::new(v.extract(1), v.extract(3),
220 w.extract(5 - 4), w.extract(7 - 4))
221 }
222 fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
223 f32x4::new(v.extract(2), v.extract(3),
224 w.extract(6 - 4), w.extract(7 - 4))
225 }
226
227 fn swiz1032(v: f32x4) -> f32x4 {
228 f32x4::new(v.extract(1), v.extract(0),
229 v.extract(3), v.extract(2))
230 }
231 fn swiz2301(v: f32x4) -> f32x4 {
232 f32x4::new(v.extract(2), v.extract(3),
233 v.extract(0), v.extract(1))
234 }
235
236 b.iter(|| {
237 for _ in 0..100 {
238 let src0;
239 let src1;
240 let src2;
241 let src3;
242 let mut tmp1;
243 let row0;
244 let mut row1;
245 let mut row2;
246 let mut row3;
247 let mut minor0;
248 let mut minor1;
249 let mut minor2;
250 let mut minor3;
251 let mut det;
252
253 let x = bb(&x);
254 src0 = x[0];
255 src1 = x[1];
256 src2 = x[2];
257 src3 = x[3];
258
259 tmp1 = shuf0145(src0, src1);
260 row1 = shuf0145(src2, src3);
261 row0 = shuf0246(tmp1, row1);
262 row1 = shuf1357(row1, tmp1);
263
264 tmp1 = shuf2367(src0, src1);
265 row3 = shuf2367(src2, src3);
266 row2 = shuf0246(tmp1, row3);
267 row3 = shuf0246(row3, tmp1);
268
269
270 tmp1 = row2 * row3;
271 tmp1 = swiz1032(tmp1);
272 minor0 = row1 * tmp1;
273 minor1 = row0 * tmp1;
274 tmp1 = swiz2301(tmp1);
275 minor0 = (row1 * tmp1) - minor0;
276 minor1 = (row0 * tmp1) - minor1;
277 minor1 = swiz2301(minor1);
278
279
280 tmp1 = row1 * row2;
281 tmp1 = swiz1032(tmp1);
282 minor0 = (row3 * tmp1) + minor0;
283 minor3 = row0 * tmp1;
284 tmp1 = swiz2301(tmp1);
285
286 minor0 = minor0 - row3 * tmp1;
287 minor3 = row0 * tmp1 - minor3;
288 minor3 = swiz2301(minor3);
289
290
291 tmp1 = row3 * swiz2301(row1);
292 tmp1 = swiz1032(tmp1);
293 row2 = swiz2301(row2);
294 minor0 = row2 * tmp1 + minor0;
295 minor2 = row0 * tmp1;
296 tmp1 = swiz2301(tmp1);
297 minor0 = minor0 - row2 * tmp1;
298 minor2 = row0 * tmp1 - minor2;
299 minor2 = swiz2301(minor2);
300
301
302 tmp1 = row0 * row1;
303 tmp1 = swiz1032(tmp1);
304 minor2 = minor2 + row3 * tmp1;
305 minor3 = row2 * tmp1 - minor3;
306 tmp1 = swiz2301(tmp1);
307 minor2 = row3 * tmp1 - minor2;
308 minor3 = minor3 - row2 * tmp1;
309
310
311
312 tmp1 = row0 * row3;
313 tmp1 = swiz1032(tmp1);
314 minor1 = minor1 - row2 * tmp1;
315 minor2 = row1 * tmp1 + minor2;
316 tmp1 = swiz2301(tmp1);
317 minor1 = row2 * tmp1 + minor1;
318 minor2 = minor2 - row1 * tmp1;
319
320 tmp1 = row0 * row2;
321 tmp1 = swiz1032(tmp1);
322 minor1 = row3 * tmp1 + minor1;
323 minor3 = minor3 - row1 * tmp1;
324 tmp1 = swiz2301(tmp1);
325 minor1 = minor1 - row3 * tmp1;
326 minor3 = row1 * tmp1 + minor3;
327
328 det = row0 * minor0;
329 det = swiz2301(det) + det;
330 det = swiz1032(det) + det;
331 //tmp1 = det.approx_reciprocal(); det = tmp1 * (f32x4::splat(2.0) - det * tmp1);
332 det = f32x4::splat(1.0) / det;
333
334 bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
335 }
336 })
337
338 }
339
340 #[bench]
transpose_naive(b: &mut B)341 fn transpose_naive(b: &mut B) {
342 let x = [[0_f32; 4]; 4];
343 b.iter(|| {
344 for _ in 0..100 {
345 let x = bb(&x);
346 bb(&[[x[0][0], x[1][0], x[2][0], x[3][0]],
347 [x[0][1], x[1][1], x[2][1], x[3][1]],
348 [x[0][2], x[1][2], x[2][2], x[3][2]],
349 [x[0][3], x[1][3], x[2][3], x[3][3]]]);
350 }
351 })
352 }
353
354 #[bench]
transpose_simd4(b: &mut B)355 fn transpose_simd4(b: &mut B) {
356 let x = [f32x4::splat(0_f32); 4];
357
358 fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
359 f32x4::new(v.extract(0), v.extract(2),
360 w.extract(4 - 4), w.extract(6 - 4))
361 }
362 fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
363 f32x4::new(v.extract(1), v.extract(3),
364 w.extract(5 - 4), w.extract(7 - 4))
365 }
366 b.iter(|| {
367 for _ in 0..100 {
368 let x = bb(&x);
369 let x0 = x[0];
370 let x1 = x[1];
371 let x2 = x[2];
372 let x3 = x[3];
373
374 let a0 = shuf0246(x0, x1);
375 let a1 = shuf0246(x2, x3);
376 let a2 = shuf1357(x0, x1);
377 let a3 = shuf1357(x2, x3);
378
379 let b0 = shuf0246(a0, a1);
380 let b1 = shuf0246(a2, a3);
381 let b2 = shuf1357(a0, a1);
382 let b3 = shuf1357(a2, a3);
383 bb(&[b0, b1, b2, b3]);
384 }
385 })
386 }
387
388 #[cfg(target_feature = "avx")]
389 #[bench]
transpose_simd8_naive(b: &mut B)390 fn transpose_simd8_naive(b: &mut B) {
391 let x = [f32x8::splat(0_f32); 2];
392
393 fn shuf0246(v: f32x8, w: f32x8) -> f32x8 {
394 f32x8::new(v.extract(0), v.extract(2), v.extract(4), v.extract(6),
395 w.extract(0), w.extract(2), w.extract(4), w.extract(6))
396 }
397 fn shuf1357(v: f32x8, w: f32x8) -> f32x8 {
398 f32x8::new(v.extract(1), v.extract(3), v.extract(5), v.extract(7),
399 w.extract(1), w.extract(3), w.extract(5), w.extract(7),)
400 }
401 b.iter(|| {
402 for _ in 0..100 {
403 let x = bb(&x);
404 let x01 = x[0];
405 let x23 = x[1];
406
407 let a01 = shuf0246(x01, x23);
408 let a23 = shuf1357(x01, x23);
409
410 let b01 = shuf0246(a01, a23);
411 let b23 = shuf1357(a01, a23);
412 bb(&[b01, b23]);
413 }
414 })
415 }
416
417 #[cfg(target_feature = "avx")]
418 #[bench]
transpose_simd8_avx2_vpermps(b: &mut B)419 fn transpose_simd8_avx2_vpermps(b: &mut B) {
420 let x = [f32x8::splat(0_f32); 2];
421
422 // efficient on AVX2 using vpermps
423 fn perm04152637(v: f32x8) -> f32x8 {
424 // broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
425 // v.permutevar(i32x8::new(0, 4, 1, 5, 2, 6, 3, 7))
426 f32x8::new(v.extract(0), v.extract(4), v.extract(1), v.extract(5),
427 v.extract(2), v.extract(6), v.extract(3), v.extract(7))
428 }
429 fn shuf_lo(v: f32x8, w: f32x8) -> f32x8 {
430 f32x8::new(v.extract(0), v.extract(1), w.extract(0), w.extract(1),
431 v.extract(4), v.extract(5), w.extract(4), w.extract(5),)
432 }
433 fn shuf_hi(v: f32x8, w: f32x8) -> f32x8 {
434 f32x8::new(v.extract(2), v.extract(3), w.extract(2), w.extract(3),
435 v.extract(6), v.extract(7), w.extract(6), w.extract(7),)
436 }
437 b.iter(|| {
438 for _ in 0..100 {
439 let x = bb(&x);
440 let x01 = x[0];
441 let x23 = x[1];
442
443 let a01 = perm04152637(x01);
444 let a23 = perm04152637(x23);
445
446 let b01 = shuf_lo(a01, a23);
447 let b23 = shuf_hi(a01, a23);
448 bb(&[b01, b23]);
449 }
450 })
451 }
452
453 #[cfg(target_feature = "avx")]
454 #[bench]
transpose_simd8_avx2_vpermpd(b: &mut B)455 fn transpose_simd8_avx2_vpermpd(b: &mut B) {
456 let x = [f32x8::splat(0_f32); 2];
457
458 // efficient on AVX2 using vpermpd
459 fn perm01452367(v: f32x8) -> f32x8 {
460 f32x8::new(v.extract(0), v.extract(1), v.extract(4), v.extract(5),
461 v.extract(2), v.extract(3), v.extract(6), v.extract(7))
462 }
463 fn shuf_lo_ps(v: f32x8, w: f32x8) -> f32x8 {
464 f32x8::new(v.extract(0), w.extract(0), v.extract(1), w.extract(1),
465 v.extract(4), w.extract(4), v.extract(5), w.extract(5),)
466 }
467 fn shuf_hi_ps(v: f32x8, w: f32x8) -> f32x8 {
468 f32x8::new(v.extract(2), w.extract(2), v.extract(3), w.extract(3),
469 v.extract(6), w.extract(6), v.extract(7), w.extract(7),)
470 }
471 b.iter(|| {
472 for _ in 0..100 {
473 let x = bb(&x);
474 let x01 = x[0];
475 let x23 = x[1];
476
477 let a01 = perm01452367(x01);
478 let a23 = perm01452367(x23);
479
480 let b01 = shuf_lo_ps(a01, a23);
481 let b23 = shuf_hi_ps(a01, a23);
482 bb(&[b01, b23]);
483 }
484 })
485 }
486