1 #![feature(test)]
2 #![feature(cfg_target_feature)]
3 extern crate test;
4 extern crate simd;
5 
6 use test::black_box as bb;
7 use test::Bencher as B;
8 use simd::f32x4;
9 #[cfg(target_feature = "avx")]
10 use simd::x86::avx::{f32x8, f64x4};
11 // #[cfg(target_feature = "avx2")]
12 // use simd::x86::avx2::Avx2F32x8;
13 
14 
15 #[bench]
multiply_naive(b: &mut B)16 fn multiply_naive(b: &mut B) {
17     let x = [[1.0_f32; 4]; 4];
18     let y = [[2.0; 4]; 4];
19     b.iter(|| {
20         for _ in 0..100 {
21         let (x, y) = bb((&x, &y));
22 
23         bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
24             x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
25             x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
26             x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
27            [x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
28             x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
29             x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
30             x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
31            [x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
32             x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
33             x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
34             x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
35            [x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
36             x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
37             x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
38             x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
39              ]);
40         }
41     })
42 }
43 
44 #[bench]
multiply_simd4_32(b: &mut B)45 fn multiply_simd4_32(b: &mut B) {
46     let x = [f32x4::splat(1.0_f32); 4];
47     let y = [f32x4::splat(2.0); 4];
48     b.iter(|| {
49         for _ in 0..100 {
50         let (x, y) = bb((&x, &y));
51 
52         let y0 = y[0];
53         let y1 = y[1];
54         let y2 = y[2];
55         let y3 = y[3];
56         bb(&[f32x4::splat(y0.extract(0)) * x[0] +
57              f32x4::splat(y0.extract(1)) * x[1] +
58              f32x4::splat(y0.extract(2)) * x[2] +
59              f32x4::splat(y0.extract(3)) * x[3],
60              f32x4::splat(y1.extract(0)) * x[0] +
61              f32x4::splat(y1.extract(1)) * x[1] +
62              f32x4::splat(y1.extract(2)) * x[2] +
63              f32x4::splat(y1.extract(3)) * x[3],
64              f32x4::splat(y2.extract(0)) * x[0] +
65              f32x4::splat(y2.extract(1)) * x[1] +
66              f32x4::splat(y2.extract(2)) * x[2] +
67              f32x4::splat(y2.extract(3)) * x[3],
68              f32x4::splat(y3.extract(0)) * x[0] +
69              f32x4::splat(y3.extract(1)) * x[1] +
70              f32x4::splat(y3.extract(2)) * x[2] +
71              f32x4::splat(y3.extract(3)) * x[3],
72              ]);
73         }
74     })
75 }
76 
77 #[cfg(target_feature = "avx")]
78 #[bench]
multiply_simd4_64(b: &mut B)79 fn multiply_simd4_64(b: &mut B) {
80     let x = [f64x4::splat(1.0_f64); 4];
81     let y = [f64x4::splat(2.0); 4];
82     b.iter(|| {
83         for _ in 0..100 {
84         let (x, y) = bb((&x, &y));
85 
86         let y0 = y[0];
87         let y1 = y[1];
88         let y2 = y[2];
89         let y3 = y[3];
90         bb(&[f64x4::splat(y0.extract(0)) * x[0] +
91              f64x4::splat(y0.extract(1)) * x[1] +
92              f64x4::splat(y0.extract(2)) * x[2] +
93              f64x4::splat(y0.extract(3)) * x[3],
94              f64x4::splat(y1.extract(0)) * x[0] +
95              f64x4::splat(y1.extract(1)) * x[1] +
96              f64x4::splat(y1.extract(2)) * x[2] +
97              f64x4::splat(y1.extract(3)) * x[3],
98              f64x4::splat(y2.extract(0)) * x[0] +
99              f64x4::splat(y2.extract(1)) * x[1] +
100              f64x4::splat(y2.extract(2)) * x[2] +
101              f64x4::splat(y2.extract(3)) * x[3],
102              f64x4::splat(y3.extract(0)) * x[0] +
103              f64x4::splat(y3.extract(1)) * x[1] +
104              f64x4::splat(y3.extract(2)) * x[2] +
105              f64x4::splat(y3.extract(3)) * x[3],
106              ]);
107         }
108     })
109 }
110 
111 #[bench]
inverse_naive(b: &mut B)112 fn inverse_naive(b: &mut B) {
113     let mut x = [[0_f32; 4]; 4];
114     for i in 0..4 { x[i][i] = 1.0 }
115 
116     b.iter(|| {
117         for _ in 0..100 {
118             let x = bb(&x);
119 
120             let mut t = [[0_f32; 4]; 4];
121             for i in 0..4 {
122                 t[0][i] = x[i][0];
123                 t[1][i] = x[i][1];
124                 t[2][i] = x[i][2];
125                 t[3][i] = x[i][3];
126             }
127 
128             let _0 = t[2][2] * t[3][3];
129             let _1 = t[2][3] * t[3][2];
130             let _2 = t[2][1] * t[3][3];
131             let _3 = t[2][3] * t[3][1];
132             let _4 = t[2][1] * t[3][2];
133             let _5 = t[2][2] * t[3][1];
134             let _6 = t[2][0] * t[3][3];
135             let _7 = t[2][3] * t[3][0];
136             let _8 = t[2][0] * t[3][2];
137             let _9 = t[2][2] * t[3][0];
138             let _10 = t[2][0] * t[3][1];
139             let _11 = t[2][1] * t[3][0];
140 
141             let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
142                 (_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
143             let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
144                 (_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
145             let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
146                 (_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
147             let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
148                 (_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
149             let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
150                 (_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
151             let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
152                 (_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
153             let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
154                 (_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
155             let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
156                 (_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);
157 
158             let _0 = t[0][2] * t[1][3];
159             let _1 = t[0][3] * t[1][2];
160             let _2 = t[0][1] * t[1][3];
161             let _3 = t[0][3] * t[1][1];
162             let _4 = t[0][1] * t[1][2];
163             let _5 = t[0][2] * t[1][1];
164             let _6 = t[0][0] * t[1][3];
165             let _7 = t[0][3] * t[1][0];
166             let _8 = t[0][0] * t[1][2];
167             let _9 = t[0][2] * t[1][0];
168             let _10 = t[0][0] * t[1][1];
169             let _11 = t[0][1] * t[1][0];
170 
171             let d20  = _0*t[3][1]  + _3*t[3][2]  + _4*t[3][3]-
172                 (_1*t[3][1]  + _2*t[3][2]  + _5*t[3][3]);
173             let d21  = _1*t[3][0]  + _6*t[3][2]  + _9*t[3][3]-
174                 (_0*t[3][0]  + _7*t[3][2]  + _8*t[3][3]);
175             let d22 = _2*t[3][0]  + _7*t[3][1]  + _10*t[3][3]-
176                 (_3*t[3][0]  + _6*t[3][1]  + _11*t[3][3]);
177             let d23 = _5*t[3][0]  + _8*t[3][1]  + _11*t[3][2]-
178                 (_4*t[3][0]  + _9*t[3][1]  + _10*t[3][2]);
179             let d30 = _2*t[2][2]  + _5*t[2][3]  + _1*t[2][1]-
180                 (_4*t[2][3]  + _0*t[2][1]   + _3*t[2][2]);
181             let d31 = _8*t[2][3]  + _0*t[2][0]   + _7*t[2][2]-
182                 (_6*t[2][2]  + _9*t[2][3]  + _1*t[2][0]);
183             let d32 = _6*t[2][1]   + _11*t[2][3] + _3*t[2][0]-
184                 (_10*t[2][3] + _2*t[2][0]   + _7*t[2][1]);
185             let d33 = _10*t[2][2] + _4*t[2][0]   + _9*t[2][1]-
186                 (_8*t[2][1]   + _11*t[2][2] + _5*t[2][0]);
187 
188             let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;
189 
190             let det = 1.0 / det;
191             let mut ret = [[d00, d01, d02, d03],
192                            [d10, d11, d12, d13],
193                            [d20, d21, d22, d23],
194                            [d30, d31, d32, d33]];
195             for i in 0..4 {
196                 for j in 0..4 {
197                     ret[i][j] *= det;
198                 }
199             }
200             bb(&ret);
201         }
202     })
203 }
204 
205 #[bench]
inverse_simd4(b: &mut B)206 fn inverse_simd4(b: &mut B) {
207     let mut x = [f32x4::splat(0_f32); 4];
208     for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }
209 
210     fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
211         f32x4::new(v.extract(0), v.extract(1),
212                    w.extract(4 - 4), w.extract(5 - 4))
213     }
214     fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
215         f32x4::new(v.extract(0), v.extract(2),
216                    w.extract(4 - 4), w.extract(6 - 4))
217     }
218     fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
219         f32x4::new(v.extract(1), v.extract(3),
220                    w.extract(5 - 4), w.extract(7 - 4))
221     }
222     fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
223         f32x4::new(v.extract(2), v.extract(3),
224                    w.extract(6 - 4), w.extract(7 - 4))
225     }
226 
227     fn swiz1032(v: f32x4) -> f32x4 {
228         f32x4::new(v.extract(1), v.extract(0),
229                    v.extract(3), v.extract(2))
230     }
231     fn swiz2301(v: f32x4) -> f32x4 {
232         f32x4::new(v.extract(2), v.extract(3),
233                    v.extract(0), v.extract(1))
234     }
235 
236     b.iter(|| {
237         for _ in 0..100 {
238             let src0;
239             let src1;
240             let src2;
241             let src3;
242             let mut tmp1;
243             let row0;
244             let mut row1;
245             let mut row2;
246             let mut row3;
247             let mut minor0;
248             let mut minor1;
249             let mut minor2;
250             let mut minor3;
251             let mut det;
252 
253             let x = bb(&x);
254             src0 = x[0];
255             src1 = x[1];
256             src2 = x[2];
257             src3 = x[3];
258 
259             tmp1 = shuf0145(src0, src1);
260             row1 = shuf0145(src2, src3);
261             row0 = shuf0246(tmp1, row1);
262             row1 = shuf1357(row1, tmp1);
263 
264             tmp1 = shuf2367(src0, src1);
265             row3 = shuf2367(src2, src3);
266             row2 = shuf0246(tmp1, row3);
267             row3 = shuf0246(row3, tmp1);
268 
269 
270             tmp1 = row2 * row3;
271             tmp1 = swiz1032(tmp1);
272             minor0 = row1 * tmp1;
273             minor1 = row0 * tmp1;
274             tmp1 = swiz2301(tmp1);
275             minor0 = (row1 * tmp1) - minor0;
276             minor1 = (row0 * tmp1) - minor1;
277             minor1 = swiz2301(minor1);
278 
279 
280             tmp1 = row1 * row2;
281             tmp1 = swiz1032(tmp1);
282             minor0 = (row3 * tmp1) + minor0;
283             minor3 = row0 * tmp1;
284             tmp1 = swiz2301(tmp1);
285 
286             minor0 = minor0 - row3 * tmp1;
287             minor3 = row0 * tmp1 - minor3;
288             minor3 = swiz2301(minor3);
289 
290 
291             tmp1 = row3 * swiz2301(row1);
292             tmp1 = swiz1032(tmp1);
293             row2 = swiz2301(row2);
294             minor0 = row2 * tmp1 + minor0;
295             minor2 = row0 * tmp1;
296             tmp1 = swiz2301(tmp1);
297             minor0 = minor0 - row2 * tmp1;
298             minor2 = row0 * tmp1 - minor2;
299             minor2 = swiz2301(minor2);
300 
301 
302             tmp1 = row0 * row1;
303             tmp1 = swiz1032(tmp1);
304             minor2 = minor2 + row3 * tmp1;
305             minor3 = row2 * tmp1 - minor3;
306             tmp1 = swiz2301(tmp1);
307             minor2 = row3 * tmp1 - minor2;
308             minor3 = minor3 - row2 * tmp1;
309 
310 
311 
312             tmp1 = row0 * row3;
313             tmp1 = swiz1032(tmp1);
314             minor1 = minor1 - row2 * tmp1;
315             minor2 = row1 * tmp1 + minor2;
316             tmp1 = swiz2301(tmp1);
317             minor1 = row2 * tmp1 + minor1;
318             minor2 = minor2 - row1 * tmp1;
319 
320             tmp1 = row0 * row2;
321             tmp1 = swiz1032(tmp1);
322             minor1 = row3 * tmp1 + minor1;
323             minor3 = minor3 - row1 * tmp1;
324             tmp1 = swiz2301(tmp1);
325             minor1 = minor1 - row3 * tmp1;
326             minor3 = row1 * tmp1 + minor3;
327 
328             det = row0 * minor0;
329             det = swiz2301(det) + det;
330             det = swiz1032(det) + det;
331             //tmp1 = det.approx_reciprocal(); det = tmp1 * (f32x4::splat(2.0) - det * tmp1);
332             det = f32x4::splat(1.0) / det;
333 
334             bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
335         }
336      })
337 
338 }
339 
340 #[bench]
transpose_naive(b: &mut B)341 fn transpose_naive(b: &mut B) {
342     let x = [[0_f32; 4]; 4];
343     b.iter(|| {
344         for _ in 0..100 {
345             let x = bb(&x);
346             bb(&[[x[0][0], x[1][0], x[2][0], x[3][0]],
347                  [x[0][1], x[1][1], x[2][1], x[3][1]],
348                  [x[0][2], x[1][2], x[2][2], x[3][2]],
349                  [x[0][3], x[1][3], x[2][3], x[3][3]]]);
350         }
351     })
352 }
353 
354 #[bench]
transpose_simd4(b: &mut B)355 fn transpose_simd4(b: &mut B) {
356     let x = [f32x4::splat(0_f32); 4];
357 
358     fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
359         f32x4::new(v.extract(0), v.extract(2),
360                    w.extract(4 - 4), w.extract(6 - 4))
361     }
362     fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
363         f32x4::new(v.extract(1), v.extract(3),
364                    w.extract(5 - 4), w.extract(7 - 4))
365     }
366     b.iter(|| {
367         for _ in 0..100 {
368             let x = bb(&x);
369             let x0 = x[0];
370             let x1 = x[1];
371             let x2 = x[2];
372             let x3 = x[3];
373 
374             let a0 = shuf0246(x0, x1);
375             let a1 = shuf0246(x2, x3);
376             let a2 = shuf1357(x0, x1);
377             let a3 = shuf1357(x2, x3);
378 
379             let b0 = shuf0246(a0, a1);
380             let b1 = shuf0246(a2, a3);
381             let b2 = shuf1357(a0, a1);
382             let b3 = shuf1357(a2, a3);
383             bb(&[b0, b1, b2, b3]);
384         }
385     })
386 }
387 
388 #[cfg(target_feature = "avx")]
389 #[bench]
transpose_simd8_naive(b: &mut B)390 fn transpose_simd8_naive(b: &mut B) {
391     let x = [f32x8::splat(0_f32); 2];
392 
393     fn shuf0246(v: f32x8, w: f32x8) -> f32x8 {
394         f32x8::new(v.extract(0), v.extract(2), v.extract(4), v.extract(6),
395                    w.extract(0), w.extract(2), w.extract(4), w.extract(6))
396     }
397     fn shuf1357(v: f32x8, w: f32x8) -> f32x8 {
398         f32x8::new(v.extract(1), v.extract(3), v.extract(5), v.extract(7),
399                    w.extract(1), w.extract(3), w.extract(5), w.extract(7),)
400     }
401     b.iter(|| {
402         for _ in 0..100 {
403             let x = bb(&x);
404             let x01 = x[0];
405             let x23 = x[1];
406 
407             let a01 = shuf0246(x01, x23);
408             let a23 = shuf1357(x01, x23);
409 
410             let b01 = shuf0246(a01, a23);
411             let b23 = shuf1357(a01, a23);
412             bb(&[b01, b23]);
413         }
414     })
415 }
416 
417 #[cfg(target_feature = "avx")]
418 #[bench]
transpose_simd8_avx2_vpermps(b: &mut B)419 fn transpose_simd8_avx2_vpermps(b: &mut B) {
420     let x = [f32x8::splat(0_f32); 2];
421 
422     // efficient on AVX2 using vpermps
423     fn perm04152637(v: f32x8) -> f32x8 {
424         // broken on rustc 1.7.0-nightly (1ddaf8bdf 2015-12-12)
425         // v.permutevar(i32x8::new(0, 4, 1, 5, 2, 6, 3, 7))
426         f32x8::new(v.extract(0), v.extract(4), v.extract(1), v.extract(5),
427                     v.extract(2), v.extract(6), v.extract(3), v.extract(7))
428     }
429     fn shuf_lo(v: f32x8, w: f32x8) -> f32x8 {
430         f32x8::new(v.extract(0), v.extract(1), w.extract(0), w.extract(1),
431                    v.extract(4), v.extract(5), w.extract(4), w.extract(5),)
432     }
433     fn shuf_hi(v: f32x8, w: f32x8) -> f32x8 {
434         f32x8::new(v.extract(2), v.extract(3), w.extract(2), w.extract(3),
435                    v.extract(6), v.extract(7), w.extract(6), w.extract(7),)
436     }
437     b.iter(|| {
438         for _ in 0..100 {
439             let x = bb(&x);
440             let x01 = x[0];
441             let x23 = x[1];
442 
443             let a01 = perm04152637(x01);
444             let a23 = perm04152637(x23);
445 
446             let b01 = shuf_lo(a01, a23);
447             let b23 = shuf_hi(a01, a23);
448             bb(&[b01, b23]);
449         }
450     })
451 }
452 
453 #[cfg(target_feature = "avx")]
454 #[bench]
transpose_simd8_avx2_vpermpd(b: &mut B)455 fn transpose_simd8_avx2_vpermpd(b: &mut B) {
456     let x = [f32x8::splat(0_f32); 2];
457 
458     // efficient on AVX2 using vpermpd
459     fn perm01452367(v: f32x8) -> f32x8 {
460         f32x8::new(v.extract(0), v.extract(1), v.extract(4), v.extract(5),
461                     v.extract(2), v.extract(3), v.extract(6), v.extract(7))
462     }
463     fn shuf_lo_ps(v: f32x8, w: f32x8) -> f32x8 {
464         f32x8::new(v.extract(0), w.extract(0), v.extract(1), w.extract(1),
465                    v.extract(4), w.extract(4), v.extract(5), w.extract(5),)
466     }
467     fn shuf_hi_ps(v: f32x8, w: f32x8) -> f32x8 {
468         f32x8::new(v.extract(2), w.extract(2), v.extract(3), w.extract(3),
469                    v.extract(6), w.extract(6), v.extract(7), w.extract(7),)
470     }
471     b.iter(|| {
472         for _ in 0..100 {
473             let x = bb(&x);
474             let x01 = x[0];
475             let x23 = x[1];
476 
477             let a01 = perm01452367(x01);
478             let a23 = perm01452367(x23);
479 
480             let b01 = shuf_lo_ps(a01, a23);
481             let b23 = shuf_hi_ps(a01, a23);
482             bb(&[b01, b23]);
483         }
484     })
485 }
486