1 /**
2  * Copyright 2016 Andreas Schäfer
3  *
4  * Distributed under the Boost Software License, Version 1.0. (See accompanying
5  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
6  */
7 
8 #ifndef FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP
9 #define FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP
10 
11 #include <libflatarray/detail/streaming_short_vec_switch.hpp>
12 
13 namespace LibFlatArray {
14 
15 /**
16  * This class serves as a type switch to select an appropriate
17  * short_vec type based on the machine architecture and working set
18  * size. This is just a heuristic. Users are advised that an
19  * analytical performance model can yield much better results.
20  *
21  * We're primarily concerned with two choices: temporal vs.
22  * non-temporal stores and the arity of the vector type. Smaller
23  * working sets should use short_vec if they fit well into the cache,
24  * larger sets should use streaming_short_vec to benefit from
25  * streaming stores.
26  *
27  * The arity of the vector type should not be smaller than the arity
28  * of the supported assembly instructions (e.g. >=8 for AVX512 and
29  * doubles).If the arity is larger then we effectively perform
30  * loop-unrolling. This may be beneficial for architectures that
31  * struggle with out-of-order execution as if lenghtens the loop body
32  * and gives them more independent instructions to work on (e.g. Intel
33  * Core 2). Modern Intel architectures however may suffer from
34  * unrolling as this might make the loop body exceed the size of the
35  * loop buffer which holds previously decoded microinstructions.
36  *
37  * Arguments should be:
38  *
39  * - CARGO: the main machine data type used inside the kernel, e.g.
40  *     float or double. Most kernels will operate on various data
41  *     types, but the vector arity should usually be chosen based on
42  *     that type which is used most as it has the strongest impact on
43  *     register scheduling.
44  *
45  * - ACCESSOR: an soa_accessor produced by LibFlatArray that provides
46  *     the number of elements in the working set. We assume the size
47  *     of the working set to be the product of the size of CARGO and
48  *     the number of elements in the set.
49  *
50  * - LAST_LEVEL_CACHE_SIZE_ESTIMATE: if available, the user can give
51  *     an estimate of the CPU's cache. Our hard-coded value will
52  *     overestimate that size for most architectures, but that's
53  *     generally fine. The consequence of overestimating is that for
54  *     some medium-sized sets the code will use temporal stores
55  *     instead of non-temporal stores, reulting in a performance hit
56  *     of less than 30% (true for most codes and current
57  *     architectures). Underestimating the cache size will result in
58  *     the use of steaming stores even if the working set would fit
59  *     just fine into the caches, easily resulting in a performance
60  *     hit of 1500% (e.g. 0.4 GLUPS instead of 6 GLUPS for a 3D Jacobi
61  *     on an Intel i7-6700HQ). Bottom line: never underestimate the
62  *     cache size!
63  */
64 template<typename CARGO, typename ACCESSOR, int LAST_LEVEL_CACHE_SIZE_ESTIMATE = (1 << 25)>
65 class estimate_optimum_short_vec_type
66 {
67 public:
68     // Revert to scalar values when running on a CUDA device. The
69     // vector unit is much wider, but from a programming PoV it's
70     // scalar:
71 #ifdef __CUDA_ARCH__
72     static const int ARITY = 1;
73 #else
74     // for IBM Blue Gene/Q's QPX, which is mutually exclusive to
75     // Intel/AMD's AVX/SSE or ARM's NEON ISAs:
76 #  ifdef __VECTOR4DOUBLE__
77     static const int BIT_WIDTH = 256;
78 #  endif
79 
80     // Dito for ARM NEON:
81 #  ifdef __ARM_NEON__
82     static const int BIT_WIDTH = 128;
83 #  endif
84 
85     // Only the case of the IBM PC is complicated. No thanks to you,
86     // history!
87 #  if !defined(__CUDA_ARCH__) && !defined(__ARM_NEON__) && !defined(__MIC__)
88 #    ifdef __AVX512F__
89     static const int BIT_WIDTH = 512;
90 #    else
91 #      ifdef __AVX__
92     static const int BIT_WIDTH = 256;
93 #      else
94 #        ifdef __SSE__
95     static const int BIT_WIDTH = 128;
96 #        else
97     static const int BIT_WIDTH = sizeof(CARGO) * 8;
98 #        endif
99 #      endif
100 #    endif
101 #  endif
102 
103     // rule of thumb: 2x loop unrolling for CPUs:
104     static const int ARITY = 2 * BIT_WIDTH / sizeof(CARGO) / 8;
105 #endif
106 
107     static const int STREAMING_FLAG =
108         ACCESSOR::DIM_PROD * sizeof(typename ACCESSOR::element_type) / LAST_LEVEL_CACHE_SIZE_ESTIMATE;
109 
110     typedef typename detail::flat_array::streaming_short_vec_switch<CARGO, ARITY, STREAMING_FLAG>::VALUE VALUE;
111 };
112 
113 }
114 
115 #endif
116