1 /** 2 * Copyright 2016 Andreas Schäfer 3 * 4 * Distributed under the Boost Software License, Version 1.0. (See accompanying 5 * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt) 6 */ 7 8 #ifndef FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP 9 #define FLAT_ARRAY_ESTIMATE_OPTIMUM_SHORT_VEC_TYPE_HPP 10 11 #include <libflatarray/detail/streaming_short_vec_switch.hpp> 12 13 namespace LibFlatArray { 14 15 /** 16 * This class serves as a type switch to select an appropriate 17 * short_vec type based on the machine architecture and working set 18 * size. This is just a heuristic. Users are advised that an 19 * analytical performance model can yield much better results. 20 * 21 * We're primarily concerned with two choices: temporal vs. 22 * non-temporal stores and the arity of the vector type. Smaller 23 * working sets should use short_vec if they fit well into the cache, 24 * larger sets should use streaming_short_vec to benefit from 25 * streaming stores. 26 * 27 * The arity of the vector type should not be smaller than the arity 28 * of the supported assembly instructions (e.g. >=8 for AVX512 and 29 * doubles).If the arity is larger then we effectively perform 30 * loop-unrolling. This may be beneficial for architectures that 31 * struggle with out-of-order execution as if lenghtens the loop body 32 * and gives them more independent instructions to work on (e.g. Intel 33 * Core 2). Modern Intel architectures however may suffer from 34 * unrolling as this might make the loop body exceed the size of the 35 * loop buffer which holds previously decoded microinstructions. 36 * 37 * Arguments should be: 38 * 39 * - CARGO: the main machine data type used inside the kernel, e.g. 40 * float or double. Most kernels will operate on various data 41 * types, but the vector arity should usually be chosen based on 42 * that type which is used most as it has the strongest impact on 43 * register scheduling. 44 * 45 * - ACCESSOR: an soa_accessor produced by LibFlatArray that provides 46 * the number of elements in the working set. We assume the size 47 * of the working set to be the product of the size of CARGO and 48 * the number of elements in the set. 49 * 50 * - LAST_LEVEL_CACHE_SIZE_ESTIMATE: if available, the user can give 51 * an estimate of the CPU's cache. Our hard-coded value will 52 * overestimate that size for most architectures, but that's 53 * generally fine. The consequence of overestimating is that for 54 * some medium-sized sets the code will use temporal stores 55 * instead of non-temporal stores, reulting in a performance hit 56 * of less than 30% (true for most codes and current 57 * architectures). Underestimating the cache size will result in 58 * the use of steaming stores even if the working set would fit 59 * just fine into the caches, easily resulting in a performance 60 * hit of 1500% (e.g. 0.4 GLUPS instead of 6 GLUPS for a 3D Jacobi 61 * on an Intel i7-6700HQ). Bottom line: never underestimate the 62 * cache size! 63 */ 64 template<typename CARGO, typename ACCESSOR, int LAST_LEVEL_CACHE_SIZE_ESTIMATE = (1 << 25)> 65 class estimate_optimum_short_vec_type 66 { 67 public: 68 // Revert to scalar values when running on a CUDA device. The 69 // vector unit is much wider, but from a programming PoV it's 70 // scalar: 71 #ifdef __CUDA_ARCH__ 72 static const int ARITY = 1; 73 #else 74 // for IBM Blue Gene/Q's QPX, which is mutually exclusive to 75 // Intel/AMD's AVX/SSE or ARM's NEON ISAs: 76 # ifdef __VECTOR4DOUBLE__ 77 static const int BIT_WIDTH = 256; 78 # endif 79 80 // Dito for ARM NEON: 81 # ifdef __ARM_NEON__ 82 static const int BIT_WIDTH = 128; 83 # endif 84 85 // Only the case of the IBM PC is complicated. No thanks to you, 86 // history! 87 # if !defined(__CUDA_ARCH__) && !defined(__ARM_NEON__) && !defined(__MIC__) 88 # ifdef __AVX512F__ 89 static const int BIT_WIDTH = 512; 90 # else 91 # ifdef __AVX__ 92 static const int BIT_WIDTH = 256; 93 # else 94 # ifdef __SSE__ 95 static const int BIT_WIDTH = 128; 96 # else 97 static const int BIT_WIDTH = sizeof(CARGO) * 8; 98 # endif 99 # endif 100 # endif 101 # endif 102 103 // rule of thumb: 2x loop unrolling for CPUs: 104 static const int ARITY = 2 * BIT_WIDTH / sizeof(CARGO) / 8; 105 #endif 106 107 static const int STREAMING_FLAG = 108 ACCESSOR::DIM_PROD * sizeof(typename ACCESSOR::element_type) / LAST_LEVEL_CACHE_SIZE_ESTIMATE; 109 110 typedef typename detail::flat_array::streaming_short_vec_switch<CARGO, ARITY, STREAMING_FLAG>::VALUE VALUE; 111 }; 112 113 } 114 115 #endif 116