1 /*
2 * Copyright (c) 2019-2020 The University of Tennessee and The University
3 * of Tennessee Research Foundation. All rights
4 * reserved.
5 * Copyright (c) 2020 Research Organization for Information Science
6 * and Technology (RIST). All rights reserved.
7 * Copyright (c) 2021 Cisco Systems, Inc. All rights reserved.
8 * $COPYRIGHT$
9 *
10 * Additional copyrights may follow
11 *
12 * $HEADER$
13 */
14
15 /** @file
16 *
17 * This is the "avx" component source code.
18 *
19 */
20
21 #include "ompi_config.h"
22
23 #include "opal/util/printf.h"
24 #include "ompi/include/mpi_portable_platform.h"
25
26 #include "ompi/constants.h"
27 #include "ompi/op/op.h"
28 #include "ompi/mca/op/op.h"
29 #include "ompi/mca/op/base/base.h"
30 #include "ompi/mca/op/avx/op_avx.h"
31
32 static int avx_component_open(void);
33 static int avx_component_close(void);
34 static int avx_component_init_query(bool enable_progress_threads,
35 bool enable_mpi_thread_multiple);
36 static struct ompi_op_base_module_1_0_0_t *
37 avx_component_op_query(struct ompi_op_t *op, int *priority);
38 static int avx_component_register(void);
39
40 static mca_base_var_enum_value_flag_t avx_support_flags[] = {
41 { .flag = 0x001, .string = "SSE" },
42 { .flag = 0x002, .string = "SSE2" },
43 { .flag = 0x004, .string = "SSE3" },
44 { .flag = 0x008, .string = "SSE4.1" },
45 { .flag = 0x010, .string = "AVX" },
46 { .flag = 0x020, .string = "AVX2" },
47 { .flag = 0x100, .string = "AVX512F" },
48 { .flag = 0x200, .string = "AVX512BW" },
49 { .flag = 0, .string = NULL },
50 };
51
52 /**
53 * A slightly modified code from
54 * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
55 */
56 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
57
58 #include <immintrin.h>
59
has_intel_AVX_features(void)60 static uint32_t has_intel_AVX_features(void)
61 {
62 uint32_t flags = 0;
63
64 flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
65 flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
66 flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
67 flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
68 flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
69 flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
70 flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
71 flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
72 return flags;
73 }
74 #else /* non-Intel compiler */
75 #include <stdint.h>
76
77 #if defined(_MSC_VER)
78 #include <intrin.h>
79 #endif
80
run_cpuid(uint32_t eax,uint32_t ecx,uint32_t * abcd)81 static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
82 {
83 #if defined(_MSC_VER)
84 __cpuidex(abcd, eax, ecx);
85 #else
86 uint32_t ebx = 0, edx = 0;
87 #if defined( __i386__ ) && defined ( __PIC__ )
88 /* in case of PIC under 32-bit EBX cannot be clobbered */
89 __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
90 #else
91 __asm__ ( "cpuid" : "+b" (ebx),
92 #endif /* defined( __i386__ ) && defined ( __PIC__ ) */
93 "+a" (eax), "+c" (ecx), "=d" (edx) );
94 abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
95 #endif
96 }
97
has_intel_AVX_features(void)98 static uint32_t has_intel_AVX_features(void)
99 {
100 /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
101 const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX
102 const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX
103 const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX
104 const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX
105 const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX
106 const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX
107 const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX
108 const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX
109 uint32_t flags = 0, abcd[4];
110
111 run_cpuid( 1, 0, abcd );
112 flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
113 flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
114 flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
115 flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
116 flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
117 #if defined(__APPLE__)
118 uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
119 // OS supports extended processor state management ?
120 if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
121 return 0;
122 #endif /* defined(__APPLE__) */
123
124 run_cpuid( 7, 0, abcd );
125 flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
126 flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
127 flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
128 return flags;
129 }
130 #endif /* non-Intel compiler */
131
132 ompi_op_avx_component_t mca_op_avx_component = {
133 {
134 .opc_version = {
135 OMPI_OP_BASE_VERSION_1_0_0,
136
137 .mca_component_name = "avx",
138 MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
139 OMPI_RELEASE_VERSION),
140 .mca_open_component = avx_component_open,
141 .mca_close_component = avx_component_close,
142 .mca_register_component_params = avx_component_register,
143 },
144 .opc_data = {
145 /* The component is checkpoint ready */
146 MCA_BASE_METADATA_PARAM_CHECKPOINT
147 },
148
149 .opc_init_query = avx_component_init_query,
150 .opc_op_query = avx_component_op_query,
151 },
152 };
153
154 /*
155 * Component open
156 */
avx_component_open(void)157 static int avx_component_open(void)
158 {
159 /* We checked the flags during register, so if they are set to
160 * zero either the architecture is not suitable or the user disabled
161 * AVX support.
162 *
163 * A first level check to see what level of AVX is available on the
164 * hardware.
165 *
166 * Note that if this function returns non-OMPI_SUCCESS, then this
167 * component won't even be shown in ompi_info output (which is
168 * probably not what you want).
169 */
170 return OMPI_SUCCESS;
171 }
172
173 /*
174 * Component close
175 */
avx_component_close(void)176 static int avx_component_close(void)
177 {
178 /* If avx was opened successfully, close it (i.e., release any
179 resources that may have been allocated on this component).
180 Note that _component_close() will always be called at the end
181 of the process, so it may have been after any/all of the other
182 component functions have been invoked (and possibly even after
183 modules have been created and/or destroyed). */
184
185 return OMPI_SUCCESS;
186 }
187
188 /*
189 * Register MCA params.
190 */
191 static int
avx_component_register(void)192 avx_component_register(void)
193 {
194 mca_op_avx_component.supported =
195 mca_op_avx_component.flags = has_intel_AVX_features();
196
197 // MCA var enum flag for conveniently seeing SSE/MMX/AVX support
198 // values
199 mca_base_var_enum_flag_t *new_enum_flag = NULL;
200 (void) mca_base_var_enum_create_flag("op_avx_support_flags",
201 avx_support_flags, &new_enum_flag);
202
203 (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
204 "capabilities",
205 "Level of SSE/MMX/AVX support available in the current environment",
206 MCA_BASE_VAR_TYPE_INT,
207 &(new_enum_flag->super), 0, 0,
208 OPAL_INFO_LVL_4,
209 MCA_BASE_VAR_SCOPE_CONSTANT,
210 &mca_op_avx_component.supported);
211
212 (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
213 "support",
214 "Level of SSE/MMX/AVX support to be used, capped by the local architecture capabilities",
215 MCA_BASE_VAR_TYPE_INT,
216 &(new_enum_flag->super), 0, 0,
217 OPAL_INFO_LVL_4,
218 MCA_BASE_VAR_SCOPE_LOCAL,
219 &mca_op_avx_component.flags);
220 OBJ_RELEASE(new_enum_flag);
221
222 mca_op_avx_component.flags &= mca_op_avx_component.supported;
223
224 return OMPI_SUCCESS;
225 }
226
227 /*
228 * Query whether this component wants to be used in this process.
229 */
230 static int
avx_component_init_query(bool enable_progress_threads,bool enable_mpi_thread_multiple)231 avx_component_init_query(bool enable_progress_threads,
232 bool enable_mpi_thread_multiple)
233 {
234 if( 0 == mca_op_avx_component.flags )
235 return OMPI_ERR_NOT_SUPPORTED;
236 return OMPI_SUCCESS;
237 }
238
239 #if OMPI_MCA_OP_HAVE_AVX512
240 extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
241 extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
242 #endif
243 #if OMPI_MCA_OP_HAVE_AVX2
244 extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
245 extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
246 #endif
247 #if OMPI_MCA_OP_HAVE_AVX
248 extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
249 extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
250 #endif
251 /*
252 * Query whether this component can be used for a specific op
253 */
254 static struct ompi_op_base_module_1_0_0_t*
avx_component_op_query(struct ompi_op_t * op,int * priority)255 avx_component_op_query(struct ompi_op_t *op, int *priority)
256 {
257 ompi_op_base_module_t *module = NULL;
258 /* Sanity check -- although the framework should never invoke the
259 _component_op_query() on non-intrinsic MPI_Op's, we'll put a
260 check here just to be sure. */
261 if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
262 return NULL;
263 }
264
265 switch (op->o_f_to_c_index) {
266 case OMPI_OP_BASE_FORTRAN_MAX:
267 case OMPI_OP_BASE_FORTRAN_MIN:
268 case OMPI_OP_BASE_FORTRAN_SUM:
269 case OMPI_OP_BASE_FORTRAN_PROD:
270 case OMPI_OP_BASE_FORTRAN_BOR:
271 case OMPI_OP_BASE_FORTRAN_BAND:
272 case OMPI_OP_BASE_FORTRAN_BXOR:
273 module = OBJ_NEW(ompi_op_base_module_t);
274 for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
275 #if OMPI_MCA_OP_HAVE_AVX512
276 if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
277 module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
278 module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
279 }
280 #endif
281 #if OMPI_MCA_OP_HAVE_AVX2
282 if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
283 if( NULL == module->opm_fns[i] ) {
284 module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
285 }
286 if( NULL == module->opm_3buff_fns[i] ) {
287 module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
288 }
289 }
290 #endif
291 #if OMPI_MCA_OP_HAVE_AVX
292 if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
293 if( NULL == module->opm_fns[i] ) {
294 module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
295 }
296 if( NULL == module->opm_3buff_fns[i] ) {
297 module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
298 }
299 }
300 #endif
301 if( NULL != module->opm_fns[i] ) {
302 OBJ_RETAIN(module);
303 }
304 if( NULL != module->opm_3buff_fns[i] ) {
305 OBJ_RETAIN(module);
306 }
307 }
308 break;
309 case OMPI_OP_BASE_FORTRAN_LAND:
310 case OMPI_OP_BASE_FORTRAN_LOR:
311 case OMPI_OP_BASE_FORTRAN_LXOR:
312 case OMPI_OP_BASE_FORTRAN_MAXLOC:
313 case OMPI_OP_BASE_FORTRAN_MINLOC:
314 case OMPI_OP_BASE_FORTRAN_REPLACE:
315 default:
316 break;
317 }
318 /* If we got a module from above, we'll return it. Otherwise,
319 we'll return NULL, indicating that this component does not want
320 to be considered for selection for this MPI_Op. Note that the
321 functions each returned a *avx* component pointer
322 (vs. a *base* component pointer -- where an *avx* component
323 is a base component plus some other module-specific cached
324 information), so we have to cast it to the right pointer type
325 before returning. */
326 if (NULL != module) {
327 *priority = 50;
328 }
329 return (ompi_op_base_module_1_0_0_t *) module;
330 }
331