1 /*
2  * Copyright (c) 2019-2020 The University of Tennessee and The University
3  *                         of Tennessee Research Foundation.  All rights
4  *                         reserved.
5  * Copyright (c) 2020      Research Organization for Information Science
6  *                         and Technology (RIST).  All rights reserved.
7  * Copyright (c) 2021      Cisco Systems, Inc.  All rights reserved.
8  * $COPYRIGHT$
9  *
10  * Additional copyrights may follow
11  *
12  * $HEADER$
13  */
14 
15 /** @file
16  *
17  * This is the "avx" component source code.
18  *
19  */
20 
21 #include "ompi_config.h"
22 
23 #include "opal/util/printf.h"
24 #include "ompi/include/mpi_portable_platform.h"
25 
26 #include "ompi/constants.h"
27 #include "ompi/op/op.h"
28 #include "ompi/mca/op/op.h"
29 #include "ompi/mca/op/base/base.h"
30 #include "ompi/mca/op/avx/op_avx.h"
31 
32 static int avx_component_open(void);
33 static int avx_component_close(void);
34 static int avx_component_init_query(bool enable_progress_threads,
35                                     bool enable_mpi_thread_multiple);
36 static struct ompi_op_base_module_1_0_0_t *
37     avx_component_op_query(struct ompi_op_t *op, int *priority);
38 static int avx_component_register(void);
39 
40 static mca_base_var_enum_value_flag_t avx_support_flags[] = {
41     { .flag = 0x001, .string = "SSE" },
42     { .flag = 0x002, .string = "SSE2" },
43     { .flag = 0x004, .string = "SSE3" },
44     { .flag = 0x008, .string = "SSE4.1" },
45     { .flag = 0x010, .string = "AVX" },
46     { .flag = 0x020, .string = "AVX2" },
47     { .flag = 0x100, .string = "AVX512F" },
48     { .flag = 0x200, .string = "AVX512BW" },
49     { .flag = 0,     .string = NULL },
50 };
51 
52 /**
53  * A slightly modified code from
54  * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
55  */
56 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
57 
58 #include <immintrin.h>
59 
has_intel_AVX_features(void)60 static uint32_t has_intel_AVX_features(void)
61 {
62     uint32_t flags = 0;
63 
64     flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F)  ? OMPI_OP_AVX_HAS_AVX512F_FLAG   : 0;
65     flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
66     flags |= _may_i_use_cpu_feature(_FEATURE_AVX2)     ? OMPI_OP_AVX_HAS_AVX2_FLAG      : 0;
67     flags |= _may_i_use_cpu_feature(_FEATURE_AVX)      ? OMPI_OP_AVX_HAS_AVX_FLAG       : 0;
68     flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1)   ? OMPI_OP_AVX_HAS_SSE4_1_FLAG    : 0;
69     flags |= _may_i_use_cpu_feature(_FEATURE_SSE3)     ? OMPI_OP_AVX_HAS_SSE3_FLAG      : 0;
70     flags |= _may_i_use_cpu_feature(_FEATURE_SSE2)     ? OMPI_OP_AVX_HAS_SSE2_FLAG      : 0;
71     flags |= _may_i_use_cpu_feature(_FEATURE_SSE)      ? OMPI_OP_AVX_HAS_SSE_FLAG       : 0;
72     return flags;
73 }
74 #else /* non-Intel compiler */
75 #include <stdint.h>
76 
77 #if defined(_MSC_VER)
78 #include <intrin.h>
79 #endif
80 
run_cpuid(uint32_t eax,uint32_t ecx,uint32_t * abcd)81 static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
82 {
83 #if defined(_MSC_VER)
84     __cpuidex(abcd, eax, ecx);
85 #else
86     uint32_t ebx = 0, edx = 0;
87 #if defined( __i386__ ) && defined ( __PIC__ )
88     /* in case of PIC under 32-bit EBX cannot be clobbered */
89     __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
90 #else
91     __asm__ ( "cpuid" : "+b" (ebx),
92 #endif  /* defined( __i386__ ) && defined ( __PIC__ ) */
93               "+a" (eax), "+c" (ecx), "=d" (edx) );
94     abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
95 #endif
96 }
97 
has_intel_AVX_features(void)98 static uint32_t has_intel_AVX_features(void)
99 {
100     /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
101     const uint32_t avx512f_mask   = (1U << 16);  // AVX512F   (EAX = 7, ECX = 0) : EBX
102     const uint32_t avx512_bw_mask = (1U << 30);  // AVX512BW  (EAX = 7, ECX = 0) : EBX
103     const uint32_t avx2_mask      = (1U << 5);   // AVX2      (EAX = 7, ECX = 0) : EBX
104     const uint32_t avx_mask       = (1U << 28);  // AVX       (EAX = 1, ECX = 0) : ECX
105     const uint32_t sse4_1_mask    = (1U << 19);  // SSE4.1    (EAX = 1, ECX = 0) : ECX
106     const uint32_t sse3_mask      = (1U << 0);   // SSE3      (EAX = 1, ECX = 0) : ECX
107     const uint32_t sse2_mask      = (1U << 26);  // SSE2      (EAX = 1, ECX = 0) : EDX
108     const uint32_t sse_mask       = (1U << 15);  // SSE       (EAX = 1, ECX = 0) : EDX
109     uint32_t flags = 0, abcd[4];
110 
111     run_cpuid( 1, 0, abcd );
112     flags |= (abcd[2] & avx_mask)       ? OMPI_OP_AVX_HAS_AVX_FLAG      : 0;
113     flags |= (abcd[2] & sse4_1_mask)    ? OMPI_OP_AVX_HAS_SSE4_1_FLAG   : 0;
114     flags |= (abcd[2] & sse3_mask)      ? OMPI_OP_AVX_HAS_SSE3_FLAG     : 0;
115     flags |= (abcd[3] & sse2_mask)      ? OMPI_OP_AVX_HAS_SSE2_FLAG     : 0;
116     flags |= (abcd[3] & sse_mask)       ? OMPI_OP_AVX_HAS_SSE_FLAG      : 0;
117 #if defined(__APPLE__)
118     uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27));  /* FMA(12) + MOVBE (22) OSXSAVE (27) */
119     // OS supports extended processor state management ?
120     if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
121         return 0;
122 #endif  /* defined(__APPLE__) */
123 
124     run_cpuid( 7, 0, abcd );
125     flags |= (abcd[1] & avx512f_mask)   ? OMPI_OP_AVX_HAS_AVX512F_FLAG  : 0;
126     flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
127     flags |= (abcd[1] & avx2_mask)      ? OMPI_OP_AVX_HAS_AVX2_FLAG     : 0;
128     return flags;
129 }
130 #endif /* non-Intel compiler */
131 
132 ompi_op_avx_component_t mca_op_avx_component = {
133     {
134         .opc_version = {
135             OMPI_OP_BASE_VERSION_1_0_0,
136 
137             .mca_component_name = "avx",
138             MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
139                                   OMPI_RELEASE_VERSION),
140             .mca_open_component = avx_component_open,
141             .mca_close_component = avx_component_close,
142             .mca_register_component_params = avx_component_register,
143         },
144         .opc_data = {
145             /* The component is checkpoint ready */
146             MCA_BASE_METADATA_PARAM_CHECKPOINT
147         },
148 
149         .opc_init_query = avx_component_init_query,
150         .opc_op_query = avx_component_op_query,
151     },
152 };
153 
154 /*
155  * Component open
156  */
avx_component_open(void)157 static int avx_component_open(void)
158 {
159     /* We checked the flags during register, so if they are set to
160      * zero either the architecture is not suitable or the user disabled
161      * AVX support.
162      *
163      * A first level check to see what level of AVX is available on the
164      * hardware.
165      *
166      * Note that if this function returns non-OMPI_SUCCESS, then this
167      * component won't even be shown in ompi_info output (which is
168      * probably not what you want).
169      */
170     return OMPI_SUCCESS;
171 }
172 
173 /*
174  * Component close
175  */
avx_component_close(void)176 static int avx_component_close(void)
177 {
178     /* If avx was opened successfully, close it (i.e., release any
179        resources that may have been allocated on this component).
180        Note that _component_close() will always be called at the end
181        of the process, so it may have been after any/all of the other
182        component functions have been invoked (and possibly even after
183        modules have been created and/or destroyed). */
184 
185     return OMPI_SUCCESS;
186 }
187 
188 /*
189  * Register MCA params.
190  */
191 static int
avx_component_register(void)192 avx_component_register(void)
193 {
194     mca_op_avx_component.supported =
195         mca_op_avx_component.flags = has_intel_AVX_features();
196 
197     // MCA var enum flag for conveniently seeing SSE/MMX/AVX support
198     // values
199     mca_base_var_enum_flag_t *new_enum_flag = NULL;
200     (void) mca_base_var_enum_create_flag("op_avx_support_flags",
201                                          avx_support_flags, &new_enum_flag);
202 
203     (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
204                                            "capabilities",
205                                            "Level of SSE/MMX/AVX support available in the current environment",
206                                            MCA_BASE_VAR_TYPE_INT,
207                                            &(new_enum_flag->super), 0, 0,
208                                            OPAL_INFO_LVL_4,
209                                            MCA_BASE_VAR_SCOPE_CONSTANT,
210                                            &mca_op_avx_component.supported);
211 
212     (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
213                                            "support",
214                                            "Level of SSE/MMX/AVX support to be used, capped by the local architecture capabilities",
215                                            MCA_BASE_VAR_TYPE_INT,
216                                            &(new_enum_flag->super), 0, 0,
217                                            OPAL_INFO_LVL_4,
218                                            MCA_BASE_VAR_SCOPE_LOCAL,
219                                            &mca_op_avx_component.flags);
220     OBJ_RELEASE(new_enum_flag);
221 
222     mca_op_avx_component.flags &= mca_op_avx_component.supported;
223 
224     return OMPI_SUCCESS;
225 }
226 
227 /*
228  * Query whether this component wants to be used in this process.
229  */
230 static int
avx_component_init_query(bool enable_progress_threads,bool enable_mpi_thread_multiple)231 avx_component_init_query(bool enable_progress_threads,
232                          bool enable_mpi_thread_multiple)
233 {
234     if( 0 == mca_op_avx_component.flags )
235         return OMPI_ERR_NOT_SUPPORTED;
236     return OMPI_SUCCESS;
237 }
238 
239 #if OMPI_MCA_OP_HAVE_AVX512
240  extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
241  extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
242 #endif
243 #if OMPI_MCA_OP_HAVE_AVX2
244  extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
245  extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
246 #endif
247 #if OMPI_MCA_OP_HAVE_AVX
248  extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
249  extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
250 #endif
251 /*
252  * Query whether this component can be used for a specific op
253  */
254 static struct ompi_op_base_module_1_0_0_t*
avx_component_op_query(struct ompi_op_t * op,int * priority)255 avx_component_op_query(struct ompi_op_t *op, int *priority)
256 {
257     ompi_op_base_module_t *module = NULL;
258     /* Sanity check -- although the framework should never invoke the
259        _component_op_query() on non-intrinsic MPI_Op's, we'll put a
260        check here just to be sure. */
261     if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
262         return NULL;
263     }
264 
265     switch (op->o_f_to_c_index) {
266     case OMPI_OP_BASE_FORTRAN_MAX:
267     case OMPI_OP_BASE_FORTRAN_MIN:
268     case OMPI_OP_BASE_FORTRAN_SUM:
269     case OMPI_OP_BASE_FORTRAN_PROD:
270     case OMPI_OP_BASE_FORTRAN_BOR:
271     case OMPI_OP_BASE_FORTRAN_BAND:
272     case OMPI_OP_BASE_FORTRAN_BXOR:
273         module = OBJ_NEW(ompi_op_base_module_t);
274         for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
275 #if OMPI_MCA_OP_HAVE_AVX512
276             if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
277                 module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
278                 module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
279             }
280 #endif
281 #if OMPI_MCA_OP_HAVE_AVX2
282             if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
283                 if( NULL == module->opm_fns[i] ) {
284                     module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
285                 }
286                 if( NULL == module->opm_3buff_fns[i] ) {
287                     module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
288                 }
289             }
290 #endif
291 #if OMPI_MCA_OP_HAVE_AVX
292             if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
293                 if( NULL == module->opm_fns[i] ) {
294                     module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
295                 }
296                 if( NULL == module->opm_3buff_fns[i] ) {
297                     module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
298                 }
299             }
300 #endif
301             if( NULL != module->opm_fns[i] ) {
302                 OBJ_RETAIN(module);
303             }
304             if( NULL != module->opm_3buff_fns[i] ) {
305                 OBJ_RETAIN(module);
306             }
307         }
308         break;
309     case OMPI_OP_BASE_FORTRAN_LAND:
310     case OMPI_OP_BASE_FORTRAN_LOR:
311     case OMPI_OP_BASE_FORTRAN_LXOR:
312     case OMPI_OP_BASE_FORTRAN_MAXLOC:
313     case OMPI_OP_BASE_FORTRAN_MINLOC:
314     case OMPI_OP_BASE_FORTRAN_REPLACE:
315     default:
316         break;
317     }
318     /* If we got a module from above, we'll return it.  Otherwise,
319        we'll return NULL, indicating that this component does not want
320        to be considered for selection for this MPI_Op.  Note that the
321        functions each returned a *avx* component pointer
322        (vs. a *base* component pointer -- where an *avx* component
323        is a base component plus some other module-specific cached
324        information), so we have to cast it to the right pointer type
325        before returning. */
326     if (NULL != module) {
327         *priority = 50;
328     }
329     return (ompi_op_base_module_1_0_0_t *) module;
330 }
331