1 //    This file is part of ELPA.
2 //
3 //    The ELPA library was originally created by the ELPA consortium,
4 //    consisting of the following organizations:
5 //
6 //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 //      Informatik,
10 //    - Technische Universität München, Lehrstuhl für Informatik mit
11 //      Schwerpunkt Wissenschaftliches Rechnen ,
12 //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 //      and
16 //    - IBM Deutschland GmbH
17 //
18 //    This particular source code file contains additions, changes and
19 //    enhancements authored by Intel Corporation which is not part of
20 //    the ELPA consortium.
21 //
22 //    More information can be found here:
23 //    http://elpa.mpcdf.mpg.de/
24 //
25 //    ELPA is free software: you can redistribute it and/or modify
26 //    it under the terms of the version 3 of the license of the
27 //    GNU Lesser General Public License as published by the Free
28 //    Software Foundation.
29 //
30 //    ELPA is distributed in the hope that it will be useful,
31 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
32 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 //    GNU Lesser General Public License for more details.
34 //
35 //    You should have received a copy of the GNU Lesser General Public License
36 //    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
37 //
38 //    ELPA reflects a substantial effort on the part of the original
39 //    ELPA consortium, and we ask you to respect the spirit of the
40 //    license that we chose: i.e., please contribute any changes you
41 //    may have back to the original ELPA library distribution, and keep
42 //    any derivatives of ELPA under the same license that we chose for
43 //    the original distribution, the GNU Lesser General Public License.
44 //
45 //    Authors: L. Huedepohl and A. Marek, MPCDF
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <elpa/elpa.h>
50 #include "elpa_index.h"
51 
52 #include "config.h"
53 
54 #ifdef WITH_OPENMP
55 #include <omp.h>
56 #endif
57 
58 int max_threads_glob;
59 int set_max_threads_glob=0;
60 
61 static int enumerate_identity(elpa_index_t index, int i);
62 static int cardinality_bool(elpa_index_t index);
63 static int valid_bool(elpa_index_t index, int n, int new_value);
64 
65 static int number_of_solvers(elpa_index_t index);
66 static int solver_enumerate(elpa_index_t index, int i);
67 static int solver_is_valid(elpa_index_t index, int n, int new_value);
68 static const char* elpa_solver_name(int solver);
69 
70 static int number_of_real_kernels(elpa_index_t index);
71 static int real_kernel_enumerate(elpa_index_t index, int i);
72 static int real_kernel_is_valid(elpa_index_t index, int n, int new_value);
73 static const char *real_kernel_name(int kernel);
74 
75 static int number_of_complex_kernels(elpa_index_t index);
76 static int complex_kernel_enumerate(elpa_index_t index, int i);
77 static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value);
78 static const char *complex_kernel_name(int kernel);
79 
80 static int band_to_full_cardinality(elpa_index_t index);
81 static int band_to_full_enumerate(elpa_index_t index, int i);
82 static int band_to_full_is_valid(elpa_index_t index, int n, int new_value);
83 
84 static int stripewidth_real_cardinality(elpa_index_t index);
85 static int stripewidth_real_enumerate(elpa_index_t index, int i);
86 static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value);
87 
88 static int stripewidth_complex_cardinality(elpa_index_t index);
89 static int stripewidth_complex_enumerate(elpa_index_t index, int i);
90 static int stripewidth_complex_is_valid(elpa_index_t index, int n, int new_value);
91 
92 static int omp_threads_cardinality(elpa_index_t index);
93 static int omp_threads_enumerate(elpa_index_t index, int i);
94 static int omp_threads_is_valid(elpa_index_t index, int n, int new_value);
95 
96 static int max_stored_rows_cardinality(elpa_index_t index);
97 static int max_stored_rows_enumerate(elpa_index_t index, int i);
98 static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value);
99 
100 static int min_tile_size_cardinality(elpa_index_t index);
101 static int min_tile_size_enumerate(elpa_index_t index, int i);
102 static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value);
103 
104 static int valid_with_gpu(elpa_index_t index, int n, int new_value);
105 static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value);
106 static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value);
107 
108 static int intermediate_bandwidth_cardinality(elpa_index_t index);
109 static int intermediate_bandwidth_enumerate(elpa_index_t index, int i);
110 static int intermediate_bandwidth_is_valid(elpa_index_t index, int n, int new_value);
111 
112 static int cannon_buffer_size_cardinality(elpa_index_t index);
113 static int cannon_buffer_size_enumerate(elpa_index_t index, int i);
114 static int cannon_buffer_size_is_valid(elpa_index_t index, int n, int new_value);
115 
116 static int na_is_valid(elpa_index_t index, int n, int new_value);
117 static int nev_is_valid(elpa_index_t index, int n, int new_value);
118 static int bw_is_valid(elpa_index_t index, int n, int new_value);
119 static int gpu_is_valid(elpa_index_t index, int n, int new_value);
120 
121 static int is_positive(elpa_index_t index, int n, int new_value);
122 
123 static int elpa_double_string_to_value(char *name, char *string, double *value);
124 static int elpa_double_value_to_string(char *name, double value, const char **string);
125 
126 #define BASE_ENTRY(option_name, option_description, once_value, readonly_value, print_flag_value) \
127                 .base = { \
128                         .name = option_name, \
129                         .description = option_description, \
130                         .once = once_value, \
131                         .readonly = readonly_value, \
132                         .env_default = "ELPA_DEFAULT_" option_name, \
133                         .env_force = "ELPA_FORCE_" option_name, \
134                         .print_flag = print_flag_value, \
135                 }
136 
137 #define INT_PARAMETER_ENTRY(option_name, option_description, valid_func, print_flag) \
138         { \
139                 BASE_ENTRY(option_name, option_description, 1, 0, print_flag), \
140                 .valid = valid_func, \
141         }
142 
143 #define BOOL_ENTRY(option_name, option_description, default, tune_level, tune_domain, print_flag) \
144         { \
145                 BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
146                 .default_value = default, \
147                 .autotune_level = tune_level, \
148                 .autotune_domain = tune_domain, \
149                 .cardinality = cardinality_bool, \
150                 .enumerate = enumerate_identity, \
151                 .valid = valid_bool, \
152         }
153 
154 #define INT_ENTRY(option_name, option_description, default, tune_level, tune_domain, card_func, enumerate_func, valid_func, to_string_func, print_flag) \
155         { \
156                 BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
157                 .default_value = default, \
158                 .autotune_level = tune_level, \
159                 .autotune_domain = tune_domain, \
160                 .cardinality = card_func, \
161                 .enumerate = enumerate_func, \
162                 .valid = valid_func, \
163                 .to_string = to_string_func, \
164         }
165 
166 #define INT_ANY_ENTRY(option_name, option_description, print_flag) \
167         { \
168                 BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
169         }
170 
171 /* The order here is important! Tunable options that are dependent on other
172  * tunable options must appear later in the list than their prerequisites */
173 static const elpa_index_int_entry_t int_entries[] = {
174         INT_PARAMETER_ENTRY("na", "Global matrix has size (na * na)", na_is_valid, PRINT_STRUCTURE),
175         INT_PARAMETER_ENTRY("nev", "Number of eigenvectors to be computed, 0 <= nev <= na", nev_is_valid, PRINT_STRUCTURE),
176         INT_PARAMETER_ENTRY("nblk", "Block size of scalapack block-cyclic distribution", is_positive, PRINT_STRUCTURE),
177         INT_PARAMETER_ENTRY("local_nrows", "Number of matrix rows stored on this process", NULL, PRINT_NO),
178         INT_PARAMETER_ENTRY("local_ncols", "Number of matrix columns stored on this process", NULL, PRINT_NO),
179         INT_PARAMETER_ENTRY("process_row", "Process row number in the 2D domain decomposition", NULL, PRINT_NO),
180         INT_PARAMETER_ENTRY("process_col", "Process column number in the 2D domain decomposition", NULL, PRINT_NO),
181         INT_PARAMETER_ENTRY("process_id", "Process rank", NULL, PRINT_NO),
182         INT_PARAMETER_ENTRY("num_process_rows", "Number of process row number in the 2D domain decomposition", NULL, PRINT_STRUCTURE),
183         INT_PARAMETER_ENTRY("num_process_cols", "Number of process column number in the 2D domain decomposition", NULL, PRINT_STRUCTURE),
184         INT_PARAMETER_ENTRY("num_processes", "Total number of processes", NULL, PRINT_STRUCTURE),
185         INT_PARAMETER_ENTRY("bandwidth", "If specified, a band matrix with this bandwidth is expected as input; bandwidth must be multiply of nblk", bw_is_valid, PRINT_YES),
186         INT_ANY_ENTRY("mpi_comm_rows", "Communicator for inter-row communication", PRINT_NO),
187         INT_ANY_ENTRY("mpi_comm_cols", "Communicator for inter-column communication", PRINT_NO),
188         INT_ANY_ENTRY("mpi_comm_parent", "Parent communicator", PRINT_NO),
189         INT_ANY_ENTRY("blacs_context", "BLACS context", PRINT_NO),
190         INT_ENTRY("solver", "Solver to use", ELPA_SOLVER_1STAGE, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_ANY, \
191                         number_of_solvers, solver_enumerate, solver_is_valid, elpa_solver_name, PRINT_YES),
192         INT_ENTRY("gpu", "Use GPU acceleration", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
193                         cardinality_bool, enumerate_identity, gpu_is_valid, NULL, PRINT_YES),
194         //default of gpu ussage for individual phases is 1. However, it is only evaluated, if GPU is used at all, which first has to be determined
195         //by the parameter gpu and presence of the device
196         INT_ENTRY("gpu_tridiag", "Use GPU acceleration for ELPA1 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
197                         cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
198         INT_ENTRY("gpu_solve_tridi", "Use GPU acceleration for ELPA solve tridi", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
199                         cardinality_bool, enumerate_identity, valid_with_gpu, NULL, PRINT_YES),
200         INT_ENTRY("gpu_trans_ev", "Use GPU acceleration for ELPA1 trans ev", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
201                         cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
202         INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
203                         cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
204         INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
205                         cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
206         INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
207                         cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
208         INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
209                         cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
210         INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
211                         number_of_real_kernels, real_kernel_enumerate, real_kernel_is_valid, real_kernel_name, PRINT_YES),
212         INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
213                         number_of_complex_kernels, complex_kernel_enumerate, complex_kernel_is_valid, complex_kernel_name, PRINT_YES),
214 
215         INT_ENTRY("min_tile_size", "Minimal tile size used internally in elpa1_tridiag and elpa2_bandred", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
216                         min_tile_size_cardinality, min_tile_size_enumerate, min_tile_size_is_valid, NULL, PRINT_YES),
217         INT_ENTRY("intermediate_bandwidth", "Specifies the intermediate bandwidth in ELPA2 full->banded step. Must be a multiple of nblk", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
218                         intermediate_bandwidth_cardinality, intermediate_bandwidth_enumerate, intermediate_bandwidth_is_valid, NULL, PRINT_YES),
219 
220         INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_ANY,
221                         band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL, PRINT_YES),
222         INT_ENTRY("stripewidth_real", "Stripewidth_real, default 48. Must be a multiple of 4", 48, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_REAL,
223                         stripewidth_real_cardinality, stripewidth_real_enumerate, stripewidth_real_is_valid, NULL, PRINT_YES),
224         INT_ENTRY("stripewidth_complex", "Stripewidth_complex, default 96. Must be a multiple of 8", 96, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_COMPLEX,
225                         stripewidth_complex_cardinality, stripewidth_complex_enumerate, stripewidth_complex_is_valid, NULL, PRINT_YES),
226 
227         INT_ENTRY("max_stored_rows", "Maximum number of stored rows used in ELPA 1 backtransformation, default 63", 63, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_ANY,
228                         max_stored_rows_cardinality, max_stored_rows_enumerate, max_stored_rows_is_valid, NULL, PRINT_YES),
229 #ifdef WITH_OPENMP
230         INT_ENTRY("omp_threads", "OpenMP threads used in ELPA, default 1", 1, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_ANY,
231                         omp_threads_cardinality, omp_threads_enumerate, omp_threads_is_valid, NULL, PRINT_YES),
232 #else
233         INT_ENTRY("omp_threads", "OpenMP threads used in ELPA, default 1", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
234                         omp_threads_cardinality, omp_threads_enumerate, omp_threads_is_valid, NULL, PRINT_YES),
235 #endif
236         INT_ENTRY("cannon_buffer_size", "Increasing the buffer size might make it faster, but costs memory", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
237                         cannon_buffer_size_cardinality, cannon_buffer_size_enumerate, cannon_buffer_size_is_valid, NULL, PRINT_YES),
238         //BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL),
239         BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_REAL, PRINT_YES),
240         BOOL_ENTRY("timings", "Enable time measurement", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
241         BOOL_ENTRY("debug", "Emit verbose debugging messages", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
242         BOOL_ENTRY("print_flops", "Print FLOP rates on task 0", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
243         BOOL_ENTRY("measure_performance", "Also measure with flops (via papi) with the timings", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
244         BOOL_ENTRY("check_pd", "Check eigenvalues to be positive", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
245         BOOL_ENTRY("cannon_for_generalized", "Whether to use Cannons algorithm for the generalized EVP", 1, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
246 };
247 
248 #define READONLY_DOUBLE_ENTRY(option_name, option_description) \
249         { \
250                 BASE_ENTRY(option_name, option_description, 0, 1, 0) \
251         }
252 
253 static const elpa_index_double_entry_t double_entries[] = {
254         /* Empty for now */
255 };
256 
elpa_index_free(elpa_index_t index)257 void elpa_index_free(elpa_index_t index) {
258 #define FREE_OPTION(TYPE, ...) \
259         free(index->TYPE##_options.values); \
260         free(index->TYPE##_options.is_set); \
261         free(index->TYPE##_options.notified);
262 
263         FOR_ALL_TYPES(FREE_OPTION);
264 
265         free(index);
266 }
267 
compar(const void * a,const void * b)268 static int compar(const void *a, const void *b) {
269         return strcmp(((elpa_index_int_entry_t *) a)->base.name,
270                       ((elpa_index_int_entry_t *) b)->base.name);
271 }
272 
273 #define IMPLEMENT_FIND_ENTRY(TYPE, ...) \
274         static int find_##TYPE##_entry(char *name) { \
275                 elpa_index_##TYPE##_entry_t *entry; \
276                 elpa_index_##TYPE##_entry_t key = { .base = {.name = name} } ; \
277                 size_t nmembers = nelements(TYPE##_entries); \
278                 entry = lfind((const void*) &key, (const void *) TYPE##_entries, &nmembers, sizeof(elpa_index_##TYPE##_entry_t), compar); \
279                 if (entry) { \
280                         return (entry - &TYPE##_entries[0]); \
281                 } else { \
282                         return -1; \
283                 } \
284         }
285 FOR_ALL_TYPES(IMPLEMENT_FIND_ENTRY)
286 
287 
288 #define IMPLEMENT_GETENV(TYPE, PRINTF_SPEC, ...) \
289         static int getenv_##TYPE(elpa_index_t index, const char *env_variable, enum NOTIFY_FLAGS notify_flag, int n, TYPE *value, const char *error_string) { \
290                 int err; \
291                 char *env_value = getenv(env_variable); \
292                 if (env_value) { \
293                         err = elpa_##TYPE##_string_to_value(TYPE##_entries[n].base.name, env_value, value); \
294                         if (err != ELPA_OK) { \
295                                 fprintf(stderr, "ELPA: Error interpreting environment variable %s with value '%s': %s\n", \
296                                                 TYPE##_entries[n].base.name, env_value, elpa_strerr(err)); \
297                         } else {\
298                                 const char *value_string = NULL; \
299                                 if (elpa_##TYPE##_value_to_string(TYPE##_entries[n].base.name, *value, &value_string) == ELPA_OK) { \
300                                         if (!(index->TYPE##_options.notified[n] & notify_flag)) { \
301                                                 if (elpa_index_is_printing_mpi_rank(index)) { \
302                                                         fprintf(stderr, "ELPA: %s '%s' is set to %s due to environment variable %s\n", \
303                                                                       error_string, TYPE##_entries[n].base.name, value_string, env_variable); \
304                                                 } \
305                                                 index->TYPE##_options.notified[n] |= notify_flag; \
306                                         } \
307                                 } else { \
308                                         if (elpa_index_is_printing_mpi_rank(index)) { \
309                                                 fprintf(stderr, "ELPA: %s '%s' is set to '" PRINTF_SPEC "' due to environment variable %s\n", \
310                                                         error_string, TYPE##_entries[n].base.name, *value, env_variable);\
311                                         } \
312                                 } \
313                                 return 1; \
314                         } \
315                 } \
316                 return 0; \
317         }
FOR_ALL_TYPES(IMPLEMENT_GETENV)318 FOR_ALL_TYPES(IMPLEMENT_GETENV)
319 
320 
321 #define IMPLEMENT_GET_FUNCTION(TYPE, PRINTF_SPEC, SCANF_SPEC, ERROR_VALUE) \
322         TYPE elpa_index_get_##TYPE##_value(elpa_index_t index, char *name, int *error) { \
323                 TYPE ret; \
324                 if (sizeof(TYPE##_entries) == 0) { \
325                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
326                 } \
327                 int n = find_##TYPE##_entry(name); \
328                 if (n >= 0) { \
329                         int from_env = 0; \
330                         if (!TYPE##_entries[n].base.once && !TYPE##_entries[n].base.readonly) { \
331                                 from_env = getenv_##TYPE(index, TYPE##_entries[n].base.env_force, NOTIFY_ENV_FORCE, n, &ret, "Option"); \
332                         } \
333                         if (!from_env) { \
334                                 ret = index->TYPE##_options.values[n]; \
335                         } \
336                         if (error != NULL) { \
337                                 *error = ELPA_OK; \
338                         } \
339                         return ret; \
340                 } else { \
341                         if (error != NULL) { \
342                                 *error = ELPA_ERROR_ENTRY_NOT_FOUND; \
343                         } \
344                         return ERROR_VALUE; \
345                 } \
346         }
347 FOR_ALL_TYPES(IMPLEMENT_GET_FUNCTION)
348 
349 
350 #define IMPLEMENT_LOC_FUNCTION(TYPE, ...) \
351         TYPE* elpa_index_get_##TYPE##_loc(elpa_index_t index, char *name) { \
352                 if (sizeof(TYPE##_entries) == 0) { \
353                         return NULL; \
354                 } \
355                 int n = find_##TYPE##_entry(name); \
356                 if (n >= 0) { \
357                         return &index->TYPE##_options.values[n]; \
358                 } else { \
359                         return NULL; \
360                 } \
361         }
362 FOR_ALL_TYPES(IMPLEMENT_LOC_FUNCTION)
363 
364 
365 #define IMPLEMENT_SET_FUNCTION(TYPE, PRINTF_SPEC, ...) \
366         int elpa_index_set_##TYPE##_value(elpa_index_t index, char *name, TYPE value) { \
367                 if (sizeof(TYPE##_entries) == 0) { \
368                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
369                 } \
370                 int n = find_##TYPE##_entry(name); \
371                 if (n < 0) { \
372                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
373                 }; \
374                 if (TYPE##_entries[n].valid != NULL) { \
375                         if(!TYPE##_entries[n].valid(index, n, value)) { \
376                                 return ELPA_ERROR_ENTRY_INVALID_VALUE; \
377                         }; \
378                 } \
379                 if (TYPE##_entries[n].base.once & index->TYPE##_options.is_set[n]) { \
380                         return ELPA_ERROR_ENTRY_ALREADY_SET; \
381                 } \
382                 if (TYPE##_entries[n].base.readonly) { \
383                         return ELPA_ERROR_ENTRY_READONLY; \
384                 } \
385                 index->TYPE##_options.values[n] = value; \
386                 index->TYPE##_options.is_set[n] = 1; \
387                 return ELPA_OK; \
388         }
389 FOR_ALL_TYPES(IMPLEMENT_SET_FUNCTION)
390 
391 #define IMPLEMENT_SET_FROM_LOAD_FUNCTION(TYPE, PRINTF_SPEC, ...) \
392         int elpa_index_set_from_load_##TYPE##_value(elpa_index_t index, char *name, TYPE value, int explicit) { \
393                 if (sizeof(TYPE##_entries) == 0) { \
394                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
395                 } \
396                 int n = find_##TYPE##_entry(name); \
397                 if (n < 0) { \
398                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
399                 }; \
400                 index->TYPE##_options.values[n] = value; \
401                 if(explicit) \
402                         index->TYPE##_options.is_set[n] = 1; \
403                 return ELPA_OK; \
404         }
405 FOR_ALL_TYPES(IMPLEMENT_SET_FROM_LOAD_FUNCTION)
406 
407 
408 #define IMPLEMENT_IS_SET_FUNCTION(TYPE, ...) \
409         int elpa_index_##TYPE##_value_is_set(elpa_index_t index, char *name) { \
410                 if (sizeof(TYPE##_entries) == 0) { \
411                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
412                 } \
413                 int n = find_##TYPE##_entry(name); \
414                 if (n >= 0) { \
415                         if (index->TYPE##_options.is_set[n]) { \
416                                 return 1; \
417                         } else { \
418                                 return 0; \
419                         } \
420                 } else { \
421                         return ELPA_ERROR_ENTRY_NOT_FOUND; \
422                 } \
423         }
424 FOR_ALL_TYPES(IMPLEMENT_IS_SET_FUNCTION)
425 
426 
427 int elpa_index_value_is_set(elpa_index_t index, char *name) {
428         int res = ELPA_ERROR;
429 
430 #define RET_IF_SET(TYPE, ...) \
431         res = elpa_index_##TYPE##_value_is_set(index, name); \
432         if (res >= 0) { \
433                 return res; \
434         }
435 
436         FOR_ALL_TYPES(RET_IF_SET)
437 
438         fprintf(stderr, "ELPA Error: Could not find entry '%s'\n", name);
439         return res;
440 }
441 
elpa_index_int_is_valid(elpa_index_t index,char * name,int new_value)442 int elpa_index_int_is_valid(elpa_index_t index, char *name, int new_value) {
443         int n = find_int_entry(name); \
444         if (n >= 0) { \
445                 if (int_entries[n].valid == NULL) {
446                         return ELPA_OK;
447                 } else {
448                         return int_entries[n].valid(index, n, new_value) ? ELPA_OK : ELPA_ERROR;
449                 }
450         }
451         return ELPA_ERROR_ENTRY_NOT_FOUND;
452 }
453 
elpa_int_value_to_string(char * name,int value,const char ** string)454 int elpa_int_value_to_string(char *name, int value, const char **string) {
455         int n = find_int_entry(name);
456         if (n < 0) {
457                 return ELPA_ERROR_ENTRY_NOT_FOUND;
458         }
459         if (int_entries[n].to_string == NULL) {
460                 return ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION;
461         }
462         *string = int_entries[n].to_string(value);
463         return ELPA_OK;
464 }
465 
466 
elpa_int_value_to_strlen(char * name,int value)467 int elpa_int_value_to_strlen(char *name, int value) {
468         const char *string = NULL;
469         elpa_int_value_to_string(name, value, &string);
470         if (string == NULL) {
471                 return 0;
472         } else {
473                 return strlen(string);
474         }
475 }
476 
477 
elpa_index_int_value_to_strlen(elpa_index_t index,char * name)478 int elpa_index_int_value_to_strlen(elpa_index_t index, char *name) {
479         int n = find_int_entry(name);
480         if (n < 0) {
481                 return 0;
482         }
483         return elpa_int_value_to_strlen(name, index->int_options.values[n]);
484 }
485 
486 
elpa_int_string_to_value(char * name,char * string,int * value)487 int elpa_int_string_to_value(char *name, char *string, int *value) {
488         int n = find_int_entry(name);
489         if (n < 0) {
490                 return ELPA_ERROR_ENTRY_NOT_FOUND;
491         }
492 
493         if (int_entries[n].to_string == NULL) {
494                 int val, ret;
495                 ret = sscanf(string, "%d", &val);
496                 if (ret == 1) {
497                         *value = val;
498                         return ELPA_OK;
499                 } else {
500                         return ELPA_ERROR_ENTRY_INVALID_VALUE;
501                 }
502         }
503 
504         for (int i = 0; i < int_entries[n].cardinality(NULL); i++) {
505                 int candidate = int_entries[n].enumerate(NULL, i);
506                 if (strcmp(string, int_entries[n].to_string(candidate)) == 0) {
507                         *value = candidate;
508                         return ELPA_OK;
509                 }
510         }
511         return ELPA_ERROR_ENTRY_INVALID_VALUE;
512 }
513 
elpa_double_string_to_value(char * name,char * string,double * value)514 int elpa_double_string_to_value(char *name, char *string, double *value) {
515         double val;
516         int ret = sscanf(string, "%lf", &val);
517         if (ret == 1) {
518                 *value = val;
519                 return ELPA_OK;
520         } else {
521                 /* \todo: remove */
522                 fprintf(stderr, "ELPA: DEBUG: Could not parse double value '%s' for option '%s'\n", string, name);
523                 return ELPA_ERROR_ENTRY_INVALID_VALUE;
524         }
525 }
526 
elpa_double_value_to_string(char * name,double value,const char ** string)527 int elpa_double_value_to_string(char *name, double value, const char **string) {
528         return ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION;
529 }
530 
elpa_option_cardinality(char * name)531 int elpa_option_cardinality(char *name) {
532         int n = find_int_entry(name);
533         if (n < 0 || !int_entries[n].cardinality) {
534                 return ELPA_ERROR_ENTRY_NOT_FOUND;
535         }
536         return int_entries[n].cardinality(NULL);
537 }
538 
elpa_option_enumerate(char * name,int i)539 int elpa_option_enumerate(char *name, int i) {
540         int n = find_int_entry(name);
541         if (n < 0 || !int_entries[n].enumerate) {
542                 return 0;
543         }
544         return int_entries[n].enumerate(NULL, i);
545 }
546 
547 
548 /* Helper functions for simple int entries */
cardinality_bool(elpa_index_t index)549 static int cardinality_bool(elpa_index_t index) {
550         return 2;
551 }
552 
valid_bool(elpa_index_t index,int n,int new_value)553 static int valid_bool(elpa_index_t index, int n, int new_value) {
554         return (0 <= new_value) && (new_value < 2);
555 }
556 
enumerate_identity(elpa_index_t index,int i)557 static int enumerate_identity(elpa_index_t index, int i) {
558         return i;
559 }
560 
561 /* Helper functions for specific options */
562 
563 #define NAME_CASE(name, value, ...) \
564         case value: \
565                 return #name;
566 
567 #define VALID_CASE(name, value) \
568         case value: \
569                 return 1;
570 
571 #define VALID_CASE_3(name, value, available, other_checks) \
572         case value: \
573                 return available && (other_checks(value));
574 
elpa_solver_name(int solver)575 static const char* elpa_solver_name(int solver) {
576         switch(solver) {
577                 ELPA_FOR_ALL_SOLVERS(NAME_CASE)
578                 default:
579                         return "(Invalid solver)";
580         }
581 }
582 
number_of_solvers(elpa_index_t index)583 static int number_of_solvers(elpa_index_t index) {
584         return ELPA_NUMBER_OF_SOLVERS;
585 }
586 
solver_enumerate(elpa_index_t index,int i)587 static int solver_enumerate(elpa_index_t index, int i) {
588 #define OPTION_RANK(name, value, ...) \
589         +(value >= sizeof(array_of_size_value)/sizeof(int) ? 0 : 1)
590 
591 #define EMPTY()
592 #define DEFER1(m) m EMPTY()
593 #define EVAL(...) __VA_ARGS__
594 
595 #define ENUMERATE_CASE(name, value, ...) \
596         { const int array_of_size_value[value]; \
597         case 0 DEFER1(INNER_ITERATOR)()(OPTION_RANK): \
598                 return value; }
599 
600         switch(i) {
601 #define INNER_ITERATOR() ELPA_FOR_ALL_SOLVERS
602                 EVAL(ELPA_FOR_ALL_SOLVERS(ENUMERATE_CASE))
603 #undef INNER_ITERATOR
604                 default:
605                         return 0;
606         }
607 }
608 
609 
solver_is_valid(elpa_index_t index,int n,int new_value)610 static int solver_is_valid(elpa_index_t index, int n, int new_value) {
611         switch(new_value) {
612                 ELPA_FOR_ALL_SOLVERS(VALID_CASE)
613                 default:
614                         return 0;
615         }
616 }
617 
number_of_real_kernels(elpa_index_t index)618 static int number_of_real_kernels(elpa_index_t index) {
619         return ELPA_2STAGE_NUMBER_OF_REAL_KERNELS;
620 }
621 
real_kernel_enumerate(elpa_index_t index,int i)622 static int real_kernel_enumerate(elpa_index_t index,int i) {
623         switch(i) {
624 #define INNER_ITERATOR() ELPA_FOR_ALL_2STAGE_REAL_KERNELS
625                 EVAL(ELPA_FOR_ALL_2STAGE_REAL_KERNELS(ENUMERATE_CASE))
626 #undef INNER_ITERATOR
627                 default:
628                         return 0;
629         }
630 }
631 
real_kernel_name(int kernel)632 static const char *real_kernel_name(int kernel) {
633         switch(kernel) {
634                 ELPA_FOR_ALL_2STAGE_REAL_KERNELS(NAME_CASE)
635                 default:
636                         return "(Invalid real kernel)";
637         }
638 }
639 
640 #define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
641         kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
642 
real_kernel_is_valid(elpa_index_t index,int n,int new_value)643 static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
644         int solver = elpa_index_get_int_value(index, "solver", NULL);
645         if (solver == ELPA_SOLVER_1STAGE) {
646                 return new_value == ELPA_2STAGE_REAL_DEFAULT;
647         }
648         int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
649         switch(new_value) {
650                 ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
651                 default:
652                         return 0;
653         }
654 }
655 
number_of_complex_kernels(elpa_index_t index)656 static int number_of_complex_kernels(elpa_index_t index) {
657         return ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS;
658 }
659 
660 
complex_kernel_enumerate(elpa_index_t index,int i)661 static int complex_kernel_enumerate(elpa_index_t index,int i) {
662         switch(i) {
663 #define INNER_ITERATOR() ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS
664                 EVAL(ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(ENUMERATE_CASE))
665 #undef INNER_ITERATOR
666                 default:
667                         return 0;
668         }
669 }
670 
complex_kernel_name(int kernel)671 static const char *complex_kernel_name(int kernel) {
672         switch(kernel) {
673                 ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(NAME_CASE)
674                 default:
675                         return "(Invalid complex kernel)";
676         }
677 }
678 
679 #define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
680         kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
681 
complex_kernel_is_valid(elpa_index_t index,int n,int new_value)682 static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
683         int solver = elpa_index_get_int_value(index, "solver", NULL);
684         if (solver == ELPA_SOLVER_1STAGE) {
685                 return new_value == ELPA_2STAGE_COMPLEX_DEFAULT;
686         }
687         int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
688         switch(new_value) {
689                 ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
690                 default:
691                         return 0;
692         }
693 }
694 
elpa_autotune_level_name(int level)695 static const char* elpa_autotune_level_name(int level) {
696         switch(level) {
697                 ELPA_FOR_ALL_AUTOTUNE_LEVELS(NAME_CASE)
698                 default:
699                         return "(Invalid autotune level)";
700         }
701 }
702 
elpa_autotune_domain_name(int domain)703 static const char* elpa_autotune_domain_name(int domain) {
704         switch(domain) {
705                 ELPA_FOR_ALL_AUTOTUNE_DOMAINS(NAME_CASE)
706                 default:
707                         return "(Invalid autotune domain)";
708         }
709 }
710 
na_is_valid(elpa_index_t index,int n,int new_value)711 static int na_is_valid(elpa_index_t index, int n, int new_value) {
712         return new_value > 0;
713 }
714 
nev_is_valid(elpa_index_t index,int n,int new_value)715 static int nev_is_valid(elpa_index_t index, int n, int new_value) {
716         if (!elpa_index_int_value_is_set(index, "na")) {
717                 return 0;
718         }
719         return 0 <= new_value && new_value <= elpa_index_get_int_value(index, "na", NULL);
720 }
721 
is_positive(elpa_index_t index,int n,int new_value)722 static int is_positive(elpa_index_t index, int n, int new_value) {
723         return new_value > 0;
724 }
725 
bw_is_valid(elpa_index_t index,int n,int new_value)726 static int bw_is_valid(elpa_index_t index, int n, int new_value) {
727         int na;
728         if (elpa_index_int_value_is_set(index, "na") != 1) {
729                 return 0;
730         }
731 
732         na = elpa_index_get_int_value(index, "na", NULL);
733         return (0 <= new_value) && (new_value < na);
734 }
735 
gpu_is_valid(elpa_index_t index,int n,int new_value)736 static int gpu_is_valid(elpa_index_t index, int n, int new_value) {
737         return new_value == 0 || new_value == 1;
738 }
739 
band_to_full_cardinality(elpa_index_t index)740 static int band_to_full_cardinality(elpa_index_t index) {
741 	return 10;
742 }
band_to_full_enumerate(elpa_index_t index,int i)743 static int band_to_full_enumerate(elpa_index_t index, int i) {
744 	return i+1;
745 }
746 
747 // TODO shouldnt it be only for ELPA2??
band_to_full_is_valid(elpa_index_t index,int n,int new_value)748 static int band_to_full_is_valid(elpa_index_t index, int n, int new_value) {
749 	int max_block=10;
750         return (1 <= new_value) && (new_value <= max_block);
751 }
752 
stripewidth_real_cardinality(elpa_index_t index)753 static int stripewidth_real_cardinality(elpa_index_t index) {
754 	return 17;
755 }
756 
stripewidth_complex_cardinality(elpa_index_t index)757 static int stripewidth_complex_cardinality(elpa_index_t index) {
758 	return 17;
759 }
760 
stripewidth_real_enumerate(elpa_index_t index,int i)761 static int stripewidth_real_enumerate(elpa_index_t index, int i) {
762 	switch(i) {
763 	  case 0:
764 	    return 32;
765 	  case 1:
766 	    return 36;
767 	  case 2:
768 	    return 40;
769 	  case 3:
770 	    return 44;
771 	  case 4:
772 	    return 48;
773 	  case 5:
774 	    return 52;
775 	  case 6:
776 	    return 56;
777 	  case 7:
778 	    return 60;
779 	  case 8:
780 	    return 64;
781 	  case 9:
782 	    return 68;
783 	  case 10:
784 	    return 72;
785 	  case 11:
786 	    return 76;
787 	  case 12:
788 	    return 80;
789 	  case 13:
790 	    return 84;
791 	  case 14:
792 	    return 88;
793 	  case 15:
794 	    return 92;
795 	  case 16:
796 	    return 96;
797 	}
798 }
799 
stripewidth_complex_enumerate(elpa_index_t index,int i)800 static int stripewidth_complex_enumerate(elpa_index_t index, int i) {
801 	switch(i) {
802 	  case 0:
803 	    return 48;
804 	  case 1:
805 	    return 56;
806 	  case 2:
807 	    return 64;
808 	  case 3:
809 	    return 72;
810 	  case 4:
811 	    return 80;
812 	  case 5:
813 	    return 88;
814 	  case 6:
815 	    return 96;
816 	  case 7:
817 	    return 104;
818 	  case 8:
819 	    return 112;
820 	  case 9:
821 	    return 120;
822 	  case 10:
823 	    return 128;
824 	  case 11:
825 	    return 136;
826 	  case 12:
827 	    return 144;
828 	  case 13:
829 	    return 152;
830 	  case 14:
831 	    return 160;
832 	  case 15:
833 	    return 168;
834 	  case 16:
835 	    return 176;
836 	}
837 }
838 
stripewidth_real_is_valid(elpa_index_t index,int n,int new_value)839 static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value) {
840 	return (32 <= new_value) && (new_value <= 96);
841 }
842 
stripewidth_complex_is_valid(elpa_index_t index,int n,int new_value)843 static int stripewidth_complex_is_valid(elpa_index_t index, int n, int new_value) {
844 	return (48 <= new_value) && (new_value <= 176);
845 }
846 
omp_threads_cardinality(elpa_index_t index)847 static int omp_threads_cardinality(elpa_index_t index) {
848 	int max_threads;
849 #ifdef WITH_OPENMP
850 	if (set_max_threads_glob == 0) {
851 		max_threads_glob = omp_get_max_threads();
852 		set_max_threads_glob = 1;
853 	}
854 #else
855 	max_threads_glob = 1;
856 	set_max_threads_glob = 1;
857 #endif
858 	max_threads = max_threads_glob;
859 	return max_threads;
860 }
861 
omp_threads_enumerate(elpa_index_t index,int i)862 static int omp_threads_enumerate(elpa_index_t index, int i) {
863         return i + 1;
864 }
865 
omp_threads_is_valid(elpa_index_t index,int n,int new_value)866 static int omp_threads_is_valid(elpa_index_t index, int n, int new_value) {
867         int max_threads;
868 #ifdef WITH_OPENMP
869 	if (set_max_threads_glob == 0) {
870 		max_threads_glob = omp_get_max_threads();
871 		set_max_threads_glob = 1;
872 	}
873 #else
874 	max_threads_glob = 1;
875 	set_max_threads_glob = 1;
876 #endif
877 	max_threads = max_threads_glob;
878         return (1 <= new_value) && (new_value <= max_threads);
879 }
880 
881 
valid_with_gpu(elpa_index_t index,int n,int new_value)882 static int valid_with_gpu(elpa_index_t index, int n, int new_value) {
883         int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
884         if (gpu_is_active == 1) {
885                 return ((new_value == 0 ) || (new_value == 1));
886         }
887         else {
888                 return new_value == 0;
889         }
890 }
891 
valid_with_gpu_elpa1(elpa_index_t index,int n,int new_value)892 static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value) {
893         int solver = elpa_index_get_int_value(index, "solver", NULL);
894         int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
895         if ((solver == ELPA_SOLVER_1STAGE) && (gpu_is_active == 1)) {
896                 return ((new_value == 0 ) || (new_value == 1));
897         }
898         else {
899                 return new_value == 0;
900         }
901 }
902 
valid_with_gpu_elpa2(elpa_index_t index,int n,int new_value)903 static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value) {
904         int solver = elpa_index_get_int_value(index, "solver", NULL);
905         int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
906         if ((solver == ELPA_SOLVER_2STAGE) && (gpu_is_active == 1)) {
907                 return ((new_value == 0 ) || (new_value == 1));
908         }
909         else {
910                 return new_value == 0;
911         }
912 }
913 
max_stored_rows_cardinality(elpa_index_t index)914 static int max_stored_rows_cardinality(elpa_index_t index) {
915 	return 8;
916 }
917 
max_stored_rows_enumerate(elpa_index_t index,int i)918 static int max_stored_rows_enumerate(elpa_index_t index, int i) {
919 	switch(i) {
920 	  case 0:
921 	    return 15;
922 	  case 1:
923 	    return 31;
924 	  case 2:
925 	    return 47;
926 	  case 3:
927 	    return 63;
928 	  case 4:
929 	    return 79;
930 	  case 5:
931 	    return 95;
932 	  case 6:
933 	    return 111;
934 	  case 7:
935 	    return 127;
936 	}
937 }
938 
max_stored_rows_is_valid(elpa_index_t index,int n,int new_value)939 static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value) {
940         int solver = elpa_index_get_int_value(index, "solver", NULL);
941         if (solver == ELPA_SOLVER_2STAGE) {
942                 return new_value == 15;
943         } else {
944                 return (15 <= new_value) && (new_value <= 127);
945         }
946 }
947 
948 
949 // TODO: this shoudl definitely be improved (too many options to test in autotuning)
950 static const int TILE_SIZE_STEP = 128;
951 
min_tile_size_cardinality(elpa_index_t index)952 static int min_tile_size_cardinality(elpa_index_t index) {
953         int na;
954         if(index == NULL)
955                 return 0;
956         if (elpa_index_int_value_is_set(index, "na") != 1) {
957                 return 0;
958         }
959         na = elpa_index_get_int_value(index, "na", NULL);
960         return na/TILE_SIZE_STEP;
961 }
962 
min_tile_size_enumerate(elpa_index_t index,int i)963 static int min_tile_size_enumerate(elpa_index_t index, int i) {
964         return (i+1) * TILE_SIZE_STEP;
965 }
966 
min_tile_size_is_valid(elpa_index_t index,int n,int new_value)967 static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value) {
968        return new_value % TILE_SIZE_STEP == 0;
969 }
970 
intermediate_bandwidth_cardinality(elpa_index_t index)971 static int intermediate_bandwidth_cardinality(elpa_index_t index) {
972         int na, nblk;
973         if(index == NULL)
974                 return 0;
975         if (elpa_index_int_value_is_set(index, "na") != 1) {
976                 return 0;
977         }
978         na = elpa_index_get_int_value(index, "na", NULL);
979 
980         if (elpa_index_int_value_is_set(index, "nblk") != 1) {
981                 return 0;
982         }
983         nblk = elpa_index_get_int_value(index, "nblk", NULL);
984 
985         return na/nblk;
986 }
987 
intermediate_bandwidth_enumerate(elpa_index_t index,int i)988 static int intermediate_bandwidth_enumerate(elpa_index_t index, int i) {
989         int nblk;
990         if(index == NULL)
991                 return 0;
992         if (elpa_index_int_value_is_set(index, "nblk") != 1) {
993                 return 0;
994         }
995         nblk = elpa_index_get_int_value(index, "nblk", NULL);
996 
997         return (i+1) * nblk;
998 }
999 
intermediate_bandwidth_is_valid(elpa_index_t index,int n,int new_value)1000 static int intermediate_bandwidth_is_valid(elpa_index_t index, int n, int new_value) {
1001         int na, nblk;
1002         if (elpa_index_int_value_is_set(index, "na") != 1) {
1003                 return 0;
1004         }
1005         na = elpa_index_get_int_value(index, "na", NULL);
1006 
1007         if (elpa_index_int_value_is_set(index, "nblk") != 1) {
1008                 return 0;
1009         }
1010         nblk = elpa_index_get_int_value(index, "nblk", NULL);
1011 
1012         int solver = elpa_index_get_int_value(index, "solver", NULL);
1013         if (solver == ELPA_SOLVER_1STAGE) {
1014                 return new_value == nblk;
1015         } else {
1016                 if((new_value <= 1 ) || (new_value > na ))
1017                   return 0;
1018                 if(new_value % nblk != 0) {
1019                   fprintf(stderr, "intermediate bandwidth has to be multiple of nblk\n");
1020                   return 0;
1021                 }
1022         }
1023 }
1024 
cannon_buffer_size_cardinality(elpa_index_t index)1025 static int cannon_buffer_size_cardinality(elpa_index_t index) {
1026         return 2;
1027 }
1028 
cannon_buffer_size_enumerate(elpa_index_t index,int i)1029 static int cannon_buffer_size_enumerate(elpa_index_t index, int i) {
1030         int np_rows;
1031         if(index == NULL)
1032                 return 0;
1033         if (elpa_index_int_value_is_set(index, "num_process_rows") != 1) {
1034                 return 0;
1035         }
1036         np_rows = elpa_index_get_int_value(index, "num_process_rows", NULL);
1037 
1038         // TODO: 0 is both error code and legal value?
1039         if(i == 0)
1040           return 0;
1041         else
1042           return np_rows - 1;
1043 }
1044 
cannon_buffer_size_is_valid(elpa_index_t index,int n,int new_value)1045 static int cannon_buffer_size_is_valid(elpa_index_t index, int n, int new_value) {
1046         int np_rows;
1047         if(index == NULL)
1048                 return 0;
1049         if (elpa_index_int_value_is_set(index, "num_process_rows") != 1) {
1050                 return 0;
1051         }
1052         np_rows = elpa_index_get_int_value(index, "num_process_rows", NULL);
1053 
1054         return ((new_value >= 0) && (new_value < np_rows));
1055 }
1056 
elpa_index_instance()1057 elpa_index_t elpa_index_instance() {
1058         elpa_index_t index = (elpa_index_t) calloc(1, sizeof(struct elpa_index_struct));
1059 
1060 #define ALLOCATE(TYPE, PRINTF_SPEC, ...) \
1061         index->TYPE##_options.values = (TYPE*) calloc(nelements(TYPE##_entries), sizeof(TYPE)); \
1062         index->TYPE##_options.is_set = (int*) calloc(nelements(TYPE##_entries), sizeof(int)); \
1063         index->TYPE##_options.notified = (int*) calloc(nelements(TYPE##_entries), sizeof(int)); \
1064         for (int n = 0; n < nelements(TYPE##_entries); n++) { \
1065                 TYPE default_value = TYPE##_entries[n].default_value; \
1066                 if (!TYPE##_entries[n].base.once && !TYPE##_entries[n].base.readonly) { \
1067                         getenv_##TYPE(index, TYPE##_entries[n].base.env_default, NOTIFY_ENV_DEFAULT, n, &default_value, "Default for option"); \
1068                 } \
1069                 index->TYPE##_options.values[n] = default_value; \
1070         }
1071 
1072         FOR_ALL_TYPES(ALLOCATE)
1073 
1074         return index;
1075 }
1076 
is_tunable_but_overriden(elpa_index_t index,int i,int autotune_level,int autotune_domain)1077 static int is_tunable_but_overriden(elpa_index_t index, int i, int autotune_level, int autotune_domain) {
1078         return (int_entries[i].autotune_level != 0) &&
1079                (int_entries[i].autotune_level <= autotune_level) &&
1080                (int_entries[i].autotune_domain & autotune_domain) &&
1081                (index->int_options.is_set[i]);
1082 }
1083 
is_tunable(elpa_index_t index,int i,int autotune_level,int autotune_domain)1084 static int is_tunable(elpa_index_t index, int i, int autotune_level, int autotune_domain) {
1085         return (int_entries[i].autotune_level != 0) &&
1086                (int_entries[i].autotune_level <= autotune_level) &&
1087                (int_entries[i].autotune_domain & autotune_domain) &&
1088                (!index->int_options.is_set[i]);
1089 }
1090 
elpa_index_autotune_cardinality(elpa_index_t index,int autotune_level,int autotune_domain)1091 int elpa_index_autotune_cardinality(elpa_index_t index, int autotune_level, int autotune_domain) {
1092         int N = 1;
1093 
1094         for (int i = 0; i < nelements(int_entries); i++) { \
1095                 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1096                         N *= int_entries[i].cardinality(index);
1097                 }
1098         }
1099         return N;
1100 }
1101 
elpa_index_print_int_parameter(elpa_index_t index,char * buff,int i)1102 void elpa_index_print_int_parameter(elpa_index_t index, char* buff, int i)
1103 {
1104         int value = index->int_options.values[i];
1105         sprintf(buff, "%s = ", int_entries[i].base.name);
1106         if (int_entries[i].to_string) {
1107                 sprintf(buff, "%s%d -> %s\n", buff, value, int_entries[i].to_string(value));
1108         } else {
1109                 sprintf(buff, "%s%d\n", buff, value);
1110         }
1111 }
1112 
elpa_index_set_autotune_parameters(elpa_index_t index,int autotune_level,int autotune_domain,int current)1113 int elpa_index_set_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain, int current) {
1114         int current_cpy = current;
1115         char buff[100];
1116         int debug = elpa_index_get_int_value(index, "debug", NULL);
1117 
1118         //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "***Trying a new autotuning index %d\n", current);
1119         for (int i = 0; i < nelements(int_entries); i++) {
1120                 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1121                         int value = int_entries[i].enumerate(index, current_cpy % int_entries[i].cardinality(index));
1122                         //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "  * val[%d] = %d -> %d\n", i, current_cpy % int_entries[i].cardinality(index), value);
1123                         /* Try to set option i to that value */
1124                         if (int_entries[i].valid(index, i, value)) {
1125                                 index->int_options.values[i] = value;
1126                         } else {
1127                                 //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "  *NOT VALID becaluse of i %d (%s) and value %d translated to %d\n", i, int_entries[i].base.name, current_cpy % int_entries[i].cardinality(index), value);
1128                                 return 0;
1129                         }
1130                         current_cpy /= int_entries[i].cardinality(index);
1131                 }
1132         }
1133         if (debug == 1 && elpa_index_is_printing_mpi_rank(index)) {
1134                 fprintf(stderr, "\n*** AUTOTUNING: setting a new combination of parameters, idx %d ***\n", current);
1135                 elpa_index_print_autotune_parameters(index, autotune_level, autotune_domain);
1136                 fprintf(stderr, "***\n\n");
1137         }
1138 
1139         /* Could set all values */
1140         return 1;
1141 }
1142 
elpa_index_print_autotune_parameters(elpa_index_t index,int autotune_level,int autotune_domain)1143 int elpa_index_print_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain) {
1144         char buff[100];
1145         if (elpa_index_is_printing_mpi_rank(index)) {
1146                 for (int i = 0; i < nelements(int_entries); i++) {
1147                         if (is_tunable(index, i, autotune_level, autotune_domain)) {
1148                                 elpa_index_print_int_parameter(index, buff, i);
1149                                 fprintf(stderr, "%s", buff);
1150                         }
1151                 }
1152         }
1153         return 1;
1154 }
1155 
elpa_index_print_autotune_state(elpa_index_t index,int autotune_level,int autotune_domain,int min_loc,double min_val,int current,int cardinality,char * file_name)1156 int elpa_index_print_autotune_state(elpa_index_t index, int autotune_level, int autotune_domain, int min_loc,
1157                                     double min_val, int current, int cardinality, char* file_name) {
1158         char buff[100];
1159         elpa_index_t index_best;
1160         int min_loc_cpy = min_loc;
1161         FILE *f;
1162 
1163         // get index with the currently best parameters
1164         index_best = elpa_index_instance();
1165 
1166         if(min_loc_cpy > -1){
1167                 for (int i = 0; i < nelements(int_entries); i++) {
1168                         if (is_tunable(index, i, autotune_level, autotune_domain)) {
1169 
1170                                 int value = int_entries[i].enumerate(index, min_loc_cpy % int_entries[i].cardinality(index));
1171                                 /* we are setting the value for output only, we do not need to check consistency */
1172                                 index_best->int_options.values[i] = value;
1173                                 min_loc_cpy /= int_entries[i].cardinality(index);
1174                         }
1175                 }
1176         }
1177         if (elpa_index_is_printing_mpi_rank(index)) {
1178                 int output_to_file = (strlen(file_name) > 0);
1179                 if(output_to_file) {
1180                         f = fopen(file_name, "w");
1181                         if(f == NULL){
1182                                 fprintf(stderr, "Cannot open file %s in elpa_index_print_autotune_state\n", file_name);
1183                                 return 0;
1184                         }
1185                 }
1186                 else {
1187                         f = stdout;
1188                 }
1189 
1190                 if(!output_to_file)
1191                         fprintf(f, "\n");
1192                 fprintf(f, "*** AUTOTUNING STATE ***\n");
1193                 fprintf(f, "** This is the state of the autotuning object\n");
1194                 fprintf(f, "autotune_level = %d -> %s\n", autotune_level, elpa_autotune_level_name(autotune_level));
1195                 fprintf(f, "autotune_domain = %d -> %s\n", autotune_domain, elpa_autotune_domain_name(autotune_domain));
1196                 fprintf(f, "autotune_cardinality = %d\n", cardinality);
1197                 fprintf(f, "current_idx = %d\n", current);
1198                 fprintf(f, "best_idx = %d\n", min_loc);
1199                 fprintf(f, "best_time = %g\n", min_val);
1200                 if(min_loc_cpy > -1) {
1201                         fprintf(f, "** The following parameters are autotuned with so far the best values\n");
1202                         for (int i = 0; i < nelements(int_entries); i++) {
1203                                 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1204                                         elpa_index_print_int_parameter(index_best, buff, i);
1205                                         fprintf(f, "%s", buff);
1206                                 }
1207                         }
1208                         fprintf(f, "** The following parameters would be autotuned on the selected autotuning level, but were overridden by the set() method\n");
1209                         for (int i = 0; i < nelements(int_entries); i++) {
1210                                 if (is_tunable_but_overriden(index, i, autotune_level, autotune_domain)) {
1211                                         elpa_index_print_int_parameter(index, buff, i);
1212                                         fprintf(f, "%s", buff);
1213                                 }
1214                         }
1215                 }else{
1216                         fprintf(f, "** No output after first step\n");
1217                 }
1218                 fprintf(f, "*** END OF AUTOTUNING STATE ***\n");
1219 
1220                 if(output_to_file)
1221                         fclose(f);
1222         }
1223         elpa_index_free(index_best);
1224 
1225         return 1;
1226 }
1227 
1228 const int LEN =1000;
1229 
1230 #define IMPLEMENT_LOAD_LINE(TYPE, PRINTF_SPEC, SCANF_SPEC, ...) \
1231         static int load_##TYPE##_line(FILE* f, const char* expected, TYPE* val) { \
1232                 char line[LEN], s[LEN]; \
1233                 int error = 0; \
1234                 TYPE n; \
1235                 if(fgets(line, LEN, f) == NULL){ \
1236                         fprintf(stderr, "Loading autotuning state error: line is not there\n"); \
1237                         error = 1; \
1238                 } else{ \
1239                         sscanf(line, "%s = " SCANF_SPEC "\n", s, &n); \
1240                         if(strcmp(s, expected) != 0){ \
1241                                 fprintf(stderr, "Loading autotuning state error: expected %s, got %s\n", expected, s); \
1242                                 error = 1;\
1243                         } else{ \
1244                                 *val = n; \
1245                         } \
1246                 } \
1247                 if(error){ \
1248                         fprintf(stderr, "Autotuning state file corrupted\n"); \
1249                         return 0; \
1250                 } \
1251                 return 1; \
1252         }
FOR_ALL_TYPES(IMPLEMENT_LOAD_LINE)1253 FOR_ALL_TYPES(IMPLEMENT_LOAD_LINE)
1254 
1255 int elpa_index_load_autotune_state(elpa_index_t index, int* autotune_level, int* autotune_domain, int* min_loc,
1256                                     double* min_val, int* current, int* cardinality, char* file_name) {
1257         char line[LEN];
1258         FILE *f;
1259 
1260         //TODO: should be broadcasted, instead of read on all ranks
1261         //if(elpa_index_is_printing_mpi_rank(index)){
1262                 f = fopen(file_name, "r");
1263 
1264                 if (f == NULL) {
1265                         fprintf(stderr, "Cannont open file %s\n", file_name);
1266                         return(0);
1267                 }
1268 
1269 
1270                 if(fgets(line, LEN, f) == NULL) return 0;
1271                 if(fgets(line, LEN, f) == NULL) return 0;
1272                 if(! load_int_line(f, "autotune_level", autotune_level)) return 0;
1273                 if(! load_int_line(f, "autotune_domain", autotune_domain)) return 0;
1274                 if(! load_int_line(f, "autotune_cardinality", cardinality)) return 0;
1275                 if(! load_int_line(f, "current_idx", current)) return 0;
1276                 if(! load_int_line(f, "best_idx", min_loc)) return 0;
1277                 if(! load_double_line(f, "best_time", min_val)) return 0;
1278                 fclose(f);
1279        // }
1280 
1281         return 1;
1282 }
1283 
1284 const char STRUCTURE_PARAMETERS[] = "* Parameters describing structure of the computation:\n";
1285 const char EXPLICIT_PARAMETERS[] = "* Parameters explicitly set by the user:\n";
1286 const char DEFAULT_PARAMETERS[] = "* Parameters with default or environment value:\n";
1287 
elpa_index_print_settings(elpa_index_t index,char * file_name)1288 int elpa_index_print_settings(elpa_index_t index, char *file_name) {
1289         const int LEN =10000;
1290         char out_structure[LEN], out_set[LEN], out_defaults[LEN], out_nowhere[LEN], buff[100];
1291         char (*out)[LEN];
1292         FILE *f;
1293 
1294         sprintf(out_structure, "%s", STRUCTURE_PARAMETERS);
1295         sprintf(out_set, "%s", EXPLICIT_PARAMETERS);
1296         sprintf(out_defaults, "%s", DEFAULT_PARAMETERS);
1297         sprintf(out_nowhere, "Not to be printed:\n");
1298         if(elpa_index_is_printing_mpi_rank(index)){
1299                 for (int i = 0; i < nelements(int_entries); i++) {
1300                         if(int_entries[i].base.print_flag == PRINT_STRUCTURE) {
1301                                 out = &out_structure;
1302                         } else if(int_entries[i].base.print_flag == PRINT_YES && index->int_options.is_set[i]) {
1303                                 out = &out_set;
1304                         } else if(int_entries[i].base.print_flag == PRINT_YES && !index->int_options.is_set[i]) {
1305                                 out = &out_defaults;
1306                         } else
1307                                 out = &out_nowhere;
1308                         elpa_index_print_int_parameter(index, buff, i);
1309                         sprintf(*out, "%s%s", *out, buff);
1310                 }
1311                 int output_to_file = (strlen(file_name) > 0);
1312                 if(output_to_file) {
1313                         f = fopen(file_name, "w");
1314                         if(f == NULL){
1315                                 fprintf(stderr, "Cannot open file %s in elpa_index_print_settings\n", file_name);
1316                                 return 0;
1317                         }
1318                 }
1319                 else {
1320                         f = stdout;
1321                 }
1322 
1323                 fprintf(f, "*** ELPA STATE ***\n");
1324                 fprintf(f, "%s%s%s", out_structure, out_set, out_defaults);
1325                 fprintf(f, "*** END OF ELPA STATE ***\n");
1326                 if(output_to_file)
1327                         fclose(f);
1328         }
1329 
1330         return 1;
1331 }
1332 
elpa_index_load_settings(elpa_index_t index,char * file_name)1333 int elpa_index_load_settings(elpa_index_t index, char *file_name) {
1334         const int LEN = 1000;
1335         char line[LEN], s[LEN];
1336         int n;
1337         FILE *f;
1338         int skip, explicit;
1339 
1340         //TODO: should be broadcasted, instead of read on all ranks
1341         //if(elpa_index_is_printing_mpi_rank(index)){
1342                 f = fopen(file_name, "r");
1343 
1344                 if (f == NULL) {
1345                         fprintf(stderr, "Cannont open file %s\n", file_name);
1346                         return(0);
1347                 }
1348 
1349                 skip = 1;
1350                 explicit = 0;
1351 
1352                 while ((fgets(line, LEN, f)) != NULL) {
1353                         if(strcmp(line, EXPLICIT_PARAMETERS) == 0){
1354                                 skip = 0;
1355                                 explicit = 1;
1356                         }
1357                         if(strcmp(line, DEFAULT_PARAMETERS) == 0){
1358                                 skip = 0;
1359                                 explicit = 0;
1360                         }
1361 
1362                         if(line[0] != '\n' && line[0] != '*'){
1363                                 sscanf(line, "%s = %d\n", s, &n);
1364                                 if(! skip){
1365                                         int error = elpa_index_set_from_load_int_value(index, s, n, explicit);
1366                                 }
1367                         }
1368                 }
1369                 fclose(f);
1370        // }
1371 
1372         return 1;
1373 }
1374 
1375 
elpa_index_is_printing_mpi_rank(elpa_index_t index)1376 int elpa_index_is_printing_mpi_rank(elpa_index_t index)
1377 {
1378   int process_id;
1379   if(elpa_index_int_value_is_set(index, "process_id")){
1380     process_id = elpa_index_get_int_value(index, "process_id", NULL);
1381     return (process_id == 0);
1382   }
1383   printf("Warning: process_id not set, printing on all MPI ranks. This can happen with legacy API.");
1384   return 1;
1385 }
1386