1 // This file is part of ELPA.
2 //
3 // The ELPA library was originally created by the ELPA consortium,
4 // consisting of the following organizations:
5 //
6 // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 // Informatik,
10 // - Technische Universität München, Lehrstuhl für Informatik mit
11 // Schwerpunkt Wissenschaftliches Rechnen ,
12 // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 // and
16 // - IBM Deutschland GmbH
17 //
18 // This particular source code file contains additions, changes and
19 // enhancements authored by Intel Corporation which is not part of
20 // the ELPA consortium.
21 //
22 // More information can be found here:
23 // http://elpa.mpcdf.mpg.de/
24 //
25 // ELPA is free software: you can redistribute it and/or modify
26 // it under the terms of the version 3 of the license of the
27 // GNU Lesser General Public License as published by the Free
28 // Software Foundation.
29 //
30 // ELPA is distributed in the hope that it will be useful,
31 // but WITHOUT ANY WARRANTY; without even the implied warranty of
32 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 // GNU Lesser General Public License for more details.
34 //
35 // You should have received a copy of the GNU Lesser General Public License
36 // along with ELPA. If not, see <http://www.gnu.org/licenses/>
37 //
38 // ELPA reflects a substantial effort on the part of the original
39 // ELPA consortium, and we ask you to respect the spirit of the
40 // license that we chose: i.e., please contribute any changes you
41 // may have back to the original ELPA library distribution, and keep
42 // any derivatives of ELPA under the same license that we chose for
43 // the original distribution, the GNU Lesser General Public License.
44 //
45 // Authors: L. Huedepohl and A. Marek, MPCDF
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <elpa/elpa.h>
50 #include "elpa_index.h"
51
52 #include "config.h"
53
54 #ifdef WITH_OPENMP
55 #include <omp.h>
56 #endif
57
58 int max_threads_glob;
59 int set_max_threads_glob=0;
60
61 static int enumerate_identity(elpa_index_t index, int i);
62 static int cardinality_bool(elpa_index_t index);
63 static int valid_bool(elpa_index_t index, int n, int new_value);
64
65 static int number_of_solvers(elpa_index_t index);
66 static int solver_enumerate(elpa_index_t index, int i);
67 static int solver_is_valid(elpa_index_t index, int n, int new_value);
68 static const char* elpa_solver_name(int solver);
69
70 static int number_of_real_kernels(elpa_index_t index);
71 static int real_kernel_enumerate(elpa_index_t index, int i);
72 static int real_kernel_is_valid(elpa_index_t index, int n, int new_value);
73 static const char *real_kernel_name(int kernel);
74
75 static int number_of_complex_kernels(elpa_index_t index);
76 static int complex_kernel_enumerate(elpa_index_t index, int i);
77 static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value);
78 static const char *complex_kernel_name(int kernel);
79
80 static int band_to_full_cardinality(elpa_index_t index);
81 static int band_to_full_enumerate(elpa_index_t index, int i);
82 static int band_to_full_is_valid(elpa_index_t index, int n, int new_value);
83
84 static int stripewidth_real_cardinality(elpa_index_t index);
85 static int stripewidth_real_enumerate(elpa_index_t index, int i);
86 static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value);
87
88 static int stripewidth_complex_cardinality(elpa_index_t index);
89 static int stripewidth_complex_enumerate(elpa_index_t index, int i);
90 static int stripewidth_complex_is_valid(elpa_index_t index, int n, int new_value);
91
92 static int omp_threads_cardinality(elpa_index_t index);
93 static int omp_threads_enumerate(elpa_index_t index, int i);
94 static int omp_threads_is_valid(elpa_index_t index, int n, int new_value);
95
96 static int max_stored_rows_cardinality(elpa_index_t index);
97 static int max_stored_rows_enumerate(elpa_index_t index, int i);
98 static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value);
99
100 static int min_tile_size_cardinality(elpa_index_t index);
101 static int min_tile_size_enumerate(elpa_index_t index, int i);
102 static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value);
103
104 static int valid_with_gpu(elpa_index_t index, int n, int new_value);
105 static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value);
106 static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value);
107
108 static int intermediate_bandwidth_cardinality(elpa_index_t index);
109 static int intermediate_bandwidth_enumerate(elpa_index_t index, int i);
110 static int intermediate_bandwidth_is_valid(elpa_index_t index, int n, int new_value);
111
112 static int cannon_buffer_size_cardinality(elpa_index_t index);
113 static int cannon_buffer_size_enumerate(elpa_index_t index, int i);
114 static int cannon_buffer_size_is_valid(elpa_index_t index, int n, int new_value);
115
116 static int na_is_valid(elpa_index_t index, int n, int new_value);
117 static int nev_is_valid(elpa_index_t index, int n, int new_value);
118 static int bw_is_valid(elpa_index_t index, int n, int new_value);
119 static int gpu_is_valid(elpa_index_t index, int n, int new_value);
120
121 static int is_positive(elpa_index_t index, int n, int new_value);
122
123 static int elpa_double_string_to_value(char *name, char *string, double *value);
124 static int elpa_double_value_to_string(char *name, double value, const char **string);
125
126 #define BASE_ENTRY(option_name, option_description, once_value, readonly_value, print_flag_value) \
127 .base = { \
128 .name = option_name, \
129 .description = option_description, \
130 .once = once_value, \
131 .readonly = readonly_value, \
132 .env_default = "ELPA_DEFAULT_" option_name, \
133 .env_force = "ELPA_FORCE_" option_name, \
134 .print_flag = print_flag_value, \
135 }
136
137 #define INT_PARAMETER_ENTRY(option_name, option_description, valid_func, print_flag) \
138 { \
139 BASE_ENTRY(option_name, option_description, 1, 0, print_flag), \
140 .valid = valid_func, \
141 }
142
143 #define BOOL_ENTRY(option_name, option_description, default, tune_level, tune_domain, print_flag) \
144 { \
145 BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
146 .default_value = default, \
147 .autotune_level = tune_level, \
148 .autotune_domain = tune_domain, \
149 .cardinality = cardinality_bool, \
150 .enumerate = enumerate_identity, \
151 .valid = valid_bool, \
152 }
153
154 #define INT_ENTRY(option_name, option_description, default, tune_level, tune_domain, card_func, enumerate_func, valid_func, to_string_func, print_flag) \
155 { \
156 BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
157 .default_value = default, \
158 .autotune_level = tune_level, \
159 .autotune_domain = tune_domain, \
160 .cardinality = card_func, \
161 .enumerate = enumerate_func, \
162 .valid = valid_func, \
163 .to_string = to_string_func, \
164 }
165
166 #define INT_ANY_ENTRY(option_name, option_description, print_flag) \
167 { \
168 BASE_ENTRY(option_name, option_description, 0, 0, print_flag), \
169 }
170
171 /* The order here is important! Tunable options that are dependent on other
172 * tunable options must appear later in the list than their prerequisites */
173 static const elpa_index_int_entry_t int_entries[] = {
174 INT_PARAMETER_ENTRY("na", "Global matrix has size (na * na)", na_is_valid, PRINT_STRUCTURE),
175 INT_PARAMETER_ENTRY("nev", "Number of eigenvectors to be computed, 0 <= nev <= na", nev_is_valid, PRINT_STRUCTURE),
176 INT_PARAMETER_ENTRY("nblk", "Block size of scalapack block-cyclic distribution", is_positive, PRINT_STRUCTURE),
177 INT_PARAMETER_ENTRY("local_nrows", "Number of matrix rows stored on this process", NULL, PRINT_NO),
178 INT_PARAMETER_ENTRY("local_ncols", "Number of matrix columns stored on this process", NULL, PRINT_NO),
179 INT_PARAMETER_ENTRY("process_row", "Process row number in the 2D domain decomposition", NULL, PRINT_NO),
180 INT_PARAMETER_ENTRY("process_col", "Process column number in the 2D domain decomposition", NULL, PRINT_NO),
181 INT_PARAMETER_ENTRY("process_id", "Process rank", NULL, PRINT_NO),
182 INT_PARAMETER_ENTRY("num_process_rows", "Number of process row number in the 2D domain decomposition", NULL, PRINT_STRUCTURE),
183 INT_PARAMETER_ENTRY("num_process_cols", "Number of process column number in the 2D domain decomposition", NULL, PRINT_STRUCTURE),
184 INT_PARAMETER_ENTRY("num_processes", "Total number of processes", NULL, PRINT_STRUCTURE),
185 INT_PARAMETER_ENTRY("bandwidth", "If specified, a band matrix with this bandwidth is expected as input; bandwidth must be multiply of nblk", bw_is_valid, PRINT_YES),
186 INT_ANY_ENTRY("mpi_comm_rows", "Communicator for inter-row communication", PRINT_NO),
187 INT_ANY_ENTRY("mpi_comm_cols", "Communicator for inter-column communication", PRINT_NO),
188 INT_ANY_ENTRY("mpi_comm_parent", "Parent communicator", PRINT_NO),
189 INT_ANY_ENTRY("blacs_context", "BLACS context", PRINT_NO),
190 INT_ENTRY("solver", "Solver to use", ELPA_SOLVER_1STAGE, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_ANY, \
191 number_of_solvers, solver_enumerate, solver_is_valid, elpa_solver_name, PRINT_YES),
192 INT_ENTRY("gpu", "Use GPU acceleration", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
193 cardinality_bool, enumerate_identity, gpu_is_valid, NULL, PRINT_YES),
194 //default of gpu ussage for individual phases is 1. However, it is only evaluated, if GPU is used at all, which first has to be determined
195 //by the parameter gpu and presence of the device
196 INT_ENTRY("gpu_tridiag", "Use GPU acceleration for ELPA1 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
197 cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
198 INT_ENTRY("gpu_solve_tridi", "Use GPU acceleration for ELPA solve tridi", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
199 cardinality_bool, enumerate_identity, valid_with_gpu, NULL, PRINT_YES),
200 INT_ENTRY("gpu_trans_ev", "Use GPU acceleration for ELPA1 trans ev", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
201 cardinality_bool, enumerate_identity, valid_with_gpu_elpa1, NULL, PRINT_YES),
202 INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
203 cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
204 INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
205 cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
206 INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
207 cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
208 INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_ANY,
209 cardinality_bool, enumerate_identity, valid_with_gpu_elpa2, NULL, PRINT_YES),
210 INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
211 number_of_real_kernels, real_kernel_enumerate, real_kernel_is_valid, real_kernel_name, PRINT_YES),
212 INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
213 number_of_complex_kernels, complex_kernel_enumerate, complex_kernel_is_valid, complex_kernel_name, PRINT_YES),
214
215 INT_ENTRY("min_tile_size", "Minimal tile size used internally in elpa1_tridiag and elpa2_bandred", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
216 min_tile_size_cardinality, min_tile_size_enumerate, min_tile_size_is_valid, NULL, PRINT_YES),
217 INT_ENTRY("intermediate_bandwidth", "Specifies the intermediate bandwidth in ELPA2 full->banded step. Must be a multiple of nblk", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
218 intermediate_bandwidth_cardinality, intermediate_bandwidth_enumerate, intermediate_bandwidth_is_valid, NULL, PRINT_YES),
219
220 INT_ENTRY("blocking_in_band_to_full", "Loop blocking, default 3", 3, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_ANY,
221 band_to_full_cardinality, band_to_full_enumerate, band_to_full_is_valid, NULL, PRINT_YES),
222 INT_ENTRY("stripewidth_real", "Stripewidth_real, default 48. Must be a multiple of 4", 48, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_REAL,
223 stripewidth_real_cardinality, stripewidth_real_enumerate, stripewidth_real_is_valid, NULL, PRINT_YES),
224 INT_ENTRY("stripewidth_complex", "Stripewidth_complex, default 96. Must be a multiple of 8", 96, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_COMPLEX,
225 stripewidth_complex_cardinality, stripewidth_complex_enumerate, stripewidth_complex_is_valid, NULL, PRINT_YES),
226
227 INT_ENTRY("max_stored_rows", "Maximum number of stored rows used in ELPA 1 backtransformation, default 63", 63, ELPA_AUTOTUNE_EXTENSIVE, ELPA_AUTOTUNE_DOMAIN_ANY,
228 max_stored_rows_cardinality, max_stored_rows_enumerate, max_stored_rows_is_valid, NULL, PRINT_YES),
229 #ifdef WITH_OPENMP
230 INT_ENTRY("omp_threads", "OpenMP threads used in ELPA, default 1", 1, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_ANY,
231 omp_threads_cardinality, omp_threads_enumerate, omp_threads_is_valid, NULL, PRINT_YES),
232 #else
233 INT_ENTRY("omp_threads", "OpenMP threads used in ELPA, default 1", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
234 omp_threads_cardinality, omp_threads_enumerate, omp_threads_is_valid, NULL, PRINT_YES),
235 #endif
236 INT_ENTRY("cannon_buffer_size", "Increasing the buffer size might make it faster, but costs memory", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
237 cannon_buffer_size_cardinality, cannon_buffer_size_enumerate, cannon_buffer_size_is_valid, NULL, PRINT_YES),
238 //BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_MEDIUM, ELPA_AUTOTUNE_DOMAIN_REAL),
239 BOOL_ENTRY("qr", "Use QR decomposition, only used for ELPA_SOLVER_2STAGE, real case", 0, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_REAL, PRINT_YES),
240 BOOL_ENTRY("timings", "Enable time measurement", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
241 BOOL_ENTRY("debug", "Emit verbose debugging messages", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
242 BOOL_ENTRY("print_flops", "Print FLOP rates on task 0", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
243 BOOL_ENTRY("measure_performance", "Also measure with flops (via papi) with the timings", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
244 BOOL_ENTRY("check_pd", "Check eigenvalues to be positive", 0, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
245 BOOL_ENTRY("cannon_for_generalized", "Whether to use Cannons algorithm for the generalized EVP", 1, ELPA_AUTOTUNE_NOT_TUNABLE, 0, PRINT_YES),
246 };
247
248 #define READONLY_DOUBLE_ENTRY(option_name, option_description) \
249 { \
250 BASE_ENTRY(option_name, option_description, 0, 1, 0) \
251 }
252
253 static const elpa_index_double_entry_t double_entries[] = {
254 /* Empty for now */
255 };
256
elpa_index_free(elpa_index_t index)257 void elpa_index_free(elpa_index_t index) {
258 #define FREE_OPTION(TYPE, ...) \
259 free(index->TYPE##_options.values); \
260 free(index->TYPE##_options.is_set); \
261 free(index->TYPE##_options.notified);
262
263 FOR_ALL_TYPES(FREE_OPTION);
264
265 free(index);
266 }
267
compar(const void * a,const void * b)268 static int compar(const void *a, const void *b) {
269 return strcmp(((elpa_index_int_entry_t *) a)->base.name,
270 ((elpa_index_int_entry_t *) b)->base.name);
271 }
272
273 #define IMPLEMENT_FIND_ENTRY(TYPE, ...) \
274 static int find_##TYPE##_entry(char *name) { \
275 elpa_index_##TYPE##_entry_t *entry; \
276 elpa_index_##TYPE##_entry_t key = { .base = {.name = name} } ; \
277 size_t nmembers = nelements(TYPE##_entries); \
278 entry = lfind((const void*) &key, (const void *) TYPE##_entries, &nmembers, sizeof(elpa_index_##TYPE##_entry_t), compar); \
279 if (entry) { \
280 return (entry - &TYPE##_entries[0]); \
281 } else { \
282 return -1; \
283 } \
284 }
285 FOR_ALL_TYPES(IMPLEMENT_FIND_ENTRY)
286
287
288 #define IMPLEMENT_GETENV(TYPE, PRINTF_SPEC, ...) \
289 static int getenv_##TYPE(elpa_index_t index, const char *env_variable, enum NOTIFY_FLAGS notify_flag, int n, TYPE *value, const char *error_string) { \
290 int err; \
291 char *env_value = getenv(env_variable); \
292 if (env_value) { \
293 err = elpa_##TYPE##_string_to_value(TYPE##_entries[n].base.name, env_value, value); \
294 if (err != ELPA_OK) { \
295 fprintf(stderr, "ELPA: Error interpreting environment variable %s with value '%s': %s\n", \
296 TYPE##_entries[n].base.name, env_value, elpa_strerr(err)); \
297 } else {\
298 const char *value_string = NULL; \
299 if (elpa_##TYPE##_value_to_string(TYPE##_entries[n].base.name, *value, &value_string) == ELPA_OK) { \
300 if (!(index->TYPE##_options.notified[n] & notify_flag)) { \
301 if (elpa_index_is_printing_mpi_rank(index)) { \
302 fprintf(stderr, "ELPA: %s '%s' is set to %s due to environment variable %s\n", \
303 error_string, TYPE##_entries[n].base.name, value_string, env_variable); \
304 } \
305 index->TYPE##_options.notified[n] |= notify_flag; \
306 } \
307 } else { \
308 if (elpa_index_is_printing_mpi_rank(index)) { \
309 fprintf(stderr, "ELPA: %s '%s' is set to '" PRINTF_SPEC "' due to environment variable %s\n", \
310 error_string, TYPE##_entries[n].base.name, *value, env_variable);\
311 } \
312 } \
313 return 1; \
314 } \
315 } \
316 return 0; \
317 }
FOR_ALL_TYPES(IMPLEMENT_GETENV)318 FOR_ALL_TYPES(IMPLEMENT_GETENV)
319
320
321 #define IMPLEMENT_GET_FUNCTION(TYPE, PRINTF_SPEC, SCANF_SPEC, ERROR_VALUE) \
322 TYPE elpa_index_get_##TYPE##_value(elpa_index_t index, char *name, int *error) { \
323 TYPE ret; \
324 if (sizeof(TYPE##_entries) == 0) { \
325 return ELPA_ERROR_ENTRY_NOT_FOUND; \
326 } \
327 int n = find_##TYPE##_entry(name); \
328 if (n >= 0) { \
329 int from_env = 0; \
330 if (!TYPE##_entries[n].base.once && !TYPE##_entries[n].base.readonly) { \
331 from_env = getenv_##TYPE(index, TYPE##_entries[n].base.env_force, NOTIFY_ENV_FORCE, n, &ret, "Option"); \
332 } \
333 if (!from_env) { \
334 ret = index->TYPE##_options.values[n]; \
335 } \
336 if (error != NULL) { \
337 *error = ELPA_OK; \
338 } \
339 return ret; \
340 } else { \
341 if (error != NULL) { \
342 *error = ELPA_ERROR_ENTRY_NOT_FOUND; \
343 } \
344 return ERROR_VALUE; \
345 } \
346 }
347 FOR_ALL_TYPES(IMPLEMENT_GET_FUNCTION)
348
349
350 #define IMPLEMENT_LOC_FUNCTION(TYPE, ...) \
351 TYPE* elpa_index_get_##TYPE##_loc(elpa_index_t index, char *name) { \
352 if (sizeof(TYPE##_entries) == 0) { \
353 return NULL; \
354 } \
355 int n = find_##TYPE##_entry(name); \
356 if (n >= 0) { \
357 return &index->TYPE##_options.values[n]; \
358 } else { \
359 return NULL; \
360 } \
361 }
362 FOR_ALL_TYPES(IMPLEMENT_LOC_FUNCTION)
363
364
365 #define IMPLEMENT_SET_FUNCTION(TYPE, PRINTF_SPEC, ...) \
366 int elpa_index_set_##TYPE##_value(elpa_index_t index, char *name, TYPE value) { \
367 if (sizeof(TYPE##_entries) == 0) { \
368 return ELPA_ERROR_ENTRY_NOT_FOUND; \
369 } \
370 int n = find_##TYPE##_entry(name); \
371 if (n < 0) { \
372 return ELPA_ERROR_ENTRY_NOT_FOUND; \
373 }; \
374 if (TYPE##_entries[n].valid != NULL) { \
375 if(!TYPE##_entries[n].valid(index, n, value)) { \
376 return ELPA_ERROR_ENTRY_INVALID_VALUE; \
377 }; \
378 } \
379 if (TYPE##_entries[n].base.once & index->TYPE##_options.is_set[n]) { \
380 return ELPA_ERROR_ENTRY_ALREADY_SET; \
381 } \
382 if (TYPE##_entries[n].base.readonly) { \
383 return ELPA_ERROR_ENTRY_READONLY; \
384 } \
385 index->TYPE##_options.values[n] = value; \
386 index->TYPE##_options.is_set[n] = 1; \
387 return ELPA_OK; \
388 }
389 FOR_ALL_TYPES(IMPLEMENT_SET_FUNCTION)
390
391 #define IMPLEMENT_SET_FROM_LOAD_FUNCTION(TYPE, PRINTF_SPEC, ...) \
392 int elpa_index_set_from_load_##TYPE##_value(elpa_index_t index, char *name, TYPE value, int explicit) { \
393 if (sizeof(TYPE##_entries) == 0) { \
394 return ELPA_ERROR_ENTRY_NOT_FOUND; \
395 } \
396 int n = find_##TYPE##_entry(name); \
397 if (n < 0) { \
398 return ELPA_ERROR_ENTRY_NOT_FOUND; \
399 }; \
400 index->TYPE##_options.values[n] = value; \
401 if(explicit) \
402 index->TYPE##_options.is_set[n] = 1; \
403 return ELPA_OK; \
404 }
405 FOR_ALL_TYPES(IMPLEMENT_SET_FROM_LOAD_FUNCTION)
406
407
408 #define IMPLEMENT_IS_SET_FUNCTION(TYPE, ...) \
409 int elpa_index_##TYPE##_value_is_set(elpa_index_t index, char *name) { \
410 if (sizeof(TYPE##_entries) == 0) { \
411 return ELPA_ERROR_ENTRY_NOT_FOUND; \
412 } \
413 int n = find_##TYPE##_entry(name); \
414 if (n >= 0) { \
415 if (index->TYPE##_options.is_set[n]) { \
416 return 1; \
417 } else { \
418 return 0; \
419 } \
420 } else { \
421 return ELPA_ERROR_ENTRY_NOT_FOUND; \
422 } \
423 }
424 FOR_ALL_TYPES(IMPLEMENT_IS_SET_FUNCTION)
425
426
427 int elpa_index_value_is_set(elpa_index_t index, char *name) {
428 int res = ELPA_ERROR;
429
430 #define RET_IF_SET(TYPE, ...) \
431 res = elpa_index_##TYPE##_value_is_set(index, name); \
432 if (res >= 0) { \
433 return res; \
434 }
435
436 FOR_ALL_TYPES(RET_IF_SET)
437
438 fprintf(stderr, "ELPA Error: Could not find entry '%s'\n", name);
439 return res;
440 }
441
elpa_index_int_is_valid(elpa_index_t index,char * name,int new_value)442 int elpa_index_int_is_valid(elpa_index_t index, char *name, int new_value) {
443 int n = find_int_entry(name); \
444 if (n >= 0) { \
445 if (int_entries[n].valid == NULL) {
446 return ELPA_OK;
447 } else {
448 return int_entries[n].valid(index, n, new_value) ? ELPA_OK : ELPA_ERROR;
449 }
450 }
451 return ELPA_ERROR_ENTRY_NOT_FOUND;
452 }
453
elpa_int_value_to_string(char * name,int value,const char ** string)454 int elpa_int_value_to_string(char *name, int value, const char **string) {
455 int n = find_int_entry(name);
456 if (n < 0) {
457 return ELPA_ERROR_ENTRY_NOT_FOUND;
458 }
459 if (int_entries[n].to_string == NULL) {
460 return ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION;
461 }
462 *string = int_entries[n].to_string(value);
463 return ELPA_OK;
464 }
465
466
elpa_int_value_to_strlen(char * name,int value)467 int elpa_int_value_to_strlen(char *name, int value) {
468 const char *string = NULL;
469 elpa_int_value_to_string(name, value, &string);
470 if (string == NULL) {
471 return 0;
472 } else {
473 return strlen(string);
474 }
475 }
476
477
elpa_index_int_value_to_strlen(elpa_index_t index,char * name)478 int elpa_index_int_value_to_strlen(elpa_index_t index, char *name) {
479 int n = find_int_entry(name);
480 if (n < 0) {
481 return 0;
482 }
483 return elpa_int_value_to_strlen(name, index->int_options.values[n]);
484 }
485
486
elpa_int_string_to_value(char * name,char * string,int * value)487 int elpa_int_string_to_value(char *name, char *string, int *value) {
488 int n = find_int_entry(name);
489 if (n < 0) {
490 return ELPA_ERROR_ENTRY_NOT_FOUND;
491 }
492
493 if (int_entries[n].to_string == NULL) {
494 int val, ret;
495 ret = sscanf(string, "%d", &val);
496 if (ret == 1) {
497 *value = val;
498 return ELPA_OK;
499 } else {
500 return ELPA_ERROR_ENTRY_INVALID_VALUE;
501 }
502 }
503
504 for (int i = 0; i < int_entries[n].cardinality(NULL); i++) {
505 int candidate = int_entries[n].enumerate(NULL, i);
506 if (strcmp(string, int_entries[n].to_string(candidate)) == 0) {
507 *value = candidate;
508 return ELPA_OK;
509 }
510 }
511 return ELPA_ERROR_ENTRY_INVALID_VALUE;
512 }
513
elpa_double_string_to_value(char * name,char * string,double * value)514 int elpa_double_string_to_value(char *name, char *string, double *value) {
515 double val;
516 int ret = sscanf(string, "%lf", &val);
517 if (ret == 1) {
518 *value = val;
519 return ELPA_OK;
520 } else {
521 /* \todo: remove */
522 fprintf(stderr, "ELPA: DEBUG: Could not parse double value '%s' for option '%s'\n", string, name);
523 return ELPA_ERROR_ENTRY_INVALID_VALUE;
524 }
525 }
526
elpa_double_value_to_string(char * name,double value,const char ** string)527 int elpa_double_value_to_string(char *name, double value, const char **string) {
528 return ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION;
529 }
530
elpa_option_cardinality(char * name)531 int elpa_option_cardinality(char *name) {
532 int n = find_int_entry(name);
533 if (n < 0 || !int_entries[n].cardinality) {
534 return ELPA_ERROR_ENTRY_NOT_FOUND;
535 }
536 return int_entries[n].cardinality(NULL);
537 }
538
elpa_option_enumerate(char * name,int i)539 int elpa_option_enumerate(char *name, int i) {
540 int n = find_int_entry(name);
541 if (n < 0 || !int_entries[n].enumerate) {
542 return 0;
543 }
544 return int_entries[n].enumerate(NULL, i);
545 }
546
547
548 /* Helper functions for simple int entries */
cardinality_bool(elpa_index_t index)549 static int cardinality_bool(elpa_index_t index) {
550 return 2;
551 }
552
valid_bool(elpa_index_t index,int n,int new_value)553 static int valid_bool(elpa_index_t index, int n, int new_value) {
554 return (0 <= new_value) && (new_value < 2);
555 }
556
enumerate_identity(elpa_index_t index,int i)557 static int enumerate_identity(elpa_index_t index, int i) {
558 return i;
559 }
560
561 /* Helper functions for specific options */
562
563 #define NAME_CASE(name, value, ...) \
564 case value: \
565 return #name;
566
567 #define VALID_CASE(name, value) \
568 case value: \
569 return 1;
570
571 #define VALID_CASE_3(name, value, available, other_checks) \
572 case value: \
573 return available && (other_checks(value));
574
elpa_solver_name(int solver)575 static const char* elpa_solver_name(int solver) {
576 switch(solver) {
577 ELPA_FOR_ALL_SOLVERS(NAME_CASE)
578 default:
579 return "(Invalid solver)";
580 }
581 }
582
number_of_solvers(elpa_index_t index)583 static int number_of_solvers(elpa_index_t index) {
584 return ELPA_NUMBER_OF_SOLVERS;
585 }
586
solver_enumerate(elpa_index_t index,int i)587 static int solver_enumerate(elpa_index_t index, int i) {
588 #define OPTION_RANK(name, value, ...) \
589 +(value >= sizeof(array_of_size_value)/sizeof(int) ? 0 : 1)
590
591 #define EMPTY()
592 #define DEFER1(m) m EMPTY()
593 #define EVAL(...) __VA_ARGS__
594
595 #define ENUMERATE_CASE(name, value, ...) \
596 { const int array_of_size_value[value]; \
597 case 0 DEFER1(INNER_ITERATOR)()(OPTION_RANK): \
598 return value; }
599
600 switch(i) {
601 #define INNER_ITERATOR() ELPA_FOR_ALL_SOLVERS
602 EVAL(ELPA_FOR_ALL_SOLVERS(ENUMERATE_CASE))
603 #undef INNER_ITERATOR
604 default:
605 return 0;
606 }
607 }
608
609
solver_is_valid(elpa_index_t index,int n,int new_value)610 static int solver_is_valid(elpa_index_t index, int n, int new_value) {
611 switch(new_value) {
612 ELPA_FOR_ALL_SOLVERS(VALID_CASE)
613 default:
614 return 0;
615 }
616 }
617
number_of_real_kernels(elpa_index_t index)618 static int number_of_real_kernels(elpa_index_t index) {
619 return ELPA_2STAGE_NUMBER_OF_REAL_KERNELS;
620 }
621
real_kernel_enumerate(elpa_index_t index,int i)622 static int real_kernel_enumerate(elpa_index_t index,int i) {
623 switch(i) {
624 #define INNER_ITERATOR() ELPA_FOR_ALL_2STAGE_REAL_KERNELS
625 EVAL(ELPA_FOR_ALL_2STAGE_REAL_KERNELS(ENUMERATE_CASE))
626 #undef INNER_ITERATOR
627 default:
628 return 0;
629 }
630 }
631
real_kernel_name(int kernel)632 static const char *real_kernel_name(int kernel) {
633 switch(kernel) {
634 ELPA_FOR_ALL_2STAGE_REAL_KERNELS(NAME_CASE)
635 default:
636 return "(Invalid real kernel)";
637 }
638 }
639
640 #define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
641 kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
642
real_kernel_is_valid(elpa_index_t index,int n,int new_value)643 static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
644 int solver = elpa_index_get_int_value(index, "solver", NULL);
645 if (solver == ELPA_SOLVER_1STAGE) {
646 return new_value == ELPA_2STAGE_REAL_DEFAULT;
647 }
648 int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
649 switch(new_value) {
650 ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
651 default:
652 return 0;
653 }
654 }
655
number_of_complex_kernels(elpa_index_t index)656 static int number_of_complex_kernels(elpa_index_t index) {
657 return ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS;
658 }
659
660
complex_kernel_enumerate(elpa_index_t index,int i)661 static int complex_kernel_enumerate(elpa_index_t index,int i) {
662 switch(i) {
663 #define INNER_ITERATOR() ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS
664 EVAL(ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(ENUMERATE_CASE))
665 #undef INNER_ITERATOR
666 default:
667 return 0;
668 }
669 }
670
complex_kernel_name(int kernel)671 static const char *complex_kernel_name(int kernel) {
672 switch(kernel) {
673 ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(NAME_CASE)
674 default:
675 return "(Invalid complex kernel)";
676 }
677 }
678
679 #define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
680 kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
681
complex_kernel_is_valid(elpa_index_t index,int n,int new_value)682 static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
683 int solver = elpa_index_get_int_value(index, "solver", NULL);
684 if (solver == ELPA_SOLVER_1STAGE) {
685 return new_value == ELPA_2STAGE_COMPLEX_DEFAULT;
686 }
687 int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
688 switch(new_value) {
689 ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
690 default:
691 return 0;
692 }
693 }
694
elpa_autotune_level_name(int level)695 static const char* elpa_autotune_level_name(int level) {
696 switch(level) {
697 ELPA_FOR_ALL_AUTOTUNE_LEVELS(NAME_CASE)
698 default:
699 return "(Invalid autotune level)";
700 }
701 }
702
elpa_autotune_domain_name(int domain)703 static const char* elpa_autotune_domain_name(int domain) {
704 switch(domain) {
705 ELPA_FOR_ALL_AUTOTUNE_DOMAINS(NAME_CASE)
706 default:
707 return "(Invalid autotune domain)";
708 }
709 }
710
na_is_valid(elpa_index_t index,int n,int new_value)711 static int na_is_valid(elpa_index_t index, int n, int new_value) {
712 return new_value > 0;
713 }
714
nev_is_valid(elpa_index_t index,int n,int new_value)715 static int nev_is_valid(elpa_index_t index, int n, int new_value) {
716 if (!elpa_index_int_value_is_set(index, "na")) {
717 return 0;
718 }
719 return 0 <= new_value && new_value <= elpa_index_get_int_value(index, "na", NULL);
720 }
721
is_positive(elpa_index_t index,int n,int new_value)722 static int is_positive(elpa_index_t index, int n, int new_value) {
723 return new_value > 0;
724 }
725
bw_is_valid(elpa_index_t index,int n,int new_value)726 static int bw_is_valid(elpa_index_t index, int n, int new_value) {
727 int na;
728 if (elpa_index_int_value_is_set(index, "na") != 1) {
729 return 0;
730 }
731
732 na = elpa_index_get_int_value(index, "na", NULL);
733 return (0 <= new_value) && (new_value < na);
734 }
735
gpu_is_valid(elpa_index_t index,int n,int new_value)736 static int gpu_is_valid(elpa_index_t index, int n, int new_value) {
737 return new_value == 0 || new_value == 1;
738 }
739
band_to_full_cardinality(elpa_index_t index)740 static int band_to_full_cardinality(elpa_index_t index) {
741 return 10;
742 }
band_to_full_enumerate(elpa_index_t index,int i)743 static int band_to_full_enumerate(elpa_index_t index, int i) {
744 return i+1;
745 }
746
747 // TODO shouldnt it be only for ELPA2??
band_to_full_is_valid(elpa_index_t index,int n,int new_value)748 static int band_to_full_is_valid(elpa_index_t index, int n, int new_value) {
749 int max_block=10;
750 return (1 <= new_value) && (new_value <= max_block);
751 }
752
stripewidth_real_cardinality(elpa_index_t index)753 static int stripewidth_real_cardinality(elpa_index_t index) {
754 return 17;
755 }
756
stripewidth_complex_cardinality(elpa_index_t index)757 static int stripewidth_complex_cardinality(elpa_index_t index) {
758 return 17;
759 }
760
stripewidth_real_enumerate(elpa_index_t index,int i)761 static int stripewidth_real_enumerate(elpa_index_t index, int i) {
762 switch(i) {
763 case 0:
764 return 32;
765 case 1:
766 return 36;
767 case 2:
768 return 40;
769 case 3:
770 return 44;
771 case 4:
772 return 48;
773 case 5:
774 return 52;
775 case 6:
776 return 56;
777 case 7:
778 return 60;
779 case 8:
780 return 64;
781 case 9:
782 return 68;
783 case 10:
784 return 72;
785 case 11:
786 return 76;
787 case 12:
788 return 80;
789 case 13:
790 return 84;
791 case 14:
792 return 88;
793 case 15:
794 return 92;
795 case 16:
796 return 96;
797 }
798 }
799
stripewidth_complex_enumerate(elpa_index_t index,int i)800 static int stripewidth_complex_enumerate(elpa_index_t index, int i) {
801 switch(i) {
802 case 0:
803 return 48;
804 case 1:
805 return 56;
806 case 2:
807 return 64;
808 case 3:
809 return 72;
810 case 4:
811 return 80;
812 case 5:
813 return 88;
814 case 6:
815 return 96;
816 case 7:
817 return 104;
818 case 8:
819 return 112;
820 case 9:
821 return 120;
822 case 10:
823 return 128;
824 case 11:
825 return 136;
826 case 12:
827 return 144;
828 case 13:
829 return 152;
830 case 14:
831 return 160;
832 case 15:
833 return 168;
834 case 16:
835 return 176;
836 }
837 }
838
stripewidth_real_is_valid(elpa_index_t index,int n,int new_value)839 static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value) {
840 return (32 <= new_value) && (new_value <= 96);
841 }
842
stripewidth_complex_is_valid(elpa_index_t index,int n,int new_value)843 static int stripewidth_complex_is_valid(elpa_index_t index, int n, int new_value) {
844 return (48 <= new_value) && (new_value <= 176);
845 }
846
omp_threads_cardinality(elpa_index_t index)847 static int omp_threads_cardinality(elpa_index_t index) {
848 int max_threads;
849 #ifdef WITH_OPENMP
850 if (set_max_threads_glob == 0) {
851 max_threads_glob = omp_get_max_threads();
852 set_max_threads_glob = 1;
853 }
854 #else
855 max_threads_glob = 1;
856 set_max_threads_glob = 1;
857 #endif
858 max_threads = max_threads_glob;
859 return max_threads;
860 }
861
omp_threads_enumerate(elpa_index_t index,int i)862 static int omp_threads_enumerate(elpa_index_t index, int i) {
863 return i + 1;
864 }
865
omp_threads_is_valid(elpa_index_t index,int n,int new_value)866 static int omp_threads_is_valid(elpa_index_t index, int n, int new_value) {
867 int max_threads;
868 #ifdef WITH_OPENMP
869 if (set_max_threads_glob == 0) {
870 max_threads_glob = omp_get_max_threads();
871 set_max_threads_glob = 1;
872 }
873 #else
874 max_threads_glob = 1;
875 set_max_threads_glob = 1;
876 #endif
877 max_threads = max_threads_glob;
878 return (1 <= new_value) && (new_value <= max_threads);
879 }
880
881
valid_with_gpu(elpa_index_t index,int n,int new_value)882 static int valid_with_gpu(elpa_index_t index, int n, int new_value) {
883 int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
884 if (gpu_is_active == 1) {
885 return ((new_value == 0 ) || (new_value == 1));
886 }
887 else {
888 return new_value == 0;
889 }
890 }
891
valid_with_gpu_elpa1(elpa_index_t index,int n,int new_value)892 static int valid_with_gpu_elpa1(elpa_index_t index, int n, int new_value) {
893 int solver = elpa_index_get_int_value(index, "solver", NULL);
894 int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
895 if ((solver == ELPA_SOLVER_1STAGE) && (gpu_is_active == 1)) {
896 return ((new_value == 0 ) || (new_value == 1));
897 }
898 else {
899 return new_value == 0;
900 }
901 }
902
valid_with_gpu_elpa2(elpa_index_t index,int n,int new_value)903 static int valid_with_gpu_elpa2(elpa_index_t index, int n, int new_value) {
904 int solver = elpa_index_get_int_value(index, "solver", NULL);
905 int gpu_is_active = elpa_index_get_int_value(index, "gpu", NULL);
906 if ((solver == ELPA_SOLVER_2STAGE) && (gpu_is_active == 1)) {
907 return ((new_value == 0 ) || (new_value == 1));
908 }
909 else {
910 return new_value == 0;
911 }
912 }
913
max_stored_rows_cardinality(elpa_index_t index)914 static int max_stored_rows_cardinality(elpa_index_t index) {
915 return 8;
916 }
917
max_stored_rows_enumerate(elpa_index_t index,int i)918 static int max_stored_rows_enumerate(elpa_index_t index, int i) {
919 switch(i) {
920 case 0:
921 return 15;
922 case 1:
923 return 31;
924 case 2:
925 return 47;
926 case 3:
927 return 63;
928 case 4:
929 return 79;
930 case 5:
931 return 95;
932 case 6:
933 return 111;
934 case 7:
935 return 127;
936 }
937 }
938
max_stored_rows_is_valid(elpa_index_t index,int n,int new_value)939 static int max_stored_rows_is_valid(elpa_index_t index, int n, int new_value) {
940 int solver = elpa_index_get_int_value(index, "solver", NULL);
941 if (solver == ELPA_SOLVER_2STAGE) {
942 return new_value == 15;
943 } else {
944 return (15 <= new_value) && (new_value <= 127);
945 }
946 }
947
948
949 // TODO: this shoudl definitely be improved (too many options to test in autotuning)
950 static const int TILE_SIZE_STEP = 128;
951
min_tile_size_cardinality(elpa_index_t index)952 static int min_tile_size_cardinality(elpa_index_t index) {
953 int na;
954 if(index == NULL)
955 return 0;
956 if (elpa_index_int_value_is_set(index, "na") != 1) {
957 return 0;
958 }
959 na = elpa_index_get_int_value(index, "na", NULL);
960 return na/TILE_SIZE_STEP;
961 }
962
min_tile_size_enumerate(elpa_index_t index,int i)963 static int min_tile_size_enumerate(elpa_index_t index, int i) {
964 return (i+1) * TILE_SIZE_STEP;
965 }
966
min_tile_size_is_valid(elpa_index_t index,int n,int new_value)967 static int min_tile_size_is_valid(elpa_index_t index, int n, int new_value) {
968 return new_value % TILE_SIZE_STEP == 0;
969 }
970
intermediate_bandwidth_cardinality(elpa_index_t index)971 static int intermediate_bandwidth_cardinality(elpa_index_t index) {
972 int na, nblk;
973 if(index == NULL)
974 return 0;
975 if (elpa_index_int_value_is_set(index, "na") != 1) {
976 return 0;
977 }
978 na = elpa_index_get_int_value(index, "na", NULL);
979
980 if (elpa_index_int_value_is_set(index, "nblk") != 1) {
981 return 0;
982 }
983 nblk = elpa_index_get_int_value(index, "nblk", NULL);
984
985 return na/nblk;
986 }
987
intermediate_bandwidth_enumerate(elpa_index_t index,int i)988 static int intermediate_bandwidth_enumerate(elpa_index_t index, int i) {
989 int nblk;
990 if(index == NULL)
991 return 0;
992 if (elpa_index_int_value_is_set(index, "nblk") != 1) {
993 return 0;
994 }
995 nblk = elpa_index_get_int_value(index, "nblk", NULL);
996
997 return (i+1) * nblk;
998 }
999
intermediate_bandwidth_is_valid(elpa_index_t index,int n,int new_value)1000 static int intermediate_bandwidth_is_valid(elpa_index_t index, int n, int new_value) {
1001 int na, nblk;
1002 if (elpa_index_int_value_is_set(index, "na") != 1) {
1003 return 0;
1004 }
1005 na = elpa_index_get_int_value(index, "na", NULL);
1006
1007 if (elpa_index_int_value_is_set(index, "nblk") != 1) {
1008 return 0;
1009 }
1010 nblk = elpa_index_get_int_value(index, "nblk", NULL);
1011
1012 int solver = elpa_index_get_int_value(index, "solver", NULL);
1013 if (solver == ELPA_SOLVER_1STAGE) {
1014 return new_value == nblk;
1015 } else {
1016 if((new_value <= 1 ) || (new_value > na ))
1017 return 0;
1018 if(new_value % nblk != 0) {
1019 fprintf(stderr, "intermediate bandwidth has to be multiple of nblk\n");
1020 return 0;
1021 }
1022 }
1023 }
1024
cannon_buffer_size_cardinality(elpa_index_t index)1025 static int cannon_buffer_size_cardinality(elpa_index_t index) {
1026 return 2;
1027 }
1028
cannon_buffer_size_enumerate(elpa_index_t index,int i)1029 static int cannon_buffer_size_enumerate(elpa_index_t index, int i) {
1030 int np_rows;
1031 if(index == NULL)
1032 return 0;
1033 if (elpa_index_int_value_is_set(index, "num_process_rows") != 1) {
1034 return 0;
1035 }
1036 np_rows = elpa_index_get_int_value(index, "num_process_rows", NULL);
1037
1038 // TODO: 0 is both error code and legal value?
1039 if(i == 0)
1040 return 0;
1041 else
1042 return np_rows - 1;
1043 }
1044
cannon_buffer_size_is_valid(elpa_index_t index,int n,int new_value)1045 static int cannon_buffer_size_is_valid(elpa_index_t index, int n, int new_value) {
1046 int np_rows;
1047 if(index == NULL)
1048 return 0;
1049 if (elpa_index_int_value_is_set(index, "num_process_rows") != 1) {
1050 return 0;
1051 }
1052 np_rows = elpa_index_get_int_value(index, "num_process_rows", NULL);
1053
1054 return ((new_value >= 0) && (new_value < np_rows));
1055 }
1056
elpa_index_instance()1057 elpa_index_t elpa_index_instance() {
1058 elpa_index_t index = (elpa_index_t) calloc(1, sizeof(struct elpa_index_struct));
1059
1060 #define ALLOCATE(TYPE, PRINTF_SPEC, ...) \
1061 index->TYPE##_options.values = (TYPE*) calloc(nelements(TYPE##_entries), sizeof(TYPE)); \
1062 index->TYPE##_options.is_set = (int*) calloc(nelements(TYPE##_entries), sizeof(int)); \
1063 index->TYPE##_options.notified = (int*) calloc(nelements(TYPE##_entries), sizeof(int)); \
1064 for (int n = 0; n < nelements(TYPE##_entries); n++) { \
1065 TYPE default_value = TYPE##_entries[n].default_value; \
1066 if (!TYPE##_entries[n].base.once && !TYPE##_entries[n].base.readonly) { \
1067 getenv_##TYPE(index, TYPE##_entries[n].base.env_default, NOTIFY_ENV_DEFAULT, n, &default_value, "Default for option"); \
1068 } \
1069 index->TYPE##_options.values[n] = default_value; \
1070 }
1071
1072 FOR_ALL_TYPES(ALLOCATE)
1073
1074 return index;
1075 }
1076
is_tunable_but_overriden(elpa_index_t index,int i,int autotune_level,int autotune_domain)1077 static int is_tunable_but_overriden(elpa_index_t index, int i, int autotune_level, int autotune_domain) {
1078 return (int_entries[i].autotune_level != 0) &&
1079 (int_entries[i].autotune_level <= autotune_level) &&
1080 (int_entries[i].autotune_domain & autotune_domain) &&
1081 (index->int_options.is_set[i]);
1082 }
1083
is_tunable(elpa_index_t index,int i,int autotune_level,int autotune_domain)1084 static int is_tunable(elpa_index_t index, int i, int autotune_level, int autotune_domain) {
1085 return (int_entries[i].autotune_level != 0) &&
1086 (int_entries[i].autotune_level <= autotune_level) &&
1087 (int_entries[i].autotune_domain & autotune_domain) &&
1088 (!index->int_options.is_set[i]);
1089 }
1090
elpa_index_autotune_cardinality(elpa_index_t index,int autotune_level,int autotune_domain)1091 int elpa_index_autotune_cardinality(elpa_index_t index, int autotune_level, int autotune_domain) {
1092 int N = 1;
1093
1094 for (int i = 0; i < nelements(int_entries); i++) { \
1095 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1096 N *= int_entries[i].cardinality(index);
1097 }
1098 }
1099 return N;
1100 }
1101
elpa_index_print_int_parameter(elpa_index_t index,char * buff,int i)1102 void elpa_index_print_int_parameter(elpa_index_t index, char* buff, int i)
1103 {
1104 int value = index->int_options.values[i];
1105 sprintf(buff, "%s = ", int_entries[i].base.name);
1106 if (int_entries[i].to_string) {
1107 sprintf(buff, "%s%d -> %s\n", buff, value, int_entries[i].to_string(value));
1108 } else {
1109 sprintf(buff, "%s%d\n", buff, value);
1110 }
1111 }
1112
elpa_index_set_autotune_parameters(elpa_index_t index,int autotune_level,int autotune_domain,int current)1113 int elpa_index_set_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain, int current) {
1114 int current_cpy = current;
1115 char buff[100];
1116 int debug = elpa_index_get_int_value(index, "debug", NULL);
1117
1118 //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, "***Trying a new autotuning index %d\n", current);
1119 for (int i = 0; i < nelements(int_entries); i++) {
1120 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1121 int value = int_entries[i].enumerate(index, current_cpy % int_entries[i].cardinality(index));
1122 //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, " * val[%d] = %d -> %d\n", i, current_cpy % int_entries[i].cardinality(index), value);
1123 /* Try to set option i to that value */
1124 if (int_entries[i].valid(index, i, value)) {
1125 index->int_options.values[i] = value;
1126 } else {
1127 //if(elpa_index_is_printing_mpi_rank(index)) fprintf(stderr, " *NOT VALID becaluse of i %d (%s) and value %d translated to %d\n", i, int_entries[i].base.name, current_cpy % int_entries[i].cardinality(index), value);
1128 return 0;
1129 }
1130 current_cpy /= int_entries[i].cardinality(index);
1131 }
1132 }
1133 if (debug == 1 && elpa_index_is_printing_mpi_rank(index)) {
1134 fprintf(stderr, "\n*** AUTOTUNING: setting a new combination of parameters, idx %d ***\n", current);
1135 elpa_index_print_autotune_parameters(index, autotune_level, autotune_domain);
1136 fprintf(stderr, "***\n\n");
1137 }
1138
1139 /* Could set all values */
1140 return 1;
1141 }
1142
elpa_index_print_autotune_parameters(elpa_index_t index,int autotune_level,int autotune_domain)1143 int elpa_index_print_autotune_parameters(elpa_index_t index, int autotune_level, int autotune_domain) {
1144 char buff[100];
1145 if (elpa_index_is_printing_mpi_rank(index)) {
1146 for (int i = 0; i < nelements(int_entries); i++) {
1147 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1148 elpa_index_print_int_parameter(index, buff, i);
1149 fprintf(stderr, "%s", buff);
1150 }
1151 }
1152 }
1153 return 1;
1154 }
1155
elpa_index_print_autotune_state(elpa_index_t index,int autotune_level,int autotune_domain,int min_loc,double min_val,int current,int cardinality,char * file_name)1156 int elpa_index_print_autotune_state(elpa_index_t index, int autotune_level, int autotune_domain, int min_loc,
1157 double min_val, int current, int cardinality, char* file_name) {
1158 char buff[100];
1159 elpa_index_t index_best;
1160 int min_loc_cpy = min_loc;
1161 FILE *f;
1162
1163 // get index with the currently best parameters
1164 index_best = elpa_index_instance();
1165
1166 if(min_loc_cpy > -1){
1167 for (int i = 0; i < nelements(int_entries); i++) {
1168 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1169
1170 int value = int_entries[i].enumerate(index, min_loc_cpy % int_entries[i].cardinality(index));
1171 /* we are setting the value for output only, we do not need to check consistency */
1172 index_best->int_options.values[i] = value;
1173 min_loc_cpy /= int_entries[i].cardinality(index);
1174 }
1175 }
1176 }
1177 if (elpa_index_is_printing_mpi_rank(index)) {
1178 int output_to_file = (strlen(file_name) > 0);
1179 if(output_to_file) {
1180 f = fopen(file_name, "w");
1181 if(f == NULL){
1182 fprintf(stderr, "Cannot open file %s in elpa_index_print_autotune_state\n", file_name);
1183 return 0;
1184 }
1185 }
1186 else {
1187 f = stdout;
1188 }
1189
1190 if(!output_to_file)
1191 fprintf(f, "\n");
1192 fprintf(f, "*** AUTOTUNING STATE ***\n");
1193 fprintf(f, "** This is the state of the autotuning object\n");
1194 fprintf(f, "autotune_level = %d -> %s\n", autotune_level, elpa_autotune_level_name(autotune_level));
1195 fprintf(f, "autotune_domain = %d -> %s\n", autotune_domain, elpa_autotune_domain_name(autotune_domain));
1196 fprintf(f, "autotune_cardinality = %d\n", cardinality);
1197 fprintf(f, "current_idx = %d\n", current);
1198 fprintf(f, "best_idx = %d\n", min_loc);
1199 fprintf(f, "best_time = %g\n", min_val);
1200 if(min_loc_cpy > -1) {
1201 fprintf(f, "** The following parameters are autotuned with so far the best values\n");
1202 for (int i = 0; i < nelements(int_entries); i++) {
1203 if (is_tunable(index, i, autotune_level, autotune_domain)) {
1204 elpa_index_print_int_parameter(index_best, buff, i);
1205 fprintf(f, "%s", buff);
1206 }
1207 }
1208 fprintf(f, "** The following parameters would be autotuned on the selected autotuning level, but were overridden by the set() method\n");
1209 for (int i = 0; i < nelements(int_entries); i++) {
1210 if (is_tunable_but_overriden(index, i, autotune_level, autotune_domain)) {
1211 elpa_index_print_int_parameter(index, buff, i);
1212 fprintf(f, "%s", buff);
1213 }
1214 }
1215 }else{
1216 fprintf(f, "** No output after first step\n");
1217 }
1218 fprintf(f, "*** END OF AUTOTUNING STATE ***\n");
1219
1220 if(output_to_file)
1221 fclose(f);
1222 }
1223 elpa_index_free(index_best);
1224
1225 return 1;
1226 }
1227
1228 const int LEN =1000;
1229
1230 #define IMPLEMENT_LOAD_LINE(TYPE, PRINTF_SPEC, SCANF_SPEC, ...) \
1231 static int load_##TYPE##_line(FILE* f, const char* expected, TYPE* val) { \
1232 char line[LEN], s[LEN]; \
1233 int error = 0; \
1234 TYPE n; \
1235 if(fgets(line, LEN, f) == NULL){ \
1236 fprintf(stderr, "Loading autotuning state error: line is not there\n"); \
1237 error = 1; \
1238 } else{ \
1239 sscanf(line, "%s = " SCANF_SPEC "\n", s, &n); \
1240 if(strcmp(s, expected) != 0){ \
1241 fprintf(stderr, "Loading autotuning state error: expected %s, got %s\n", expected, s); \
1242 error = 1;\
1243 } else{ \
1244 *val = n; \
1245 } \
1246 } \
1247 if(error){ \
1248 fprintf(stderr, "Autotuning state file corrupted\n"); \
1249 return 0; \
1250 } \
1251 return 1; \
1252 }
FOR_ALL_TYPES(IMPLEMENT_LOAD_LINE)1253 FOR_ALL_TYPES(IMPLEMENT_LOAD_LINE)
1254
1255 int elpa_index_load_autotune_state(elpa_index_t index, int* autotune_level, int* autotune_domain, int* min_loc,
1256 double* min_val, int* current, int* cardinality, char* file_name) {
1257 char line[LEN];
1258 FILE *f;
1259
1260 //TODO: should be broadcasted, instead of read on all ranks
1261 //if(elpa_index_is_printing_mpi_rank(index)){
1262 f = fopen(file_name, "r");
1263
1264 if (f == NULL) {
1265 fprintf(stderr, "Cannont open file %s\n", file_name);
1266 return(0);
1267 }
1268
1269
1270 if(fgets(line, LEN, f) == NULL) return 0;
1271 if(fgets(line, LEN, f) == NULL) return 0;
1272 if(! load_int_line(f, "autotune_level", autotune_level)) return 0;
1273 if(! load_int_line(f, "autotune_domain", autotune_domain)) return 0;
1274 if(! load_int_line(f, "autotune_cardinality", cardinality)) return 0;
1275 if(! load_int_line(f, "current_idx", current)) return 0;
1276 if(! load_int_line(f, "best_idx", min_loc)) return 0;
1277 if(! load_double_line(f, "best_time", min_val)) return 0;
1278 fclose(f);
1279 // }
1280
1281 return 1;
1282 }
1283
1284 const char STRUCTURE_PARAMETERS[] = "* Parameters describing structure of the computation:\n";
1285 const char EXPLICIT_PARAMETERS[] = "* Parameters explicitly set by the user:\n";
1286 const char DEFAULT_PARAMETERS[] = "* Parameters with default or environment value:\n";
1287
elpa_index_print_settings(elpa_index_t index,char * file_name)1288 int elpa_index_print_settings(elpa_index_t index, char *file_name) {
1289 const int LEN =10000;
1290 char out_structure[LEN], out_set[LEN], out_defaults[LEN], out_nowhere[LEN], buff[100];
1291 char (*out)[LEN];
1292 FILE *f;
1293
1294 sprintf(out_structure, "%s", STRUCTURE_PARAMETERS);
1295 sprintf(out_set, "%s", EXPLICIT_PARAMETERS);
1296 sprintf(out_defaults, "%s", DEFAULT_PARAMETERS);
1297 sprintf(out_nowhere, "Not to be printed:\n");
1298 if(elpa_index_is_printing_mpi_rank(index)){
1299 for (int i = 0; i < nelements(int_entries); i++) {
1300 if(int_entries[i].base.print_flag == PRINT_STRUCTURE) {
1301 out = &out_structure;
1302 } else if(int_entries[i].base.print_flag == PRINT_YES && index->int_options.is_set[i]) {
1303 out = &out_set;
1304 } else if(int_entries[i].base.print_flag == PRINT_YES && !index->int_options.is_set[i]) {
1305 out = &out_defaults;
1306 } else
1307 out = &out_nowhere;
1308 elpa_index_print_int_parameter(index, buff, i);
1309 sprintf(*out, "%s%s", *out, buff);
1310 }
1311 int output_to_file = (strlen(file_name) > 0);
1312 if(output_to_file) {
1313 f = fopen(file_name, "w");
1314 if(f == NULL){
1315 fprintf(stderr, "Cannot open file %s in elpa_index_print_settings\n", file_name);
1316 return 0;
1317 }
1318 }
1319 else {
1320 f = stdout;
1321 }
1322
1323 fprintf(f, "*** ELPA STATE ***\n");
1324 fprintf(f, "%s%s%s", out_structure, out_set, out_defaults);
1325 fprintf(f, "*** END OF ELPA STATE ***\n");
1326 if(output_to_file)
1327 fclose(f);
1328 }
1329
1330 return 1;
1331 }
1332
elpa_index_load_settings(elpa_index_t index,char * file_name)1333 int elpa_index_load_settings(elpa_index_t index, char *file_name) {
1334 const int LEN = 1000;
1335 char line[LEN], s[LEN];
1336 int n;
1337 FILE *f;
1338 int skip, explicit;
1339
1340 //TODO: should be broadcasted, instead of read on all ranks
1341 //if(elpa_index_is_printing_mpi_rank(index)){
1342 f = fopen(file_name, "r");
1343
1344 if (f == NULL) {
1345 fprintf(stderr, "Cannont open file %s\n", file_name);
1346 return(0);
1347 }
1348
1349 skip = 1;
1350 explicit = 0;
1351
1352 while ((fgets(line, LEN, f)) != NULL) {
1353 if(strcmp(line, EXPLICIT_PARAMETERS) == 0){
1354 skip = 0;
1355 explicit = 1;
1356 }
1357 if(strcmp(line, DEFAULT_PARAMETERS) == 0){
1358 skip = 0;
1359 explicit = 0;
1360 }
1361
1362 if(line[0] != '\n' && line[0] != '*'){
1363 sscanf(line, "%s = %d\n", s, &n);
1364 if(! skip){
1365 int error = elpa_index_set_from_load_int_value(index, s, n, explicit);
1366 }
1367 }
1368 }
1369 fclose(f);
1370 // }
1371
1372 return 1;
1373 }
1374
1375
elpa_index_is_printing_mpi_rank(elpa_index_t index)1376 int elpa_index_is_printing_mpi_rank(elpa_index_t index)
1377 {
1378 int process_id;
1379 if(elpa_index_int_value_is_set(index, "process_id")){
1380 process_id = elpa_index_get_int_value(index, "process_id", NULL);
1381 return (process_id == 0);
1382 }
1383 printf("Warning: process_id not set, printing on all MPI ranks. This can happen with legacy API.");
1384 return 1;
1385 }
1386