1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2019 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2006 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
14 * Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
15 * Copyright (c) 2013-2018 Research Organization for Information Science
16 * and Technology (RIST). All rights reserved.
17 * Copyright (c) 2017 Intel, Inc. All rights reserved
18 * $COPYRIGHT$
19 *
20 * Additional copyrights may follow
21 *
22 * $HEADER$
23 */
24
25 #include "opal_config.h"
26
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <stdint.h>
30
31 #include "opal/prefetch.h"
32 #include "opal/util/arch.h"
33 #include "opal/util/output.h"
34
35 #include "opal/datatype/opal_datatype_internal.h"
36 #include "opal/datatype/opal_datatype.h"
37 #include "opal/datatype/opal_convertor.h"
38 #include "opal/datatype/opal_datatype_checksum.h"
39 #include "opal/datatype/opal_datatype_prototypes.h"
40 #include "opal/datatype/opal_convertor_internal.h"
41 #if OPAL_CUDA_SUPPORT
42 #include "opal/datatype/opal_datatype_cuda.h"
43 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
44 CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
45 #endif
46
opal_convertor_construct(opal_convertor_t * convertor)47 static void opal_convertor_construct( opal_convertor_t* convertor )
48 {
49 convertor->pStack = convertor->static_stack;
50 convertor->stack_size = DT_STATIC_STACK_SIZE;
51 convertor->partial_length = 0;
52 convertor->remoteArch = opal_local_arch;
53 convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
54 #if OPAL_CUDA_SUPPORT
55 convertor->cbmemcpy = &opal_cuda_memcpy;
56 #endif
57 }
58
59
opal_convertor_destruct(opal_convertor_t * convertor)60 static void opal_convertor_destruct( opal_convertor_t* convertor )
61 {
62 opal_convertor_cleanup( convertor );
63 }
64
65 OBJ_CLASS_INSTANCE(opal_convertor_t, opal_object_t, opal_convertor_construct, opal_convertor_destruct );
66
67 static opal_convertor_master_t* opal_convertor_master_list = NULL;
68
69 extern conversion_fct_t opal_datatype_heterogeneous_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED];
70 extern conversion_fct_t opal_datatype_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED];
71
opal_convertor_destroy_masters(void)72 void opal_convertor_destroy_masters( void )
73 {
74 opal_convertor_master_t* master = opal_convertor_master_list;
75
76 while( NULL != master ) {
77 opal_convertor_master_list = master->next;
78 master->next = NULL;
79 /* Cleanup the conversion function if not one of the defaults */
80 if( (master->pFunctions != opal_datatype_heterogeneous_copy_functions) &&
81 (master->pFunctions != opal_datatype_copy_functions) )
82 free( master->pFunctions );
83
84 free( master );
85 master = opal_convertor_master_list;
86 }
87 }
88
89 /**
90 * Find or create a convertor suitable for the remote architecture. If there
91 * is already a master convertor for this architecture then return it.
92 * Otherwise, create and initialize a full featured master convertor.
93 */
opal_convertor_find_or_create_master(uint32_t remote_arch)94 opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_arch )
95 {
96 opal_convertor_master_t* master = opal_convertor_master_list;
97 int i;
98 size_t* remote_sizes;
99
100 while( NULL != master ) {
101 if( master->remote_arch == remote_arch )
102 return master;
103 master = master->next;
104 }
105 /**
106 * Create a new convertor matching the specified architecture and add it to the
107 * master convertor list.
108 */
109 master = (opal_convertor_master_t*)malloc( sizeof(opal_convertor_master_t) );
110 master->next = opal_convertor_master_list;
111 opal_convertor_master_list = master;
112 master->remote_arch = remote_arch;
113 master->flags = 0;
114 master->hetero_mask = 0;
115 /**
116 * Most of the sizes will be identical, so for now just make a copy of
117 * the local ones. As master->remote_sizes is defined as being an array of
118 * consts we have to manually cast it before using it for writing purposes.
119 */
120 remote_sizes = (size_t*)master->remote_sizes;
121 memcpy(remote_sizes, opal_datatype_local_sizes, sizeof(size_t) * OPAL_DATATYPE_MAX_PREDEFINED);
122 /**
123 * If the local and remote architecture are the same there is no need
124 * to check for the remote data sizes. They will always be the same as
125 * the local ones.
126 */
127 if( master->remote_arch == opal_local_arch ) {
128 master->pFunctions = opal_datatype_copy_functions;
129 master->flags |= CONVERTOR_HOMOGENEOUS;
130 return master;
131 }
132
133 /* Find out the remote bool size */
134 if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS8 ) ) {
135 remote_sizes[OPAL_DATATYPE_BOOL] = 1;
136 } else if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS16 ) ) {
137 remote_sizes[OPAL_DATATYPE_BOOL] = 2;
138 } else if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS32 ) ) {
139 remote_sizes[OPAL_DATATYPE_BOOL] = 4;
140 } else {
141 opal_output( 0, "Unknown sizeof(bool) for the remote architecture\n" );
142 }
143
144 /**
145 * Now we can compute the conversion mask. For all sizes where the remote
146 * and local architecture differ a conversion is needed. Moreover, if the
147 * 2 architectures don't have the same endianess all data with a length
148 * over 2 bytes (with the exception of logicals) have to be byte-swapped.
149 */
150 for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
151 if( remote_sizes[i] != opal_datatype_local_sizes[i] )
152 master->hetero_mask |= (((uint32_t)1) << i);
153 }
154 if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_ISBIGENDIAN ) !=
155 opal_arch_checkmask( &opal_local_arch, OPAL_ARCH_ISBIGENDIAN ) ) {
156 uint32_t hetero_mask = 0;
157
158 for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
159 if( remote_sizes[i] > 1 )
160 hetero_mask |= (((uint32_t)1) << i);
161 }
162 hetero_mask &= ~(((uint32_t)1) << OPAL_DATATYPE_BOOL);
163 master->hetero_mask |= hetero_mask;
164 }
165 master->pFunctions = (conversion_fct_t*)malloc( sizeof(opal_datatype_heterogeneous_copy_functions) );
166 /**
167 * Usually the heterogeneous functions are slower than the copy ones. Let's
168 * try to minimize the usage of the heterogeneous versions.
169 */
170 for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
171 if( master->hetero_mask & (((uint32_t)1) << i) )
172 master->pFunctions[i] = opal_datatype_heterogeneous_copy_functions[i];
173 else
174 master->pFunctions[i] = opal_datatype_copy_functions[i];
175 }
176
177 /* We're done so far, return the mater convertor */
178 return master;
179 }
180
181
opal_convertor_create(int32_t remote_arch,int32_t mode)182 opal_convertor_t* opal_convertor_create( int32_t remote_arch, int32_t mode )
183 {
184 opal_convertor_t* convertor = OBJ_NEW(opal_convertor_t);
185 opal_convertor_master_t* master;
186
187 master = opal_convertor_find_or_create_master( remote_arch );
188
189 convertor->remoteArch = remote_arch;
190 convertor->stack_pos = 0;
191 convertor->flags = master->flags;
192 convertor->master = master;
193
194 return convertor;
195 }
196
197 #define OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( CONVERTOR, IOV, OUT, MAX_DATA ) \
198 do { \
199 /* protect against over packing data */ \
200 if( OPAL_UNLIKELY((CONVERTOR)->flags & CONVERTOR_COMPLETED) ) { \
201 (IOV)[0].iov_len = 0; \
202 *(OUT) = 0; \
203 *(MAX_DATA) = 0; \
204 return 1; /* nothing to do */ \
205 } \
206 (CONVERTOR)->checksum = OPAL_CSUM_ZERO; \
207 (CONVERTOR)->csum_ui1 = 0; \
208 (CONVERTOR)->csum_ui2 = 0; \
209 assert( (CONVERTOR)->bConverted < (CONVERTOR)->local_size ); \
210 } while(0)
211
212 /**
213 * Return 0 if everything went OK and if there is still room before the complete
214 * conversion of the data (need additional call with others input buffers )
215 * 1 if everything went fine and the data was completly converted
216 * -1 something wrong occurs.
217 */
opal_convertor_pack(opal_convertor_t * pConv,struct iovec * iov,uint32_t * out_size,size_t * max_data)218 int32_t opal_convertor_pack( opal_convertor_t* pConv,
219 struct iovec* iov, uint32_t* out_size,
220 size_t* max_data )
221 {
222 OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );
223
224 if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
225 /**
226 * We are doing conversion on a contiguous datatype on a homogeneous
227 * environment. The convertor contain minimal information, we only
228 * use the bConverted to manage the conversion.
229 */
230 uint32_t i;
231 unsigned char* base_pointer;
232 size_t pending_length = pConv->local_size - pConv->bConverted;
233
234 *max_data = pending_length;
235 opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );
236
237 for( i = 0; i < *out_size; i++ ) {
238 if( iov[i].iov_len >= pending_length ) {
239 goto complete_contiguous_data_pack;
240 }
241 if( OPAL_LIKELY(NULL == iov[i].iov_base) )
242 iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
243 else
244 #if OPAL_CUDA_SUPPORT
245 MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
246 #else
247 MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
248 #endif
249 pending_length -= iov[i].iov_len;
250 base_pointer += iov[i].iov_len;
251 }
252 *max_data -= pending_length;
253 pConv->bConverted += (*max_data);
254 return 0;
255
256 complete_contiguous_data_pack:
257 iov[i].iov_len = pending_length;
258 if( OPAL_LIKELY(NULL == iov[i].iov_base) )
259 iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
260 else
261 #if OPAL_CUDA_SUPPORT
262 MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
263 #else
264 MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
265 #endif
266 pConv->bConverted = pConv->local_size;
267 *out_size = i + 1;
268 pConv->flags |= CONVERTOR_COMPLETED;
269 return 1;
270 }
271
272 return pConv->fAdvance( pConv, iov, out_size, max_data );
273 }
274
275
opal_convertor_unpack(opal_convertor_t * pConv,struct iovec * iov,uint32_t * out_size,size_t * max_data)276 int32_t opal_convertor_unpack( opal_convertor_t* pConv,
277 struct iovec* iov, uint32_t* out_size,
278 size_t* max_data )
279 {
280 OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );
281
282 if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
283 /**
284 * We are doing conversion on a contiguous datatype on a homogeneous
285 * environment. The convertor contain minimal informations, we only
286 * use the bConverted to manage the conversion.
287 */
288 uint32_t i;
289 unsigned char* base_pointer;
290 size_t pending_length = pConv->local_size - pConv->bConverted;
291
292 *max_data = pending_length;
293 opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );
294
295 for( i = 0; i < *out_size; i++ ) {
296 if( iov[i].iov_len >= pending_length ) {
297 goto complete_contiguous_data_unpack;
298 }
299 #if OPAL_CUDA_SUPPORT
300 MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
301 #else
302 MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
303 #endif
304 pending_length -= iov[i].iov_len;
305 base_pointer += iov[i].iov_len;
306 }
307 *max_data -= pending_length;
308 pConv->bConverted += (*max_data);
309 return 0;
310
311 complete_contiguous_data_unpack:
312 iov[i].iov_len = pending_length;
313 #if OPAL_CUDA_SUPPORT
314 MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
315 #else
316 MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
317 #endif
318 pConv->bConverted = pConv->local_size;
319 *out_size = i + 1;
320 pConv->flags |= CONVERTOR_COMPLETED;
321 return 1;
322 }
323
324 return pConv->fAdvance( pConv, iov, out_size, max_data );
325 }
326
327 static inline int
opal_convertor_create_stack_with_pos_contig(opal_convertor_t * pConvertor,size_t starting_point,const size_t * sizes)328 opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor,
329 size_t starting_point, const size_t* sizes )
330 {
331 dt_stack_t* pStack; /* pointer to the position on the stack */
332 const opal_datatype_t* pData = pConvertor->pDesc;
333 dt_elem_desc_t* pElems;
334 size_t count;
335 ptrdiff_t extent;
336
337 pStack = pConvertor->pStack;
338 /**
339 * The prepare function already make the selection on which data representation
340 * we have to use: normal one or the optimized version ?
341 */
342 pElems = pConvertor->use_desc->desc;
343
344 count = starting_point / pData->size;
345 extent = pData->ub - pData->lb;
346
347 pStack[0].type = OPAL_DATATYPE_LOOP; /* the first one is always the loop */
348 pStack[0].count = pConvertor->count - count;
349 pStack[0].index = -1;
350 pStack[0].disp = count * extent;
351
352 /* now compute the number of pending bytes */
353 count = starting_point % pData->size;
354 /**
355 * We save the current displacement starting from the begining
356 * of this data.
357 */
358 if( OPAL_LIKELY(0 == count) ) {
359 pStack[1].type = pElems->elem.common.type;
360 pStack[1].count = pElems->elem.blocklen;
361 } else {
362 pStack[1].type = OPAL_DATATYPE_UINT1;
363 pStack[1].count = pData->size - count;
364 }
365 pStack[1].disp = count;
366 pStack[1].index = 0; /* useless */
367
368 pConvertor->bConverted = starting_point;
369 pConvertor->stack_pos = 1;
370 assert( 0 == pConvertor->partial_length );
371 return OPAL_SUCCESS;
372 }
373
374 static inline int
opal_convertor_create_stack_at_begining(opal_convertor_t * convertor,const size_t * sizes)375 opal_convertor_create_stack_at_begining( opal_convertor_t* convertor,
376 const size_t* sizes )
377 {
378 dt_stack_t* pStack = convertor->pStack;
379 dt_elem_desc_t* pElems;
380
381 /**
382 * The prepare function already make the selection on which data representation
383 * we have to use: normal one or the optimized version ?
384 */
385 pElems = convertor->use_desc->desc;
386
387 convertor->stack_pos = 1;
388 convertor->partial_length = 0;
389 convertor->bConverted = 0;
390 /**
391 * Fill the first position on the stack. This one correspond to the
392 * last fake OPAL_DATATYPE_END_LOOP that we add to the data representation and
393 * allow us to move quickly inside the datatype when we have a count.
394 */
395 pStack[0].index = -1;
396 pStack[0].count = convertor->count;
397 pStack[0].disp = 0;
398 pStack[0].type = OPAL_DATATYPE_LOOP;
399
400 pStack[1].index = 0;
401 pStack[1].disp = 0;
402 if( pElems[0].elem.common.type == OPAL_DATATYPE_LOOP ) {
403 pStack[1].count = pElems[0].loop.loops;
404 pStack[1].type = OPAL_DATATYPE_LOOP;
405 } else {
406 pStack[1].count = (size_t)pElems[0].elem.count * pElems[0].elem.blocklen;
407 pStack[1].type = pElems[0].elem.common.type;
408 }
409 return OPAL_SUCCESS;
410 }
411
412
opal_convertor_set_position_nocheck(opal_convertor_t * convertor,size_t * position)413 int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
414 size_t* position )
415 {
416 int32_t rc;
417
418 /**
419 * create_stack_with_pos_contig always set the position relative to the ZERO
420 * position, so there is no need for special handling. In all other cases,
421 * if we plan to rollback the convertor then first we have to reset it at
422 * the beginning.
423 */
424 if( OPAL_LIKELY(convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) {
425 rc = opal_convertor_create_stack_with_pos_contig( convertor, (*position),
426 opal_datatype_local_sizes );
427 } else {
428 if( (0 == (*position)) || ((*position) < convertor->bConverted) ) {
429 rc = opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes );
430 if( 0 == (*position) ) return rc;
431 }
432 rc = opal_convertor_generic_simple_position( convertor, position );
433 /**
434 * If we have a non-contigous send convertor don't allow it move in the middle
435 * of a predefined datatype, it won't be able to copy out the left-overs
436 * anyway. Instead force the position to stay on predefined datatypes
437 * boundaries. As we allow partial predefined datatypes on the contiguous
438 * case, we should be accepted by any receiver convertor.
439 */
440 if( CONVERTOR_SEND & convertor->flags ) {
441 convertor->bConverted -= convertor->partial_length;
442 convertor->partial_length = 0;
443 }
444 }
445 *position = convertor->bConverted;
446 return rc;
447 }
448
449 static size_t
opal_datatype_compute_remote_size(const opal_datatype_t * pData,const size_t * sizes)450 opal_datatype_compute_remote_size( const opal_datatype_t* pData,
451 const size_t* sizes )
452 {
453 uint32_t typeMask = pData->bdt_used;
454 size_t length = 0;
455
456 if (opal_datatype_is_predefined(pData)) {
457 return sizes[pData->desc.desc->elem.common.type];
458 }
459
460 if( OPAL_UNLIKELY(NULL == pData->ptypes) ) {
461 /* Allocate and fill the array of types used in the datatype description */
462 opal_datatype_compute_ptypes( (opal_datatype_t*)pData );
463 }
464
465 for( int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) {
466 if( typeMask & ((uint32_t)1 << i) ) {
467 length += (pData->ptypes[i] * sizes[i]);
468 typeMask ^= ((uint32_t)1 << i);
469 }
470 }
471 return length;
472 }
473
474 /**
475 * Compute the remote size. If necessary remove the homogeneous flag
476 * and redirect the convertor description toward the non-optimized
477 * datatype representation.
478 */
opal_convertor_compute_remote_size(opal_convertor_t * pConvertor)479 size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
480 {
481 opal_datatype_t* datatype = (opal_datatype_t*)pConvertor->pDesc;
482
483 pConvertor->remote_size = pConvertor->local_size;
484 if( OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask) ) {
485 pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS);
486 /* Can we use the optimized description? */
487 if (pConvertor->flags & OPAL_DATATYPE_OPTIMIZED_RESTRICTED) {
488 pConvertor->use_desc = &(datatype->desc);
489 }
490 if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
491 /* This is for a single datatype, we must update it with the count */
492 pConvertor->remote_size = opal_datatype_compute_remote_size(datatype,
493 pConvertor->master->remote_sizes);
494 pConvertor->remote_size *= pConvertor->count;
495 }
496 }
497 pConvertor->flags |= CONVERTOR_HAS_REMOTE_SIZE;
498 return pConvertor->remote_size;
499 }
500
501 /**
502 * This macro will initialize a convertor based on a previously created
503 * convertor. The idea is the move outside these function the heavy
504 * selection of architecture features for the convertors. I consider
505 * here that the convertor is clean, either never initialized or already
506 * cleaned.
507 */
508 #define OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ) \
509 { \
510 convertor->local_size = count * datatype->size; \
511 convertor->pBaseBuf = (unsigned char*)pUserBuf; \
512 convertor->count = count; \
513 convertor->pDesc = (opal_datatype_t*)datatype; \
514 convertor->bConverted = 0; \
515 convertor->use_desc = &(datatype->opt_desc); \
516 /* If the data is empty we just mark the convertor as \
517 * completed. With this flag set the pack and unpack functions \
518 * will not do anything. \
519 */ \
520 if( OPAL_UNLIKELY((0 == count) || (0 == datatype->size)) ) { \
521 convertor->flags |= (OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED | CONVERTOR_HAS_REMOTE_SIZE); \
522 convertor->local_size = convertor->remote_size = 0; \
523 return OPAL_SUCCESS; \
524 } \
525 \
526 /* Grab the datatype part of the flags */ \
527 convertor->flags &= CONVERTOR_TYPE_MASK; \
528 convertor->flags |= (CONVERTOR_DATATYPE_MASK & datatype->flags); \
529 convertor->flags |= (CONVERTOR_NO_OP | CONVERTOR_HOMOGENEOUS); \
530 \
531 convertor->remote_size = convertor->local_size; \
532 if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) { \
533 if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && \
534 ((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \
535 ((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \
536 return OPAL_SUCCESS; \
537 } \
538 } \
539 \
540 assert( (convertor)->pDesc == (datatype) ); \
541 opal_convertor_compute_remote_size( convertor ); \
542 assert( NULL != convertor->use_desc->desc ); \
543 /* For predefined datatypes (contiguous) do nothing more */ \
544 /* if checksum is enabled then always continue */ \
545 if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \
546 == OPAL_DATATYPE_FLAG_NO_GAPS) && \
547 ((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \
548 (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { \
549 return OPAL_SUCCESS; \
550 } \
551 convertor->flags &= ~CONVERTOR_NO_OP; \
552 { \
553 uint32_t required_stack_length = datatype->loops + 1; \
554 \
555 if( required_stack_length > convertor->stack_size ) { \
556 assert(convertor->pStack == convertor->static_stack); \
557 convertor->stack_size = required_stack_length; \
558 convertor->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * \
559 convertor->stack_size ); \
560 } \
561 } \
562 opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes ); \
563 }
564
565
opal_convertor_prepare_for_recv(opal_convertor_t * convertor,const struct opal_datatype_t * datatype,size_t count,const void * pUserBuf)566 int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
567 const struct opal_datatype_t* datatype,
568 size_t count,
569 const void* pUserBuf )
570 {
571 /* Here I should check that the data is not overlapping */
572
573 convertor->flags |= CONVERTOR_RECV;
574 #if OPAL_CUDA_SUPPORT
575 if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
576 mca_cuda_convertor_init(convertor, pUserBuf);
577 }
578 #endif
579
580 assert(! (convertor->flags & CONVERTOR_SEND));
581 OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
582
583 #if defined(CHECKSUM)
584 if( OPAL_UNLIKELY(convertor->flags & CONVERTOR_WITH_CHECKSUM) ) {
585 if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
586 convertor->fAdvance = opal_unpack_general_checksum;
587 } else {
588 if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
589 convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
590 } else {
591 convertor->fAdvance = opal_generic_simple_unpack_checksum;
592 }
593 }
594 } else
595 #endif /* defined(CHECKSUM) */
596 if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
597 convertor->fAdvance = opal_unpack_general;
598 } else {
599 if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
600 convertor->fAdvance = opal_unpack_homogeneous_contig;
601 } else {
602 convertor->fAdvance = opal_generic_simple_unpack;
603 }
604 }
605 return OPAL_SUCCESS;
606 }
607
608
opal_convertor_prepare_for_send(opal_convertor_t * convertor,const struct opal_datatype_t * datatype,size_t count,const void * pUserBuf)609 int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
610 const struct opal_datatype_t* datatype,
611 size_t count,
612 const void* pUserBuf )
613 {
614 convertor->flags |= CONVERTOR_SEND;
615 #if OPAL_CUDA_SUPPORT
616 if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
617 mca_cuda_convertor_init(convertor, pUserBuf);
618 }
619 #endif
620
621 OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
622
623 #if defined(CHECKSUM)
624 if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
625 if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
626 convertor->fAdvance = opal_pack_general_checksum;
627 } else {
628 if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
629 if( ((datatype->ub - datatype->lb) == (ptrdiff_t)datatype->size)
630 || (1 >= convertor->count) )
631 convertor->fAdvance = opal_pack_homogeneous_contig_checksum;
632 else
633 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
634 } else {
635 convertor->fAdvance = opal_generic_simple_pack_checksum;
636 }
637 }
638 } else
639 #endif /* defined(CHECKSUM) */
640 if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
641 convertor->fAdvance = opal_pack_general;
642 } else {
643 if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
644 if( ((datatype->ub - datatype->lb) == (ptrdiff_t)datatype->size)
645 || (1 >= convertor->count) )
646 convertor->fAdvance = opal_pack_homogeneous_contig;
647 else
648 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
649 } else {
650 convertor->fAdvance = opal_generic_simple_pack;
651 }
652 }
653 return OPAL_SUCCESS;
654 }
655
656 /*
657 * These functions can be used in order to create an IDENTICAL copy of one convertor. In this
658 * context IDENTICAL means that the datatype and count and all other properties of the basic
659 * convertor get replicated on this new convertor. However, the references to the datatype
660 * are not increased. This function take special care about the stack. If all the cases the
661 * stack is created with the correct number of entries but if the copy_stack is true (!= 0)
662 * then the content of the old stack is copied on the new one. The result will be a convertor
663 * ready to use starting from the old position. If copy_stack is false then the convertor
664 * is created with a empty stack (you have to use opal_convertor_set_position before using it).
665 */
opal_convertor_clone(const opal_convertor_t * source,opal_convertor_t * destination,int32_t copy_stack)666 int opal_convertor_clone( const opal_convertor_t* source,
667 opal_convertor_t* destination,
668 int32_t copy_stack )
669 {
670 destination->remoteArch = source->remoteArch;
671 destination->flags = source->flags;
672 destination->pDesc = source->pDesc;
673 destination->use_desc = source->use_desc;
674 destination->count = source->count;
675 destination->pBaseBuf = source->pBaseBuf;
676 destination->fAdvance = source->fAdvance;
677 destination->master = source->master;
678 destination->local_size = source->local_size;
679 destination->remote_size = source->remote_size;
680 /* create the stack */
681 if( OPAL_UNLIKELY(source->stack_size > DT_STATIC_STACK_SIZE) ) {
682 destination->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * source->stack_size );
683 } else {
684 destination->pStack = destination->static_stack;
685 }
686 destination->stack_size = source->stack_size;
687
688 /* initialize the stack */
689 if( OPAL_LIKELY(0 == copy_stack) ) {
690 destination->bConverted = -1;
691 destination->stack_pos = -1;
692 } else {
693 memcpy( destination->pStack, source->pStack, sizeof(dt_stack_t) * (source->stack_pos+1) );
694 destination->bConverted = source->bConverted;
695 destination->stack_pos = source->stack_pos;
696 }
697 #if OPAL_CUDA_SUPPORT
698 destination->cbmemcpy = source->cbmemcpy;
699 #endif
700 return OPAL_SUCCESS;
701 }
702
703
opal_convertor_dump(opal_convertor_t * convertor)704 void opal_convertor_dump( opal_convertor_t* convertor )
705 {
706 opal_output( 0, "Convertor %p count %" PRIsize_t " stack position %u bConverted %" PRIsize_t "\n"
707 "\tlocal_size %" PRIsize_t " remote_size %" PRIsize_t " flags %X stack_size %u pending_length %" PRIsize_t "\n"
708 "\tremote_arch %u local_arch %u\n",
709 (void*)convertor,
710 convertor->count, convertor->stack_pos, convertor->bConverted,
711 convertor->local_size, convertor->remote_size,
712 convertor->flags, convertor->stack_size, convertor->partial_length,
713 convertor->remoteArch, opal_local_arch );
714 if( convertor->flags & CONVERTOR_RECV ) opal_output( 0, "unpack ");
715 if( convertor->flags & CONVERTOR_SEND ) opal_output( 0, "pack ");
716 if( convertor->flags & CONVERTOR_SEND_CONVERSION ) opal_output( 0, "conversion ");
717 if( convertor->flags & CONVERTOR_HOMOGENEOUS ) opal_output( 0, "homogeneous " );
718 else opal_output( 0, "heterogeneous ");
719 if( convertor->flags & CONVERTOR_NO_OP ) opal_output( 0, "no_op ");
720 if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) opal_output( 0, "checksum ");
721 if( convertor->flags & CONVERTOR_CUDA ) opal_output( 0, "CUDA ");
722 if( convertor->flags & CONVERTOR_CUDA_ASYNC ) opal_output( 0, "CUDA Async ");
723 if( convertor->flags & CONVERTOR_COMPLETED ) opal_output( 0, "COMPLETED ");
724
725 opal_datatype_dump( convertor->pDesc );
726 if( !((0 == convertor->stack_pos) &&
727 ((size_t)convertor->pStack[convertor->stack_pos].index > convertor->pDesc->desc.length)) ) {
728 /* only if the convertor is completely initialized */
729 opal_output( 0, "Actual stack representation\n" );
730 opal_datatype_dump_stack( convertor->pStack, convertor->stack_pos,
731 convertor->pDesc->desc.desc, convertor->pDesc->name );
732 }
733 }
734
735
opal_datatype_dump_stack(const dt_stack_t * pStack,int stack_pos,const union dt_elem_desc * pDesc,const char * name)736 void opal_datatype_dump_stack( const dt_stack_t* pStack, int stack_pos,
737 const union dt_elem_desc* pDesc, const char* name )
738 {
739 opal_output( 0, "\nStack %p stack_pos %d name %s\n", (void*)pStack, stack_pos, name );
740 for( ; stack_pos >= 0; stack_pos-- ) {
741 opal_output( 0, "%d: pos %d count %" PRIsize_t " disp %ld ", stack_pos, pStack[stack_pos].index,
742 pStack[stack_pos].count, pStack[stack_pos].disp );
743 if( pStack->index != -1 )
744 opal_output( 0, "\t[desc count %lu disp %ld extent %ld]\n",
745 (unsigned long)pDesc[pStack[stack_pos].index].elem.count,
746 (long)pDesc[pStack[stack_pos].index].elem.disp,
747 (long)pDesc[pStack[stack_pos].index].elem.extent );
748 else
749 opal_output( 0, "\n" );
750 }
751 opal_output( 0, "\n" );
752 }
753