1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2019 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2006 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
14  * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
15  * Copyright (c) 2013-2018 Research Organization for Information Science
16  *                         and Technology (RIST).  All rights reserved.
17  * Copyright (c) 2017      Intel, Inc. All rights reserved
18  * $COPYRIGHT$
19  *
20  * Additional copyrights may follow
21  *
22  * $HEADER$
23  */
24 
25 #include "opal_config.h"
26 
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <stdint.h>
30 
31 #include "opal/prefetch.h"
32 #include "opal/util/arch.h"
33 #include "opal/util/output.h"
34 
35 #include "opal/datatype/opal_datatype_internal.h"
36 #include "opal/datatype/opal_datatype.h"
37 #include "opal/datatype/opal_convertor.h"
38 #include "opal/datatype/opal_datatype_checksum.h"
39 #include "opal/datatype/opal_datatype_prototypes.h"
40 #include "opal/datatype/opal_convertor_internal.h"
41 #if OPAL_CUDA_SUPPORT
42 #include "opal/datatype/opal_datatype_cuda.h"
43 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
44     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
45 #endif
46 
opal_convertor_construct(opal_convertor_t * convertor)47 static void opal_convertor_construct( opal_convertor_t* convertor )
48 {
49     convertor->pStack         = convertor->static_stack;
50     convertor->stack_size     = DT_STATIC_STACK_SIZE;
51     convertor->partial_length = 0;
52     convertor->remoteArch     = opal_local_arch;
53     convertor->flags          = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
54 #if OPAL_CUDA_SUPPORT
55     convertor->cbmemcpy       = &opal_cuda_memcpy;
56 #endif
57 }
58 
59 
opal_convertor_destruct(opal_convertor_t * convertor)60 static void opal_convertor_destruct( opal_convertor_t* convertor )
61 {
62     opal_convertor_cleanup( convertor );
63 }
64 
65 OBJ_CLASS_INSTANCE(opal_convertor_t, opal_object_t, opal_convertor_construct, opal_convertor_destruct );
66 
67 static opal_convertor_master_t* opal_convertor_master_list = NULL;
68 
69 extern conversion_fct_t opal_datatype_heterogeneous_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED];
70 extern conversion_fct_t opal_datatype_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED];
71 
opal_convertor_destroy_masters(void)72 void opal_convertor_destroy_masters( void )
73 {
74     opal_convertor_master_t* master = opal_convertor_master_list;
75 
76     while( NULL != master ) {
77         opal_convertor_master_list = master->next;
78         master->next = NULL;
79         /* Cleanup the conversion function if not one of the defaults */
80         if( (master->pFunctions != opal_datatype_heterogeneous_copy_functions) &&
81             (master->pFunctions != opal_datatype_copy_functions) )
82             free( master->pFunctions );
83 
84         free( master );
85         master = opal_convertor_master_list;
86     }
87 }
88 
89 /**
90  * Find or create a convertor suitable for the remote architecture. If there
91  * is already a master convertor for this architecture then return it.
92  * Otherwise, create and initialize a full featured master convertor.
93  */
opal_convertor_find_or_create_master(uint32_t remote_arch)94 opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_arch )
95 {
96     opal_convertor_master_t* master = opal_convertor_master_list;
97     int i;
98     size_t* remote_sizes;
99 
100     while( NULL != master ) {
101         if( master->remote_arch == remote_arch )
102             return master;
103         master = master->next;
104     }
105     /**
106      * Create a new convertor matching the specified architecture and add it to the
107      * master convertor list.
108      */
109     master = (opal_convertor_master_t*)malloc( sizeof(opal_convertor_master_t) );
110     master->next = opal_convertor_master_list;
111     opal_convertor_master_list = master;
112     master->remote_arch = remote_arch;
113     master->flags       = 0;
114     master->hetero_mask = 0;
115     /**
116      * Most of the sizes will be identical, so for now just make a copy of
117      * the local ones. As master->remote_sizes is defined as being an array of
118      * consts we have to manually cast it before using it for writing purposes.
119      */
120     remote_sizes = (size_t*)master->remote_sizes;
121     memcpy(remote_sizes, opal_datatype_local_sizes, sizeof(size_t) * OPAL_DATATYPE_MAX_PREDEFINED);
122     /**
123      * If the local and remote architecture are the same there is no need
124      * to check for the remote data sizes. They will always be the same as
125      * the local ones.
126      */
127     if( master->remote_arch == opal_local_arch ) {
128         master->pFunctions = opal_datatype_copy_functions;
129         master->flags |= CONVERTOR_HOMOGENEOUS;
130         return master;
131     }
132 
133     /* Find out the remote bool size */
134     if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS8 ) ) {
135         remote_sizes[OPAL_DATATYPE_BOOL] = 1;
136     } else if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS16 ) ) {
137         remote_sizes[OPAL_DATATYPE_BOOL] = 2;
138     } else if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS32 ) ) {
139         remote_sizes[OPAL_DATATYPE_BOOL] = 4;
140     } else {
141         opal_output( 0, "Unknown sizeof(bool) for the remote architecture\n" );
142     }
143 
144     /**
145      * Now we can compute the conversion mask. For all sizes where the remote
146      * and local architecture differ a conversion is needed. Moreover, if the
147      * 2 architectures don't have the same endianess all data with a length
148      * over 2 bytes (with the exception of logicals) have to be byte-swapped.
149      */
150     for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
151         if( remote_sizes[i] != opal_datatype_local_sizes[i] )
152             master->hetero_mask |= (((uint32_t)1) << i);
153     }
154     if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_ISBIGENDIAN ) !=
155         opal_arch_checkmask( &opal_local_arch, OPAL_ARCH_ISBIGENDIAN ) ) {
156         uint32_t hetero_mask = 0;
157 
158         for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
159             if( remote_sizes[i] > 1 )
160                 hetero_mask |= (((uint32_t)1) << i);
161         }
162         hetero_mask &= ~(((uint32_t)1) << OPAL_DATATYPE_BOOL);
163         master->hetero_mask |= hetero_mask;
164     }
165     master->pFunctions = (conversion_fct_t*)malloc( sizeof(opal_datatype_heterogeneous_copy_functions) );
166     /**
167      * Usually the heterogeneous functions are slower than the copy ones. Let's
168      * try to minimize the usage of the heterogeneous versions.
169      */
170     for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
171         if( master->hetero_mask & (((uint32_t)1) << i) )
172             master->pFunctions[i] = opal_datatype_heterogeneous_copy_functions[i];
173         else
174             master->pFunctions[i] = opal_datatype_copy_functions[i];
175     }
176 
177     /* We're done so far, return the mater convertor */
178     return master;
179 }
180 
181 
opal_convertor_create(int32_t remote_arch,int32_t mode)182 opal_convertor_t* opal_convertor_create( int32_t remote_arch, int32_t mode )
183 {
184     opal_convertor_t* convertor = OBJ_NEW(opal_convertor_t);
185     opal_convertor_master_t* master;
186 
187     master = opal_convertor_find_or_create_master( remote_arch );
188 
189     convertor->remoteArch = remote_arch;
190     convertor->stack_pos  = 0;
191     convertor->flags      = master->flags;
192     convertor->master     = master;
193 
194     return convertor;
195 }
196 
197 #define OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( CONVERTOR, IOV, OUT, MAX_DATA ) \
198     do {                                                                \
199         /* protect against over packing data */                         \
200         if( OPAL_UNLIKELY((CONVERTOR)->flags & CONVERTOR_COMPLETED) ) { \
201             (IOV)[0].iov_len = 0;                                       \
202             *(OUT) = 0;                                                 \
203             *(MAX_DATA) = 0;                                            \
204             return 1;  /* nothing to do */                              \
205         }                                                               \
206         (CONVERTOR)->checksum = OPAL_CSUM_ZERO;                         \
207         (CONVERTOR)->csum_ui1 = 0;                                      \
208         (CONVERTOR)->csum_ui2 = 0;                                      \
209         assert( (CONVERTOR)->bConverted < (CONVERTOR)->local_size );    \
210     } while(0)
211 
212 /**
213  * Return 0 if everything went OK and if there is still room before the complete
214  *          conversion of the data (need additional call with others input buffers )
215  *        1 if everything went fine and the data was completly converted
216  *       -1 something wrong occurs.
217  */
opal_convertor_pack(opal_convertor_t * pConv,struct iovec * iov,uint32_t * out_size,size_t * max_data)218 int32_t opal_convertor_pack( opal_convertor_t* pConv,
219                              struct iovec* iov, uint32_t* out_size,
220                              size_t* max_data )
221 {
222     OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );
223 
224     if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
225         /**
226          * We are doing conversion on a contiguous datatype on a homogeneous
227          * environment. The convertor contain minimal information, we only
228          * use the bConverted to manage the conversion.
229          */
230         uint32_t i;
231         unsigned char* base_pointer;
232         size_t pending_length = pConv->local_size - pConv->bConverted;
233 
234         *max_data = pending_length;
235         opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );
236 
237         for( i = 0; i < *out_size; i++ ) {
238             if( iov[i].iov_len >= pending_length ) {
239                 goto complete_contiguous_data_pack;
240             }
241             if( OPAL_LIKELY(NULL == iov[i].iov_base) )
242                 iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
243             else
244 #if OPAL_CUDA_SUPPORT
245                 MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
246 #else
247                 MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
248 #endif
249             pending_length -= iov[i].iov_len;
250             base_pointer += iov[i].iov_len;
251         }
252         *max_data -= pending_length;
253         pConv->bConverted += (*max_data);
254         return 0;
255 
256 complete_contiguous_data_pack:
257         iov[i].iov_len = pending_length;
258         if( OPAL_LIKELY(NULL == iov[i].iov_base) )
259             iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
260         else
261 #if OPAL_CUDA_SUPPORT
262             MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
263 #else
264             MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
265 #endif
266         pConv->bConverted = pConv->local_size;
267         *out_size = i + 1;
268         pConv->flags |= CONVERTOR_COMPLETED;
269         return 1;
270     }
271 
272     return pConv->fAdvance( pConv, iov, out_size, max_data );
273 }
274 
275 
opal_convertor_unpack(opal_convertor_t * pConv,struct iovec * iov,uint32_t * out_size,size_t * max_data)276 int32_t opal_convertor_unpack( opal_convertor_t* pConv,
277                                struct iovec* iov, uint32_t* out_size,
278                                size_t* max_data )
279 {
280     OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );
281 
282     if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
283         /**
284          * We are doing conversion on a contiguous datatype on a homogeneous
285          * environment. The convertor contain minimal informations, we only
286          * use the bConverted to manage the conversion.
287          */
288         uint32_t i;
289         unsigned char* base_pointer;
290         size_t pending_length = pConv->local_size - pConv->bConverted;
291 
292         *max_data = pending_length;
293         opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );
294 
295         for( i = 0; i < *out_size; i++ ) {
296             if( iov[i].iov_len >= pending_length ) {
297                 goto complete_contiguous_data_unpack;
298             }
299 #if OPAL_CUDA_SUPPORT
300             MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
301 #else
302             MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
303 #endif
304             pending_length -= iov[i].iov_len;
305             base_pointer += iov[i].iov_len;
306         }
307         *max_data -= pending_length;
308         pConv->bConverted += (*max_data);
309         return 0;
310 
311 complete_contiguous_data_unpack:
312         iov[i].iov_len = pending_length;
313 #if OPAL_CUDA_SUPPORT
314         MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
315 #else
316         MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
317 #endif
318         pConv->bConverted = pConv->local_size;
319         *out_size = i + 1;
320         pConv->flags |= CONVERTOR_COMPLETED;
321         return 1;
322     }
323 
324     return pConv->fAdvance( pConv, iov, out_size, max_data );
325 }
326 
327 static inline int
opal_convertor_create_stack_with_pos_contig(opal_convertor_t * pConvertor,size_t starting_point,const size_t * sizes)328 opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor,
329                                              size_t starting_point, const size_t* sizes )
330 {
331     dt_stack_t* pStack;   /* pointer to the position on the stack */
332     const opal_datatype_t* pData = pConvertor->pDesc;
333     dt_elem_desc_t* pElems;
334     size_t count;
335     ptrdiff_t extent;
336 
337     pStack = pConvertor->pStack;
338     /**
339      * The prepare function already make the selection on which data representation
340      * we have to use: normal one or the optimized version ?
341      */
342     pElems = pConvertor->use_desc->desc;
343 
344     count = starting_point / pData->size;
345     extent = pData->ub - pData->lb;
346 
347     pStack[0].type     = OPAL_DATATYPE_LOOP;  /* the first one is always the loop */
348     pStack[0].count    = pConvertor->count - count;
349     pStack[0].index    = -1;
350     pStack[0].disp     = count * extent;
351 
352     /* now compute the number of pending bytes */
353     count = starting_point % pData->size;
354     /**
355      * We save the current displacement starting from the begining
356      * of this data.
357      */
358     if( OPAL_LIKELY(0 == count) ) {
359         pStack[1].type     = pElems->elem.common.type;
360         pStack[1].count    = pElems->elem.blocklen;
361     } else {
362         pStack[1].type  = OPAL_DATATYPE_UINT1;
363         pStack[1].count = pData->size - count;
364     }
365     pStack[1].disp  = count;
366     pStack[1].index = 0;  /* useless */
367 
368     pConvertor->bConverted = starting_point;
369     pConvertor->stack_pos = 1;
370     assert( 0 == pConvertor->partial_length );
371     return OPAL_SUCCESS;
372 }
373 
374 static inline int
opal_convertor_create_stack_at_begining(opal_convertor_t * convertor,const size_t * sizes)375 opal_convertor_create_stack_at_begining( opal_convertor_t* convertor,
376                                          const size_t* sizes )
377 {
378     dt_stack_t* pStack = convertor->pStack;
379     dt_elem_desc_t* pElems;
380 
381     /**
382      * The prepare function already make the selection on which data representation
383      * we have to use: normal one or the optimized version ?
384      */
385     pElems = convertor->use_desc->desc;
386 
387     convertor->stack_pos      = 1;
388     convertor->partial_length = 0;
389     convertor->bConverted     = 0;
390     /**
391      * Fill the first position on the stack. This one correspond to the
392      * last fake OPAL_DATATYPE_END_LOOP that we add to the data representation and
393      * allow us to move quickly inside the datatype when we have a count.
394      */
395     pStack[0].index = -1;
396     pStack[0].count = convertor->count;
397     pStack[0].disp  = 0;
398     pStack[0].type  = OPAL_DATATYPE_LOOP;
399 
400     pStack[1].index = 0;
401     pStack[1].disp = 0;
402     if( pElems[0].elem.common.type == OPAL_DATATYPE_LOOP ) {
403         pStack[1].count = pElems[0].loop.loops;
404         pStack[1].type  = OPAL_DATATYPE_LOOP;
405     } else {
406         pStack[1].count = (size_t)pElems[0].elem.count * pElems[0].elem.blocklen;
407         pStack[1].type  = pElems[0].elem.common.type;
408     }
409     return OPAL_SUCCESS;
410 }
411 
412 
opal_convertor_set_position_nocheck(opal_convertor_t * convertor,size_t * position)413 int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
414                                              size_t* position )
415 {
416     int32_t rc;
417 
418     /**
419      * create_stack_with_pos_contig always set the position relative to the ZERO
420      * position, so there is no need for special handling. In all other cases,
421      * if we plan to rollback the convertor then first we have to reset it at
422      * the beginning.
423      */
424     if( OPAL_LIKELY(convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) {
425         rc = opal_convertor_create_stack_with_pos_contig( convertor, (*position),
426                                                           opal_datatype_local_sizes );
427     } else {
428         if( (0 == (*position)) || ((*position) < convertor->bConverted) ) {
429             rc = opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes );
430             if( 0 == (*position) ) return rc;
431         }
432         rc = opal_convertor_generic_simple_position( convertor, position );
433         /**
434          * If we have a non-contigous send convertor don't allow it move in the middle
435          * of a predefined datatype, it won't be able to copy out the left-overs
436          * anyway. Instead force the position to stay on predefined datatypes
437          * boundaries. As we allow partial predefined datatypes on the contiguous
438          * case, we should be accepted by any receiver convertor.
439          */
440         if( CONVERTOR_SEND & convertor->flags ) {
441             convertor->bConverted -= convertor->partial_length;
442             convertor->partial_length = 0;
443         }
444     }
445     *position = convertor->bConverted;
446     return rc;
447 }
448 
449 static size_t
opal_datatype_compute_remote_size(const opal_datatype_t * pData,const size_t * sizes)450 opal_datatype_compute_remote_size( const opal_datatype_t* pData,
451                                    const size_t* sizes )
452 {
453     uint32_t typeMask = pData->bdt_used;
454     size_t length = 0;
455 
456     if (opal_datatype_is_predefined(pData)) {
457         return sizes[pData->desc.desc->elem.common.type];
458     }
459 
460     if( OPAL_UNLIKELY(NULL == pData->ptypes) ) {
461         /* Allocate and fill the array of types used in the datatype description */
462         opal_datatype_compute_ptypes( (opal_datatype_t*)pData );
463     }
464 
465     for( int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) {
466         if( typeMask & ((uint32_t)1 << i) ) {
467             length += (pData->ptypes[i] * sizes[i]);
468             typeMask ^= ((uint32_t)1 << i);
469         }
470     }
471     return length;
472 }
473 
474 /**
475  * Compute the remote size. If necessary remove the homogeneous flag
476  * and redirect the convertor description toward the non-optimized
477  * datatype representation.
478  */
opal_convertor_compute_remote_size(opal_convertor_t * pConvertor)479 size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
480 {
481     opal_datatype_t* datatype = (opal_datatype_t*)pConvertor->pDesc;
482 
483     pConvertor->remote_size = pConvertor->local_size;
484     if( OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask) ) {
485         pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS);
486         /* Can we use the optimized description? */
487         if (pConvertor->flags & OPAL_DATATYPE_OPTIMIZED_RESTRICTED) {
488             pConvertor->use_desc = &(datatype->desc);
489         }
490         if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
491             /* This is for a single datatype, we must update it with the count */
492             pConvertor->remote_size = opal_datatype_compute_remote_size(datatype,
493                                                                         pConvertor->master->remote_sizes);
494             pConvertor->remote_size *= pConvertor->count;
495         }
496     }
497     pConvertor->flags |= CONVERTOR_HAS_REMOTE_SIZE;
498     return pConvertor->remote_size;
499 }
500 
501 /**
502  * This macro will initialize a convertor based on a previously created
503  * convertor. The idea is the move outside these function the heavy
504  * selection of architecture features for the convertors. I consider
505  * here that the convertor is clean, either never initialized or already
506  * cleaned.
507  */
508 #define OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf )  \
509     {                                                                   \
510         convertor->local_size = count * datatype->size;                 \
511         convertor->pBaseBuf   = (unsigned char*)pUserBuf;               \
512         convertor->count      = count;                                  \
513         convertor->pDesc      = (opal_datatype_t*)datatype;             \
514         convertor->bConverted = 0;                                      \
515         convertor->use_desc   = &(datatype->opt_desc);                  \
516         /* If the data is empty we just mark the convertor as           \
517          * completed. With this flag set the pack and unpack functions  \
518          * will not do anything.                                        \
519          */                                                             \
520         if( OPAL_UNLIKELY((0 == count) || (0 == datatype->size)) ) {    \
521             convertor->flags |= (OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED | CONVERTOR_HAS_REMOTE_SIZE); \
522             convertor->local_size = convertor->remote_size = 0;         \
523             return OPAL_SUCCESS;                                        \
524         }                                                               \
525                                                                         \
526         /* Grab the datatype part of the flags */                       \
527         convertor->flags     &= CONVERTOR_TYPE_MASK;                    \
528         convertor->flags     |= (CONVERTOR_DATATYPE_MASK & datatype->flags); \
529         convertor->flags     |= (CONVERTOR_NO_OP | CONVERTOR_HOMOGENEOUS); \
530                                                                         \
531         convertor->remote_size = convertor->local_size;                 \
532         if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) {   \
533             if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) &&        \
534                 ((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \
535                  ((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \
536                 return OPAL_SUCCESS;                                    \
537             }                                                           \
538         }                                                               \
539                                                                         \
540         assert( (convertor)->pDesc == (datatype) );                     \
541         opal_convertor_compute_remote_size( convertor );                \
542         assert( NULL != convertor->use_desc->desc );                    \
543         /* For predefined datatypes (contiguous) do nothing more */     \
544         /* if checksum is enabled then always continue */               \
545         if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \
546              == OPAL_DATATYPE_FLAG_NO_GAPS) &&                          \
547             ((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \
548              (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) {              \
549             return OPAL_SUCCESS;                                        \
550         }                                                               \
551         convertor->flags &= ~CONVERTOR_NO_OP;                           \
552         {                                                               \
553             uint32_t required_stack_length = datatype->loops + 1;       \
554                                                                         \
555             if( required_stack_length > convertor->stack_size ) {       \
556                 assert(convertor->pStack == convertor->static_stack);   \
557                 convertor->stack_size = required_stack_length;          \
558                 convertor->pStack     = (dt_stack_t*)malloc(sizeof(dt_stack_t) * \
559                                                             convertor->stack_size ); \
560             }                                                           \
561         }                                                               \
562         opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes ); \
563     }
564 
565 
opal_convertor_prepare_for_recv(opal_convertor_t * convertor,const struct opal_datatype_t * datatype,size_t count,const void * pUserBuf)566 int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
567                                          const struct opal_datatype_t* datatype,
568                                          size_t count,
569                                          const void* pUserBuf )
570 {
571     /* Here I should check that the data is not overlapping */
572 
573     convertor->flags |= CONVERTOR_RECV;
574 #if OPAL_CUDA_SUPPORT
575     if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
576         mca_cuda_convertor_init(convertor, pUserBuf);
577     }
578 #endif
579 
580     assert(! (convertor->flags & CONVERTOR_SEND));
581     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
582 
583 #if defined(CHECKSUM)
584     if( OPAL_UNLIKELY(convertor->flags & CONVERTOR_WITH_CHECKSUM) ) {
585         if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
586             convertor->fAdvance = opal_unpack_general_checksum;
587         } else {
588             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
589                 convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
590             } else {
591                 convertor->fAdvance = opal_generic_simple_unpack_checksum;
592             }
593         }
594     } else
595 #endif  /* defined(CHECKSUM) */
596         if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
597             convertor->fAdvance = opal_unpack_general;
598         } else {
599             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
600                 convertor->fAdvance = opal_unpack_homogeneous_contig;
601             } else {
602                 convertor->fAdvance = opal_generic_simple_unpack;
603             }
604         }
605     return OPAL_SUCCESS;
606 }
607 
608 
opal_convertor_prepare_for_send(opal_convertor_t * convertor,const struct opal_datatype_t * datatype,size_t count,const void * pUserBuf)609 int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
610                                          const struct opal_datatype_t* datatype,
611                                          size_t count,
612                                          const void* pUserBuf )
613 {
614     convertor->flags |= CONVERTOR_SEND;
615 #if OPAL_CUDA_SUPPORT
616     if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
617         mca_cuda_convertor_init(convertor, pUserBuf);
618     }
619 #endif
620 
621     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
622 
623 #if defined(CHECKSUM)
624     if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
625         if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
626             convertor->fAdvance = opal_pack_general_checksum;
627         } else {
628             if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
629                 if( ((datatype->ub - datatype->lb) == (ptrdiff_t)datatype->size)
630                     || (1 >= convertor->count) )
631                     convertor->fAdvance = opal_pack_homogeneous_contig_checksum;
632                 else
633                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
634             } else {
635                 convertor->fAdvance = opal_generic_simple_pack_checksum;
636             }
637         }
638     } else
639 #endif  /* defined(CHECKSUM) */
640         if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
641             convertor->fAdvance = opal_pack_general;
642         } else {
643             if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
644                 if( ((datatype->ub - datatype->lb) == (ptrdiff_t)datatype->size)
645                     || (1 >= convertor->count) )
646                     convertor->fAdvance = opal_pack_homogeneous_contig;
647                 else
648                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
649             } else {
650                 convertor->fAdvance = opal_generic_simple_pack;
651             }
652         }
653     return OPAL_SUCCESS;
654 }
655 
656 /*
657  * These functions can be used in order to create an IDENTICAL copy of one convertor. In this
658  * context IDENTICAL means that the datatype and count and all other properties of the basic
659  * convertor get replicated on this new convertor. However, the references to the datatype
660  * are not increased. This function take special care about the stack. If all the cases the
661  * stack is created with the correct number of entries but if the copy_stack is true (!= 0)
662  * then the content of the old stack is copied on the new one. The result will be a convertor
663  * ready to use starting from the old position. If copy_stack is false then the convertor
664  * is created with a empty stack (you have to use opal_convertor_set_position before using it).
665  */
opal_convertor_clone(const opal_convertor_t * source,opal_convertor_t * destination,int32_t copy_stack)666 int opal_convertor_clone( const opal_convertor_t* source,
667                           opal_convertor_t* destination,
668                           int32_t copy_stack )
669 {
670     destination->remoteArch        = source->remoteArch;
671     destination->flags             = source->flags;
672     destination->pDesc             = source->pDesc;
673     destination->use_desc          = source->use_desc;
674     destination->count             = source->count;
675     destination->pBaseBuf          = source->pBaseBuf;
676     destination->fAdvance          = source->fAdvance;
677     destination->master            = source->master;
678     destination->local_size        = source->local_size;
679     destination->remote_size       = source->remote_size;
680     /* create the stack */
681     if( OPAL_UNLIKELY(source->stack_size > DT_STATIC_STACK_SIZE) ) {
682         destination->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * source->stack_size );
683     } else {
684         destination->pStack = destination->static_stack;
685     }
686     destination->stack_size = source->stack_size;
687 
688     /* initialize the stack */
689     if( OPAL_LIKELY(0 == copy_stack) ) {
690         destination->bConverted = -1;
691         destination->stack_pos  = -1;
692     } else {
693         memcpy( destination->pStack, source->pStack, sizeof(dt_stack_t) * (source->stack_pos+1) );
694         destination->bConverted = source->bConverted;
695         destination->stack_pos  = source->stack_pos;
696     }
697 #if OPAL_CUDA_SUPPORT
698     destination->cbmemcpy   = source->cbmemcpy;
699 #endif
700     return OPAL_SUCCESS;
701 }
702 
703 
opal_convertor_dump(opal_convertor_t * convertor)704 void opal_convertor_dump( opal_convertor_t* convertor )
705 {
706     opal_output( 0, "Convertor %p count %" PRIsize_t " stack position %u bConverted %" PRIsize_t "\n"
707                  "\tlocal_size %" PRIsize_t " remote_size %" PRIsize_t " flags %X stack_size %u pending_length %" PRIsize_t "\n"
708                  "\tremote_arch %u local_arch %u\n",
709                  (void*)convertor,
710                  convertor->count, convertor->stack_pos, convertor->bConverted,
711                  convertor->local_size, convertor->remote_size,
712                  convertor->flags, convertor->stack_size, convertor->partial_length,
713                  convertor->remoteArch, opal_local_arch );
714     if( convertor->flags & CONVERTOR_RECV ) opal_output( 0, "unpack ");
715     if( convertor->flags & CONVERTOR_SEND ) opal_output( 0, "pack ");
716     if( convertor->flags & CONVERTOR_SEND_CONVERSION ) opal_output( 0, "conversion ");
717     if( convertor->flags & CONVERTOR_HOMOGENEOUS ) opal_output( 0, "homogeneous " );
718     else opal_output( 0, "heterogeneous ");
719     if( convertor->flags & CONVERTOR_NO_OP ) opal_output( 0, "no_op ");
720     if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) opal_output( 0, "checksum ");
721     if( convertor->flags & CONVERTOR_CUDA ) opal_output( 0, "CUDA ");
722     if( convertor->flags & CONVERTOR_CUDA_ASYNC ) opal_output( 0, "CUDA Async ");
723     if( convertor->flags & CONVERTOR_COMPLETED ) opal_output( 0, "COMPLETED ");
724 
725     opal_datatype_dump( convertor->pDesc );
726     if( !((0 == convertor->stack_pos) &&
727           ((size_t)convertor->pStack[convertor->stack_pos].index > convertor->pDesc->desc.length)) ) {
728         /* only if the convertor is completely initialized */
729         opal_output( 0, "Actual stack representation\n" );
730         opal_datatype_dump_stack( convertor->pStack, convertor->stack_pos,
731                                   convertor->pDesc->desc.desc, convertor->pDesc->name );
732     }
733 }
734 
735 
opal_datatype_dump_stack(const dt_stack_t * pStack,int stack_pos,const union dt_elem_desc * pDesc,const char * name)736 void opal_datatype_dump_stack( const dt_stack_t* pStack, int stack_pos,
737                                const union dt_elem_desc* pDesc, const char* name )
738 {
739     opal_output( 0, "\nStack %p stack_pos %d name %s\n", (void*)pStack, stack_pos, name );
740     for( ; stack_pos >= 0; stack_pos-- ) {
741         opal_output( 0, "%d: pos %d count %" PRIsize_t " disp %ld ", stack_pos, pStack[stack_pos].index,
742                      pStack[stack_pos].count, pStack[stack_pos].disp );
743         if( pStack->index != -1 )
744             opal_output( 0, "\t[desc count %lu disp %ld extent %ld]\n",
745                          (unsigned long)pDesc[pStack[stack_pos].index].elem.count,
746                          (long)pDesc[pStack[stack_pos].index].elem.disp,
747                          (long)pDesc[pStack[stack_pos].index].elem.extent );
748         else
749             opal_output( 0, "\n" );
750     }
751     opal_output( 0, "\n" );
752 }
753