1 //------------------------------------------------------------------------------
2 // GB_builder: build a matrix from tuples
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // CALLED BY: GB_build, GB_Matrix_wait, GB_transpose, GB_concat_hyper
11 // CALLS:     Generated/GB__red_build__* workers
12 
13 // This function is called by GB_build to build a matrix T for GrB_Matrix_build
14 // or GrB_Vector_build, by GB_Matrix_wait to build a matrix T from the list of
15 // pending tuples, and by GB_transpose to transpose a matrix or vector.
16 // Duplicates can appear if called by GB_build or GB_Matrix_wait, but not
17 // GB_transpose.
18 
19 // The indices are provided either as (I_input,J_input) or (I_work,J_work), not
20 // both.  The values are provided as S_input or S_work, not both.  On return,
21 // the *work arrays are either transplanted into T, or freed, since they are
22 // temporary workspaces.
23 
24 // The work is done in major 5 Steps, some of which can be skipped, depending
25 // on how the tuples are provided (*_work or *_input), and whether or not they
26 // are sorted, or have duplicates.  If vdim <= 1, some work is skipped (for
27 // GrB_Vectors, and single-vector GrB_Matrices).  Let e be the of tuples on
28 // input.  Let p be the # of threads used.
29 
30 // STEP 1: copy user input.  O(e/p) read/write per thread, or skipped.
31 
32 // STEP 2: sort the tuples.  Time: O((e log e)/p), read/write, or skipped if
33 //         the tuples are already sorted.
34 
35 // STEP 3: count vectors and duplicates.  O(e/p) reads, per thread, if no
36 //         duplicates, or skipped if already done.  O(e/p) read/writes
37 //         per thread if duplicates appear.
38 
39 // STEP 4: construct T->h and T->p.  O(e/p) reads per thread, or skipped if
40 //         T is a vector.
41 
42 // STEP 5: assemble the tuples.  O(e/p) read/writes per thread, or O(1) if the
43 //         values can be transplanted into T as-is.
44 
45 // For GrB_Matrix_build:  If the input (I_input, J_input, S_input) is already
46 // sorted with no duplicates, and no typecasting needs to be done, then Step 1
47 // still must be done (each thread does O(e/p) reads of (I_input,J_input) and
48 // writes to I_work), but Step 1 also does the work for Step 3.  Step 2 and 3
49 // are skipped.  Step 4 does O(e/p) reads per thread (J_input only).  Then
50 // I_work is transplanted into T->i.  Step 5 does O(e/p) read/writes per thread
51 // to copy S into T->x.
52 
53 // For GrB_Vector_build: as GrB_Matrix_build, Step 1 does O(e/p) read/writes
54 // per thread.  The input is always a vector, so vdim == 1 always holds.  Step
55 // 2 is skipped if the indices are already sorted, and Step 3 does no work at
56 // all unless duplicates appear.  Step 4 takes no time, for any vector. Step 5
57 // does O(e/p) reads/writes per thread.
58 
59 // For GB_Matrix_wait:  the pending tuples are provided as I_work, J_work, and
60 // S_work, so Step 1 is skipped (no need to check for invalid indices).  The
61 // input J_work may be null (vdim can be anything, since GB_Matrix_wait is used
62 // for both vectors and matrices).  The tuples might be in sorted order
63 // already, which is known precisely known from A->Pending->sorted.  Step 2
64 // does O((e log e)/p) work to sort the tuples.  Duplicates may appear, and
65 // out-of-order tuples are likely.  Step 3 does O(e/p) read/writes.  Step 4
66 // does O(e/p) reads per thread of (I_work,J_work), or just I_work.  Step 5
67 // does O(e/p) read/writes per thread, or O(1) time if S_work can be
68 // transplanted into T->x.
69 
70 // For GB_transpose: uses I_work, J_work, and either S_input (if no op applied
71 // to the values) or S_work (if an op was applied to the A->x values).  This is
72 // only done for matrices, not vectors, so vdim > 1 will always hold.  The
73 // indices are valid so Step 1 is skipped.  The tuples are not sorted, so Step
74 // 2 takes O((e log e)/p) time to do the sort.  There are no duplicates, so
75 // Step 3 only does O(e/p) reads of J_work to count the vectors in each slice.
76 // Step 4 only does O(e/p) reads of J_work to compute T->h and T->p.  Step 5
77 // does O(e/p) read/writes per thread, but it uses the simpler case in
78 // GB_reduce_build_template since no duplicates can appear.  It is unlikely
79 // able to transplant S_work into T->x since the input will almost always be
80 // unsorted.
81 
82 // For GB_concat_hyper:  uses I_work, J_work, and S_work.  No duplicates
83 // appear.  Tuples are not sorted on input.  I_work is transplanted into C->i.
84 // J_work and S_work are freed on output.  S_work is not transplanted into
85 // C->x.
86 
87 // This method always returns T as hypersparse, and has no matrix inputs.  If
88 // the final C should become full or bitmap, that conversion is done by
89 // GB_transplant_conform.
90 
91 #include "GB_build.h"
92 #include "GB_sort.h"
93 #include "GB_binop.h"
94 #ifndef GBCOMPACT
95 #include "GB_red__include.h"
96 #endif
97 
98 #define GB_I_WORK(t) (((t) < 0) ? -1 : I_work [t])
99 #define GB_J_WORK(t) (((t) < 0) ? -1 : ((J_work == NULL) ? 0 : J_work [t]))
100 #define GB_K_WORK(t) (((t) < 0) ? -1 : ((K_work == NULL) ? t : K_work [t]))
101 
102 #define GB_FREE_WORK                                \
103 {                                                   \
104     GB_WERK_POP (Work, int64_t) ;                   \
105     GB_FREE (I_work_handle, *I_work_size_handle) ;  \
106     GB_FREE (J_work_handle, *J_work_size_handle) ;  \
107     GB_FREE (S_work_handle, *S_work_size_handle) ;  \
108     GB_FREE_WERK (&K_work, K_work_size) ;           \
109 }
110 
111 //------------------------------------------------------------------------------
112 // GB_builder
113 //------------------------------------------------------------------------------
114 
GB_builder(GrB_Matrix T,const GrB_Type ttype,const int64_t vlen,const int64_t vdim,const bool is_csc,int64_t ** I_work_handle,size_t * I_work_size_handle,int64_t ** J_work_handle,size_t * J_work_size_handle,GB_void ** S_work_handle,size_t * S_work_size_handle,bool known_sorted,bool known_no_duplicates,int64_t ijslen,const bool is_matrix,const int64_t * restrict I_input,const int64_t * restrict J_input,const GB_void * restrict S_input,const int64_t nvals,const GrB_BinaryOp dup,const GB_Type_code scode,GB_Context Context)115 GrB_Info GB_builder                 // build a matrix from tuples
116 (
117     GrB_Matrix T,                   // matrix to build, static or dynamic header
118     const GrB_Type ttype,           // type of output matrix T
119     const int64_t vlen,             // length of each vector of T
120     const int64_t vdim,             // number of vectors in T
121     const bool is_csc,              // true if T is CSC, false if CSR
122     int64_t **I_work_handle,        // for (i,k) or (j,i,k) tuples
123     size_t *I_work_size_handle,
124     int64_t **J_work_handle,        // for (j,i,k) tuples
125     size_t *J_work_size_handle,
126     GB_void **S_work_handle,        // array of values of tuples, size ijslen
127     size_t *S_work_size_handle,
128     bool known_sorted,              // true if tuples known to be sorted
129     bool known_no_duplicates,       // true if tuples known to not have dupl
130     int64_t ijslen,                 // size of I_work and J_work arrays
131     const bool is_matrix,           // true if T a GrB_Matrix, false if vector
132     const int64_t *restrict I_input,// original indices, size nvals
133     const int64_t *restrict J_input,// original indices, size nvals
134     const GB_void *restrict S_input,// array of values of tuples, size nvals
135     const int64_t nvals,            // number of tuples, and size of K_work
136     const GrB_BinaryOp dup,         // binary function to assemble duplicates,
137                                     // if NULL use the SECOND operator to
138                                     // keep the most recent duplicate.
139     const GB_Type_code scode,       // GB_Type_code of S_work or S_input array
140     GB_Context Context
141 )
142 {
143 
144     //--------------------------------------------------------------------------
145     // check inputs
146     //--------------------------------------------------------------------------
147 
148     ASSERT (T != NULL) ;            // T is a static or dynamic header on input
149     ASSERT (nvals >= 0) ;
150     ASSERT (scode <= GB_UDT_code) ;
151     ASSERT_TYPE_OK (ttype, "ttype for builder", GB0) ;
152     ASSERT_BINARYOP_OK_OR_NULL (dup, "dup for builder", GB0) ;
153     ASSERT (I_work_handle != NULL) ;
154     ASSERT (J_work_handle != NULL) ;
155     ASSERT (S_work_handle != NULL) ;
156     ASSERT (!GB_OP_IS_POSITIONAL (dup)) ;
157     ASSERT (I_work_size_handle != NULL) ;
158     ASSERT (J_work_size_handle != NULL) ;
159     ASSERT (S_work_size_handle != NULL) ;
160 
161     //--------------------------------------------------------------------------
162     // get S
163     //--------------------------------------------------------------------------
164 
165     GB_void *restrict S_work = (*S_work_handle) ;
166 
167     const GB_void *restrict S = (S_work == NULL) ? S_input : S_work ;
168     size_t tsize = ttype->size ;
169     size_t ssize = GB_code_size (scode, tsize) ;
170     ASSERT (GB_IMPLIES (nvals > 0, S != NULL)) ;
171 
172     //==========================================================================
173     // symbolic phase of the build =============================================
174     //==========================================================================
175 
176     // The symbolic phase sorts the tuples and finds any duplicates.  The
177     // output matrix T is constructed (not including T->i and T->x), and T->h
178     // and T->p are computed.  Then I_work is transplanted into T->i, or T->i is
179     // allocated.  T->x is then allocated.  It is not computed until the
180     // numeric phase.
181 
182     // When this function returns, I_work is either freed or transplanted into
183     // T->i.  J_work is freed, and the I_work and J_work pointers (in the
184     // caller) are set to NULL by setting their handles to NULL.  Note that
185     // J_work may already be NULL on input, if T has one or zero vectors
186     // (J_work_handle is always non-NULL however).
187 
188     GrB_Info info ;
189     int64_t *restrict I_work = (*I_work_handle) ;
190     int64_t *restrict J_work = (*J_work_handle) ;
191     int64_t *restrict K_work = NULL ; size_t K_work_size = 0 ;
192     ASSERT (*J_work_size_handle == GB_Global_memtable_size (J_work)) ;
193 
194     //--------------------------------------------------------------------------
195     // determine the number of threads to use
196     //--------------------------------------------------------------------------
197 
198     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
199     int nthreads = GB_nthreads (nvals, chunk, nthreads_max) ;
200 
201     //--------------------------------------------------------------------------
202     // allocate workspace
203     //--------------------------------------------------------------------------
204 
205     GB_WERK_DECLARE (Work, int64_t) ;
206     GB_WERK_PUSH (Work, 5*(nthreads+1), int64_t) ;
207     if (Work == NULL)
208     {
209         // out of memory
210         GB_FREE_WORK ;
211         return (GrB_OUT_OF_MEMORY) ;
212     }
213 
214     memset (Work, 0, Work_nitems * sizeof (int64_t)) ;
215     int64_t *restrict tstart_slice = Work ;                  // nthreads+1
216     int64_t *restrict tnvec_slice  = Work +   (nthreads+1) ; // nthreads+1
217     int64_t *restrict tnz_slice    = Work + 2*(nthreads+1) ; // nthreads+1
218     int64_t *restrict kbad         = Work + 3*(nthreads+1) ; // nthreads
219     int64_t *restrict ilast_slice  = Work + 4*(nthreads+1) ; // nthreads
220 
221     //--------------------------------------------------------------------------
222     // partition the tuples for the threads
223     //--------------------------------------------------------------------------
224 
225     // Thread tid handles tuples tstart_slice [tid] to tstart_slice [tid+1]-1.
226     // Each thread handles about the same number of tuples.  This partition
227     // depends only on nvals.
228 
229     GB_eslice (tstart_slice, nvals, nthreads) ;
230 
231     // tstart_slice [tid]: first tuple in slice tid
232     // tnvec_slice [tid]: # of vectors that start in a slice.  If a vector
233     //                    starts in one slice and ends in another, it is
234     //                    counted as being in the first slice.
235     // tnz_slice   [tid]: # of entries in a slice after removing duplicates
236 
237     // sentinel values for the final cumulative sum
238     tnvec_slice [nthreads] = 0 ;
239     tnz_slice   [nthreads] = 0 ;
240 
241     // this becomes true if the first pass computes tnvec_slice and tnz_slice,
242     // and if the (I_input,J_input) tuples were found to be already sorted with
243     // no duplicates present.
244     bool tnvec_and_tnz_slice_computed = false ;
245 
246     //--------------------------------------------------------------------------
247     // STEP 1: copy user input and check if valid
248     //--------------------------------------------------------------------------
249 
250     // If the indices are provided by (I_input,J_input), then import them into
251     // (I_work,J_work) and check if they are valid, and sorted.   If the input
252     // happens to be already sorted, then duplicates are detected and the # of
253     // vectors in each slice is counted.
254 
255     if (I_work == NULL)
256     {
257 
258         //----------------------------------------------------------------------
259         // allocate I_work
260         //----------------------------------------------------------------------
261 
262         // allocate workspace to load and sort the index tuples:
263 
264         // vdim <= 1: I_work and K_work for (i,k) tuples, where i = I_input [k]
265 
266         // vdim > 1: also J_work for (j,i,k) tuples where i = I_input [k] and
267         // j = J_input [k].  If the tuples are found to be already sorted on
268         // input, then J_work is not allocated, and J_input is used instead.
269 
270         // The k value in the tuple gives the position in the original set of
271         // tuples: I_input [k] and S [k] when vdim <= 1, and also J_input [k]
272         // for matrices with vdim > 1.
273 
274         // The workspace I_work and J_work are allocated here but freed (or
275         // transplanted) inside GB_builder.  K_work is allocated, used, and
276         // freed in GB_builder.
277 
278         ASSERT (J_work == NULL) ;
279         I_work = GB_MALLOC (nvals, int64_t, I_work_size_handle) ;
280         (*I_work_handle) = I_work ;
281         ijslen = nvals ;
282         if (I_work == NULL)
283         {
284             // out of memory
285             GB_FREE_WORK ;
286             return (GrB_OUT_OF_MEMORY) ;
287         }
288 
289         //----------------------------------------------------------------------
290         // create the tuples to sort, and check for any invalid indices
291         //----------------------------------------------------------------------
292 
293         known_sorted = true ;
294         bool no_duplicates_found = true ;
295 
296         if (nvals == 0)
297         {
298 
299             // nothing to do
300 
301         }
302         else if (is_matrix)
303         {
304 
305             //------------------------------------------------------------------
306             // C is a matrix; check both I_input and J_input
307             //------------------------------------------------------------------
308 
309             ASSERT (J_input != NULL) ;
310             ASSERT (I_work != NULL) ;
311             ASSERT (vdim >= 0) ;
312             ASSERT (I_input != NULL) ;
313 
314             int tid ;
315             #pragma omp parallel for num_threads(nthreads) schedule(static) \
316                 reduction(&&:known_sorted) reduction(&&:no_duplicates_found)
317             for (tid = 0 ; tid < nthreads ; tid++)
318             {
319 
320                 kbad [tid] = -1 ;
321                 int64_t my_tnvec = 0 ;
322                 int64_t kstart   = tstart_slice [tid] ;
323                 int64_t kend     = tstart_slice [tid+1] ;
324                 int64_t ilast = (kstart == 0) ? -1 : I_input [kstart-1] ;
325                 int64_t jlast = (kstart == 0) ? -1 : J_input [kstart-1] ;
326 
327                 for (int64_t k = kstart ; k < kend ; k++)
328                 {
329                     // get k-th index from user input: (i,j)
330                     int64_t i = I_input [k] ;
331                     int64_t j = J_input [k] ;
332 
333                     if (i < 0 || i >= vlen || j < 0 || j >= vdim)
334                     {
335                         // halt if out of bounds
336                         kbad [tid] = k ;
337                         break ;
338                     }
339 
340                     // check if the tuples are already sorted
341                     known_sorted = known_sorted &&
342                         ((jlast < j) || (jlast == j && ilast <= i)) ;
343 
344                     // check if this entry is a duplicate of the one before it
345                     no_duplicates_found = no_duplicates_found &&
346                         (!(jlast == j && ilast == i)) ;
347 
348                     // copy the tuple into I_work.  J_work is done later.
349                     I_work [k] = i ;
350 
351                     if (j > jlast)
352                     {
353                         // vector j starts in this slice (but this is
354                         // valid only if J_input is sorted on input)
355                         my_tnvec++ ;
356                     }
357 
358                     // log the last index seen
359                     ilast = i ; jlast = j ;
360                 }
361 
362                 // these are valid only if I_input and J_input are sorted on
363                 // input, with no duplicates present.
364                 tnvec_slice [tid] = my_tnvec ;
365                 tnz_slice   [tid] = kend - kstart ;
366 
367             }
368 
369             // collect the report from each thread
370             for (int tid = 0 ; tid < nthreads ; tid++)
371             {
372                 if (kbad [tid] >= 0)
373                 {
374                     // invalid index
375                     int64_t i = I_input [kbad [tid]] ;
376                     int64_t j = J_input [kbad [tid]] ;
377                     int64_t row = is_csc ? i : j ;
378                     int64_t col = is_csc ? j : i ;
379                     int64_t nrows = is_csc ? vlen : vdim ;
380                     int64_t ncols = is_csc ? vdim : vlen ;
381                     GB_FREE_WORK ;
382                     GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
383                         "index (" GBd "," GBd ") out of bounds,"
384                         " must be < (" GBd ", " GBd ")",
385                         row, col, nrows, ncols) ;
386                 }
387             }
388 
389             // if the tuples were found to be already in sorted order, and if
390             // no duplicates were found, then tnvec_slice and tnz_slice are now
391             // valid, Otherwise, they can only be computed after sorting.
392             tnvec_and_tnz_slice_computed = known_sorted && no_duplicates_found ;
393 
394             //------------------------------------------------------------------
395             // allocate J_work, if needed
396             //------------------------------------------------------------------
397 
398             if (vdim > 1 && !known_sorted)
399             {
400                 // copy J_input into J_work, so the tuples can be sorted
401                 J_work = GB_MALLOC (nvals, int64_t, J_work_size_handle) ;
402                 (*J_work_handle) = J_work ;
403                 if (J_work == NULL)
404                 {
405                     // out of memory
406                     GB_FREE_WORK ;
407                     return (GrB_OUT_OF_MEMORY) ;
408                 }
409                 GB_memcpy (J_work, J_input, nvals * sizeof (int64_t), nthreads);
410             }
411             else
412             {
413                 // J_work is a shallow copy of J_input.  The pointer is not
414                 // copied into (*J_work_handle), so it will not be freed.
415                 // J_input is not modified, even though it is typecast to the
416                 // int64_t *J_work, since J_work is not modified in this case.
417                 J_work = (int64_t *) J_input ;
418             }
419 
420         }
421         else
422         {
423 
424             //------------------------------------------------------------------
425             // C is a typecasted GrB_Vector; check only I_input
426             //------------------------------------------------------------------
427 
428             ASSERT (I_input != NULL) ;
429             ASSERT (J_input == NULL) ;
430             ASSERT (vdim == 1) ;
431 
432             int tid ;
433             #pragma omp parallel for num_threads(nthreads) schedule(static) \
434                 reduction(&&:known_sorted) reduction(&&:no_duplicates_found)
435             for (tid = 0 ; tid < nthreads ; tid++)
436             {
437 
438                 kbad [tid] = -1 ;
439                 int64_t kstart   = tstart_slice [tid] ;
440                 int64_t kend     = tstart_slice [tid+1] ;
441                 int64_t ilast = (kstart == 0) ? -1 : I_input [kstart-1] ;
442 
443                 for (int64_t k = kstart ; k < kend ; k++)
444                 {
445                     // get k-th index from user input: (i)
446                     int64_t i = I_input [k] ;
447 
448                     if (i < 0 || i >= vlen)
449                     {
450                         // halt if out of bounds
451                         kbad [tid] = k ;
452                         break ;
453                     }
454 
455                     // check if the tuples are already sorted
456                     known_sorted = known_sorted && (ilast <= i) ;
457 
458                     // check if this entry is a duplicate of the one before it
459                     no_duplicates_found = no_duplicates_found &&
460                         (!(ilast == i)) ;
461 
462                     // copy the tuple into the work arrays to be sorted
463                     I_work [k] = i ;
464 
465                     // log the last index seen
466                     ilast = i ;
467                 }
468             }
469 
470             // collect the report from each thread
471             for (int tid = 0 ; tid < nthreads ; tid++)
472             {
473                 if (kbad [tid] >= 0)
474                 {
475                     // invalid index
476                     int64_t i = I_input [kbad [tid]] ;
477                     GB_FREE_WORK ;
478                     GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
479                         "index (" GBd ") out of bounds, must be < (" GBd ")",
480                         i, vlen) ;
481                 }
482             }
483         }
484 
485         //----------------------------------------------------------------------
486         // determine if duplicates are possible
487         //----------------------------------------------------------------------
488 
489         // The input is now known to be sorted, or not.  If it is sorted, and
490         // if no duplicates were found, then it is known to have no duplicates.
491         // Otherwise, duplicates might appear, but a sort is required first to
492         // check for duplicates.
493 
494         known_no_duplicates = known_sorted && no_duplicates_found ;
495     }
496 
497     //--------------------------------------------------------------------------
498     // STEP 2: sort the tuples in ascending order
499     //--------------------------------------------------------------------------
500 
501     // If the tuples are known to already be sorted, Step 2 is skipped.  In
502     // that case, K_work is NULL (not allocated), which implicitly means that
503     // K_work [k] = k for all k = 0:nvals-1.
504 
505     if (!known_sorted)
506     {
507 
508         // create the k part of each tuple
509         K_work = GB_MALLOC_WERK (nvals, int64_t, &K_work_size) ;
510         if (K_work == NULL)
511         {
512             // out of memory
513             GB_FREE_WORK ;
514             return (GrB_OUT_OF_MEMORY) ;
515         }
516 
517         // The k part of each tuple (i,k) or (j,i,k) records the original
518         // position of the tuple in the input list.  This allows an unstable
519         // sorting algorith to be used.  Since k is unique, it forces the
520         // result of the sort to be stable regardless of whether or not the
521         // sorting algorithm is stable.  It also keeps track of where the
522         // numerical value of the tuple can be found; it is in S[k] for the
523         // tuple (i,k) or (j,i,k), regardless of where the tuple appears in the
524         // list after it is sorted.
525         int64_t k ;
526         #pragma omp parallel for num_threads(nthreads) schedule(static)
527         for (k = 0 ; k < nvals ; k++)
528         {
529             K_work [k] = k ;
530         }
531 
532         // sort all the tuples
533         if (vdim > 1)
534         {
535 
536             // sort a set of (j,i,k) tuples
537             info = GB_msort_3b (J_work, I_work, K_work, nvals, nthreads) ;
538 
539             #ifdef GB_DEBUG
540             if (info == GrB_SUCCESS)
541             {
542                 int64_t ilast = -1 ;
543                 int64_t jlast = -1 ;
544                 for (int64_t k = 0 ; k < nvals ; k++)
545                 {
546                     int64_t i = I_work [k] ;
547                     int64_t j = J_work [k] ;
548                     ASSERT ((jlast < j) || (jlast == j && ilast <= i)) ;
549                     ilast = i ;
550                     jlast = j ;
551                 }
552             }
553             #endif
554 
555         }
556         else
557         {
558             // sort a set of (i,k) tuples
559             info = GB_msort_2b (I_work, K_work, nvals, nthreads) ;
560 
561             #ifdef GB_DEBUG
562             if (info == GrB_SUCCESS)
563             {
564                 int64_t ilast = -1 ;
565                 for (int64_t k = 0 ; k < nvals ; k++)
566                 {
567                     int64_t i = I_work [k] ;
568                     ASSERT (ilast <= i) ;
569                     ilast = i ;
570                 }
571             }
572             #endif
573 
574         }
575 
576         if (info != GrB_SUCCESS)
577         {
578             // out of memory in GB_msort_*
579             GB_FREE_WORK ;
580             return (GrB_OUT_OF_MEMORY) ;
581         }
582     }
583 
584     //--------------------------------------------------------------------------
585     // STEP 3: count vectors and duplicates in each slice
586     //--------------------------------------------------------------------------
587 
588     // Duplicates are located, counted and their indices negated.  The # of
589     // vectors in each slice is counted.  If the indices are known to not have
590     // duplicates, then only the vectors are counted.  Counting the # of
591     // vectors is skipped if already done by Step 1.
592 
593     if (known_no_duplicates)
594     {
595 
596         //----------------------------------------------------------------------
597         // no duplicates: just count # vectors in each slice
598         //----------------------------------------------------------------------
599 
600         // This is much faster, particularly if the # of vectors in each slice
601         // has already been computed.
602 
603         #ifdef GB_DEBUG
604         {
605             // assert that there are no duplicates
606             int64_t ilast = -1, jlast = -1 ;
607             for (int64_t t = 0 ; t < nvals ; t++)
608             {
609                 int64_t i = GB_I_WORK (t), j = GB_J_WORK (t) ;
610                 bool is_duplicate = (i == ilast && j == jlast) ;
611                 ASSERT (!is_duplicate) ;
612                 ilast = i ; jlast = j ;
613             }
614         }
615         #endif
616 
617         if (vdim <= 1)
618         {
619 
620             // all tuples appear in at most one vector, and there are no
621             // duplicates, so there is no need to scan I_work or J_work.
622 
623             for (int tid = 0 ; tid < nthreads ; tid++)
624             {
625                 int64_t tstart = tstart_slice [tid] ;
626                 int64_t tend   = tstart_slice [tid+1] ;
627                 tnvec_slice [tid] = 0 ;
628                 tnz_slice   [tid] = tend - tstart ;
629             }
630             tnvec_slice [0] = (nvals == 0) ? 0 : 1 ;
631 
632         }
633         else
634         {
635 
636             // count the # of unique vector indices in J_work.  No need to scan
637             // I_work since there are no duplicates to be found.  Also no need
638             // to compute them if already found in Step 1.
639 
640             if (!tnvec_and_tnz_slice_computed)
641             {
642 
643                 int tid ;
644                 #pragma omp parallel for num_threads(nthreads) schedule(static)
645                 for (tid = 0 ; tid < nthreads ; tid++)
646                 {
647                     int64_t my_tnvec = 0 ;
648                     int64_t tstart = tstart_slice [tid] ;
649                     int64_t tend   = tstart_slice [tid+1] ;
650                     int64_t jlast  = GB_J_WORK (tstart-1) ;
651 
652                     for (int64_t t = tstart ; t < tend ; t++)
653                     {
654                         // get the t-th tuple
655                         int64_t j = J_work [t] ;
656                         if (j > jlast)
657                         {
658                             // vector j starts in this slice
659                             my_tnvec++ ;
660                             jlast = j ;
661                         }
662                     }
663 
664                     tnvec_slice [tid] = my_tnvec ;
665                     tnz_slice   [tid] = tend - tstart ;
666                 }
667             }
668         }
669 
670     }
671     else
672     {
673 
674         //----------------------------------------------------------------------
675         // look for duplicates and count # vectors in each slice
676         //----------------------------------------------------------------------
677 
678         for (int tid = 0 ; tid < nthreads ; tid++)
679         {
680             int64_t tstart = tstart_slice [tid] ;
681             ilast_slice [tid] = GB_I_WORK (tstart-1) ;
682         }
683 
684         int tid ;
685         #pragma omp parallel for num_threads(nthreads) schedule(static)
686         for (tid = 0 ; tid < nthreads ; tid++)
687         {
688 
689             int64_t my_tnvec = 0 ;
690             int64_t my_ndupl = 0 ;
691             int64_t tstart   = tstart_slice [tid] ;
692             int64_t tend     = tstart_slice [tid+1] ;
693             int64_t ilast    = ilast_slice [tid] ;
694             int64_t jlast    = GB_J_WORK (tstart-1) ;
695 
696             for (int64_t t = tstart ; t < tend ; t++)
697             {
698                 // get the t-th tuple
699                 int64_t i = I_work [t] ;
700                 int64_t j = GB_J_WORK (t) ;
701 
702                 // tuples are now sorted but there may be duplicates
703                 ASSERT ((jlast < j) || (jlast == j && ilast <= i)) ;
704 
705                 // check if (j,i,k) is a duplicate
706                 if (i == ilast && j == jlast)
707                 {
708                     // flag the tuple as a duplicate
709                     I_work [t] = -1 ;
710                     my_ndupl++ ;
711                     // the sort places earlier duplicate tuples (with smaller
712                     // k) after later ones (with larger k).
713                     ASSERT (GB_K_WORK (t-1) < GB_K_WORK (t)) ;
714                 }
715                 else
716                 {
717                     // this is a new tuple
718                     if (j > jlast)
719                     {
720                         // vector j starts in this slice
721                         my_tnvec++ ;
722                         jlast = j ;
723                     }
724                     ilast = i ;
725                 }
726             }
727             tnvec_slice [tid] = my_tnvec ;
728             tnz_slice   [tid] = (tend - tstart) - my_ndupl ;
729         }
730     }
731 
732     //--------------------------------------------------------------------------
733     // find total # of vectors and duplicates in all tuples
734     //--------------------------------------------------------------------------
735 
736     // Replace tnvec_slice with its cumulative sum, after which each slice tid
737     // will be responsible for the # vectors in T that range from tnvec_slice
738     // [tid] to tnvec_slice [tid+1]-1.
739     GB_cumsum (tnvec_slice, nthreads, NULL, 1, NULL) ;
740     int64_t tnvec = tnvec_slice [nthreads] ;
741 
742     // Replace tnz_slice with its cumulative sum
743     GB_cumsum (tnz_slice, nthreads, NULL, 1, NULL) ;
744 
745     // find the total # of final entries, after assembling duplicates
746     int64_t tnz = tnz_slice [nthreads] ;
747     int64_t ndupl = nvals - tnz ;
748 
749     //--------------------------------------------------------------------------
750     // allocate T; always hypersparse
751     //--------------------------------------------------------------------------
752 
753     // allocate T; allocate T->p and T->h but do not initialize them.
754     // T is always hypersparse.  The header T always exists on input, as
755     // either a static or dynamic header.
756     bool static_header = T->static_header ;
757     info = GB_new (&T, static_header, // always hyper, static or dynamic header
758         ttype, vlen, vdim, GB_Ap_malloc, is_csc,
759         GxB_HYPERSPARSE, GB_ALWAYS_HYPER, tnvec, Context) ;
760     if (info != GrB_SUCCESS)
761     {
762         // out of memory
763         GB_FREE_WORK ;
764         return (info) ;
765     }
766 
767     ASSERT (T->p != NULL) ;
768     ASSERT (T->h != NULL) ;
769     ASSERT (T->nzmax == 0) ;        // T->i and T->x not yet allocated
770     ASSERT (T->b == NULL) ;
771     ASSERT (T->i == NULL) ;
772     ASSERT (T->x == NULL) ;
773 
774     //--------------------------------------------------------------------------
775     // STEP 4: construct the vector pointers and hyperlist for T
776     //--------------------------------------------------------------------------
777 
778     // Step 4 scans the J_work indices and constructs T->h and T->p.
779 
780     int64_t *restrict Th = T->h ;
781     int64_t *restrict Tp = T->p ;
782 
783     if (vdim <= 1)
784     {
785 
786         //----------------------------------------------------------------------
787         // special case for vectors
788         //----------------------------------------------------------------------
789 
790         ASSERT (tnvec == 0 || tnvec == 1) ;
791         if (tnvec > 0)
792         {
793             Th [0] = 0 ;
794             Tp [0] = 0 ;
795         }
796 
797     }
798     else if (ndupl == 0)
799     {
800 
801         //----------------------------------------------------------------------
802         // no duplicates appear
803         //----------------------------------------------------------------------
804 
805         int tid ;
806         #pragma omp parallel for num_threads(nthreads) schedule(static)
807         for (tid = 0 ; tid < nthreads ; tid++)
808         {
809 
810             int64_t my_tnvec = tnvec_slice [tid] ;
811             int64_t tstart   = tstart_slice [tid] ;
812             int64_t tend     = tstart_slice [tid+1] ;
813             int64_t jlast    = GB_J_WORK (tstart-1) ;
814 
815             for (int64_t t = tstart ; t < tend ; t++)
816             {
817                 // get the t-th tuple
818                 int64_t j = GB_J_WORK (t) ;
819                 if (j > jlast)
820                 {
821                     // vector j starts in this slice
822                     Th [my_tnvec] = j ;
823                     Tp [my_tnvec] = t ;
824                     my_tnvec++ ;
825                     jlast = j ;
826                 }
827             }
828         }
829 
830     }
831     else
832     {
833 
834         //----------------------------------------------------------------------
835         // it is known that at least one duplicate appears
836         //----------------------------------------------------------------------
837 
838         int tid ;
839         #pragma omp parallel for num_threads(nthreads) schedule(static)
840         for (tid = 0 ; tid < nthreads ; tid++)
841         {
842 
843             int64_t my_tnz   = tnz_slice [tid] ;
844             int64_t my_tnvec = tnvec_slice [tid] ;
845             int64_t tstart   = tstart_slice [tid] ;
846             int64_t tend     = tstart_slice [tid+1] ;
847             int64_t jlast    = GB_J_WORK (tstart-1) ;
848 
849             for (int64_t t = tstart ; t < tend ; t++)
850             {
851                 // get the t-th tuple
852                 int64_t i = I_work [t] ;
853                 int64_t j = GB_J_WORK (t) ;
854                 if (i >= 0)
855                 {
856                     // this is a new tuple
857                     if (j > jlast)
858                     {
859                         // vector j starts in this slice
860                         Th [my_tnvec] = j ;
861                         Tp [my_tnvec] = my_tnz ;
862                         my_tnvec++ ;
863                         jlast = j ;
864                     }
865                     my_tnz++ ;
866                 }
867             }
868         }
869     }
870 
871     // log the end of the last vector
872     T->nvec_nonempty = tnvec ;
873     T->nvec = tnvec ;
874     Tp [tnvec] = tnz ;
875     ASSERT (T->nvec == T->plen) ;
876     T->magic = GB_MAGIC ;
877 
878     //--------------------------------------------------------------------------
879     // free J_work if it exists
880     //--------------------------------------------------------------------------
881 
882     ASSERT (J_work_handle != NULL) ;
883     GB_FREE (J_work_handle, *J_work_size_handle) ;
884     J_work = NULL ;
885 
886     //--------------------------------------------------------------------------
887     // allocate T->i
888     //--------------------------------------------------------------------------
889 
890     T->nzmax = GB_IMAX (tnz, 1) ;
891 
892     if (ndupl == 0)
893     {
894         // shrink I_work from size ijslen to size T->nzmax
895         if (T->nzmax < ijslen)
896         {
897             // this cannot fail since the size is shrinking.
898             bool ok ;
899             GB_REALLOC (I_work, T->nzmax, ijslen, int64_t, I_work_size_handle,
900                 &ok, Context) ;
901             ASSERT (ok) ;
902         }
903         // transplant I_work into T->i
904         T->i = I_work ; T->i_size = (*I_work_size_handle) ;
905         I_work = NULL ;
906         (*I_work_handle) = NULL ;
907         (*I_work_size_handle) = 0 ;
908     }
909     else
910     {
911         // duplicates exist, so allocate a new T->i.  I_work must be freed later
912         T->i = GB_MALLOC (T->nzmax, int64_t, &(T->i_size)) ;
913         if (T->i == NULL)
914         {
915             // out of memory
916             GB_phbix_free (T) ;
917             GB_FREE_WORK ;
918             return (GrB_OUT_OF_MEMORY) ;
919         }
920     }
921 
922     int64_t *restrict Ti = T->i ;
923 
924     //==========================================================================
925     // numerical phase of the build: assemble any duplicates
926     //==========================================================================
927 
928     // The tuples have been sorted.  Assemble any duplicates with a switch
929     // factory of built-in workers, or four generic workers.  The vector
930     // pointers T->p and hyperlist T->h (if hypersparse) have already been
931     // computed.
932 
933     // If there are no duplicates, T->i holds the row indices of the tuple.
934     // Otherwise, the row indices are still in I_work.  K_work holds the
935     // positions of each tuple in the array S.  The tuples are sorted so that
936     // duplicates are adjacent to each other and they appear in the order they
937     // appeared in the original tuples.  This method assembles the duplicates
938     // and computes T->i and T->x from I_work, K_work, and S.  into T, becoming
939     // T->i.  If no duplicates appear, T->i is already computed, and S just
940     // needs to be copied and permuted into T->x.
941 
942     // The (i,k,S[k]) tuples are held in two integer arrays: (1) I_work or T->i,
943     // and (2) K_work, and an array S of numerical values.  S has not been
944     // sorted, nor even accessed yet.  It is identical to the original unsorted
945     // tuples.  The (i,k,S[k]) tuple holds the row index i, the position k, and
946     // the value S [k].  This entry becomes T(i,j) = S [k] in the matrix T, and
947     // duplicates (if any) are assembled via the dup operator.
948 
949     //--------------------------------------------------------------------------
950     // get opcodes and check types
951     //--------------------------------------------------------------------------
952 
953     // With GB_build, there can be 1 to 2 different types.
954     //      T->type is identical to the types of x,y,z for z=dup(x,y).
955     //      dup is never NULL and all its three types are the same
956     //      The type of S (scode) can different but must be compatible
957     //          with T->type
958 
959     // With GB_Matrix_wait, there can be 1 to 5 different types:
960     //      The pending tuples are in S, of type scode which must be
961     //          compatible with dup->ytype and T->type
962     //      z = dup (x,y): can be NULL or have 1 to 3 different types
963     //      T->type: must be compatible with all above types.
964     //      dup may be NULL, in which case it is assumed be the implicit SECOND
965     //          operator, with all three types equal to T->type
966 
967     GrB_Type xtype, ytype, ztype ;
968     GxB_binary_function fdup ;
969     #ifndef GBCOMPACT
970     GB_Opcode opcode ;
971     #endif
972 
973     GB_Type_code tcode = ttype->code ;
974     bool op_2nd ;
975 
976     ASSERT_TYPE_OK (ttype, "ttype for build_factory", GB0) ;
977 
978     if (dup == NULL)
979     {
980 
981         //----------------------------------------------------------------------
982         // dup is the implicit SECOND operator
983         //----------------------------------------------------------------------
984 
985         // z = SECOND (x,y) where all three types are the same as ttype
986         // T(i,j) = (ttype) S(k) will be done for all tuples.
987 
988         #ifndef GBCOMPACT
989         opcode = GB_SECOND_opcode ;
990         #endif
991         ASSERT (GB_op_is_second (dup, ttype)) ;
992         xtype = ttype ;
993         ytype = ttype ;
994         ztype = ttype ;
995         fdup = NULL ;
996         op_2nd = true ;
997 
998     }
999     else
1000     {
1001 
1002         //----------------------------------------------------------------------
1003         // dup is an explicit operator
1004         //----------------------------------------------------------------------
1005 
1006         // T(i,j) = (ttype) S[k] will be done for the first tuple.
1007         // for subsequent tuples: T(i,j) += S[k], via the dup operator and
1008         // typecasting:
1009         //
1010         //      y = (dup->ytype) S[k]
1011         //      x = (dup->xtype) T(i,j)
1012         //      z = (dup->ztype) dup (x,y)
1013         //      T(i,j) = (ttype) z
1014 
1015         ASSERT_BINARYOP_OK (dup, "dup for build_factory", GB0) ;
1016         #ifndef GBCOMPACT
1017         opcode = dup->opcode ;
1018         #endif
1019         xtype = dup->xtype ;
1020         ytype = dup->ytype ;
1021         ztype = dup->ztype ;
1022         fdup = dup->function ;
1023         op_2nd = GB_op_is_second (dup, ttype) ;
1024     }
1025 
1026     //--------------------------------------------------------------------------
1027     // get the sizes and codes of each type
1028     //--------------------------------------------------------------------------
1029 
1030     GB_Type_code zcode = ztype->code ;
1031     GB_Type_code xcode = xtype->code ;
1032     GB_Type_code ycode = ytype->code ;
1033 
1034     ASSERT (GB_code_compatible (tcode, scode)) ;    // T(i,j) = (ttype) S
1035     ASSERT (GB_code_compatible (ycode, scode)) ;    // y = (ytype) S
1036     ASSERT (GB_Type_compatible (xtype, ttype)) ;    // x = (xtype) T(i,j)
1037     ASSERT (GB_Type_compatible (ttype, ztype)) ;    // T(i,j) = (ttype) z
1038 
1039     size_t zsize = ztype->size ;
1040     size_t xsize = xtype->size ;
1041     size_t ysize = ytype->size ;
1042 
1043     // no typecasting if all 5 types are the same
1044     bool nocasting = (tcode == scode) &&
1045         (ttype == xtype) && (ttype == ytype) && (ttype == ztype) ;
1046 
1047     //--------------------------------------------------------------------------
1048     // STEP 5: assemble the tuples
1049     //--------------------------------------------------------------------------
1050 
1051     bool copy_S_into_T = (nocasting && known_sorted && ndupl == 0) ;
1052 
1053     if (copy_S_into_T && S_work != NULL)
1054     {
1055 
1056         //----------------------------------------------------------------------
1057         // transplant S_work into T->x
1058         //----------------------------------------------------------------------
1059 
1060         // No typecasting is needed, the tuples were originally in sorted
1061         // order, and no duplicates appear.  All that is required is to copy S
1062         // into Tx.  S can be directly transplanted into T->x since S is
1063         // provided as S_work.  GB_builder must either transplant or free
1064         // S_work.  The transplant can be used by GB_Matrix_wait, whenever the
1065         // tuples are already sorted, with no duplicates, and no typecasting is
1066         // needed, since S_work is always A->Pending->x.  This transplant can
1067         // rarely be used for GB_transpose, in the case when op is NULL and the
1068         // transposed tuples happen to be sorted (which is unlikely).
1069 
1070         T->x = S_work ; T->x_size = (*S_work_size_handle) ;
1071         S_work = NULL ;
1072         (*S_work_handle) = NULL ;
1073         (*S_work_size_handle) = 0 ;
1074 
1075     }
1076     else
1077     {
1078 
1079         //----------------------------------------------------------------------
1080         // allocate T->x
1081         //----------------------------------------------------------------------
1082 
1083         T->x = GB_MALLOC (T->nzmax * ttype->size, GB_void, &(T->x_size)) ;
1084         if (T->x == NULL)
1085         {
1086             // out of memory
1087             GB_phbix_free (T) ;
1088             GB_FREE_WORK ;
1089             return (GrB_OUT_OF_MEMORY) ;
1090         }
1091 
1092         GB_void *restrict Tx = (GB_void *) T->x ;
1093 
1094         ASSERT (GB_IMPLIES (nvals > 0, S != NULL)) ;
1095 
1096         if (nvals == 0)
1097         {
1098 
1099             // nothing to do
1100 
1101         }
1102         else if (copy_S_into_T)
1103         {
1104 
1105             //------------------------------------------------------------------
1106             // copy S into T->x
1107             //------------------------------------------------------------------
1108 
1109             // No typecasting is needed, the tuples were originally in sorted
1110             // order, and no duplicates appear.  All that is required is to
1111             // copy S into Tx.  S cannot be transplanted into T->x since
1112             // S_work is NULL and S_input cannot be modified by GB_builder.
1113 
1114             GBURBLE ("(build:memcpy) ") ;
1115             ASSERT (S_work == NULL) ;
1116             ASSERT (S == S_input) ;
1117             GB_memcpy (Tx, S, nvals * tsize, nthreads) ;
1118 
1119         }
1120         else if (nocasting)
1121         {
1122 
1123             //------------------------------------------------------------------
1124             // assemble the values, S, into T, no typecasting needed
1125             //------------------------------------------------------------------
1126 
1127             // S (either S_work or S_input) must be permuted and copied into
1128             // T->x, since the tuples had to be sorted, or duplicates appear.
1129             // Any duplicates are now assembled.
1130 
1131             // There are 44 common cases of this function for built-in types
1132             // and 8 associative operators: MIN, MAX, PLUS, TIMES for 10 types
1133             // (all but boolean; and OR, AND, XOR, and EQ for boolean.
1134 
1135             // In addition, the FIRST and SECOND operators are hard-coded, for
1136             // another 22 workers, since SECOND is used by GB_Matrix_wait and
1137             // since FIRST is useful for keeping the first tuple seen.  It is
1138             // controlled by the GB_INCLUDE_SECOND_OPERATOR definition, so they
1139             // do not appear in GB_reduce_to_* where the FIRST and SECOND
1140             // operators are not needed.
1141 
1142             // Early exit cannot be exploited, so the terminal is ignored.
1143 
1144             GBURBLE ("(build:assemble) ") ;
1145             bool done = false ;
1146 
1147             #ifndef GBCOMPACT
1148 
1149                 //--------------------------------------------------------------
1150                 // define the worker for the switch factory
1151                 //--------------------------------------------------------------
1152 
1153                 #define GB_INCLUDE_SECOND_OPERATOR
1154 
1155                 #define GB_red(opname,aname) \
1156                     GB (_red_build_ ## opname ## aname)
1157 
1158                 #define GB_RED_WORKER(opname,aname,atype)                   \
1159                 {                                                           \
1160                     info = GB_red (opname, aname) ((atype *) Tx, Ti,        \
1161                         (atype *) S, nvals, ndupl, I_work, K_work,          \
1162                         tstart_slice, tnz_slice, nthreads) ;                \
1163                     done = (info != GrB_NO_VALUE) ;                         \
1164                 }                                                           \
1165                 break ;
1166 
1167                 //--------------------------------------------------------------
1168                 // launch the switch factory
1169                 //--------------------------------------------------------------
1170 
1171                 // controlled by opcode and typecode
1172                 GB_Type_code typecode = tcode ;
1173                 #include "GB_red_factory.c"
1174 
1175             #endif
1176 
1177             //------------------------------------------------------------------
1178             // generic worker
1179             //------------------------------------------------------------------
1180 
1181             if (!done)
1182             {
1183                 GB_BURBLE_N (nvals, "(generic) ") ;
1184 
1185                 //--------------------------------------------------------------
1186                 // no typecasting, but use the fdup function pointer and memcpy
1187                 //--------------------------------------------------------------
1188 
1189                 // Either the fdup operator or type of S and T are
1190                 // user-defined, or fdup is not an associative operator handled
1191                 // by the GB_red_factory, or some combination of these
1192                 // conditions.  User-defined types cannot be typecasted, so
1193                 // this handles all user-defined types.
1194 
1195                 // Tx [p] = (ttype) S [k], but with no typecasting
1196                 #define GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)                \
1197                     memcpy (Tx +((p)*tsize), S +((k)*tsize), tsize) ;
1198 
1199                 if (op_2nd)
1200                 {
1201 
1202                     //----------------------------------------------------------
1203                     // dup is the SECOND operator, with no typecasting
1204                     //----------------------------------------------------------
1205 
1206                     // Tx [p] += (ttype) S [k], but 2nd op and no typecasting
1207                     #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)        \
1208                         GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)
1209                     #include "GB_reduce_build_template.c"
1210 
1211                 }
1212                 else
1213                 {
1214 
1215                     //----------------------------------------------------------
1216                     // dup is another operator, with no typecasting needed
1217                     //----------------------------------------------------------
1218 
1219                     // Tx [p] += (ttype) S [k], but with no typecasting
1220                     #undef  GB_ADD_CAST_ARRAY_TO_ARRAY
1221                     #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)        \
1222                         fdup (Tx +((p)*tsize), Tx +((p)*tsize), S +((k)*tsize));
1223                     #include "GB_reduce_build_template.c"
1224                 }
1225             }
1226 
1227         }
1228         else
1229         {
1230 
1231             //------------------------------------------------------------------
1232             // assemble the values S into T, typecasting as needed
1233             //------------------------------------------------------------------
1234 
1235             GB_BURBLE_N (nvals, "(generic with typecast) ") ;
1236 
1237             // S (either S_work or S_input) must be permuted and copied into
1238             // T->x, since the tuples had to be sorted, or duplicates appear.
1239             // Any duplicates are now assembled.  Not all of the 5 types are
1240             // the same, but all of them are built-in since user-defined types
1241             // cannot be typecasted.
1242 
1243             GB_cast_function cast_S_to_T = GB_cast_factory (tcode, scode) ;
1244             GB_cast_function cast_S_to_Y = GB_cast_factory (ycode, scode) ;
1245             GB_cast_function cast_T_to_X = GB_cast_factory (xcode, tcode) ;
1246             GB_cast_function cast_Z_to_T = GB_cast_factory (tcode, zcode) ;
1247 
1248             ASSERT (scode <= GB_FC64_code) ;
1249             ASSERT (tcode <= GB_FC64_code) ;
1250             ASSERT (xcode <= GB_FC64_code) ;
1251             ASSERT (ycode <= GB_FC64_code) ;
1252             ASSERT (zcode <= GB_FC64_code) ;
1253 
1254             // Tx [p] = (ttype) S [k], with typecasting
1255             #undef  GB_CAST_ARRAY_TO_ARRAY
1256             #define GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)                        \
1257                 cast_S_to_T (Tx +((p)*tsize), S +((k)*ssize), ssize) ;
1258 
1259             if (op_2nd)
1260             {
1261 
1262                 //--------------------------------------------------------------
1263                 // dup operator is the SECOND operator, with typecasting
1264                 //--------------------------------------------------------------
1265 
1266                 // Tx [p] += (ttype) S [k], but 2nd op, with typecasting
1267                 #undef  GB_ADD_CAST_ARRAY_TO_ARRAY
1268                 #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)        \
1269                     GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)
1270                 #include "GB_reduce_build_template.c"
1271 
1272             }
1273             else
1274             {
1275 
1276                 //--------------------------------------------------------------
1277                 // dup is another operator, with typecasting required
1278                 //--------------------------------------------------------------
1279 
1280                 // Tx [p] += S [k], with typecasting
1281                 #undef  GB_ADD_CAST_ARRAY_TO_ARRAY
1282                 #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)                \
1283                 {                                                           \
1284                     /* ywork = (ytype) S [k] */                             \
1285                     GB_void ywork [GB_VLA(ysize)] ;                         \
1286                     cast_S_to_Y (ywork, S +((k)*ssize), ssize) ;            \
1287                     /* xwork = (xtype) Tx [p] */                            \
1288                     GB_void xwork [GB_VLA(xsize)] ;                         \
1289                     cast_T_to_X (xwork, Tx +((p)*tsize), tsize) ;           \
1290                     /* zwork = f (xwork, ywork) */                          \
1291                     GB_void zwork [GB_VLA(zsize)] ;                         \
1292                     fdup (zwork, xwork, ywork) ;                            \
1293                     /* Tx [tnz-1] = (ttype) zwork */                        \
1294                     cast_Z_to_T (Tx +((p)*tsize), zwork, zsize) ;           \
1295                 }
1296 
1297                 #include "GB_reduce_build_template.c"
1298             }
1299         }
1300     }
1301 
1302     //--------------------------------------------------------------------------
1303     // free workspace and return result
1304     //--------------------------------------------------------------------------
1305 
1306     GB_FREE_WORK ;
1307     T->jumbled = false ;
1308     ASSERT_MATRIX_OK (T, "T built", GB0) ;
1309     ASSERT (GB_IS_HYPERSPARSE (T)) ;
1310     return (GrB_SUCCESS) ;
1311 }
1312 
1313