1 //------------------------------------------------------------------------------
2 // GB_builder: build a matrix from tuples
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // CALLED BY: GB_build, GB_Matrix_wait, GB_transpose, GB_concat_hyper
11 // CALLS: Generated/GB__red_build__* workers
12
13 // This function is called by GB_build to build a matrix T for GrB_Matrix_build
14 // or GrB_Vector_build, by GB_Matrix_wait to build a matrix T from the list of
15 // pending tuples, and by GB_transpose to transpose a matrix or vector.
16 // Duplicates can appear if called by GB_build or GB_Matrix_wait, but not
17 // GB_transpose.
18
19 // The indices are provided either as (I_input,J_input) or (I_work,J_work), not
20 // both. The values are provided as S_input or S_work, not both. On return,
21 // the *work arrays are either transplanted into T, or freed, since they are
22 // temporary workspaces.
23
24 // The work is done in major 5 Steps, some of which can be skipped, depending
25 // on how the tuples are provided (*_work or *_input), and whether or not they
26 // are sorted, or have duplicates. If vdim <= 1, some work is skipped (for
27 // GrB_Vectors, and single-vector GrB_Matrices). Let e be the of tuples on
28 // input. Let p be the # of threads used.
29
30 // STEP 1: copy user input. O(e/p) read/write per thread, or skipped.
31
32 // STEP 2: sort the tuples. Time: O((e log e)/p), read/write, or skipped if
33 // the tuples are already sorted.
34
35 // STEP 3: count vectors and duplicates. O(e/p) reads, per thread, if no
36 // duplicates, or skipped if already done. O(e/p) read/writes
37 // per thread if duplicates appear.
38
39 // STEP 4: construct T->h and T->p. O(e/p) reads per thread, or skipped if
40 // T is a vector.
41
42 // STEP 5: assemble the tuples. O(e/p) read/writes per thread, or O(1) if the
43 // values can be transplanted into T as-is.
44
45 // For GrB_Matrix_build: If the input (I_input, J_input, S_input) is already
46 // sorted with no duplicates, and no typecasting needs to be done, then Step 1
47 // still must be done (each thread does O(e/p) reads of (I_input,J_input) and
48 // writes to I_work), but Step 1 also does the work for Step 3. Step 2 and 3
49 // are skipped. Step 4 does O(e/p) reads per thread (J_input only). Then
50 // I_work is transplanted into T->i. Step 5 does O(e/p) read/writes per thread
51 // to copy S into T->x.
52
53 // For GrB_Vector_build: as GrB_Matrix_build, Step 1 does O(e/p) read/writes
54 // per thread. The input is always a vector, so vdim == 1 always holds. Step
55 // 2 is skipped if the indices are already sorted, and Step 3 does no work at
56 // all unless duplicates appear. Step 4 takes no time, for any vector. Step 5
57 // does O(e/p) reads/writes per thread.
58
59 // For GB_Matrix_wait: the pending tuples are provided as I_work, J_work, and
60 // S_work, so Step 1 is skipped (no need to check for invalid indices). The
61 // input J_work may be null (vdim can be anything, since GB_Matrix_wait is used
62 // for both vectors and matrices). The tuples might be in sorted order
63 // already, which is known precisely known from A->Pending->sorted. Step 2
64 // does O((e log e)/p) work to sort the tuples. Duplicates may appear, and
65 // out-of-order tuples are likely. Step 3 does O(e/p) read/writes. Step 4
66 // does O(e/p) reads per thread of (I_work,J_work), or just I_work. Step 5
67 // does O(e/p) read/writes per thread, or O(1) time if S_work can be
68 // transplanted into T->x.
69
70 // For GB_transpose: uses I_work, J_work, and either S_input (if no op applied
71 // to the values) or S_work (if an op was applied to the A->x values). This is
72 // only done for matrices, not vectors, so vdim > 1 will always hold. The
73 // indices are valid so Step 1 is skipped. The tuples are not sorted, so Step
74 // 2 takes O((e log e)/p) time to do the sort. There are no duplicates, so
75 // Step 3 only does O(e/p) reads of J_work to count the vectors in each slice.
76 // Step 4 only does O(e/p) reads of J_work to compute T->h and T->p. Step 5
77 // does O(e/p) read/writes per thread, but it uses the simpler case in
78 // GB_reduce_build_template since no duplicates can appear. It is unlikely
79 // able to transplant S_work into T->x since the input will almost always be
80 // unsorted.
81
82 // For GB_concat_hyper: uses I_work, J_work, and S_work. No duplicates
83 // appear. Tuples are not sorted on input. I_work is transplanted into C->i.
84 // J_work and S_work are freed on output. S_work is not transplanted into
85 // C->x.
86
87 // This method always returns T as hypersparse, and has no matrix inputs. If
88 // the final C should become full or bitmap, that conversion is done by
89 // GB_transplant_conform.
90
91 #include "GB_build.h"
92 #include "GB_sort.h"
93 #include "GB_binop.h"
94 #ifndef GBCOMPACT
95 #include "GB_red__include.h"
96 #endif
97
98 #define GB_I_WORK(t) (((t) < 0) ? -1 : I_work [t])
99 #define GB_J_WORK(t) (((t) < 0) ? -1 : ((J_work == NULL) ? 0 : J_work [t]))
100 #define GB_K_WORK(t) (((t) < 0) ? -1 : ((K_work == NULL) ? t : K_work [t]))
101
102 #define GB_FREE_WORK \
103 { \
104 GB_WERK_POP (Work, int64_t) ; \
105 GB_FREE (I_work_handle, *I_work_size_handle) ; \
106 GB_FREE (J_work_handle, *J_work_size_handle) ; \
107 GB_FREE (S_work_handle, *S_work_size_handle) ; \
108 GB_FREE_WERK (&K_work, K_work_size) ; \
109 }
110
111 //------------------------------------------------------------------------------
112 // GB_builder
113 //------------------------------------------------------------------------------
114
GB_builder(GrB_Matrix T,const GrB_Type ttype,const int64_t vlen,const int64_t vdim,const bool is_csc,int64_t ** I_work_handle,size_t * I_work_size_handle,int64_t ** J_work_handle,size_t * J_work_size_handle,GB_void ** S_work_handle,size_t * S_work_size_handle,bool known_sorted,bool known_no_duplicates,int64_t ijslen,const bool is_matrix,const int64_t * restrict I_input,const int64_t * restrict J_input,const GB_void * restrict S_input,const int64_t nvals,const GrB_BinaryOp dup,const GB_Type_code scode,GB_Context Context)115 GrB_Info GB_builder // build a matrix from tuples
116 (
117 GrB_Matrix T, // matrix to build, static or dynamic header
118 const GrB_Type ttype, // type of output matrix T
119 const int64_t vlen, // length of each vector of T
120 const int64_t vdim, // number of vectors in T
121 const bool is_csc, // true if T is CSC, false if CSR
122 int64_t **I_work_handle, // for (i,k) or (j,i,k) tuples
123 size_t *I_work_size_handle,
124 int64_t **J_work_handle, // for (j,i,k) tuples
125 size_t *J_work_size_handle,
126 GB_void **S_work_handle, // array of values of tuples, size ijslen
127 size_t *S_work_size_handle,
128 bool known_sorted, // true if tuples known to be sorted
129 bool known_no_duplicates, // true if tuples known to not have dupl
130 int64_t ijslen, // size of I_work and J_work arrays
131 const bool is_matrix, // true if T a GrB_Matrix, false if vector
132 const int64_t *restrict I_input,// original indices, size nvals
133 const int64_t *restrict J_input,// original indices, size nvals
134 const GB_void *restrict S_input,// array of values of tuples, size nvals
135 const int64_t nvals, // number of tuples, and size of K_work
136 const GrB_BinaryOp dup, // binary function to assemble duplicates,
137 // if NULL use the SECOND operator to
138 // keep the most recent duplicate.
139 const GB_Type_code scode, // GB_Type_code of S_work or S_input array
140 GB_Context Context
141 )
142 {
143
144 //--------------------------------------------------------------------------
145 // check inputs
146 //--------------------------------------------------------------------------
147
148 ASSERT (T != NULL) ; // T is a static or dynamic header on input
149 ASSERT (nvals >= 0) ;
150 ASSERT (scode <= GB_UDT_code) ;
151 ASSERT_TYPE_OK (ttype, "ttype for builder", GB0) ;
152 ASSERT_BINARYOP_OK_OR_NULL (dup, "dup for builder", GB0) ;
153 ASSERT (I_work_handle != NULL) ;
154 ASSERT (J_work_handle != NULL) ;
155 ASSERT (S_work_handle != NULL) ;
156 ASSERT (!GB_OP_IS_POSITIONAL (dup)) ;
157 ASSERT (I_work_size_handle != NULL) ;
158 ASSERT (J_work_size_handle != NULL) ;
159 ASSERT (S_work_size_handle != NULL) ;
160
161 //--------------------------------------------------------------------------
162 // get S
163 //--------------------------------------------------------------------------
164
165 GB_void *restrict S_work = (*S_work_handle) ;
166
167 const GB_void *restrict S = (S_work == NULL) ? S_input : S_work ;
168 size_t tsize = ttype->size ;
169 size_t ssize = GB_code_size (scode, tsize) ;
170 ASSERT (GB_IMPLIES (nvals > 0, S != NULL)) ;
171
172 //==========================================================================
173 // symbolic phase of the build =============================================
174 //==========================================================================
175
176 // The symbolic phase sorts the tuples and finds any duplicates. The
177 // output matrix T is constructed (not including T->i and T->x), and T->h
178 // and T->p are computed. Then I_work is transplanted into T->i, or T->i is
179 // allocated. T->x is then allocated. It is not computed until the
180 // numeric phase.
181
182 // When this function returns, I_work is either freed or transplanted into
183 // T->i. J_work is freed, and the I_work and J_work pointers (in the
184 // caller) are set to NULL by setting their handles to NULL. Note that
185 // J_work may already be NULL on input, if T has one or zero vectors
186 // (J_work_handle is always non-NULL however).
187
188 GrB_Info info ;
189 int64_t *restrict I_work = (*I_work_handle) ;
190 int64_t *restrict J_work = (*J_work_handle) ;
191 int64_t *restrict K_work = NULL ; size_t K_work_size = 0 ;
192 ASSERT (*J_work_size_handle == GB_Global_memtable_size (J_work)) ;
193
194 //--------------------------------------------------------------------------
195 // determine the number of threads to use
196 //--------------------------------------------------------------------------
197
198 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
199 int nthreads = GB_nthreads (nvals, chunk, nthreads_max) ;
200
201 //--------------------------------------------------------------------------
202 // allocate workspace
203 //--------------------------------------------------------------------------
204
205 GB_WERK_DECLARE (Work, int64_t) ;
206 GB_WERK_PUSH (Work, 5*(nthreads+1), int64_t) ;
207 if (Work == NULL)
208 {
209 // out of memory
210 GB_FREE_WORK ;
211 return (GrB_OUT_OF_MEMORY) ;
212 }
213
214 memset (Work, 0, Work_nitems * sizeof (int64_t)) ;
215 int64_t *restrict tstart_slice = Work ; // nthreads+1
216 int64_t *restrict tnvec_slice = Work + (nthreads+1) ; // nthreads+1
217 int64_t *restrict tnz_slice = Work + 2*(nthreads+1) ; // nthreads+1
218 int64_t *restrict kbad = Work + 3*(nthreads+1) ; // nthreads
219 int64_t *restrict ilast_slice = Work + 4*(nthreads+1) ; // nthreads
220
221 //--------------------------------------------------------------------------
222 // partition the tuples for the threads
223 //--------------------------------------------------------------------------
224
225 // Thread tid handles tuples tstart_slice [tid] to tstart_slice [tid+1]-1.
226 // Each thread handles about the same number of tuples. This partition
227 // depends only on nvals.
228
229 GB_eslice (tstart_slice, nvals, nthreads) ;
230
231 // tstart_slice [tid]: first tuple in slice tid
232 // tnvec_slice [tid]: # of vectors that start in a slice. If a vector
233 // starts in one slice and ends in another, it is
234 // counted as being in the first slice.
235 // tnz_slice [tid]: # of entries in a slice after removing duplicates
236
237 // sentinel values for the final cumulative sum
238 tnvec_slice [nthreads] = 0 ;
239 tnz_slice [nthreads] = 0 ;
240
241 // this becomes true if the first pass computes tnvec_slice and tnz_slice,
242 // and if the (I_input,J_input) tuples were found to be already sorted with
243 // no duplicates present.
244 bool tnvec_and_tnz_slice_computed = false ;
245
246 //--------------------------------------------------------------------------
247 // STEP 1: copy user input and check if valid
248 //--------------------------------------------------------------------------
249
250 // If the indices are provided by (I_input,J_input), then import them into
251 // (I_work,J_work) and check if they are valid, and sorted. If the input
252 // happens to be already sorted, then duplicates are detected and the # of
253 // vectors in each slice is counted.
254
255 if (I_work == NULL)
256 {
257
258 //----------------------------------------------------------------------
259 // allocate I_work
260 //----------------------------------------------------------------------
261
262 // allocate workspace to load and sort the index tuples:
263
264 // vdim <= 1: I_work and K_work for (i,k) tuples, where i = I_input [k]
265
266 // vdim > 1: also J_work for (j,i,k) tuples where i = I_input [k] and
267 // j = J_input [k]. If the tuples are found to be already sorted on
268 // input, then J_work is not allocated, and J_input is used instead.
269
270 // The k value in the tuple gives the position in the original set of
271 // tuples: I_input [k] and S [k] when vdim <= 1, and also J_input [k]
272 // for matrices with vdim > 1.
273
274 // The workspace I_work and J_work are allocated here but freed (or
275 // transplanted) inside GB_builder. K_work is allocated, used, and
276 // freed in GB_builder.
277
278 ASSERT (J_work == NULL) ;
279 I_work = GB_MALLOC (nvals, int64_t, I_work_size_handle) ;
280 (*I_work_handle) = I_work ;
281 ijslen = nvals ;
282 if (I_work == NULL)
283 {
284 // out of memory
285 GB_FREE_WORK ;
286 return (GrB_OUT_OF_MEMORY) ;
287 }
288
289 //----------------------------------------------------------------------
290 // create the tuples to sort, and check for any invalid indices
291 //----------------------------------------------------------------------
292
293 known_sorted = true ;
294 bool no_duplicates_found = true ;
295
296 if (nvals == 0)
297 {
298
299 // nothing to do
300
301 }
302 else if (is_matrix)
303 {
304
305 //------------------------------------------------------------------
306 // C is a matrix; check both I_input and J_input
307 //------------------------------------------------------------------
308
309 ASSERT (J_input != NULL) ;
310 ASSERT (I_work != NULL) ;
311 ASSERT (vdim >= 0) ;
312 ASSERT (I_input != NULL) ;
313
314 int tid ;
315 #pragma omp parallel for num_threads(nthreads) schedule(static) \
316 reduction(&&:known_sorted) reduction(&&:no_duplicates_found)
317 for (tid = 0 ; tid < nthreads ; tid++)
318 {
319
320 kbad [tid] = -1 ;
321 int64_t my_tnvec = 0 ;
322 int64_t kstart = tstart_slice [tid] ;
323 int64_t kend = tstart_slice [tid+1] ;
324 int64_t ilast = (kstart == 0) ? -1 : I_input [kstart-1] ;
325 int64_t jlast = (kstart == 0) ? -1 : J_input [kstart-1] ;
326
327 for (int64_t k = kstart ; k < kend ; k++)
328 {
329 // get k-th index from user input: (i,j)
330 int64_t i = I_input [k] ;
331 int64_t j = J_input [k] ;
332
333 if (i < 0 || i >= vlen || j < 0 || j >= vdim)
334 {
335 // halt if out of bounds
336 kbad [tid] = k ;
337 break ;
338 }
339
340 // check if the tuples are already sorted
341 known_sorted = known_sorted &&
342 ((jlast < j) || (jlast == j && ilast <= i)) ;
343
344 // check if this entry is a duplicate of the one before it
345 no_duplicates_found = no_duplicates_found &&
346 (!(jlast == j && ilast == i)) ;
347
348 // copy the tuple into I_work. J_work is done later.
349 I_work [k] = i ;
350
351 if (j > jlast)
352 {
353 // vector j starts in this slice (but this is
354 // valid only if J_input is sorted on input)
355 my_tnvec++ ;
356 }
357
358 // log the last index seen
359 ilast = i ; jlast = j ;
360 }
361
362 // these are valid only if I_input and J_input are sorted on
363 // input, with no duplicates present.
364 tnvec_slice [tid] = my_tnvec ;
365 tnz_slice [tid] = kend - kstart ;
366
367 }
368
369 // collect the report from each thread
370 for (int tid = 0 ; tid < nthreads ; tid++)
371 {
372 if (kbad [tid] >= 0)
373 {
374 // invalid index
375 int64_t i = I_input [kbad [tid]] ;
376 int64_t j = J_input [kbad [tid]] ;
377 int64_t row = is_csc ? i : j ;
378 int64_t col = is_csc ? j : i ;
379 int64_t nrows = is_csc ? vlen : vdim ;
380 int64_t ncols = is_csc ? vdim : vlen ;
381 GB_FREE_WORK ;
382 GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
383 "index (" GBd "," GBd ") out of bounds,"
384 " must be < (" GBd ", " GBd ")",
385 row, col, nrows, ncols) ;
386 }
387 }
388
389 // if the tuples were found to be already in sorted order, and if
390 // no duplicates were found, then tnvec_slice and tnz_slice are now
391 // valid, Otherwise, they can only be computed after sorting.
392 tnvec_and_tnz_slice_computed = known_sorted && no_duplicates_found ;
393
394 //------------------------------------------------------------------
395 // allocate J_work, if needed
396 //------------------------------------------------------------------
397
398 if (vdim > 1 && !known_sorted)
399 {
400 // copy J_input into J_work, so the tuples can be sorted
401 J_work = GB_MALLOC (nvals, int64_t, J_work_size_handle) ;
402 (*J_work_handle) = J_work ;
403 if (J_work == NULL)
404 {
405 // out of memory
406 GB_FREE_WORK ;
407 return (GrB_OUT_OF_MEMORY) ;
408 }
409 GB_memcpy (J_work, J_input, nvals * sizeof (int64_t), nthreads);
410 }
411 else
412 {
413 // J_work is a shallow copy of J_input. The pointer is not
414 // copied into (*J_work_handle), so it will not be freed.
415 // J_input is not modified, even though it is typecast to the
416 // int64_t *J_work, since J_work is not modified in this case.
417 J_work = (int64_t *) J_input ;
418 }
419
420 }
421 else
422 {
423
424 //------------------------------------------------------------------
425 // C is a typecasted GrB_Vector; check only I_input
426 //------------------------------------------------------------------
427
428 ASSERT (I_input != NULL) ;
429 ASSERT (J_input == NULL) ;
430 ASSERT (vdim == 1) ;
431
432 int tid ;
433 #pragma omp parallel for num_threads(nthreads) schedule(static) \
434 reduction(&&:known_sorted) reduction(&&:no_duplicates_found)
435 for (tid = 0 ; tid < nthreads ; tid++)
436 {
437
438 kbad [tid] = -1 ;
439 int64_t kstart = tstart_slice [tid] ;
440 int64_t kend = tstart_slice [tid+1] ;
441 int64_t ilast = (kstart == 0) ? -1 : I_input [kstart-1] ;
442
443 for (int64_t k = kstart ; k < kend ; k++)
444 {
445 // get k-th index from user input: (i)
446 int64_t i = I_input [k] ;
447
448 if (i < 0 || i >= vlen)
449 {
450 // halt if out of bounds
451 kbad [tid] = k ;
452 break ;
453 }
454
455 // check if the tuples are already sorted
456 known_sorted = known_sorted && (ilast <= i) ;
457
458 // check if this entry is a duplicate of the one before it
459 no_duplicates_found = no_duplicates_found &&
460 (!(ilast == i)) ;
461
462 // copy the tuple into the work arrays to be sorted
463 I_work [k] = i ;
464
465 // log the last index seen
466 ilast = i ;
467 }
468 }
469
470 // collect the report from each thread
471 for (int tid = 0 ; tid < nthreads ; tid++)
472 {
473 if (kbad [tid] >= 0)
474 {
475 // invalid index
476 int64_t i = I_input [kbad [tid]] ;
477 GB_FREE_WORK ;
478 GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
479 "index (" GBd ") out of bounds, must be < (" GBd ")",
480 i, vlen) ;
481 }
482 }
483 }
484
485 //----------------------------------------------------------------------
486 // determine if duplicates are possible
487 //----------------------------------------------------------------------
488
489 // The input is now known to be sorted, or not. If it is sorted, and
490 // if no duplicates were found, then it is known to have no duplicates.
491 // Otherwise, duplicates might appear, but a sort is required first to
492 // check for duplicates.
493
494 known_no_duplicates = known_sorted && no_duplicates_found ;
495 }
496
497 //--------------------------------------------------------------------------
498 // STEP 2: sort the tuples in ascending order
499 //--------------------------------------------------------------------------
500
501 // If the tuples are known to already be sorted, Step 2 is skipped. In
502 // that case, K_work is NULL (not allocated), which implicitly means that
503 // K_work [k] = k for all k = 0:nvals-1.
504
505 if (!known_sorted)
506 {
507
508 // create the k part of each tuple
509 K_work = GB_MALLOC_WERK (nvals, int64_t, &K_work_size) ;
510 if (K_work == NULL)
511 {
512 // out of memory
513 GB_FREE_WORK ;
514 return (GrB_OUT_OF_MEMORY) ;
515 }
516
517 // The k part of each tuple (i,k) or (j,i,k) records the original
518 // position of the tuple in the input list. This allows an unstable
519 // sorting algorith to be used. Since k is unique, it forces the
520 // result of the sort to be stable regardless of whether or not the
521 // sorting algorithm is stable. It also keeps track of where the
522 // numerical value of the tuple can be found; it is in S[k] for the
523 // tuple (i,k) or (j,i,k), regardless of where the tuple appears in the
524 // list after it is sorted.
525 int64_t k ;
526 #pragma omp parallel for num_threads(nthreads) schedule(static)
527 for (k = 0 ; k < nvals ; k++)
528 {
529 K_work [k] = k ;
530 }
531
532 // sort all the tuples
533 if (vdim > 1)
534 {
535
536 // sort a set of (j,i,k) tuples
537 info = GB_msort_3b (J_work, I_work, K_work, nvals, nthreads) ;
538
539 #ifdef GB_DEBUG
540 if (info == GrB_SUCCESS)
541 {
542 int64_t ilast = -1 ;
543 int64_t jlast = -1 ;
544 for (int64_t k = 0 ; k < nvals ; k++)
545 {
546 int64_t i = I_work [k] ;
547 int64_t j = J_work [k] ;
548 ASSERT ((jlast < j) || (jlast == j && ilast <= i)) ;
549 ilast = i ;
550 jlast = j ;
551 }
552 }
553 #endif
554
555 }
556 else
557 {
558 // sort a set of (i,k) tuples
559 info = GB_msort_2b (I_work, K_work, nvals, nthreads) ;
560
561 #ifdef GB_DEBUG
562 if (info == GrB_SUCCESS)
563 {
564 int64_t ilast = -1 ;
565 for (int64_t k = 0 ; k < nvals ; k++)
566 {
567 int64_t i = I_work [k] ;
568 ASSERT (ilast <= i) ;
569 ilast = i ;
570 }
571 }
572 #endif
573
574 }
575
576 if (info != GrB_SUCCESS)
577 {
578 // out of memory in GB_msort_*
579 GB_FREE_WORK ;
580 return (GrB_OUT_OF_MEMORY) ;
581 }
582 }
583
584 //--------------------------------------------------------------------------
585 // STEP 3: count vectors and duplicates in each slice
586 //--------------------------------------------------------------------------
587
588 // Duplicates are located, counted and their indices negated. The # of
589 // vectors in each slice is counted. If the indices are known to not have
590 // duplicates, then only the vectors are counted. Counting the # of
591 // vectors is skipped if already done by Step 1.
592
593 if (known_no_duplicates)
594 {
595
596 //----------------------------------------------------------------------
597 // no duplicates: just count # vectors in each slice
598 //----------------------------------------------------------------------
599
600 // This is much faster, particularly if the # of vectors in each slice
601 // has already been computed.
602
603 #ifdef GB_DEBUG
604 {
605 // assert that there are no duplicates
606 int64_t ilast = -1, jlast = -1 ;
607 for (int64_t t = 0 ; t < nvals ; t++)
608 {
609 int64_t i = GB_I_WORK (t), j = GB_J_WORK (t) ;
610 bool is_duplicate = (i == ilast && j == jlast) ;
611 ASSERT (!is_duplicate) ;
612 ilast = i ; jlast = j ;
613 }
614 }
615 #endif
616
617 if (vdim <= 1)
618 {
619
620 // all tuples appear in at most one vector, and there are no
621 // duplicates, so there is no need to scan I_work or J_work.
622
623 for (int tid = 0 ; tid < nthreads ; tid++)
624 {
625 int64_t tstart = tstart_slice [tid] ;
626 int64_t tend = tstart_slice [tid+1] ;
627 tnvec_slice [tid] = 0 ;
628 tnz_slice [tid] = tend - tstart ;
629 }
630 tnvec_slice [0] = (nvals == 0) ? 0 : 1 ;
631
632 }
633 else
634 {
635
636 // count the # of unique vector indices in J_work. No need to scan
637 // I_work since there are no duplicates to be found. Also no need
638 // to compute them if already found in Step 1.
639
640 if (!tnvec_and_tnz_slice_computed)
641 {
642
643 int tid ;
644 #pragma omp parallel for num_threads(nthreads) schedule(static)
645 for (tid = 0 ; tid < nthreads ; tid++)
646 {
647 int64_t my_tnvec = 0 ;
648 int64_t tstart = tstart_slice [tid] ;
649 int64_t tend = tstart_slice [tid+1] ;
650 int64_t jlast = GB_J_WORK (tstart-1) ;
651
652 for (int64_t t = tstart ; t < tend ; t++)
653 {
654 // get the t-th tuple
655 int64_t j = J_work [t] ;
656 if (j > jlast)
657 {
658 // vector j starts in this slice
659 my_tnvec++ ;
660 jlast = j ;
661 }
662 }
663
664 tnvec_slice [tid] = my_tnvec ;
665 tnz_slice [tid] = tend - tstart ;
666 }
667 }
668 }
669
670 }
671 else
672 {
673
674 //----------------------------------------------------------------------
675 // look for duplicates and count # vectors in each slice
676 //----------------------------------------------------------------------
677
678 for (int tid = 0 ; tid < nthreads ; tid++)
679 {
680 int64_t tstart = tstart_slice [tid] ;
681 ilast_slice [tid] = GB_I_WORK (tstart-1) ;
682 }
683
684 int tid ;
685 #pragma omp parallel for num_threads(nthreads) schedule(static)
686 for (tid = 0 ; tid < nthreads ; tid++)
687 {
688
689 int64_t my_tnvec = 0 ;
690 int64_t my_ndupl = 0 ;
691 int64_t tstart = tstart_slice [tid] ;
692 int64_t tend = tstart_slice [tid+1] ;
693 int64_t ilast = ilast_slice [tid] ;
694 int64_t jlast = GB_J_WORK (tstart-1) ;
695
696 for (int64_t t = tstart ; t < tend ; t++)
697 {
698 // get the t-th tuple
699 int64_t i = I_work [t] ;
700 int64_t j = GB_J_WORK (t) ;
701
702 // tuples are now sorted but there may be duplicates
703 ASSERT ((jlast < j) || (jlast == j && ilast <= i)) ;
704
705 // check if (j,i,k) is a duplicate
706 if (i == ilast && j == jlast)
707 {
708 // flag the tuple as a duplicate
709 I_work [t] = -1 ;
710 my_ndupl++ ;
711 // the sort places earlier duplicate tuples (with smaller
712 // k) after later ones (with larger k).
713 ASSERT (GB_K_WORK (t-1) < GB_K_WORK (t)) ;
714 }
715 else
716 {
717 // this is a new tuple
718 if (j > jlast)
719 {
720 // vector j starts in this slice
721 my_tnvec++ ;
722 jlast = j ;
723 }
724 ilast = i ;
725 }
726 }
727 tnvec_slice [tid] = my_tnvec ;
728 tnz_slice [tid] = (tend - tstart) - my_ndupl ;
729 }
730 }
731
732 //--------------------------------------------------------------------------
733 // find total # of vectors and duplicates in all tuples
734 //--------------------------------------------------------------------------
735
736 // Replace tnvec_slice with its cumulative sum, after which each slice tid
737 // will be responsible for the # vectors in T that range from tnvec_slice
738 // [tid] to tnvec_slice [tid+1]-1.
739 GB_cumsum (tnvec_slice, nthreads, NULL, 1, NULL) ;
740 int64_t tnvec = tnvec_slice [nthreads] ;
741
742 // Replace tnz_slice with its cumulative sum
743 GB_cumsum (tnz_slice, nthreads, NULL, 1, NULL) ;
744
745 // find the total # of final entries, after assembling duplicates
746 int64_t tnz = tnz_slice [nthreads] ;
747 int64_t ndupl = nvals - tnz ;
748
749 //--------------------------------------------------------------------------
750 // allocate T; always hypersparse
751 //--------------------------------------------------------------------------
752
753 // allocate T; allocate T->p and T->h but do not initialize them.
754 // T is always hypersparse. The header T always exists on input, as
755 // either a static or dynamic header.
756 bool static_header = T->static_header ;
757 info = GB_new (&T, static_header, // always hyper, static or dynamic header
758 ttype, vlen, vdim, GB_Ap_malloc, is_csc,
759 GxB_HYPERSPARSE, GB_ALWAYS_HYPER, tnvec, Context) ;
760 if (info != GrB_SUCCESS)
761 {
762 // out of memory
763 GB_FREE_WORK ;
764 return (info) ;
765 }
766
767 ASSERT (T->p != NULL) ;
768 ASSERT (T->h != NULL) ;
769 ASSERT (T->nzmax == 0) ; // T->i and T->x not yet allocated
770 ASSERT (T->b == NULL) ;
771 ASSERT (T->i == NULL) ;
772 ASSERT (T->x == NULL) ;
773
774 //--------------------------------------------------------------------------
775 // STEP 4: construct the vector pointers and hyperlist for T
776 //--------------------------------------------------------------------------
777
778 // Step 4 scans the J_work indices and constructs T->h and T->p.
779
780 int64_t *restrict Th = T->h ;
781 int64_t *restrict Tp = T->p ;
782
783 if (vdim <= 1)
784 {
785
786 //----------------------------------------------------------------------
787 // special case for vectors
788 //----------------------------------------------------------------------
789
790 ASSERT (tnvec == 0 || tnvec == 1) ;
791 if (tnvec > 0)
792 {
793 Th [0] = 0 ;
794 Tp [0] = 0 ;
795 }
796
797 }
798 else if (ndupl == 0)
799 {
800
801 //----------------------------------------------------------------------
802 // no duplicates appear
803 //----------------------------------------------------------------------
804
805 int tid ;
806 #pragma omp parallel for num_threads(nthreads) schedule(static)
807 for (tid = 0 ; tid < nthreads ; tid++)
808 {
809
810 int64_t my_tnvec = tnvec_slice [tid] ;
811 int64_t tstart = tstart_slice [tid] ;
812 int64_t tend = tstart_slice [tid+1] ;
813 int64_t jlast = GB_J_WORK (tstart-1) ;
814
815 for (int64_t t = tstart ; t < tend ; t++)
816 {
817 // get the t-th tuple
818 int64_t j = GB_J_WORK (t) ;
819 if (j > jlast)
820 {
821 // vector j starts in this slice
822 Th [my_tnvec] = j ;
823 Tp [my_tnvec] = t ;
824 my_tnvec++ ;
825 jlast = j ;
826 }
827 }
828 }
829
830 }
831 else
832 {
833
834 //----------------------------------------------------------------------
835 // it is known that at least one duplicate appears
836 //----------------------------------------------------------------------
837
838 int tid ;
839 #pragma omp parallel for num_threads(nthreads) schedule(static)
840 for (tid = 0 ; tid < nthreads ; tid++)
841 {
842
843 int64_t my_tnz = tnz_slice [tid] ;
844 int64_t my_tnvec = tnvec_slice [tid] ;
845 int64_t tstart = tstart_slice [tid] ;
846 int64_t tend = tstart_slice [tid+1] ;
847 int64_t jlast = GB_J_WORK (tstart-1) ;
848
849 for (int64_t t = tstart ; t < tend ; t++)
850 {
851 // get the t-th tuple
852 int64_t i = I_work [t] ;
853 int64_t j = GB_J_WORK (t) ;
854 if (i >= 0)
855 {
856 // this is a new tuple
857 if (j > jlast)
858 {
859 // vector j starts in this slice
860 Th [my_tnvec] = j ;
861 Tp [my_tnvec] = my_tnz ;
862 my_tnvec++ ;
863 jlast = j ;
864 }
865 my_tnz++ ;
866 }
867 }
868 }
869 }
870
871 // log the end of the last vector
872 T->nvec_nonempty = tnvec ;
873 T->nvec = tnvec ;
874 Tp [tnvec] = tnz ;
875 ASSERT (T->nvec == T->plen) ;
876 T->magic = GB_MAGIC ;
877
878 //--------------------------------------------------------------------------
879 // free J_work if it exists
880 //--------------------------------------------------------------------------
881
882 ASSERT (J_work_handle != NULL) ;
883 GB_FREE (J_work_handle, *J_work_size_handle) ;
884 J_work = NULL ;
885
886 //--------------------------------------------------------------------------
887 // allocate T->i
888 //--------------------------------------------------------------------------
889
890 T->nzmax = GB_IMAX (tnz, 1) ;
891
892 if (ndupl == 0)
893 {
894 // shrink I_work from size ijslen to size T->nzmax
895 if (T->nzmax < ijslen)
896 {
897 // this cannot fail since the size is shrinking.
898 bool ok ;
899 GB_REALLOC (I_work, T->nzmax, ijslen, int64_t, I_work_size_handle,
900 &ok, Context) ;
901 ASSERT (ok) ;
902 }
903 // transplant I_work into T->i
904 T->i = I_work ; T->i_size = (*I_work_size_handle) ;
905 I_work = NULL ;
906 (*I_work_handle) = NULL ;
907 (*I_work_size_handle) = 0 ;
908 }
909 else
910 {
911 // duplicates exist, so allocate a new T->i. I_work must be freed later
912 T->i = GB_MALLOC (T->nzmax, int64_t, &(T->i_size)) ;
913 if (T->i == NULL)
914 {
915 // out of memory
916 GB_phbix_free (T) ;
917 GB_FREE_WORK ;
918 return (GrB_OUT_OF_MEMORY) ;
919 }
920 }
921
922 int64_t *restrict Ti = T->i ;
923
924 //==========================================================================
925 // numerical phase of the build: assemble any duplicates
926 //==========================================================================
927
928 // The tuples have been sorted. Assemble any duplicates with a switch
929 // factory of built-in workers, or four generic workers. The vector
930 // pointers T->p and hyperlist T->h (if hypersparse) have already been
931 // computed.
932
933 // If there are no duplicates, T->i holds the row indices of the tuple.
934 // Otherwise, the row indices are still in I_work. K_work holds the
935 // positions of each tuple in the array S. The tuples are sorted so that
936 // duplicates are adjacent to each other and they appear in the order they
937 // appeared in the original tuples. This method assembles the duplicates
938 // and computes T->i and T->x from I_work, K_work, and S. into T, becoming
939 // T->i. If no duplicates appear, T->i is already computed, and S just
940 // needs to be copied and permuted into T->x.
941
942 // The (i,k,S[k]) tuples are held in two integer arrays: (1) I_work or T->i,
943 // and (2) K_work, and an array S of numerical values. S has not been
944 // sorted, nor even accessed yet. It is identical to the original unsorted
945 // tuples. The (i,k,S[k]) tuple holds the row index i, the position k, and
946 // the value S [k]. This entry becomes T(i,j) = S [k] in the matrix T, and
947 // duplicates (if any) are assembled via the dup operator.
948
949 //--------------------------------------------------------------------------
950 // get opcodes and check types
951 //--------------------------------------------------------------------------
952
953 // With GB_build, there can be 1 to 2 different types.
954 // T->type is identical to the types of x,y,z for z=dup(x,y).
955 // dup is never NULL and all its three types are the same
956 // The type of S (scode) can different but must be compatible
957 // with T->type
958
959 // With GB_Matrix_wait, there can be 1 to 5 different types:
960 // The pending tuples are in S, of type scode which must be
961 // compatible with dup->ytype and T->type
962 // z = dup (x,y): can be NULL or have 1 to 3 different types
963 // T->type: must be compatible with all above types.
964 // dup may be NULL, in which case it is assumed be the implicit SECOND
965 // operator, with all three types equal to T->type
966
967 GrB_Type xtype, ytype, ztype ;
968 GxB_binary_function fdup ;
969 #ifndef GBCOMPACT
970 GB_Opcode opcode ;
971 #endif
972
973 GB_Type_code tcode = ttype->code ;
974 bool op_2nd ;
975
976 ASSERT_TYPE_OK (ttype, "ttype for build_factory", GB0) ;
977
978 if (dup == NULL)
979 {
980
981 //----------------------------------------------------------------------
982 // dup is the implicit SECOND operator
983 //----------------------------------------------------------------------
984
985 // z = SECOND (x,y) where all three types are the same as ttype
986 // T(i,j) = (ttype) S(k) will be done for all tuples.
987
988 #ifndef GBCOMPACT
989 opcode = GB_SECOND_opcode ;
990 #endif
991 ASSERT (GB_op_is_second (dup, ttype)) ;
992 xtype = ttype ;
993 ytype = ttype ;
994 ztype = ttype ;
995 fdup = NULL ;
996 op_2nd = true ;
997
998 }
999 else
1000 {
1001
1002 //----------------------------------------------------------------------
1003 // dup is an explicit operator
1004 //----------------------------------------------------------------------
1005
1006 // T(i,j) = (ttype) S[k] will be done for the first tuple.
1007 // for subsequent tuples: T(i,j) += S[k], via the dup operator and
1008 // typecasting:
1009 //
1010 // y = (dup->ytype) S[k]
1011 // x = (dup->xtype) T(i,j)
1012 // z = (dup->ztype) dup (x,y)
1013 // T(i,j) = (ttype) z
1014
1015 ASSERT_BINARYOP_OK (dup, "dup for build_factory", GB0) ;
1016 #ifndef GBCOMPACT
1017 opcode = dup->opcode ;
1018 #endif
1019 xtype = dup->xtype ;
1020 ytype = dup->ytype ;
1021 ztype = dup->ztype ;
1022 fdup = dup->function ;
1023 op_2nd = GB_op_is_second (dup, ttype) ;
1024 }
1025
1026 //--------------------------------------------------------------------------
1027 // get the sizes and codes of each type
1028 //--------------------------------------------------------------------------
1029
1030 GB_Type_code zcode = ztype->code ;
1031 GB_Type_code xcode = xtype->code ;
1032 GB_Type_code ycode = ytype->code ;
1033
1034 ASSERT (GB_code_compatible (tcode, scode)) ; // T(i,j) = (ttype) S
1035 ASSERT (GB_code_compatible (ycode, scode)) ; // y = (ytype) S
1036 ASSERT (GB_Type_compatible (xtype, ttype)) ; // x = (xtype) T(i,j)
1037 ASSERT (GB_Type_compatible (ttype, ztype)) ; // T(i,j) = (ttype) z
1038
1039 size_t zsize = ztype->size ;
1040 size_t xsize = xtype->size ;
1041 size_t ysize = ytype->size ;
1042
1043 // no typecasting if all 5 types are the same
1044 bool nocasting = (tcode == scode) &&
1045 (ttype == xtype) && (ttype == ytype) && (ttype == ztype) ;
1046
1047 //--------------------------------------------------------------------------
1048 // STEP 5: assemble the tuples
1049 //--------------------------------------------------------------------------
1050
1051 bool copy_S_into_T = (nocasting && known_sorted && ndupl == 0) ;
1052
1053 if (copy_S_into_T && S_work != NULL)
1054 {
1055
1056 //----------------------------------------------------------------------
1057 // transplant S_work into T->x
1058 //----------------------------------------------------------------------
1059
1060 // No typecasting is needed, the tuples were originally in sorted
1061 // order, and no duplicates appear. All that is required is to copy S
1062 // into Tx. S can be directly transplanted into T->x since S is
1063 // provided as S_work. GB_builder must either transplant or free
1064 // S_work. The transplant can be used by GB_Matrix_wait, whenever the
1065 // tuples are already sorted, with no duplicates, and no typecasting is
1066 // needed, since S_work is always A->Pending->x. This transplant can
1067 // rarely be used for GB_transpose, in the case when op is NULL and the
1068 // transposed tuples happen to be sorted (which is unlikely).
1069
1070 T->x = S_work ; T->x_size = (*S_work_size_handle) ;
1071 S_work = NULL ;
1072 (*S_work_handle) = NULL ;
1073 (*S_work_size_handle) = 0 ;
1074
1075 }
1076 else
1077 {
1078
1079 //----------------------------------------------------------------------
1080 // allocate T->x
1081 //----------------------------------------------------------------------
1082
1083 T->x = GB_MALLOC (T->nzmax * ttype->size, GB_void, &(T->x_size)) ;
1084 if (T->x == NULL)
1085 {
1086 // out of memory
1087 GB_phbix_free (T) ;
1088 GB_FREE_WORK ;
1089 return (GrB_OUT_OF_MEMORY) ;
1090 }
1091
1092 GB_void *restrict Tx = (GB_void *) T->x ;
1093
1094 ASSERT (GB_IMPLIES (nvals > 0, S != NULL)) ;
1095
1096 if (nvals == 0)
1097 {
1098
1099 // nothing to do
1100
1101 }
1102 else if (copy_S_into_T)
1103 {
1104
1105 //------------------------------------------------------------------
1106 // copy S into T->x
1107 //------------------------------------------------------------------
1108
1109 // No typecasting is needed, the tuples were originally in sorted
1110 // order, and no duplicates appear. All that is required is to
1111 // copy S into Tx. S cannot be transplanted into T->x since
1112 // S_work is NULL and S_input cannot be modified by GB_builder.
1113
1114 GBURBLE ("(build:memcpy) ") ;
1115 ASSERT (S_work == NULL) ;
1116 ASSERT (S == S_input) ;
1117 GB_memcpy (Tx, S, nvals * tsize, nthreads) ;
1118
1119 }
1120 else if (nocasting)
1121 {
1122
1123 //------------------------------------------------------------------
1124 // assemble the values, S, into T, no typecasting needed
1125 //------------------------------------------------------------------
1126
1127 // S (either S_work or S_input) must be permuted and copied into
1128 // T->x, since the tuples had to be sorted, or duplicates appear.
1129 // Any duplicates are now assembled.
1130
1131 // There are 44 common cases of this function for built-in types
1132 // and 8 associative operators: MIN, MAX, PLUS, TIMES for 10 types
1133 // (all but boolean; and OR, AND, XOR, and EQ for boolean.
1134
1135 // In addition, the FIRST and SECOND operators are hard-coded, for
1136 // another 22 workers, since SECOND is used by GB_Matrix_wait and
1137 // since FIRST is useful for keeping the first tuple seen. It is
1138 // controlled by the GB_INCLUDE_SECOND_OPERATOR definition, so they
1139 // do not appear in GB_reduce_to_* where the FIRST and SECOND
1140 // operators are not needed.
1141
1142 // Early exit cannot be exploited, so the terminal is ignored.
1143
1144 GBURBLE ("(build:assemble) ") ;
1145 bool done = false ;
1146
1147 #ifndef GBCOMPACT
1148
1149 //--------------------------------------------------------------
1150 // define the worker for the switch factory
1151 //--------------------------------------------------------------
1152
1153 #define GB_INCLUDE_SECOND_OPERATOR
1154
1155 #define GB_red(opname,aname) \
1156 GB (_red_build_ ## opname ## aname)
1157
1158 #define GB_RED_WORKER(opname,aname,atype) \
1159 { \
1160 info = GB_red (opname, aname) ((atype *) Tx, Ti, \
1161 (atype *) S, nvals, ndupl, I_work, K_work, \
1162 tstart_slice, tnz_slice, nthreads) ; \
1163 done = (info != GrB_NO_VALUE) ; \
1164 } \
1165 break ;
1166
1167 //--------------------------------------------------------------
1168 // launch the switch factory
1169 //--------------------------------------------------------------
1170
1171 // controlled by opcode and typecode
1172 GB_Type_code typecode = tcode ;
1173 #include "GB_red_factory.c"
1174
1175 #endif
1176
1177 //------------------------------------------------------------------
1178 // generic worker
1179 //------------------------------------------------------------------
1180
1181 if (!done)
1182 {
1183 GB_BURBLE_N (nvals, "(generic) ") ;
1184
1185 //--------------------------------------------------------------
1186 // no typecasting, but use the fdup function pointer and memcpy
1187 //--------------------------------------------------------------
1188
1189 // Either the fdup operator or type of S and T are
1190 // user-defined, or fdup is not an associative operator handled
1191 // by the GB_red_factory, or some combination of these
1192 // conditions. User-defined types cannot be typecasted, so
1193 // this handles all user-defined types.
1194
1195 // Tx [p] = (ttype) S [k], but with no typecasting
1196 #define GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k) \
1197 memcpy (Tx +((p)*tsize), S +((k)*tsize), tsize) ;
1198
1199 if (op_2nd)
1200 {
1201
1202 //----------------------------------------------------------
1203 // dup is the SECOND operator, with no typecasting
1204 //----------------------------------------------------------
1205
1206 // Tx [p] += (ttype) S [k], but 2nd op and no typecasting
1207 #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k) \
1208 GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)
1209 #include "GB_reduce_build_template.c"
1210
1211 }
1212 else
1213 {
1214
1215 //----------------------------------------------------------
1216 // dup is another operator, with no typecasting needed
1217 //----------------------------------------------------------
1218
1219 // Tx [p] += (ttype) S [k], but with no typecasting
1220 #undef GB_ADD_CAST_ARRAY_TO_ARRAY
1221 #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k) \
1222 fdup (Tx +((p)*tsize), Tx +((p)*tsize), S +((k)*tsize));
1223 #include "GB_reduce_build_template.c"
1224 }
1225 }
1226
1227 }
1228 else
1229 {
1230
1231 //------------------------------------------------------------------
1232 // assemble the values S into T, typecasting as needed
1233 //------------------------------------------------------------------
1234
1235 GB_BURBLE_N (nvals, "(generic with typecast) ") ;
1236
1237 // S (either S_work or S_input) must be permuted and copied into
1238 // T->x, since the tuples had to be sorted, or duplicates appear.
1239 // Any duplicates are now assembled. Not all of the 5 types are
1240 // the same, but all of them are built-in since user-defined types
1241 // cannot be typecasted.
1242
1243 GB_cast_function cast_S_to_T = GB_cast_factory (tcode, scode) ;
1244 GB_cast_function cast_S_to_Y = GB_cast_factory (ycode, scode) ;
1245 GB_cast_function cast_T_to_X = GB_cast_factory (xcode, tcode) ;
1246 GB_cast_function cast_Z_to_T = GB_cast_factory (tcode, zcode) ;
1247
1248 ASSERT (scode <= GB_FC64_code) ;
1249 ASSERT (tcode <= GB_FC64_code) ;
1250 ASSERT (xcode <= GB_FC64_code) ;
1251 ASSERT (ycode <= GB_FC64_code) ;
1252 ASSERT (zcode <= GB_FC64_code) ;
1253
1254 // Tx [p] = (ttype) S [k], with typecasting
1255 #undef GB_CAST_ARRAY_TO_ARRAY
1256 #define GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k) \
1257 cast_S_to_T (Tx +((p)*tsize), S +((k)*ssize), ssize) ;
1258
1259 if (op_2nd)
1260 {
1261
1262 //--------------------------------------------------------------
1263 // dup operator is the SECOND operator, with typecasting
1264 //--------------------------------------------------------------
1265
1266 // Tx [p] += (ttype) S [k], but 2nd op, with typecasting
1267 #undef GB_ADD_CAST_ARRAY_TO_ARRAY
1268 #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k) \
1269 GB_CAST_ARRAY_TO_ARRAY(Tx,p,S,k)
1270 #include "GB_reduce_build_template.c"
1271
1272 }
1273 else
1274 {
1275
1276 //--------------------------------------------------------------
1277 // dup is another operator, with typecasting required
1278 //--------------------------------------------------------------
1279
1280 // Tx [p] += S [k], with typecasting
1281 #undef GB_ADD_CAST_ARRAY_TO_ARRAY
1282 #define GB_ADD_CAST_ARRAY_TO_ARRAY(Tx,p,S,k) \
1283 { \
1284 /* ywork = (ytype) S [k] */ \
1285 GB_void ywork [GB_VLA(ysize)] ; \
1286 cast_S_to_Y (ywork, S +((k)*ssize), ssize) ; \
1287 /* xwork = (xtype) Tx [p] */ \
1288 GB_void xwork [GB_VLA(xsize)] ; \
1289 cast_T_to_X (xwork, Tx +((p)*tsize), tsize) ; \
1290 /* zwork = f (xwork, ywork) */ \
1291 GB_void zwork [GB_VLA(zsize)] ; \
1292 fdup (zwork, xwork, ywork) ; \
1293 /* Tx [tnz-1] = (ttype) zwork */ \
1294 cast_Z_to_T (Tx +((p)*tsize), zwork, zsize) ; \
1295 }
1296
1297 #include "GB_reduce_build_template.c"
1298 }
1299 }
1300 }
1301
1302 //--------------------------------------------------------------------------
1303 // free workspace and return result
1304 //--------------------------------------------------------------------------
1305
1306 GB_FREE_WORK ;
1307 T->jumbled = false ;
1308 ASSERT_MATRIX_OK (T, "T built", GB0) ;
1309 ASSERT (GB_IS_HYPERSPARSE (T)) ;
1310 return (GrB_SUCCESS) ;
1311 }
1312
1313