1 //------------------------------------------------------------------------------
2 // GB_bitmap_add_template: C = A+B, C<M>=A+B, and C<!M>=A+B, C bitmap
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // C is bitmap.  The mask M can have any sparsity structure, and is efficient
11 // to apply (all methods are asymptotically optimal).  All cases (no M, M, !M)
12 // are handled.
13 
14 {
15 
16     // TODO: the input C can be modified in-place, if it is also bitmap
17     int64_t cnvals = 0 ;
18 
19     if (M == NULL)
20     {
21 
22         //----------------------------------------------------------------------
23         // M is not present
24         //----------------------------------------------------------------------
25 
26         //      ------------------------------------------
27         //      C       =           A       +       B
28         //      ------------------------------------------
29         //      bitmap  .           sparse          bitmap
30         //      bitmap  .           bitmap          sparse
31         //      bitmap  .           bitmap          bitmap
32 
33         ASSERT (A_is_bitmap || B_is_bitmap) ;
34         ASSERT (!A_is_full) ;
35         ASSERT (!B_is_full) ;
36 
37         if (A_is_bitmap && B_is_bitmap)
38         {
39 
40             //------------------------------------------------------------------
41             // Method21: C, A, and B are all bitmap
42             //------------------------------------------------------------------
43 
44             int tid ;
45             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
46                 reduction(+:cnvals)
47             for (tid = 0 ; tid < C_nthreads ; tid++)
48             {
49                 int64_t pstart, pend, task_cnvals = 0 ;
50                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
51                 for (int64_t p = pstart ; p < pend ; p++)
52                 {
53                     int8_t c = 0 ;
54                     if (Ab [p] && Bb [p])
55                     {
56                         // C (i,j) = A (i,j) + B (i,j)
57                         GB_GETA (aij, Ax, p) ;
58                         GB_GETB (bij, Bx, p) ;
59                         GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
60                         c = 1 ;
61                     }
62                     else if (Bb [p])
63                     {
64                         // C (i,j) = B (i,j)
65                         GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
66                         c = 1 ;
67                     }
68                     else if (Ab [p])
69                     {
70                         // C (i,j) = A (i,j)
71                         GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
72                         c = 1 ;
73                     }
74                     Cb [p] = c ;
75                     task_cnvals += c ;
76                 }
77                 cnvals += task_cnvals ;
78             }
79 
80         }
81         else if (A_is_bitmap)
82         {
83 
84             //------------------------------------------------------------------
85             // Method22: C and A are bitmap; B is sparse or hypersparse
86             //------------------------------------------------------------------
87 
88             int64_t p ;
89             #pragma omp parallel for num_threads(C_nthreads) schedule(static)
90             for (p = 0 ; p < cnz ; p++)
91             {
92                 // C (i,j) = A (i,j)
93                 int8_t a = Ab [p] ;
94                 if (a) GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
95                 Cb [p] = a ;
96             }
97             cnvals = A->nvals ;
98 
99             GB_SLICE_MATRIX (B, 8, chunk) ;
100 
101             #pragma omp parallel for num_threads(B_nthreads) \
102                 schedule(dynamic,1) reduction(+:cnvals)
103             for (taskid = 0 ; taskid < B_ntasks ; taskid++)
104             {
105                 int64_t kfirst = kfirst_Bslice [taskid] ;
106                 int64_t klast  = klast_Bslice  [taskid] ;
107                 int64_t task_cnvals = 0 ;
108                 for (int64_t k = kfirst ; k <= klast ; k++)
109                 {
110                     // find the part of B(:,k) for this task
111                     int64_t j = GBH (Bh, k) ;
112                     int64_t pB_start, pB_end ;
113                     GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
114                         klast, pstart_Bslice, Bp, vlen) ;
115                     int64_t pC_start = j * vlen ;
116                     // traverse over B(:,j), the kth vector of B
117                     for (int64_t pB = pB_start ; pB < pB_end ; pB++)
118                     {
119                         int64_t i = Bi [pB] ;
120                         int64_t p = pC_start + i ;
121                         if (Cb [p])
122                         {
123                             // C (i,j) = A (i,j) + B (i,j)
124                             GB_GETA (aij, Ax, p) ;
125                             GB_GETB (bij, Bx, pB) ;
126                             GB_BINOP (GB_CX (p), aij, bij, i, j) ;
127                         }
128                         else
129                         {
130                             // C (i,j) = B (i,j)
131                             GB_COPY_B_TO_C (GB_CX (p), Bx, pB) ;
132                             Cb [p] = 1 ;
133                             task_cnvals++ ;
134                         }
135                     }
136                 }
137                 cnvals += task_cnvals ;
138             }
139 
140         }
141         else
142         {
143 
144             //------------------------------------------------------------------
145             // Method23: C and B are bitmap; A is sparse or hypersparse
146             //------------------------------------------------------------------
147 
148             int64_t p ;
149             #pragma omp parallel for num_threads(C_nthreads) schedule(static)
150             for (p = 0 ; p < cnz ; p++)
151             {
152                 // C (i,j) = B (i,j)
153                 int8_t b = Bb [p] ;
154                 if (b) GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
155                 Cb [p] = b ;
156             }
157             cnvals = B->nvals ;
158 
159             GB_SLICE_MATRIX (A, 8, chunk) ;
160 
161             #pragma omp parallel for num_threads(A_nthreads) \
162                 schedule(dynamic,1) reduction(+:cnvals)
163             for (taskid = 0 ; taskid < A_ntasks ; taskid++)
164             {
165                 int64_t kfirst = kfirst_Aslice [taskid] ;
166                 int64_t klast  = klast_Aslice  [taskid] ;
167                 int64_t task_cnvals = 0 ;
168                 for (int64_t k = kfirst ; k <= klast ; k++)
169                 {
170                     // find the part of A(:,k) for this task
171                     int64_t j = GBH (Ah, k) ;
172                     int64_t pA_start, pA_end ;
173                     GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
174                         klast, pstart_Aslice, Ap, vlen) ;
175                     int64_t pC_start = j * vlen ;
176                     // traverse over A(:,j), the kth vector of A
177                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
178                     {
179                         int64_t i = Ai [pA] ;
180                         int64_t p = pC_start + i ;
181                         if (Cb [p])
182                         {
183                             // C (i,j) = A (i,j) + B (i,j)
184                             GB_GETA (aij, Ax, pA) ;
185                             GB_GETB (bij, Bx, p) ;
186                             GB_BINOP (GB_CX (p), aij, bij, i, j) ;
187                         }
188                         else
189                         {
190                             // C (i,j) = A (i,j)
191                             GB_COPY_A_TO_C (GB_CX (p), Ax, pA) ;
192                             Cb [p] = 1 ;
193                             task_cnvals++ ;
194                         }
195                     }
196                 }
197                 cnvals += task_cnvals ;
198             }
199         }
200 
201     }
202     else if (M_is_sparse_or_hyper)
203     {
204 
205         //----------------------------------------------------------------------
206         // C is bitmap, M is sparse or hyper and complemented
207         //----------------------------------------------------------------------
208 
209         //      ------------------------------------------
210         //      C     <!M> =        A       +       B
211         //      ------------------------------------------
212         //      bitmap  sparse      sparse          bitmap
213         //      bitmap  sparse      sparse          full
214         //      bitmap  sparse      bitmap          sparse
215         //      bitmap  sparse      bitmap          bitmap
216         //      bitmap  sparse      bitmap          full
217         //      bitmap  sparse      full            sparse
218         //      bitmap  sparse      full            bitmap
219         //      bitmap  sparse      full            full
220 
221         // M is sparse and complemented.  If M is sparse and not
222         // complemented, then C is constructed as sparse, not bitmap.
223         ASSERT (Mask_comp) ;
224 
225         // C(i,j) = A(i,j) + B(i,j) can only be computed where M(i,j) is
226         // not present in the sparse pattern of M, and where it is present
227         // but equal to zero.
228 
229         //----------------------------------------------------------------------
230         // scatter M into the C bitmap
231         //----------------------------------------------------------------------
232 
233         GB_SLICE_MATRIX (M, 8, chunk) ;
234 
235         #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1)
236         for (taskid = 0 ; taskid < M_ntasks ; taskid++)
237         {
238             int64_t kfirst = kfirst_Mslice [taskid] ;
239             int64_t klast  = klast_Mslice  [taskid] ;
240             for (int64_t k = kfirst ; k <= klast ; k++)
241             {
242                 // find the part of M(:,k) for this task
243                 int64_t j = GBH (Mh, k) ;
244                 int64_t pM_start, pM_end ;
245                 GB_get_pA (&pM_start, &pM_end, taskid, k, kfirst,
246                     klast, pstart_Mslice, Mp, vlen) ;
247                 int64_t pC_start = j * vlen ;
248                 // traverse over M(:,j), the kth vector of M
249                 for (int64_t pM = pM_start ; pM < pM_end ; pM++)
250                 {
251                     // mark C(i,j) if M(i,j) is true
252                     bool mij = GB_mcast (Mx, pM, msize) ;
253                     if (mij)
254                     {
255                         int64_t i = Mi [pM] ;
256                         int64_t p = pC_start + i ;
257                         Cb [p] = 2 ;
258                     }
259                 }
260             }
261         }
262 
263         // C(i,j) has been marked, in Cb, with the value 2 where M(i,j)=1.
264         // These positions will not be computed in C(i,j).  C(i,j) can only
265         // be modified where Cb [p] is zero.
266 
267         //----------------------------------------------------------------------
268         // compute C<!M>=A+B using the mask scattered in C
269         //----------------------------------------------------------------------
270 
271         bool M_cleared = false ;
272 
273         if ((A_is_bitmap || A_is_full) && (B_is_bitmap || B_is_full))
274         {
275 
276             //------------------------------------------------------------------
277             // Method24(!M,sparse): C is bitmap, both A and B are bitmap or full
278             //------------------------------------------------------------------
279 
280             int tid ;
281             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
282                 reduction(+:cnvals)
283             for (tid = 0 ; tid < C_nthreads ; tid++)
284             {
285                 int64_t pstart, pend, task_cnvals = 0 ;
286                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
287                 for (int64_t p = pstart ; p < pend ; p++)
288                 {
289                     int8_t c = Cb [p] ;
290                     if (c == 0)
291                     {
292                         // M(i,j) is zero, so C(i,j) can be computed
293                         int8_t a = GBB (Ab, p) ;
294                         int8_t b = GBB (Bb, p) ;
295                         if (a && b)
296                         {
297                             // C (i,j) = A (i,j) + B (i,j)
298                             GB_GETA (aij, Ax, p) ;
299                             GB_GETB (bij, Bx, p) ;
300                             GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
301                             c = 1 ;
302                         }
303                         else if (b)
304                         {
305                             // C (i,j) = B (i,j)
306                             GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
307                             c = 1 ;
308                         }
309                         else if (a)
310                         {
311                             // C (i,j) = A (i,j)
312                             GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
313                             c = 1 ;
314                         }
315                         Cb [p] = c ;
316                         task_cnvals += c ;
317                     }
318                     else
319                     {
320                         // M(i,j) == 1, so C(i,j) is not computed
321                         Cb [p] = 0 ;
322                     }
323                 }
324                 cnvals += task_cnvals ;
325             }
326             M_cleared = true ;      // M has also been cleared from C
327 
328         }
329         else if (A_is_bitmap || A_is_full)
330         {
331 
332             //------------------------------------------------------------------
333             // Method25(!M,sparse): C bitmap, A bitmap or full, B sparse/hyper
334             //------------------------------------------------------------------
335 
336             int tid ;
337             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
338                 reduction(+:cnvals)
339             for (tid = 0 ; tid < C_nthreads ; tid++)
340             {
341                 int64_t pstart, pend, task_cnvals = 0 ;
342                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
343                 for (int64_t p = pstart ; p < pend ; p++)
344                 {
345                     if (Cb [p] == 0)
346                     {
347                         // C (i,j) = A (i,j)
348                         int8_t a = GBB (Ab, p) ;
349                         if (a) GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
350                         Cb [p] = a ;
351                         task_cnvals += a ;
352                     }
353                 }
354                 cnvals += task_cnvals ;
355             }
356 
357             GB_SLICE_MATRIX (B, 8, chunk) ;
358 
359             #pragma omp parallel for num_threads(B_nthreads) \
360                 schedule(dynamic,1) reduction(+:cnvals)
361             for (taskid = 0 ; taskid < B_ntasks ; taskid++)
362             {
363                 int64_t kfirst = kfirst_Bslice [taskid] ;
364                 int64_t klast  = klast_Bslice  [taskid] ;
365                 int64_t task_cnvals = 0 ;
366                 for (int64_t k = kfirst ; k <= klast ; k++)
367                 {
368                     // find the part of B(:,k) for this task
369                     int64_t j = GBH (Bh, k) ;
370                     int64_t pB_start, pB_end ;
371                     GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
372                         klast, pstart_Bslice, Bp, vlen) ;
373                     int64_t pC_start = j * vlen ;
374                     // traverse over B(:,j), the kth vector of B
375                     for (int64_t pB = pB_start ; pB < pB_end ; pB++)
376                     {
377                         int64_t i = Bi [pB] ;
378                         int64_t p = pC_start + i ;
379                         int8_t c = Cb [p] ;
380                         if (c == 1)
381                         {
382                             // C (i,j) = A (i,j) + B (i,j)
383                             GB_GETA (aij, Ax, p) ;
384                             GB_GETB (bij, Bx, pB) ;
385                             GB_BINOP (GB_CX (p), aij, bij, i, j) ;
386                         }
387                         else if (c == 0)
388                         {
389                             // C (i,j) = B (i,j)
390                             GB_COPY_B_TO_C (GB_CX (p), Bx, pB) ;
391                             Cb [p] = 1 ;
392                             task_cnvals++ ;
393                         }
394                     }
395                 }
396                 cnvals += task_cnvals ;
397             }
398 
399         }
400         else
401         {
402 
403             //------------------------------------------------------------------
404             // Method26: C bitmap, A sparse or hypersparse, B bitmap or full
405             //------------------------------------------------------------------
406 
407             int tid ;
408             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
409                 reduction(+:cnvals)
410             for (tid = 0 ; tid < C_nthreads ; tid++)
411             {
412                 int64_t pstart, pend, task_cnvals = 0 ;
413                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
414                 for (int64_t p = pstart ; p < pend ; p++)
415                 {
416                     if (Cb [p] == 0)
417                     {
418                         // C (i,j) = B (i,j)
419                         int8_t b = GBB (Bb, p) ;
420                         if (b) GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
421                         Cb [p] = b ;
422                         task_cnvals += b ;
423                     }
424                 }
425                 cnvals += task_cnvals ;
426             }
427 
428             GB_SLICE_MATRIX (A, 8, chunk) ;
429 
430             #pragma omp parallel for num_threads(A_nthreads) \
431                 schedule(dynamic,1) reduction(+:cnvals)
432             for (taskid = 0 ; taskid < A_ntasks ; taskid++)
433             {
434                 int64_t kfirst = kfirst_Aslice [taskid] ;
435                 int64_t klast  = klast_Aslice  [taskid] ;
436                 int64_t task_cnvals = 0 ;
437                 for (int64_t k = kfirst ; k <= klast ; k++)
438                 {
439                     // find the part of A(:,k) for this task
440                     int64_t j = GBH (Ah, k) ;
441                     int64_t pA_start, pA_end ;
442                     GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
443                         klast, pstart_Aslice, Ap, vlen) ;
444                     int64_t pC_start = j * vlen ;
445                     // traverse over A(:,j), the kth vector of A
446                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
447                     {
448                         int64_t i = Ai [pA] ;
449                         int64_t p = pC_start + i ;
450                         int8_t c = Cb [p] ;
451                         if (c == 1)
452                         {
453                             // C (i,j) = A (i,j) + B (i,j)
454                             GB_GETA (aij, Ax, pA) ;
455                             GB_GETB (bij, Bx, p) ;
456                             GB_BINOP (GB_CX (p), aij, bij, i, j) ;
457                         }
458                         else if (c == 0)
459                         {
460                             // C (i,j) = A (i,j)
461                             GB_COPY_A_TO_C (GB_CX (p), Ax, pA) ;
462                             Cb [p] = 1 ;
463                             task_cnvals++ ;
464                         }
465                     }
466                 }
467                 cnvals += task_cnvals ;
468             }
469         }
470 
471         //---------------------------------------------------------------------
472         // clear M from C
473         //---------------------------------------------------------------------
474 
475         if (!M_cleared)
476         {
477             // This step is required if either A or B are sparse/hyper (if
478             // one is sparse/hyper, the other must be bitmap).  It requires
479             // an extra pass over the mask M, so this might be slower than
480             // postponing the application of the mask, and doing it later.
481 
482             #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1)
483             for (taskid = 0 ; taskid < M_ntasks ; taskid++)
484             {
485                 int64_t kfirst = kfirst_Mslice [taskid] ;
486                 int64_t klast  = klast_Mslice  [taskid] ;
487                 for (int64_t k = kfirst ; k <= klast ; k++)
488                 {
489                     // find the part of M(:,k) for this task
490                     int64_t j = GBH (Mh, k) ;
491                     int64_t pM_start, pM_end ;
492                     GB_get_pA (&pM_start, &pM_end, taskid, k, kfirst,
493                         klast, pstart_Mslice, Mp, vlen) ;
494                     int64_t pC_start = j * vlen ;
495                     // traverse over M(:,j), the kth vector of M
496                     for (int64_t pM = pM_start ; pM < pM_end ; pM++)
497                     {
498                         // mark C(i,j) if M(i,j) is true
499                         bool mij = GB_mcast (Mx, pM, msize) ;
500                         if (mij)
501                         {
502                             int64_t i = Mi [pM] ;
503                             int64_t p = pC_start + i ;
504                             Cb [p] = 0 ;
505                         }
506                     }
507                 }
508             }
509         }
510 
511     }
512     else
513     {
514 
515         //----------------------------------------------------------------------
516         // C is bitmap; M is bitmap or full
517         //----------------------------------------------------------------------
518 
519         //      ------------------------------------------
520         //      C      <M> =        A       +       B
521         //      ------------------------------------------
522         //      bitmap  bitmap      sparse          bitmap
523         //      bitmap  bitmap      sparse          full
524         //      bitmap  bitmap      bitmap          sparse
525         //      bitmap  bitmap      bitmap          bitmap
526         //      bitmap  bitmap      bitmap          full
527         //      bitmap  bitmap      full            sparse
528         //      bitmap  bitmap      full            bitmap
529         //      bitmap  bitmap      full            full
530 
531         //      ------------------------------------------
532         //      C      <M> =        A       +       B
533         //      ------------------------------------------
534         //      bitmap  full        sparse          bitmap
535         //      bitmap  full        sparse          full
536         //      bitmap  full        bitmap          sparse
537         //      bitmap  full        bitmap          bitmap
538         //      bitmap  full        bitmap          full
539         //      bitmap  full        full            sparse
540         //      bitmap  full        full            bitmap
541         //      bitmap  full        full            full
542 
543         //      ------------------------------------------
544         //      C     <!M> =        A       +       B
545         //      ------------------------------------------
546         //      bitmap  bitmap      sparse          sparse
547         //      bitmap  bitmap      sparse          bitmap
548         //      bitmap  bitmap      sparse          full
549         //      bitmap  bitmap      bitmap          sparse
550         //      bitmap  bitmap      bitmap          bitmap
551         //      bitmap  bitmap      bitmap          full
552         //      bitmap  bitmap      full            sparse
553         //      bitmap  bitmap      full            bitmap
554         //      bitmap  bitmap      full            full
555 
556         //      ------------------------------------------
557         //      C     <!M> =        A       +       B
558         //      ------------------------------------------
559         //      bitmap  full        sparse          sparse
560         //      bitmap  full        sparse          bitmap
561         //      bitmap  full        sparse          full
562         //      bitmap  full        bitmap          sparse
563         //      bitmap  full        bitmap          bitmap
564         //      bitmap  full        bitmap          full
565         //      bitmap  full        full            sparse
566         //      bitmap  full        full            bitmap
567         //      bitmap  full        full            full
568 
569 
570         ASSERT (M_is_bitmap || M_is_full) ;
571         ASSERT (A_is_bitmap || A_is_full || B_is_bitmap || B_is_full) ;
572 
573         #undef  GB_GET_MIJ
574         #define GB_GET_MIJ(p)                                           \
575             bool mij = GBB (Mb, p) && GB_mcast (Mx, p, msize) ;         \
576             if (Mask_comp) mij = !mij ;
577 
578         if ((A_is_bitmap || A_is_full) && (B_is_bitmap || B_is_full))
579         {
580 
581             //------------------------------------------------------------------
582             // Method27: C is bitmap; M, A, and B are bitmap or full
583             //------------------------------------------------------------------
584 
585             int tid ;
586             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
587                 reduction(+:cnvals)
588             for (tid = 0 ; tid < C_nthreads ; tid++)
589             {
590                 int64_t pstart, pend, task_cnvals = 0 ;
591                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
592                 for (int64_t p = pstart ; p < pend ; p++)
593                 {
594                     GB_GET_MIJ (p) ;
595                     if (mij)
596                     {
597                         // M(i,j) is true, so C(i,j) can be computed
598                         int8_t a = GBB (Ab, p) ;
599                         int8_t b = GBB (Bb, p) ;
600                         int8_t c = 0 ;
601                         if (a && b)
602                         {
603                             // C (i,j) = A (i,j) + B (i,j)
604                             GB_GETA (aij, Ax, p) ;
605                             GB_GETB (bij, Bx, p) ;
606                             GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
607                             c = 1 ;
608                         }
609                         else if (b)
610                         {
611                             // C (i,j) = B (i,j)
612                             GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
613                             c = 1 ;
614                         }
615                         else if (a)
616                         {
617                             // C (i,j) = A (i,j)
618                             GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
619                             c = 1 ;
620                         }
621                         Cb [p] = c ;
622                         task_cnvals += c ;
623                     }
624                     else
625                     {
626                         // M(i,j) == 1, so C(i,j) is not computed
627                         Cb [p] = 0 ;
628                     }
629                 }
630                 cnvals += task_cnvals ;
631             }
632 
633         }
634         else if (A_is_bitmap || A_is_full)
635         {
636 
637             //------------------------------------------------------------------
638             // Method28: C bitmap; M and A bitmap or full; B sparse or hyper
639             //------------------------------------------------------------------
640 
641             int tid ;
642             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
643                 reduction(+:cnvals)
644             for (tid = 0 ; tid < C_nthreads ; tid++)
645             {
646                 int64_t pstart, pend, task_cnvals = 0 ;
647                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
648                 for (int64_t p = pstart ; p < pend ; p++)
649                 {
650                     GB_GET_MIJ (p) ;
651                     if (mij)
652                     {
653                         // C (i,j) = A (i,j)
654                         int8_t a = GBB (Ab, p) ;
655                         if (a) GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
656                         Cb [p] = a ;
657                         task_cnvals += a ;
658                     }
659                     else
660                     {
661                         Cb [p] = 0 ;
662                     }
663                 }
664                 cnvals += task_cnvals ;
665             }
666 
667             GB_SLICE_MATRIX (B, 8, chunk) ;
668 
669             #pragma omp parallel for num_threads(B_nthreads) \
670                 schedule(dynamic,1) reduction(+:cnvals)
671             for (taskid = 0 ; taskid < B_ntasks ; taskid++)
672             {
673                 int64_t kfirst = kfirst_Bslice [taskid] ;
674                 int64_t klast  = klast_Bslice  [taskid] ;
675                 int64_t task_cnvals = 0 ;
676                 for (int64_t k = kfirst ; k <= klast ; k++)
677                 {
678                     // find the part of B(:,k) for this task
679                     int64_t j = GBH (Bh, k) ;
680                     int64_t pB_start, pB_end ;
681                     GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
682                         klast, pstart_Bslice, Bp, vlen) ;
683                     int64_t pC_start = j * vlen ;
684                     // traverse over B(:,j), the kth vector of B
685                     for (int64_t pB = pB_start ; pB < pB_end ; pB++)
686                     {
687                         int64_t i = Bi [pB] ;
688                         int64_t p = pC_start + i ;
689                         GB_GET_MIJ (p) ;
690                         if (mij)
691                         {
692                             int8_t c = Cb [p] ;
693                             if (c == 1)
694                             {
695                                 // C (i,j) = A (i,j) + B (i,j)
696                                 GB_GETA (aij, Ax, p) ;
697                                 GB_GETB (bij, Bx, pB) ;
698                                 GB_BINOP (GB_CX (p), aij, bij, i, j) ;
699                             }
700                             else
701                             {
702                                 // C (i,j) = B (i,j)
703                                 GB_COPY_B_TO_C (GB_CX (p), Bx, pB) ;
704                                 Cb [p] = 1 ;
705                                 task_cnvals++ ;
706                             }
707                         }
708                     }
709                 }
710                 cnvals += task_cnvals ;
711             }
712 
713         }
714         else
715         {
716 
717             //------------------------------------------------------------------
718             // Method29: C bitmap; M and B bitmap or full; A sparse or hyper
719             //------------------------------------------------------------------
720 
721             int tid ;
722             #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
723                 reduction(+:cnvals)
724             for (tid = 0 ; tid < C_nthreads ; tid++)
725             {
726                 int64_t pstart, pend, task_cnvals = 0 ;
727                 GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
728                 for (int64_t p = pstart ; p < pend ; p++)
729                 {
730                     GB_GET_MIJ (p) ;
731                     if (mij)
732                     {
733                         // C (i,j) = B (i,j)
734                         int8_t b = GBB (Bb, p) ;
735                         if (b) GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
736                         Cb [p] = b ;
737                         task_cnvals += b ;
738                     }
739                     else
740                     {
741                         Cb [p] = 0 ;
742                     }
743                 }
744                 cnvals += task_cnvals ;
745             }
746 
747             GB_SLICE_MATRIX (A, 8, chunk) ;
748 
749             #pragma omp parallel for num_threads(A_nthreads) \
750                 schedule(dynamic,1) reduction(+:cnvals)
751             for (taskid = 0 ; taskid < A_ntasks ; taskid++)
752             {
753                 int64_t kfirst = kfirst_Aslice [taskid] ;
754                 int64_t klast  = klast_Aslice  [taskid] ;
755                 int64_t task_cnvals = 0 ;
756                 for (int64_t k = kfirst ; k <= klast ; k++)
757                 {
758                     // find the part of A(:,k) for this task
759                     int64_t j = GBH (Ah, k) ;
760                     int64_t pA_start, pA_end ;
761                     GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
762                         klast, pstart_Aslice, Ap, vlen) ;
763                     int64_t pC_start = j * vlen ;
764                     // traverse over A(:,j), the kth vector of A
765                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
766                     {
767                         int64_t i = Ai [pA] ;
768                         int64_t p = pC_start + i ;
769                         GB_GET_MIJ (p) ;
770                         if (mij)
771                         {
772                             int8_t c = Cb [p] ;
773                             if (c == 1)
774                             {
775                                 // C (i,j) = A (i,j) + B (i,j)
776                                 GB_GETA (aij, Ax, pA) ;
777                                 GB_GETB (bij, Bx, p) ;
778                                 GB_BINOP (GB_CX (p), aij, bij, i, j) ;
779                             }
780                             else
781                             {
782                                 // C (i,j) = A (i,j)
783                                 GB_COPY_A_TO_C (GB_CX (p), Ax, pA) ;
784                                 Cb [p] = 1 ;
785                                 task_cnvals++ ;
786                             }
787                         }
788                     }
789                 }
790                 cnvals += task_cnvals ;
791             }
792         }
793     }
794 
795     C->nvals = cnvals ;
796 }
797 
798