1 //------------------------------------------------------------------------------
2 // GB_sparse_add_template:  C=A+B, C<M>=A+B when C is sparse/hypersparse
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // C is sparse or hypersparse:
11 
12         //      ------------------------------------------
13         //      C       =           A       +       B
14         //      ------------------------------------------
15         //      sparse  .           sparse          sparse
16 
17         //      ------------------------------------------
18         //      C      <M> =        A       +       B
19         //      ------------------------------------------
20         //      sparse  sparse      sparse          sparse
21         //      sparse  sparse      sparse          bitmap
22         //      sparse  sparse      sparse          full
23         //      sparse  sparse      bitmap          sparse
24         //      sparse  sparse      bitmap          bitmap
25         //      sparse  sparse      bitmap          full
26         //      sparse  sparse      full            sparse
27         //      sparse  sparse      full            bitmap
28         //      sparse  sparse      full            full        (same as emult)
29 
30         //      sparse  bitmap      sparse          sparse
31         //      sparse  full        sparse          sparse
32 
33         //      ------------------------------------------
34         //      C     <!M> =        A       +       B
35         //      ------------------------------------------
36         //      sparse  bitmap      sparse          sparse
37         //      sparse  full        sparse          sparse
38 
39 // If all four matrices are sparse/hypersparse, and C<!M>=A+B is being
40 // computed, then M is passed in as NULL to GB_add_phase*.  GB_add_sparsity
41 // returns apply_mask as false.  The methods below do not handle the case when
42 // C is sparse, M is sparse, and !M is used.  All other uses of !M when M
43 // is sparse result in a bitmap structure for C, and this is handled by
44 // GB_bitmap_add_template.
45 
46         // For this case: the mask is done later, so C=A+B is computed here:
47 
48         //      ------------------------------------------
49         //      C     <!M> =        A       +       B
50         //      ------------------------------------------
51         //      sparse  sparse      sparse          sparse      (mask later)
52 
53 {
54 
55     #ifdef GB_DEBUG
56     if (M == NULL || M_is_bitmap || M_is_full)
57     {
58         ASSERT (A_is_sparse || A_is_hyper) ;
59         ASSERT (B_is_sparse || B_is_hyper) ;
60     }
61     #endif
62 
63     //--------------------------------------------------------------------------
64     // phase1: count entries in each C(:,j)
65     // phase2: compute C
66     //--------------------------------------------------------------------------
67 
68     #pragma omp parallel for num_threads(C_nthreads) schedule(dynamic,1)
69     for (taskid = 0 ; taskid < C_ntasks ; taskid++)
70     {
71 
72         //----------------------------------------------------------------------
73         // get the task descriptor
74         //----------------------------------------------------------------------
75 
76         int64_t kfirst = TaskList [taskid].kfirst ;
77         int64_t klast  = TaskList [taskid].klast ;
78         bool fine_task = (klast == -1) ;
79         int64_t len ;
80         if (fine_task)
81         {
82             // a fine task operates on a slice of a single vector
83             klast = kfirst ;
84             len = TaskList [taskid].len ;
85         }
86         else
87         {
88             // a coarse task operates on one or more whole vectors
89             len = vlen ;
90         }
91 
92         //----------------------------------------------------------------------
93         // compute all vectors in this task
94         //----------------------------------------------------------------------
95 
96         for (int64_t k = kfirst ; k <= klast ; k++)
97         {
98 
99             //------------------------------------------------------------------
100             // get j, the kth vector of C
101             //------------------------------------------------------------------
102 
103             int64_t j = GBH (Ch, k) ;
104 
105             #if defined ( GB_PHASE_1_OF_2 )
106             int64_t cjnz = 0 ;
107             #else
108             int64_t pC, pC_end ;
109             if (fine_task)
110             {
111                 // A fine task computes a slice of C(:,j)
112                 pC     = TaskList [taskid  ].pC ;
113                 pC_end = TaskList [taskid+1].pC ;
114                 ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ;
115             }
116             else
117             {
118                 // The vectors of C are never sliced for a coarse task.
119                 pC     = Cp [k  ] ;
120                 pC_end = Cp [k+1] ;
121             }
122             int64_t cjnz = pC_end - pC ;
123             if (cjnz == 0) continue ;
124             #endif
125 
126             //------------------------------------------------------------------
127             // get A(:,j)
128             //------------------------------------------------------------------
129 
130             int64_t pA = -1, pA_end = -1 ;
131             if (fine_task)
132             {
133                 // A fine task operates on Ai,Ax [pA...pA_end-1], which is
134                 // a subset of the vector A(:,j)
135                 pA     = TaskList [taskid].pA ;
136                 pA_end = TaskList [taskid].pA_end ;
137             }
138             else
139             {
140                 // A coarse task operates on the entire vector A (:,j)
141                 int64_t kA = (C_to_A == NULL) ? j : C_to_A [k] ;
142                 if (kA >= 0)
143                 {
144                     pA     = GBP (Ap, kA, vlen) ;
145                     pA_end = GBP (Ap, kA+1, vlen) ;
146                 }
147             }
148 
149             int64_t ajnz = pA_end - pA ;    // nnz in A(:,j) for this slice
150             int64_t pA_start = pA ;
151             bool adense = (ajnz == len) ;
152 
153             // get the first and last indices in A(:,j) for this vector
154             int64_t iA_first = -1, iA_last = -1 ;
155             if (ajnz > 0)
156             {
157                 iA_first = GBI (Ai, pA, vlen) ;
158                 iA_last  = GBI (Ai, pA_end-1, vlen) ;
159             }
160 
161             //------------------------------------------------------------------
162             // get B(:,j)
163             //------------------------------------------------------------------
164 
165             int64_t pB = -1, pB_end = -1 ;
166             if (fine_task)
167             {
168                 // A fine task operates on Bi,Bx [pB...pB_end-1], which is
169                 // a subset of the vector B(:,j)
170                 pB     = TaskList [taskid].pB ;
171                 pB_end = TaskList [taskid].pB_end ;
172             }
173             else
174             {
175                 // A coarse task operates on the entire vector B (:,j)
176                 int64_t kB = (C_to_B == NULL) ? j : C_to_B [k] ;
177                 if (kB >= 0)
178                 {
179                     pB     = GBP (Bp, kB, vlen) ;
180                     pB_end = GBP (Bp, kB+1, vlen) ;
181                 }
182             }
183 
184             int64_t bjnz = pB_end - pB ;    // nnz in B(:,j) for this slice
185             int64_t pB_start = pB ;
186             bool bdense = (bjnz == len) ;
187 
188             // get the first and last indices in B(:,j) for this vector
189             int64_t iB_first = -1, iB_last = -1 ;
190             if (bjnz > 0)
191             {
192                 iB_first = GBI (Bi, pB, vlen) ;
193                 iB_last  = GBI (Bi, pB_end-1, vlen) ;
194             }
195 
196             //------------------------------------------------------------------
197             // get M(:,j) if M is sparse or hypersparse
198             //------------------------------------------------------------------
199 
200             bool sparse_mask_is_easy = false ;
201             int64_t pM = -1 ;
202             int64_t pM_end = -1 ;
203             if (M_is_sparse_or_hyper)
204             {
205                 if (fine_task)
206                 {
207                     // A fine task operates on Mi,Mx [pM...pM_end-1],
208                     // which is a subset of the vector M(:,j)
209                     pM     = TaskList [taskid].pM ;
210                     pM_end = TaskList [taskid].pM_end ;
211                 }
212                 else
213                 {
214                     int64_t kM = -1 ;
215                     if (Ch_is_Mh)
216                     {
217                         // Ch is the same as Mh (a deep copy)
218                         ASSERT (Ch != NULL) ;
219                         ASSERT (M_is_hyper) ;
220                         ASSERT (Ch [k] == M->h [k]) ;
221                         kM = k ;
222                     }
223                     else
224                     {
225                         kM = (C_to_M == NULL) ? j : C_to_M [k] ;
226                     }
227                     if (kM >= 0)
228                     {
229                         pM     = GBP (Mp, kM  , vlen) ;
230                         pM_end = GBP (Mp, kM+1, vlen) ;
231                     }
232                 }
233 
234                 // The "easy mask" condition requires M to be sparse/hyper
235                 // and structural.  A and B cannot be bitmap.  Also one of
236                 // the following 3 conditions must hold:
237                 // (1) all entries are present in A(:,j) and B == M
238                 // (2) all entries are present in B(:,j) and A == M
239                 // (3) both A and B are aliased to M
240                 sparse_mask_is_easy =
241                     Mask_struct &&          // M must be structural
242                     !A_is_bitmap &&         // A must not be bitmap
243                     !B_is_bitmap &&         // B must not be bitmap
244                     ((adense && B == M) ||  // one of 3 conditions holds
245                      (bdense && A == M) ||
246                      (A == M && B == M)) ;
247 
248                 // TODO: add the condition above to GB_add_sparsity,
249                 // where adense/bdense are true for the whole matrix
250                 // (adense is true if A is full, or sparse/hypersparse with
251                 // all entries present).  The test here is done vector by
252                 // vector, for each A(:,j) and B(:,j).  This is a finer grain
253                 // test, as compared to a test for all of A and B.
254 
255             }
256 
257             //------------------------------------------------------------------
258             // C(:,j)<optional mask> = A (:,j) + B (:,j) or subvector
259             //------------------------------------------------------------------
260 
261             if (M == NULL)
262             {
263 
264                 //--------------------------------------------------------------
265                 // M is not present, or !M is sparse but not applied here
266                 //--------------------------------------------------------------
267 
268                 //      ------------------------------------------
269                 //      C       =           A       +       B
270                 //      ------------------------------------------
271                 //      sparse  .           sparse          sparse
272 
273                 //      ------------------------------------------
274                 //      C     <!M> =        A       +       B
275                 //      ------------------------------------------
276                 //      sparse  sparse      sparse          sparse  (mask later)
277 
278                 // If all four matrices are sparse or hypersparse, and
279                 // Mask_comp is true, the mask M is passed in to this method as
280                 // NULL.  C=A+B is computed with no mask, and !M is applied
281                 // later.
282 
283                 // A and B are both sparse or hypersparse, not bitmap or
284                 // full, but individual vectors of A and B might have all
285                 // entries present (adense and/or bdense).
286                 ASSERT (A_is_sparse || A_is_hyper) ;
287                 ASSERT (B_is_sparse || B_is_hyper) ;
288 
289                 #if defined ( GB_PHASE_1_OF_2 )
290 
291                 if (A_and_B_are_disjoint)
292                 {
293 
294                     // only used by GB_Matrix_wait, which computes A+T
295                     // where T is the matrix of pending tuples for A.  The
296                     // pattern of pending tuples is always disjoint with
297                     // the pattern of A.
298 
299                     cjnz = ajnz + bjnz ;
300 
301                 }
302                 else
303 
304                 #endif
305 
306                 if (adense && bdense)
307                 {
308 
309                     //----------------------------------------------------------
310                     // Method01: A(:,j) and B(:,j) dense: thus C(:,j) dense
311                     //----------------------------------------------------------
312 
313                     ASSERT (ajnz == bjnz) ;
314                     ASSERT (iA_first == iB_first) ;
315                     ASSERT (iA_last  == iB_last ) ;
316                     #if defined ( GB_PHASE_1_OF_2 )
317                     cjnz = ajnz ;
318                     #else
319                     ASSERT (cjnz == ajnz) ;
320                     GB_PRAGMA_SIMD_VECTORIZE
321                     for (int64_t p = 0 ; p < ajnz ; p++)
322                     {
323                         // C (i,j) = A (i,j) + B (i,j)
324                         int64_t i = p + iA_first ;
325                         Ci [pC + p] = i ;
326                         ASSERT (Ai [pA + p] == i) ;
327                         ASSERT (Bi [pB + p] == i) ;
328                         GB_GETA (aij, Ax, pA + p) ;
329                         GB_GETB (bij, Bx, pB + p) ;
330                         GB_BINOP (GB_CX (pC + p), aij, bij, i, j) ;
331                     }
332                     #endif
333 
334                 }
335                 else if (adense)
336                 {
337 
338                     //----------------------------------------------------------
339                     // Method02: A(:,j) dense, B(:,j) sparse: C(:,j) dense
340                     //----------------------------------------------------------
341 
342                     #if defined ( GB_PHASE_1_OF_2 )
343                     cjnz = ajnz ;
344                     #else
345                     ASSERT (cjnz == ajnz) ;
346                     GB_PRAGMA_SIMD_VECTORIZE
347                     for (int64_t p = 0 ; p < ajnz ; p++)
348                     {
349                         // C (i,j) = A (i,j)
350                         int64_t i = p + iA_first ;
351                         Ci [pC + p] = i ;
352                         ASSERT (Ai [pA + p] == i) ;
353                         GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
354                     }
355                     GB_PRAGMA_SIMD_VECTORIZE
356                     for (int64_t p = 0 ; p < bjnz ; p++)
357                     {
358                         // C (i,j) = A (i,j) + B (i,j)
359                         int64_t i = Bi [pB + p] ;
360                         int64_t ii = i - iA_first ;
361                         ASSERT (Ai [pA + ii] == i) ;
362                         GB_GETA (aij, Ax, pA + ii) ;
363                         GB_GETB (bij, Bx, pB + p) ;
364                         GB_BINOP (GB_CX (pC + ii), aij, bij, i, j) ;
365                     }
366                     #endif
367 
368                 }
369                 else if (bdense)
370                 {
371 
372                     //----------------------------------------------------------
373                     // Method03: A(:,j) sparse, B(:,j) dense: C(:,j) dense
374                     //----------------------------------------------------------
375 
376                     #if defined ( GB_PHASE_1_OF_2 )
377                     cjnz = bjnz ;
378                     #else
379                     ASSERT (cjnz == bjnz) ;
380                     GB_PRAGMA_SIMD_VECTORIZE
381                     for (int64_t p = 0 ; p < bjnz ; p++)
382                     {
383                         // C (i,j) = B (i,j)
384                         int64_t i = p + iB_first ;
385                         Ci [pC + p] = i ;
386                         ASSERT (Bi [pB + p] == i) ;
387                         GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
388                     }
389                     GB_PRAGMA_SIMD_VECTORIZE
390                     for (int64_t p = 0 ; p < ajnz ; p++)
391                     {
392                         // C (i,j) = A (i,j) + B (i,j)
393                         int64_t i = Ai [pA + p] ;
394                         int64_t ii = i - iB_first ;
395                         ASSERT (Bi [pB + ii] == i) ;
396                         GB_GETA (aij, Ax, pA + p) ;
397                         GB_GETB (bij, Bx, pB + ii) ;
398                         GB_BINOP (GB_CX (pC + ii), aij, bij, i, j) ;
399                     }
400                     #endif
401 
402                 }
403                 else if (ajnz == 0)
404                 {
405 
406                     //----------------------------------------------------------
407                     // Method04: A(:,j) is empty
408                     //----------------------------------------------------------
409 
410                     #if defined ( GB_PHASE_1_OF_2 )
411                     cjnz = bjnz ;
412                     #else
413                     ASSERT (cjnz == bjnz) ;
414                     memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
415                     GB_PRAGMA_SIMD_VECTORIZE
416                     for (int64_t p = 0 ; p < bjnz ; p++)
417                     {
418                         // C (i,j) = B (i,j)
419                         GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
420                     }
421                     #endif
422 
423                 }
424                 else if (bjnz == 0)
425                 {
426 
427                     //----------------------------------------------------------
428                     // Method05: B(:,j) is empty
429                     //----------------------------------------------------------
430 
431                     #if defined ( GB_PHASE_1_OF_2 )
432                     cjnz = ajnz ;
433                     #else
434                     ASSERT (cjnz == ajnz) ;
435                     memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
436                     GB_PRAGMA_SIMD_VECTORIZE
437                     for (int64_t p = 0 ; p < ajnz ; p++)
438                     {
439                         // C (i,j) = A (i,j)
440                         GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
441                     }
442                     #endif
443 
444                 }
445                 else if (iA_last < iB_first)
446                 {
447 
448                     //----------------------------------------------------------
449                     // Method06: last A(:,j) comes before 1st B(:,j)
450                     //----------------------------------------------------------
451 
452                     #if defined ( GB_PHASE_1_OF_2 )
453                     cjnz = ajnz + bjnz ;
454                     #else
455                     ASSERT (cjnz == ajnz + bjnz) ;
456                     memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
457                     GB_PRAGMA_SIMD_VECTORIZE
458                     for (int64_t p = 0 ; p < ajnz ; p++)
459                     {
460                         // C (i,j) = A (i,j)
461                         GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
462                     }
463                     pC += ajnz ;
464                     memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
465                     GB_PRAGMA_SIMD_VECTORIZE
466                     for (int64_t p = 0 ; p < bjnz ; p++)
467                     {
468                         // C (i,j) = B (i,j)
469                         GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
470                     }
471                     #endif
472 
473                 }
474                 else if (iB_last < iA_first)
475                 {
476 
477                     //----------------------------------------------------------
478                     // Method07: last B(:,j) comes before 1st A(:,j)
479                     //----------------------------------------------------------
480 
481                     #if defined ( GB_PHASE_1_OF_2 )
482                     cjnz = ajnz + bjnz ;
483                     #else
484                     ASSERT (cjnz == ajnz + bjnz) ;
485                     memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
486                     GB_PRAGMA_SIMD_VECTORIZE
487                     for (int64_t p = 0 ; p < bjnz ; p++)
488                     {
489                         // C (i,j) = B (i,j)
490                         GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
491                     }
492                     pC += bjnz ;
493                     memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
494                     GB_PRAGMA_SIMD_VECTORIZE
495                     for (int64_t p = 0 ; p < ajnz ; p++)
496                     {
497                         // C (i,j) = A (i,j)
498                         GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
499                     }
500                     #endif
501 
502                 }
503 
504                 #if defined ( GB_PHASE_1_OF_2 )
505                 else if (ajnz > 32 * bjnz)
506                 {
507 
508                     //----------------------------------------------------------
509                     // Method08: A(:,j) is much denser than B(:,j)
510                     //----------------------------------------------------------
511 
512                     // cjnz = ajnz + bjnz - nnz in the intersection
513 
514                     cjnz = ajnz + bjnz ;
515                     for ( ; pB < pB_end ; pB++)
516                     {
517                         int64_t i = Bi [pB] ;
518                         // find i in A(:,j)
519                         int64_t pright = pA_end - 1 ;
520                         bool found ;
521                         GB_BINARY_SEARCH (i, Ai, pA, pright, found) ;
522                         if (found) cjnz-- ;
523                     }
524 
525                 }
526                 else if (bjnz > 32 * ajnz)
527                 {
528 
529                     //----------------------------------------------------------
530                     // Method09: B(:,j) is much denser than A(:,j)
531                     //----------------------------------------------------------
532 
533                     // cjnz = ajnz + bjnz - nnz in the intersection
534 
535                     cjnz = ajnz + bjnz ;
536                     for ( ; pA < pA_end ; pA++)
537                     {
538                         int64_t i = Ai [pA] ;
539                         // find i in B(:,j)
540                         int64_t pright = pB_end - 1 ;
541                         bool found ;
542                         GB_BINARY_SEARCH (i, Bi, pB, pright, found) ;
543                         if (found) cjnz-- ;
544                     }
545 
546                 }
547                 #endif
548 
549                 else
550                 {
551 
552                     //----------------------------------------------------------
553                     // Method10: A(:,j) and B(:,j) about the same sparsity
554                     //----------------------------------------------------------
555 
556                     while (pA < pA_end && pB < pB_end)
557                     {
558                         int64_t iA = Ai [pA] ;
559                         int64_t iB = Bi [pB] ;
560                         if (iA < iB)
561                         {
562                             // C (iA,j) = A (iA,j)
563                             #if defined ( GB_PHASE_2_OF_2 )
564                             Ci [pC] = iA ;
565                             GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
566                             #endif
567                             pA++ ;
568                         }
569                         else if (iA > iB)
570                         {
571                             // C (iB,j) = B (iB,j)
572                             #if defined ( GB_PHASE_2_OF_2 )
573                             Ci [pC] = iB ;
574                             GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
575                             #endif
576                             pB++ ;
577                         }
578                         else
579                         {
580                             // C (i,j) = A (i,j) + B (i,j)
581                             #if defined ( GB_PHASE_2_OF_2 )
582                             Ci [pC] = iB ;
583                             GB_GETA (aij, Ax, pA) ;
584                             GB_GETB (bij, Bx, pB) ;
585                             GB_BINOP (GB_CX (pC), aij, bij, iB, j) ;
586                             #endif
587                             pA++ ;
588                             pB++ ;
589                         }
590                         #if defined ( GB_PHASE_2_OF_2 )
591                         pC++ ;
592                         #else
593                         cjnz++ ;
594                         #endif
595                     }
596 
597                     //----------------------------------------------------------
598                     // A (:,j) or B (:,j) have entries left; not both
599                     //----------------------------------------------------------
600 
601                     ajnz = (pA_end - pA) ;
602                     bjnz = (pB_end - pB) ;
603                     ASSERT (ajnz == 0 || bjnz == 0) ;
604                     #if defined ( GB_PHASE_1_OF_2 )
605                     cjnz += ajnz + bjnz ;
606                     #else
607                     memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
608                     for (int64_t p = 0 ; p < ajnz ; p++)
609                     {
610                         // C (i,j) = A (i,j)
611                         GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
612                     }
613                     memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
614                     for (int64_t p = 0 ; p < bjnz ; p++)
615                     {
616                         // C (i,j) = B (i,j)
617                         GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
618                     }
619                     ASSERT (pC + ajnz + bjnz == pC_end) ;
620                     #endif
621                 }
622 
623             }
624             else if (sparse_mask_is_easy)
625             {
626 
627                 //--------------------------------------------------------------
628                 // special case: M is present and very easy to use
629                 //--------------------------------------------------------------
630 
631                 //      ------------------------------------------
632                 //      C      <M> =        A       +       B
633                 //      ------------------------------------------
634                 //      sparse  sparse      sparse          sparse
635                 //      sparse  sparse      sparse          full
636                 //      sparse  sparse      full            sparse
637                 //      sparse  sparse      full            full
638 
639                 // A and B are sparse, hypersparse or full, not bitmap.
640                 ASSERT (!A_is_bitmap) ;
641                 ASSERT (!B_is_bitmap) ;
642                 ASSERT (Mask_struct) ;
643 
644                 int64_t mjnz = pM_end - pM ;        // nnz (M (:,j))
645 
646                 #if defined ( GB_PHASE_1_OF_2 )
647 
648                 // M is structural, and sparse or hypersparse, so every entry
649                 // in the mask is guaranteed to appear in A+B.  The symbolic
650                 // count is thus trivial.
651 
652                 cjnz = mjnz ;
653 
654                 #else
655 
656                 // copy the pattern into C (:,j)
657                 int64_t pC_start = pC ;
658                 int64_t pM_start = pM ;
659                 memcpy (Ci + pC, Mi + pM, mjnz * sizeof (int64_t)) ;
660                 int64_t pA_offset = pA_start - iA_first ;
661                 int64_t pB_offset = pB_start - iB_first ;
662 
663                 if (adense && B == M)
664                 {
665 
666                     //----------------------------------------------------------
667                     // Method11: A dense, B == M
668                     //----------------------------------------------------------
669 
670                     GB_PRAGMA_SIMD_VECTORIZE
671                     for (int64_t p = 0 ; p < mjnz ; p++)
672                     {
673                         int64_t pM = p + pM_start ;
674                         int64_t pC = p + pC_start ;
675                         int64_t i = Mi [pM] ;
676                         ASSERT (GB_mcast (Mx, pM, msize)) ;
677                         ASSERT (GBI (Ai, pA_offset + i, vlen) == i) ;
678                         ASSERT (GBI (Bi, pM, vlen) == i) ;
679                         GB_GETA (aij, Ax, pA_offset + i) ;
680                         GB_GETB (bij, Bx, pM) ;
681                         GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
682                     }
683 
684                 }
685                 else if (bdense && A == M)
686                 {
687 
688                     //----------------------------------------------------------
689                     // Method12: B dense, A == M
690                     //----------------------------------------------------------
691 
692                     GB_PRAGMA_SIMD_VECTORIZE
693                     for (int64_t p = 0 ; p < mjnz ; p++)
694                     {
695                         int64_t pM = p + pM_start ;
696                         int64_t pC = p + pC_start ;
697                         int64_t i = Mi [pM] ;
698                         ASSERT (GB_mcast (Mx, pM, msize)) ;
699                         ASSERT (GBI (Ai, pM, vlen) == i) ;
700                         ASSERT (GBI (Bi, pB_offset + i, vlen) == i) ;
701                         GB_GETA (aij, Ax, pM) ;
702                         GB_GETB (bij, Bx, pB_offset + i) ;
703                         GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
704                     }
705 
706                 }
707                 else // (A == M) && (B == M)
708                 {
709 
710                     //----------------------------------------------------------
711                     // Method13: A == M == B: all three matrices the same
712                     //----------------------------------------------------------
713 
714                     GB_PRAGMA_SIMD_VECTORIZE
715                     for (int64_t p = 0 ; p < mjnz ; p++)
716                     {
717                         int64_t pM = p + pM_start ;
718                         int64_t pC = p + pC_start ;
719                         #if GB_OP_IS_SECOND
720                         GB_GETB (t, Bx, pM) ;
721                         #else
722                         GB_GETA (t, Ax, pM) ;
723                         #endif
724                         GB_BINOP (GB_CX (pC), t, t, Mi [pM], j) ;
725                     }
726                 }
727                 #endif
728 
729             }
730             else if (M_is_sparse_or_hyper)
731             {
732 
733                 //--------------------------------------------------------------
734                 // Method14: C and M are sparse or hypersparse
735                 //--------------------------------------------------------------
736 
737                 //      ------------------------------------------
738                 //      C      <M> =        A       +       B
739                 //      ------------------------------------------
740                 //      sparse  sparse      sparse          sparse  (*)
741                 //      sparse  sparse      sparse          bitmap  (*)
742                 //      sparse  sparse      sparse          full    (*)
743                 //      sparse  sparse      bitmap          sparse  (*)
744                 //      sparse  sparse      bitmap          bitmap
745                 //      sparse  sparse      bitmap          full
746                 //      sparse  sparse      full            sparse  (*)
747                 //      sparse  sparse      full            bitmap
748                 //      sparse  sparse      full            full
749 
750                 // (*) This method is efficient except when either A or B are
751                 // sparse, and when M is sparse but with many entries.  When M
752                 // is sparse and either A or B are sparse, the method is
753                 // designed to be very efficient when M is very sparse compared
754                 // with A and/or B.  It traverses all entries in the sparse M,
755                 // and (for sparse A or B) does a binary search for entries in
756                 // A or B.  In that case, if M has many entries, the mask M
757                 // should be ignored, and C=A+B should be computed without any
758                 // mask.  The test for when to use M here should ignore A or B
759                 // if they are bitmap or full.
760 
761                 // A and B can have any sparsity pattern (hypersparse,
762                 // sparse, bitmap, or full).
763 
764                 for ( ; pM < pM_end ; pM++)
765                 {
766 
767                     //----------------------------------------------------------
768                     // get M(i,j) for A(i,j) + B (i,j)
769                     //----------------------------------------------------------
770 
771                     int64_t i = Mi [pM] ;
772                     bool mij = GB_mcast (Mx, pM, msize) ;
773                     if (!mij) continue ;
774 
775                     //----------------------------------------------------------
776                     // get A(i,j)
777                     //----------------------------------------------------------
778 
779                     bool afound ;
780                     if (adense)
781                     {
782                         // A is dense, bitmap, or full; use quick lookup
783                         pA = pA_start + (i - iA_first) ;
784                         afound = GBB (Ab, pA) ;
785                     }
786                     else if (A == M)
787                     {
788                         // A is aliased to M
789                         pA = pM ;
790                         afound = true ;
791                     }
792                     else
793                     {
794                         // A is sparse; use binary search.  This is slow unless
795                         // M is very sparse compared with A.
796                         int64_t apright = pA_end - 1 ;
797                         GB_BINARY_SEARCH (i, Ai, pA, apright, afound) ;
798                     }
799 
800                     ASSERT (GB_IMPLIES (afound, GBI (Ai, pA, vlen) == i)) ;
801 
802                     //----------------------------------------------------------
803                     // get B(i,j)
804                     //----------------------------------------------------------
805 
806                     bool bfound ;
807                     if (bdense)
808                     {
809                         // B is dense; use quick lookup
810                         pB = pB_start + (i - iB_first) ;
811                         bfound = GBB (Bb, pB) ;
812                     }
813                     else if (B == M)
814                     {
815                         // B is aliased to M
816                         pB = pM ;
817                         bfound = true ;
818                     }
819                     else
820                     {
821                         // B is sparse; use binary search.  This is slow unless
822                         // M is very sparse compared with B.
823                         int64_t bpright = pB_end - 1 ;
824                         GB_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ;
825                     }
826 
827                     ASSERT (GB_IMPLIES (bfound, GBI (Bi, pB, vlen) == i)) ;
828 
829                     //----------------------------------------------------------
830                     // C(i,j) = A(i,j) + B(i,j)
831                     //----------------------------------------------------------
832 
833                     if (afound && bfound)
834                     {
835                         // C (i,j) = A (i,j) + B (i,j)
836                         #if defined ( GB_PHASE_1_OF_2 )
837                         cjnz++ ;
838                         #else
839                         Ci [pC] = i ;
840                         GB_GETA (aij, Ax, pA) ;
841                         GB_GETB (bij, Bx, pB) ;
842                         GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
843                         pC++ ;
844                         #endif
845                     }
846                     else if (afound)
847                     {
848                         // C (i,j) = A (i,j)
849                         #if defined ( GB_PHASE_1_OF_2 )
850                         cjnz++ ;
851                         #else
852                         Ci [pC] = i ;
853                         GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
854                         pC++ ;
855                         #endif
856                     }
857                     else if (bfound)
858                     {
859                         // C (i,j) = B (i,j)
860                         #if defined ( GB_PHASE_1_OF_2 )
861                         cjnz++ ;
862                         #else
863                         Ci [pC] = i ;
864                         GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
865                         pC++ ;
866                         #endif
867                     }
868                 }
869 
870                 #if defined ( GB_PHASE_2_OF_2 )
871                 ASSERT (pC == pC_end) ;
872                 #endif
873 
874             }
875             else
876             {
877 
878                 //--------------------------------------------------------------
879                 // M is bitmap or full, for either C<M>=A+B or C<!M>=A+B
880                 //--------------------------------------------------------------
881 
882                 //      ------------------------------------------
883                 //      C      <M> =        A       +       B
884                 //      ------------------------------------------
885                 //      sparse  bitmap      sparse          sparse
886                 //      sparse  full        sparse          sparse
887 
888                 //      ------------------------------------------
889                 //      C      <!M> =       A       +       B
890                 //      ------------------------------------------
891                 //      sparse  bitmap      sparse          sparse
892                 //      sparse  full        sparse          sparse
893 
894                 // This method is very efficient for any mask, and should
895                 // always be used if M is bitmap or full, even if the mask must
896                 // also be applied later in GB_mask or GB_accum_mask.
897                 // Exploiting the mask here adds no extra search time, and it
898                 // reduces the size of C on output.
899 
900                 // GB_GET_MIJ: get M(i,j) where M is bitmap or full
901                 #undef  GB_GET_MIJ
902                 #define GB_GET_MIJ(i)                                     \
903                     int64_t pM = pM_start + i ;                           \
904                     bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ; \
905                     if (Mask_comp) mij = !mij ;
906 
907                 // A and B are sparse or hypersparse, not bitmap or full,
908                 // but individual vectors of A and B might have all entries
909                 // present (adense and/or bdense).
910                 ASSERT (A_is_sparse || A_is_hyper) ;
911                 ASSERT (B_is_sparse || B_is_hyper) ;
912 
913                 int64_t pM_start = j * vlen ;
914 
915                 if (adense && bdense)
916                 {
917 
918                     //----------------------------------------------------------
919                     // Method15: A(:,j) and B(:,j) dense, M bitmap/full
920                     //----------------------------------------------------------
921 
922                     ASSERT (ajnz == bjnz) ;
923                     ASSERT (iA_first == iB_first) ;
924                     ASSERT (iA_last  == iB_last ) ;
925                     for (int64_t p = 0 ; p < ajnz ; p++)
926                     {
927                         int64_t i = p + iA_first ;
928                         ASSERT (Ai [pA + p] == i) ;
929                         ASSERT (Bi [pB + p] == i) ;
930                         GB_GET_MIJ (i) ;
931                         if (mij)
932                         {
933                             // C (i,j) = A (i,j) + B (i,j)
934                             #if defined ( GB_PHASE_1_OF_2 )
935                             cjnz++ ;
936                             #else
937                             Ci [pC] = i ;
938                             GB_GETA (aij, Ax, pA + p) ;
939                             GB_GETB (bij, Bx, pB + p) ;
940                             GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
941                             pC++ ;
942                             #endif
943                         }
944                     }
945 
946                 }
947                 else if (ajnz == 0)
948                 {
949 
950                     //----------------------------------------------------------
951                     // Method16: A(:,j) is empty, M bitmap/full
952                     //----------------------------------------------------------
953 
954                     for ( ; pB < pB_end ; pB++)
955                     {
956                         int64_t i = Bi [pB] ;
957                         GB_GET_MIJ (i) ;
958                         if (mij)
959                         {
960                             // C (i,j) = B (i,j)
961                             #if defined ( GB_PHASE_1_OF_2 )
962                             cjnz++ ;
963                             #else
964                             Ci [pC] = i ;
965                             GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
966                             pC++ ;
967                             #endif
968                         }
969                     }
970 
971                 }
972                 else if (bjnz == 0)
973                 {
974 
975                     //----------------------------------------------------------
976                     // Method17: B(:,j) is empty, M bitmap/full
977                     //----------------------------------------------------------
978 
979                     for ( ; pA < pA_end ; pA++)
980                     {
981                         int64_t i = Ai [pA] ;
982                         GB_GET_MIJ (i) ;
983                         if (mij)
984                         {
985                             // C (i,j) = A (i,j)
986                             #if defined ( GB_PHASE_1_OF_2 )
987                             cjnz++ ;
988                             #else
989                             Ci [pC] = i ;
990                             GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
991                             pC++ ;
992                             #endif
993                         }
994                     }
995 
996                 }
997                 else if (iA_last < iB_first)
998                 {
999 
1000                     //----------------------------------------------------------
1001                     // Method18:last A(:,j) before 1st B(:,j), M bitmap/full
1002                     //----------------------------------------------------------
1003 
1004                     for ( ; pA < pA_end ; pA++)
1005                     {
1006                         int64_t i = Ai [pA] ;
1007                         GB_GET_MIJ (i) ;
1008                         if (mij)
1009                         {
1010                             // C (i,j) = A (i,j)
1011                             #if defined ( GB_PHASE_1_OF_2 )
1012                             cjnz++ ;
1013                             #else
1014                             Ci [pC] = i ;
1015                             GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
1016                             pC++ ;
1017                             #endif
1018                         }
1019                     }
1020 
1021                     for ( ; pB < pB_end ; pB++)
1022                     {
1023                         int64_t i = Bi [pB] ;
1024                         GB_GET_MIJ (i) ;
1025                         if (mij)
1026                         {
1027                             // C (i,j) = B (i,j)
1028                             #if defined ( GB_PHASE_1_OF_2 )
1029                             cjnz++ ;
1030                             #else
1031                             Ci [pC] = i ;
1032                             GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
1033                             pC++ ;
1034                             #endif
1035                         }
1036                     }
1037 
1038                 }
1039                 else if (iB_last < iA_first)
1040                 {
1041 
1042                     //----------------------------------------------------------
1043                     // Method19:last B(:,j) before 1st A(:,j), M bitmap/full
1044                     //----------------------------------------------------------
1045 
1046                     for ( ; pB < pB_end ; pB++)
1047                     {
1048                         int64_t i = Bi [pB] ;
1049                         GB_GET_MIJ (i) ;
1050                         if (mij)
1051                         {
1052                             // C (i,j) = B (i,j)
1053                             #if defined ( GB_PHASE_1_OF_2 )
1054                             cjnz++ ;
1055                             #else
1056                             Ci [pC] = i ;
1057                             GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
1058                             pC++ ;
1059                             #endif
1060                         }
1061                     }
1062 
1063                     for ( ; pA < pA_end ; pA++)
1064                     {
1065                         int64_t i = Ai [pA] ;
1066                         GB_GET_MIJ (i) ;
1067                         if (mij)
1068                         {
1069                             // C (i,j) = A (i,j)
1070                             #if defined ( GB_PHASE_1_OF_2 )
1071                             cjnz++ ;
1072                             #else
1073                             Ci [pC] = i ;
1074                             GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
1075                             pC++ ;
1076                             #endif
1077                         }
1078                     }
1079 
1080                 }
1081                 else
1082                 {
1083 
1084                     //----------------------------------------------------------
1085                     // Method20: merge A(:,j) and B(:,j), M bitmap/full
1086                     //----------------------------------------------------------
1087 
1088                     while (pA < pA_end && pB < pB_end)
1089                     {
1090                         int64_t iA = Ai [pA] ;
1091                         int64_t iB = Bi [pB] ;
1092                         if (iA < iB)
1093                         {
1094                             GB_GET_MIJ (iA) ;
1095                             if (mij)
1096                             {
1097                                 // C (iA,j) = A (iA,j)
1098                                 #if defined ( GB_PHASE_1_OF_2 )
1099                                 cjnz++ ;
1100                                 #else
1101                                 Ci [pC] = iA ;
1102                                 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
1103                                 pC++ ;
1104                                 #endif
1105                             }
1106                             pA++ ;
1107                         }
1108                         else if (iA > iB)
1109                         {
1110                             GB_GET_MIJ (iB) ;
1111                             if (mij)
1112                             {
1113                                 // C (iB,j) = B (iB,j)
1114                                 #if defined ( GB_PHASE_1_OF_2 )
1115                                 cjnz++ ;
1116                                 #else
1117                                 Ci [pC] = iB ;
1118                                 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
1119                                 pC++ ;
1120                                 #endif
1121                             }
1122                             pB++ ;
1123                         }
1124                         else
1125                         {
1126                             GB_GET_MIJ (iB) ;
1127                             if (mij)
1128                             {
1129                                 // C (i,j) = A (i,j) + B (i,j)
1130                                 #if defined ( GB_PHASE_1_OF_2 )
1131                                 cjnz++ ;
1132                                 #else
1133                                 Ci [pC] = iB ;
1134                                 GB_GETA (aij, Ax, pA) ;
1135                                 GB_GETB (bij, Bx, pB) ;
1136                                 GB_BINOP (GB_CX (pC), aij, bij, iB, j) ;
1137                                 pC++ ;
1138                                 #endif
1139                             }
1140                             pA++ ;
1141                             pB++ ;
1142                         }
1143                     }
1144 
1145                     //----------------------------------------------------------
1146                     // A (:,j) or B (:,j) have entries left; not both
1147                     //----------------------------------------------------------
1148 
1149                     for ( ; pA < pA_end ; pA++)
1150                     {
1151                         int64_t iA = Ai [pA] ;
1152                         GB_GET_MIJ (iA) ;
1153                         if (mij)
1154                         {
1155                             // C (iA,j) = A (iA,j)
1156                             #if defined ( GB_PHASE_1_OF_2 )
1157                             cjnz++ ;
1158                             #else
1159                             Ci [pC] = iA ;
1160                             GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
1161                             pC++ ;
1162                             #endif
1163                         }
1164                     }
1165 
1166                     for ( ; pB < pB_end ; pB++)
1167                     {
1168                         int64_t iB = Bi [pB] ;
1169                         GB_GET_MIJ (iB) ;
1170                         if (mij)
1171                         {
1172                             // C (iB,j) = B (iB,j)
1173                             #if defined ( GB_PHASE_1_OF_2 )
1174                             cjnz++ ;
1175                             #else
1176                             Ci [pC] = iB ;
1177                             GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
1178                             pC++ ;
1179                             #endif
1180                         }
1181                     }
1182                 }
1183             }
1184 
1185             //------------------------------------------------------------------
1186             // final count of nnz (C (:,j))
1187             //------------------------------------------------------------------
1188 
1189             #if defined ( GB_PHASE_1_OF_2 )
1190             if (fine_task)
1191             {
1192                 TaskList [taskid].pC = cjnz ;
1193             }
1194             else
1195             {
1196                 Cp [k] = cjnz ;
1197             }
1198             #endif
1199         }
1200     }
1201 }
1202 
1203