1 //------------------------------------------------------------------------------
2 // GB_subref_template: C = A(I,J), or C = pattern (A(I,J))
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 #if defined ( GB_SYMBOLIC )
11 // symbolic method must tolerate zombies
12 #define GB_Ai(p) GBI_UNFLIP (Ai, p, avlen)
13 #else
14 // numeric method will not see any zombies
15 #define GB_Ai(p) GBI (Ai, p, avlen)
16 #endif
17 
18 // to iterate across all entries in a bucket:
19 #define GB_for_each_index_in_bucket(inew,i)     \
20     for (int64_t inew = Mark [i] - 1 ; inew >= 0 ; inew = Inext [inew])
21 
22 // copy values from A(:,kA) to C(:,kC): Cx [pC:pC+len-1] = ... (pA:pA+len-1).
23 #if defined ( GB_SYMBOLIC )
24     // symbolic copy: Cx is int64_t; Ax is ignored
25     #define GB_COPY_RANGE(pC,pA,len)            \
26         for (int64_t k = 0 ; k < (len) ; k++)   \
27         {                                       \
28             Cx [(pC) + k] = (pA) + k ;          \
29         }
30 #else
31     // numeric copy: Cx and Ax are both (GB_void *), and point to the same type
32     #define GB_COPY_RANGE(pC,pA,len)            \
33         memcpy (Cx + (pC)*asize, Ax + (pA)*asize, (len) * asize) ;
34 #endif
35 
36 // copy a single value from A(:,kA) to C(:,kC): Cx [pC] = ... (pA])
37 #if defined ( GB_SYMBOLIC )
38     // symbolic copy: Cx is int64_t; Ax is ignored
39     #define GB_COPY_ENTRY(pC,pA)                \
40         Cx [pC] = (pA) ;
41 #else
42     // numeric copy: Cx and Ax are both (GB_void *), and point to the same type
43     #define GB_COPY_ENTRY(pC,pA)                \
44         /* Cx [pC] = Ax [pA] */                 \
45         memcpy (Cx + (pC)*asize, Ax + (pA)*asize, asize) ;
46 #endif
47 
48 // the type of Cx
49 #if defined ( GB_SYMBOLIC )
50 // C is an int64_t array; the type of A is ignored
51 #define GB_CTYPE int64_t
52 #define GB_CSIZE1 1
53 #define GB_CSIZE2 (sizeof (int64_t))
54 #else
55 // C and A have the same type
56 #define GB_CTYPE GB_void
57 #define GB_CSIZE1 asize
58 #define GB_CSIZE2 asize
59 #endif
60 
61 {
62 
63     //--------------------------------------------------------------------------
64     // get A
65     //--------------------------------------------------------------------------
66 
67     const int64_t *restrict Ai = A->i ;
68     const int64_t avlen = A->vlen ;
69 
70     #if defined ( GB_SYMBOLIC )
71     const int64_t nzombies = A->nzombies ;
72     #endif
73 
74     #if defined ( GB_PHASE_2_OF_2 ) && defined ( GB_NUMERIC )
75     ASSERT (C->type = A->type) ;
76     const GB_void *restrict Ax = (GB_void *) A->x ;
77     const int64_t asize = A->type->size ;
78     #endif
79 
80     //--------------------------------------------------------------------------
81     // get C
82     //--------------------------------------------------------------------------
83 
84     #if defined ( GB_PHASE_2_OF_2 )
85     int64_t  *restrict Ci = C->i ;
86     GB_CTYPE *restrict Cx = (GB_CTYPE *) C->x ;
87     #endif
88 
89     //--------------------------------------------------------------------------
90     // get I
91     //--------------------------------------------------------------------------
92 
93     // these values are ignored if Ikind == GB_LIST
94     int64_t ibegin = Icolon [GxB_BEGIN] ;
95     int64_t iinc   = Icolon [GxB_INC  ] ;
96     int64_t inc    = (iinc < 0) ? (-iinc) : iinc ;
97     #ifdef GB_DEBUG
98     int64_t iend   = Icolon [GxB_END  ] ;
99     #endif
100 
101     //--------------------------------------------------------------------------
102     // phase1: count entries in each C(:,kC); phase2: compute C
103     //--------------------------------------------------------------------------
104 
105     int taskid ;
106     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
107     for (taskid = 0 ; taskid < ntasks ; taskid++)
108     {
109 
110         //----------------------------------------------------------------------
111         // get the task descriptor
112         //----------------------------------------------------------------------
113 
114         int64_t kfirst = TaskList [taskid].kfirst ;
115         int64_t klast  = TaskList [taskid].klast ;
116         bool fine_task = (klast < 0) ;
117         if (fine_task)
118         {
119             // a fine task operates on a slice of a single vector
120             klast = kfirst ;
121         }
122 
123         // a coarse task accesses all of I for all its vectors
124         int64_t pI     = 0 ;
125         int64_t pI_end = nI ;
126         int64_t ilen   = nI ;
127 
128         ASSERT (0 <= kfirst && kfirst <= klast && klast < Cnvec) ;
129 
130         //----------------------------------------------------------------------
131         // compute all vectors C(:,kfirst:klast) for this task
132         //----------------------------------------------------------------------
133 
134         for (int64_t kC = kfirst ; kC <= klast ; kC++)
135         {
136 
137             //------------------------------------------------------------------
138             // get C(:,kC)
139             //------------------------------------------------------------------
140 
141             #if defined ( GB_PHASE_1_OF_2 )
142             // phase1 simply counts the # of entries in C(*,kC).
143             int64_t clen = 0 ;
144             #else
145             // This task computes all or part of C(:,kC), which are the entries
146             // in Ci,Cx [pC:pC_end-1].
147             int64_t pC, pC_end ;
148             if (fine_task)
149             {
150                 // A fine task computes a slice of C(:,kC)
151                 pC     = TaskList [taskid  ].pC ;
152                 pC_end = TaskList [taskid+1].pC ;
153                 ASSERT (Cp [kC] <= pC && pC <= pC_end && pC_end <= Cp [kC+1]) ;
154             }
155             else
156             {
157                 // The vectors of C are never sliced for a coarse task, so this
158                 // task computes all of C(:,kC).
159                 pC     = Cp [kC] ;
160                 pC_end = Cp [kC+1] ;
161             }
162             int64_t clen = pC_end - pC ;
163             if (clen == 0) continue ;
164             #endif
165 
166             //------------------------------------------------------------------
167             // get A(:,kA)
168             //------------------------------------------------------------------
169 
170             int64_t pA, pA_end ;
171 
172             if (fine_task)
173             {
174                 // a fine task computes a slice of a single vector C(:,kC).
175                 // The task accesses Ai,Ax [pA:pA_end-1], which holds either
176                 // the entire vector A(imin:imax,kA) for method 6, the entire
177                 // dense A(:,kA) for methods 1 and 2, or a slice of the
178                 // A(imin:max,kA) vector for all other methods.
179                 pA     = TaskList [taskid].pA ;
180                 pA_end = TaskList [taskid].pA_end ;
181             }
182             else
183             {
184                 // a coarse task computes the entire vector C(:,kC).  The task
185                 // accesses all of A(imin:imax,kA), for most methods, or all of
186                 // A(:,kA) for methods 1 and 2.  The vector A(*,kA) appears in
187                 // Ai,Ax [pA:pA_end-1].
188                 pA     = Ap_start [kC] ;
189                 pA_end = Ap_end   [kC] ;
190             }
191 
192             int64_t alen = pA_end - pA ;
193             if (alen == 0) continue ;
194 
195             //------------------------------------------------------------------
196             // get I
197             //------------------------------------------------------------------
198 
199             if (fine_task)
200             {
201                 // A fine task accesses I [pI:pI_end-1].  For methods 2 and 6,
202                 // pI:pI_end is a subset of the entire 0:nI-1 list.  For all
203                 // other methods, pI = 0 and pI_end = nI, and the task can
204                 // access all of I.
205                 pI     = TaskList [taskid].pB ;
206                 pI_end = TaskList [taskid].pB_end ;
207                 ilen   = pI_end - pI ;
208             }
209 
210             //------------------------------------------------------------------
211             // determine the method to use
212             //------------------------------------------------------------------
213 
214             int method ;
215             if (fine_task)
216             {
217                 // The method that the fine task uses for its slice of A(*,kA)
218                 // and C(*,kC) has already been determined by GB_subref_slice.
219                 method = (int) (-TaskList [taskid].klast) ;
220             }
221             else
222             {
223                 // determine the method based on A(*,kA) and I
224                 method = GB_subref_method (NULL, NULL, alen, avlen, Ikind, nI,
225                     (Mark != NULL), need_qsort, iinc, nduplicates) ;
226             }
227 
228             //------------------------------------------------------------------
229             // extract C (:,kC) = A (I,kA): consider all cases
230             //------------------------------------------------------------------
231 
232             switch (method)
233             {
234 
235                 //--------------------------------------------------------------
236                 case 1 : // C(:,kC) = A(:,kA) where A(:,kA) is dense
237                 //--------------------------------------------------------------
238 
239                     // A (:,kA) has not been sliced
240                     ASSERT (Ikind == GB_ALL) ;
241                     ASSERT (pA     == Ap_start [kC]) ;
242                     ASSERT (pA_end == Ap_end   [kC]) ;
243                     // copy the entire vector and construct indices
244                     #if defined ( GB_PHASE_1_OF_2 )
245                     clen = ilen ;
246                     #else
247                     for (int64_t k = 0 ; k < ilen ; k++)
248                     {
249                         int64_t inew = k + pI ;
250                         ASSERT (inew == GB_ijlist (I, inew, Ikind, Icolon)) ;
251                         ASSERT (inew == GB_Ai (pA + inew)) ;
252                         Ci [pC + k] = inew ;
253                     }
254                     GB_COPY_RANGE (pC, pA + pI, ilen) ;
255                     #endif
256                     break ;
257 
258                 //--------------------------------------------------------------
259                 case 2 : // C(:,kC) = A(I,kA) where A(I,kA) is dense
260                 //--------------------------------------------------------------
261 
262                     // This method handles any kind of list I, but A(:,kA)
263                     // must be dense.  A(:,kA) has not been sliced.
264                     ASSERT (pA     == Ap_start [kC]) ;
265                     ASSERT (pA_end == Ap_end   [kC]) ;
266                     // scan I and get the entry in A(:,kA) via direct lookup
267                     #if defined ( GB_PHASE_1_OF_2 )
268                     clen = ilen ;
269                     #else
270                     for (int64_t k = 0 ; k < ilen ; k++)
271                     {
272                         // C(inew,kC) =  A(i,kA), and it always exists.
273                         int64_t inew = k + pI ;
274                         int64_t i = GB_ijlist (I, inew, Ikind, Icolon) ;
275                         ASSERT (i == GB_Ai (pA + i)) ;
276                         Ci [pC + k] = inew ;
277                         GB_COPY_ENTRY (pC + k, pA + i) ;
278                     }
279                     #endif
280                     break ;
281 
282                 //--------------------------------------------------------------
283                 case 3 : // the list I has a single index, ibegin
284                 //--------------------------------------------------------------
285 
286                     // binary search in GB_subref_phase0 has already found it.
287                     // This can be any Ikind with nI=1: GB_ALL with A->vlen=1,
288                     // GB_RANGE with ibegin==iend, GB_STRIDE such as 0:-1:0
289                     // (with length 1), or a GB_LIST with ni=1.
290 
291                     // Time: 50x faster than MATLAB
292 
293                     ASSERT (!fine_task) ;
294                     ASSERT (alen == 1) ;
295                     ASSERT (nI == 1) ;
296                     ASSERT (GB_Ai (pA) == GB_ijlist (I, 0, Ikind, Icolon)) ;
297                     #if defined ( GB_PHASE_1_OF_2 )
298                     clen = 1 ;
299                     #else
300                     Ci [pC] = 0 ;
301                     GB_COPY_ENTRY (pC, pA) ;
302                     #endif
303                     break ;
304 
305                 //--------------------------------------------------------------
306                 case 4 : // Ikind is ":", thus C(:,kC) = A (:,kA)
307                 //--------------------------------------------------------------
308 
309                     // Time: 1x MATLAB but low speedup on the Mac.  Why?
310                     // Probably memory bound since it is just memcpy's.
311 
312                     ASSERT (Ikind == GB_ALL && ibegin == 0) ;
313                     #if defined ( GB_PHASE_1_OF_2 )
314                     clen = alen ;
315                     #else
316                     #if defined ( GB_SYMBOLIC )
317                     if (nzombies == 0)
318                     {
319                         memcpy (Ci + pC, Ai + pA, alen * sizeof (int64_t)) ;
320                     }
321                     else
322                     {
323                         // with zombies
324                         for (int64_t k = 0 ; k < alen ; k++)
325                         {
326                             // symbolic C(:,kC) = A(:,kA) where A has zombies
327                             int64_t i = GB_Ai (pA + k) ;
328                             ASSERT (i == GB_ijlist (I, i, Ikind, Icolon)) ;
329                             Ci [pC + k] = i ;
330                         }
331                     }
332                     #else
333                     memcpy (Ci + pC, Ai + pA, alen * sizeof (int64_t)) ;
334                     #endif
335                     GB_COPY_RANGE (pC, pA, alen) ;
336                     #endif
337                     break ;
338 
339                 //--------------------------------------------------------------
340                 case 5 : // Ikind is GB_RANGE = ibegin:iend
341                 //--------------------------------------------------------------
342 
343                     // Time: much faster than MATLAB.  Good speedup too.
344 
345                     ASSERT (Ikind == GB_RANGE) ;
346                     #if defined ( GB_PHASE_1_OF_2 )
347                     clen = alen ;
348                     #else
349                     for (int64_t k = 0 ; k < alen ; k++)
350                     {
351                         int64_t i = GB_Ai (pA + k) ;
352                         int64_t inew = i - ibegin ;
353                         ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
354                         Ci [pC + k] = inew ;
355                     }
356                     GB_COPY_RANGE (pC, pA, alen) ;
357                     #endif
358                     break ;
359 
360                 //--------------------------------------------------------------
361                 case 6 : // I is short vs nnz (A (:,kA)), use binary search
362                 //--------------------------------------------------------------
363 
364                     // Time: very slow unless I is very short and A(:,kA) is
365                     // very long.
366 
367                     // This case can handle any kind of I, and A(:,kA) of any
368                     // properties.  For a fine task, A(:,kA) has not been
369                     // sliced; I has been sliced instead.
370 
371                     // If the I bucket inverse has not been created, this
372                     // method is the only option.  Alternatively, if nI =
373                     // length (I) is << nnz (A (:,kA)), then scanning I and
374                     // doing a binary search of A (:,kA) is faster than doing a
375                     // linear-time search of A(:,kA) and a lookup into the I
376                     // bucket inverse.
377 
378                     // The vector of C is constructed in sorted order, so no
379                     // sort is needed.
380 
381                     // A(:,kA) has not been sliced.
382                     ASSERT (pA     == Ap_start [kC]) ;
383                     ASSERT (pA_end == Ap_end   [kC]) ;
384 
385                     // scan I, in order, and search for the entry in A(:,kA)
386                     for (int64_t k = 0 ; k < ilen ; k++)
387                     {
388                         // C(inew,kC) = A (i,kA), if it exists.
389                         // i = I [inew] ; or from a colon expression
390                         int64_t inew = k + pI ;
391                         int64_t i = GB_ijlist (I, inew, Ikind, Icolon) ;
392                         bool found ;
393                         int64_t pleft = pA ;
394                         int64_t pright = pA_end - 1 ;
395                         #if defined ( GB_SYMBOLIC )
396                         bool is_zombie ;
397                         GB_BINARY_SEARCH_ZOMBIE (i, Ai, pleft, pright, found,
398                             nzombies, is_zombie) ;
399                         #else
400                         GB_BINARY_SEARCH (i, Ai, pleft, pright, found) ;
401                         #endif
402                         if (found)
403                         {
404                             ASSERT (i == GB_Ai (pleft)) ;
405                             #if defined ( GB_PHASE_1_OF_2 )
406                             clen++ ;
407                             #else
408                             ASSERT (pC < pC_end) ;
409                             Ci [pC] = inew ;
410                             GB_COPY_ENTRY (pC, pleft) ;
411                             pC++ ;
412                             #endif
413                         }
414                     }
415                     #if defined ( GB_PHASE_2_OF_2 )
416                     ASSERT (pC == pC_end) ;
417                     #endif
418                     break ;
419 
420                 //--------------------------------------------------------------
421                 case 7 : // I is ibegin:iinc:iend with iinc > 1
422                 //--------------------------------------------------------------
423 
424                     // Time: 1 thread: C=A(1:2:n,:) is 3x slower than MATLAB
425                     // but has good speedup.  About as fast as MATLAB with
426                     // enough threads.
427 
428                     ASSERT (Ikind == GB_STRIDE && iinc > 1) ;
429                     for (int64_t k = 0 ; k < alen ; k++)
430                     {
431                         // A(i,kA) present; see if it is in ibegin:iinc:iend
432                         int64_t i = GB_Ai (pA + k) ;
433                         ASSERT (ibegin <= i && i <= iend) ;
434                         i = i - ibegin ;
435                         if (i % iinc == 0)
436                         {
437                             // i is in the sequence ibegin:iinc:iend
438                             #if defined ( GB_PHASE_1_OF_2 )
439                             clen++ ;
440                             #else
441                             int64_t inew = i / iinc ;
442                             ASSERT (pC < pC_end) ;
443                             Ci [pC] = inew ;
444                             GB_COPY_ENTRY (pC, pA + k) ;
445                             pC++ ;
446                             #endif
447                         }
448                     }
449                     #if defined ( GB_PHASE_2_OF_2 )
450                     ASSERT (pC == pC_end) ;
451                     #endif
452                     break ;
453 
454                 //----------------------------------------------------------
455                 case 8 : // I = ibegin:(-iinc):iend, with iinc < -1
456                 //----------------------------------------------------------
457 
458                     // Time: 2x slower than MATLAB for iinc = -2 or -8.
459                     // Good speedup though.  Faster than MATLAB for
460                     // large values (iinc = -128).
461 
462                     ASSERT (Ikind == GB_STRIDE && iinc < -1) ;
463                     for (int64_t k = alen - 1 ; k >= 0 ; k--)
464                     {
465                         // A(i,kA) present; see if it is in ibegin:iinc:iend
466                         int64_t i = GB_Ai (pA + k) ;
467                         ASSERT (iend <= i && i <= ibegin) ;
468                         i = ibegin - i ;
469                         if (i % inc == 0)
470                         {
471                             // i is in the sequence ibegin:iinc:iend
472                             #if defined ( GB_PHASE_1_OF_2 )
473                             clen++ ;
474                             #else
475                             int64_t inew = i / inc ;
476                             ASSERT (pC < pC_end) ;
477                             Ci [pC] = inew ;
478                             GB_COPY_ENTRY (pC, pA + k) ;
479                             pC++ ;
480                             #endif
481                         }
482                     }
483                     #if defined ( GB_PHASE_2_OF_2 )
484                     ASSERT (pC == pC_end) ;
485                     #endif
486                     break ;
487 
488                 //----------------------------------------------------------
489                 case 9 : // I = ibegin:(-1):iend
490                 //----------------------------------------------------------
491 
492                     // Time: much faster than MATLAB.  Good speedup.
493 
494                     ASSERT (Ikind == GB_STRIDE && iinc == -1) ;
495                     #if defined ( GB_PHASE_1_OF_2 )
496                     clen = alen ;
497                     #else
498                     for (int64_t k = alen - 1 ; k >= 0 ; k--)
499                     {
500                         // A(i,kA) is present
501                         int64_t i = GB_Ai (pA + k) ;
502                         int64_t inew = (ibegin - i) ;
503                         ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
504                         Ci [pC] = inew ;
505                         GB_COPY_ENTRY (pC, pA + k) ;
506                         pC++ ;
507                     }
508                     #endif
509                     break ;
510 
511                 //--------------------------------------------------------------
512                 case 10 : // I unsorted, and C needs qsort, duplicates OK
513                 //--------------------------------------------------------------
514 
515                     // Time: with one thread: 2x slower than MATLAB, probably
516                     // because of the qsort.  Good speedup however.  This used
517                     // if qsort is needed but ndupl == 0.  Try a method that
518                     // needs qsort, but no duplicates?
519 
520                     // Case 10 works well when I has many entries and A(:,kA)
521                     // has few entries. C(:,kC) must be sorted after this pass.
522 
523                     ASSERT (Ikind == GB_LIST) ;
524                     for (int64_t k = 0 ; k < alen ; k++)
525                     {
526                         // A(i,kA) present, look it up in the I inverse buckets
527                         int64_t i = GB_Ai (pA + k) ;
528                         // traverse bucket i for all indices inew where
529                         // i == I [inew] or where i is from a colon expression
GB_for_each_index_in_bucket(inew,i)530                         GB_for_each_index_in_bucket (inew, i)
531                         {
532                             ASSERT (inew >= 0 && inew < nI) ;
533                             ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
534                             #if defined ( GB_PHASE_1_OF_2 )
535                             clen++ ;
536                             #else
537                             Ci [pC] = inew ;
538                             GB_COPY_ENTRY (pC, pA + k) ;
539                             pC++ ;
540                             #endif
541                         }
542                     }
543 
544                     // TODO: skip the sort if C is allowed to be jumbled on
545                     // output.  Flag C as jumbled instead.
546 
547                     #if defined ( GB_PHASE_2_OF_2 )
548                     ASSERT (pC == pC_end) ;
549                     if (!fine_task)
550                     {
551                         // a coarse task owns this entire C(:,kC) vector, so
552                         // the sort can be done now.  The sort for vectors
553                         // handled by multiple fine tasks must wait until all
554                         // task are completed, below in the post sort.
555                         pC = Cp [kC] ;
556                         GB_qsort_1b (Ci + pC, (GB_void *) (Cx + pC*GB_CSIZE1),
557                             GB_CSIZE2, clen) ;
558                     }
559                     #endif
560                     break ;
561 
562                 //--------------------------------------------------------------
563                 case 11 : // I not contiguous, with duplicates. No qsort needed
564                 //--------------------------------------------------------------
565 
566                     // Case 11 works well when I has many entries and A(:,kA)
567                     // has few entries.  It requires that I be sorted on input,
568                     // so that no sort is required for C(:,kC).  It is
569                     // otherwise identical to Case 10.
570 
571                     ASSERT (Ikind == GB_LIST) ;
572                     for (int64_t k = 0 ; k < alen ; k++)
573                     {
574                         // A(i,kA) present, look it up in the I inverse buckets
575                         int64_t i = GB_Ai (pA + k) ;
576                         // traverse bucket i for all indices inew where
577                         // i == I [inew] or where i is from a colon expression
GB_for_each_index_in_bucket(inew,i)578                         GB_for_each_index_in_bucket (inew, i)
579                         {
580                             ASSERT (inew >= 0 && inew < nI) ;
581                             ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
582                             #if defined ( GB_PHASE_1_OF_2 )
583                             clen++ ;
584                             #else
585                             Ci [pC] = inew ;
586                             GB_COPY_ENTRY (pC, pA + k) ;
587                             pC++ ;
588                             #endif
589                         }
590                     }
591 
592                     #if defined ( GB_PHASE_2_OF_2 )
593                     ASSERT (pC == pC_end) ;
594                     #endif
595                     break ;
596 
597                 //--------------------------------------------------------------
598                 case 12 : // I not contiguous, no duplicates.  No qsort needed.
599                 //--------------------------------------------------------------
600 
601                     // Identical to Case 11, except GB_for_each_index_in_bucket
602                     // just needs to iterate 0 or 1 times.  Works well when I
603                     // has many entries and A(:,kA) has few entries.
604 
605                     ASSERT (Ikind == GB_LIST && nduplicates == 0) ;
606                     for (int64_t k = 0 ; k < alen ; k++)
607                     {
608                         // A(i,kA) present, look it up in the I inverse buckets
609                         int64_t i = GB_Ai (pA + k) ;
610                         // bucket i has at most one index inew such that
611                         // i == I [inew]
612                         int64_t inew = Mark [i] - 1 ;
613                         if (inew >= 0)
614                         {
615                             ASSERT (inew >= 0 && inew < nI) ;
616                             ASSERT (i == GB_ijlist (I, inew, Ikind, Icolon)) ;
617                             #if defined ( GB_PHASE_1_OF_2 )
618                             clen++ ;
619                             #else
620                             Ci [pC] = inew ;
621                             GB_COPY_ENTRY (pC, pA + k) ;
622                             pC++ ;
623                             #endif
624                         }
625                     }
626 
627                     #if defined ( GB_PHASE_2_OF_2 )
628                     ASSERT (pC == pC_end) ;
629                     #endif
630                     break ;
631 
632                 //--------------------------------------------------------------
633                 default: ;
634                 //--------------------------------------------------------------
635             }
636 
637             //------------------------------------------------------------------
638             // final count of nnz (C (:,j))
639             //------------------------------------------------------------------
640 
641             #if defined ( GB_PHASE_1_OF_2 )
642             if (fine_task)
643             {
644                 TaskList [taskid].pC = clen ;
645             }
646             else
647             {
648                 Cp [kC] = clen ;
649             }
650             #endif
651         }
652     }
653 
654     //--------------------------------------------------------------------------
655     // phase2: post sort for any vectors handled by fine tasks with method 10
656     //--------------------------------------------------------------------------
657 
658     // TODO: skip the sort if C is allowed to be jumbled on output.
659     // Flag C as jumbled instead.
660 
661     #if defined ( GB_PHASE_2_OF_2 )
662     if (post_sort)
663     {
664         int taskid ;
665         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
666         for (taskid = 0 ; taskid < ntasks ; taskid++)
667         {
668             int64_t kC = TaskList [taskid].kfirst ;
669             bool do_post_sort = (TaskList [taskid].len != 0) ;
670             if (do_post_sort)
671             {
672                 // This is the first fine task with method 10 for C(:,kC).  The
673                 // vector C(:,kC) must be sorted, since method 10 left it with
674                 // unsorted indices.
675                 int64_t pC = Cp [kC] ;
676                 int64_t clen = Cp [kC+1] - pC ;
677                 GB_qsort_1b (Ci + pC, (GB_void *) (Cx + pC*GB_CSIZE1),
678                     GB_CSIZE2, clen) ;
679             }
680         }
681     }
682 
683     #endif
684 
685 }
686 
687 #undef GB_Ai
688 #undef GB_for_each_index_in_bucket
689 #undef GB_COPY_RANGE
690 #undef GB_COPY_ENTRY
691 #undef GB_CTYPE
692 #undef GB_CSIZE1
693 #undef GB_CSIZE2
694 
695