1 //------------------------------------------------------------------------------
2 // GB_transpose_method: select method for GB_transpose
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 #include "GB_transpose.h"
11 
12 // GB_transpose can use choose between a merge-sort-based method that takes
13 // O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+m+n) time.
14 // The bucket sort has 3 methods: sequential, atomic, and non-atomic.
15 
GB_transpose_method(const GrB_Matrix A,int * nworkspaces_bucket,int * nthreads_bucket,GB_Context Context)16 bool GB_transpose_method        // if true: use GB_builder, false: use bucket
17 (
18     const GrB_Matrix A,         // matrix to transpose
19     int *nworkspaces_bucket,    // # of slices of A for the bucket method
20     int *nthreads_bucket,       // # of threads to use for the bucket method
21     GB_Context Context
22 )
23 {
24 
25     //--------------------------------------------------------------------------
26     // get inputs
27     //--------------------------------------------------------------------------
28 
29     int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ;
30     int64_t anz = GB_NNZ (A) ;
31     int64_t avlen = A->vlen ;
32     int64_t avdim = A->vdim ;
33     int anzlog = (anz   == 0) ? 1 : (int) GB_CEIL_LOG2 (anz) ;
34     int mlog   = (avlen == 0) ? 1 : (int) GB_CEIL_LOG2 (avlen) ;
35     double alpha ;
36 
37     // determine # of threads for bucket method
38     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
39     int nthreads = GB_nthreads (anz + avlen, chunk, nthreads_max) ;
40 
41     //--------------------------------------------------------------------------
42     // select between the atomic and non-atomic bucket method
43     //--------------------------------------------------------------------------
44 
45     bool atomics ;
46     if (nthreads == 1)
47     {
48         // sequential bucket method, no atomics needed
49         atomics = false ;
50     }
51     else if ((double) nthreads * (double) avlen > (double) anz)
52     {
53         // non-atomic workspace is too high; use atomic method
54         atomics = true ;
55     }
56     else
57     {
58         // select between atomic and non-atomic methods.  This rule is based on
59         // performance on a 4-core system with 4 threads with gcc 7.5.  The icc
60         // compiler has much slower atomics than gcc and so beta should likely
61         // be smaller when using icc.
62 
63         int beta ;
64         if (anzlog < 14)
65         {
66             beta = -4 ;     // fewer than 16K entries in A
67         }
68         else
69         {
70             switch (anzlog)
71             {
72                 case 14: beta = -4 ; break ;        // 16K entried in A
73                 case 15: beta = -3 ; break ;        // 32K
74                 case 16: beta = -2 ; break ;        // 64K
75                 case 17: beta = -1 ; break ;        // 128K
76                 case 18: beta =  0 ; break ;        // 256K
77                 case 19: beta =  1 ; break ;        // 512K
78                 case 20: beta =  2 ; break ;        // 1M
79                 case 21: beta =  3 ; break ;        // 2M
80                 case 22: beta =  4 ; break ;        // 4M
81                 case 23: beta =  5 ; break ;        // 8M
82                 case 24: beta =  6 ; break ;        // 16M
83                 case 25: beta =  8 ; break ;        // 32M
84                 case 26: beta =  9 ; break ;        // 64M
85                 case 27: beta =  9 ; break ;        // 128M
86                 case 28: beta = 10 ; break ;        // 256M
87                 default: beta = 10 ; break ;        // > 256M
88             }
89         }
90 
91         if (anzlog - mlog <= beta)
92         {
93             // use atomic method
94             // anzlog - mlog is the log2 of the average row degree, rounded.
95             // If the average row degree is <= 2^beta, use the atomic method.
96             // That is, the atomic method works better for sparser matrices,
97             // and the non-atomic works better or denser matrices.  However,
98             // the threshold changes as the problem gets larger, in terms of #
99             // of entries in A, when the atomic method becomes more attractive
100             // relative to the non-atomic method.  The atomic has the
101             // advantange of needing much less workspace, which becomes more
102             // important for larger problems.
103             atomics = true ;
104         }
105         else
106         {
107             // use non-atomic method
108             atomics = false ;
109         }
110     }
111 
112     (*nworkspaces_bucket) = (atomics) ? 1 : nthreads ;
113     (*nthreads_bucket) = nthreads ;
114 
115     //--------------------------------------------------------------------------
116     // select between GB_builder method and bucket method
117     //--------------------------------------------------------------------------
118 
119     // As the problem gets larger, the GB_builder method gets faster relative
120     // to the bucket method, in terms of the "constants" in the O(a log a) work
121     // for GB_builder, or O (a+m+n) for the bucket method.  Clearly, O (a log
122     // a) and O (a+m+n) do not fully model the performance of these two
123     // methods.  Perhaps this is because of cache effects.  The bucket method
124     // has more irregular memory accesses.  The GB_builder method uses
125     // mergesort, which has good memory locality.
126 
127     if (anzlog < 14)
128     {
129         alpha = 0.5 ;       // fewer than 2^14 = 16K entries
130     }
131     else
132     {
133         switch (anzlog)
134         {
135             case 14: alpha = 0.6 ; break ;      // 16K entries in A
136             case 15: alpha = 0.7 ; break ;      // 32K
137             case 16: alpha = 1.0 ; break ;      // 64K
138             case 17: alpha = 1.7 ; break ;      // 128K
139             case 18: alpha = 3.0 ; break ;      // 256K
140             case 19: alpha = 4.0 ; break ;      // 512K
141             case 20: alpha = 6.0 ; break ;      // 1M
142             case 21: alpha = 7.0 ; break ;      // 2M
143             case 22: alpha = 8.0 ; break ;      // 4M
144             case 23: alpha = 5.0 ; break ;      // 8M
145             case 24: alpha = 5.0 ; break ;      // 16M
146             case 25: alpha = 5.0 ; break ;      // 32M
147             case 26: alpha = 5.0 ; break ;      // 64M
148             case 27: alpha = 5.0 ; break ;      // 128M
149             case 28: alpha = 5.0 ; break ;      // 256M
150             default: alpha = 5.0 ; break ;      // > 256M
151         }
152     }
153 
154     double bucket_work  = (double) (anz + avlen + anvec) * alpha ;
155     double builder_work = (log2 ((double) anz + 1) * (anz)) ;
156 
157     //--------------------------------------------------------------------------
158     // select the method with the least amount of work
159     //--------------------------------------------------------------------------
160 
161     return (builder_work < bucket_work) ;
162 }
163 
164