1 //------------------------------------------------------------------------------
2 // GB_transpose_method: select method for GB_transpose
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 #include "GB_transpose.h"
11
12 // GB_transpose can use choose between a merge-sort-based method that takes
13 // O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+m+n) time.
14 // The bucket sort has 3 methods: sequential, atomic, and non-atomic.
15
GB_transpose_method(const GrB_Matrix A,int * nworkspaces_bucket,int * nthreads_bucket,GB_Context Context)16 bool GB_transpose_method // if true: use GB_builder, false: use bucket
17 (
18 const GrB_Matrix A, // matrix to transpose
19 int *nworkspaces_bucket, // # of slices of A for the bucket method
20 int *nthreads_bucket, // # of threads to use for the bucket method
21 GB_Context Context
22 )
23 {
24
25 //--------------------------------------------------------------------------
26 // get inputs
27 //--------------------------------------------------------------------------
28
29 int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ;
30 int64_t anz = GB_NNZ (A) ;
31 int64_t avlen = A->vlen ;
32 int64_t avdim = A->vdim ;
33 int anzlog = (anz == 0) ? 1 : (int) GB_CEIL_LOG2 (anz) ;
34 int mlog = (avlen == 0) ? 1 : (int) GB_CEIL_LOG2 (avlen) ;
35 double alpha ;
36
37 // determine # of threads for bucket method
38 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
39 int nthreads = GB_nthreads (anz + avlen, chunk, nthreads_max) ;
40
41 //--------------------------------------------------------------------------
42 // select between the atomic and non-atomic bucket method
43 //--------------------------------------------------------------------------
44
45 bool atomics ;
46 if (nthreads == 1)
47 {
48 // sequential bucket method, no atomics needed
49 atomics = false ;
50 }
51 else if ((double) nthreads * (double) avlen > (double) anz)
52 {
53 // non-atomic workspace is too high; use atomic method
54 atomics = true ;
55 }
56 else
57 {
58 // select between atomic and non-atomic methods. This rule is based on
59 // performance on a 4-core system with 4 threads with gcc 7.5. The icc
60 // compiler has much slower atomics than gcc and so beta should likely
61 // be smaller when using icc.
62
63 int beta ;
64 if (anzlog < 14)
65 {
66 beta = -4 ; // fewer than 16K entries in A
67 }
68 else
69 {
70 switch (anzlog)
71 {
72 case 14: beta = -4 ; break ; // 16K entried in A
73 case 15: beta = -3 ; break ; // 32K
74 case 16: beta = -2 ; break ; // 64K
75 case 17: beta = -1 ; break ; // 128K
76 case 18: beta = 0 ; break ; // 256K
77 case 19: beta = 1 ; break ; // 512K
78 case 20: beta = 2 ; break ; // 1M
79 case 21: beta = 3 ; break ; // 2M
80 case 22: beta = 4 ; break ; // 4M
81 case 23: beta = 5 ; break ; // 8M
82 case 24: beta = 6 ; break ; // 16M
83 case 25: beta = 8 ; break ; // 32M
84 case 26: beta = 9 ; break ; // 64M
85 case 27: beta = 9 ; break ; // 128M
86 case 28: beta = 10 ; break ; // 256M
87 default: beta = 10 ; break ; // > 256M
88 }
89 }
90
91 if (anzlog - mlog <= beta)
92 {
93 // use atomic method
94 // anzlog - mlog is the log2 of the average row degree, rounded.
95 // If the average row degree is <= 2^beta, use the atomic method.
96 // That is, the atomic method works better for sparser matrices,
97 // and the non-atomic works better or denser matrices. However,
98 // the threshold changes as the problem gets larger, in terms of #
99 // of entries in A, when the atomic method becomes more attractive
100 // relative to the non-atomic method. The atomic has the
101 // advantange of needing much less workspace, which becomes more
102 // important for larger problems.
103 atomics = true ;
104 }
105 else
106 {
107 // use non-atomic method
108 atomics = false ;
109 }
110 }
111
112 (*nworkspaces_bucket) = (atomics) ? 1 : nthreads ;
113 (*nthreads_bucket) = nthreads ;
114
115 //--------------------------------------------------------------------------
116 // select between GB_builder method and bucket method
117 //--------------------------------------------------------------------------
118
119 // As the problem gets larger, the GB_builder method gets faster relative
120 // to the bucket method, in terms of the "constants" in the O(a log a) work
121 // for GB_builder, or O (a+m+n) for the bucket method. Clearly, O (a log
122 // a) and O (a+m+n) do not fully model the performance of these two
123 // methods. Perhaps this is because of cache effects. The bucket method
124 // has more irregular memory accesses. The GB_builder method uses
125 // mergesort, which has good memory locality.
126
127 if (anzlog < 14)
128 {
129 alpha = 0.5 ; // fewer than 2^14 = 16K entries
130 }
131 else
132 {
133 switch (anzlog)
134 {
135 case 14: alpha = 0.6 ; break ; // 16K entries in A
136 case 15: alpha = 0.7 ; break ; // 32K
137 case 16: alpha = 1.0 ; break ; // 64K
138 case 17: alpha = 1.7 ; break ; // 128K
139 case 18: alpha = 3.0 ; break ; // 256K
140 case 19: alpha = 4.0 ; break ; // 512K
141 case 20: alpha = 6.0 ; break ; // 1M
142 case 21: alpha = 7.0 ; break ; // 2M
143 case 22: alpha = 8.0 ; break ; // 4M
144 case 23: alpha = 5.0 ; break ; // 8M
145 case 24: alpha = 5.0 ; break ; // 16M
146 case 25: alpha = 5.0 ; break ; // 32M
147 case 26: alpha = 5.0 ; break ; // 64M
148 case 27: alpha = 5.0 ; break ; // 128M
149 case 28: alpha = 5.0 ; break ; // 256M
150 default: alpha = 5.0 ; break ; // > 256M
151 }
152 }
153
154 double bucket_work = (double) (anz + avlen + anvec) * alpha ;
155 double builder_work = (log2 ((double) anz + 1) * (anz)) ;
156
157 //--------------------------------------------------------------------------
158 // select the method with the least amount of work
159 //--------------------------------------------------------------------------
160
161 return (builder_work < bucket_work) ;
162 }
163
164