1 // =============================================================================
2 // === spqrgpu_computeFrontStaging =============================================
3 // =============================================================================
4
5 // Returns a front staging and whether the staging is feasible.
6 // A front staging is infeasible if a front and its children do not fit on
7 // the GPU at the same time.
8
9 #ifdef GPU_BLAS
10 #include "spqr.hpp"
11 #include "GPUQREngine_Scheduler.hpp"
12
spqrgpu_computeFrontStaging(Long numFronts,Long * Parent,Long * Childp,Long * Child,Long * Fm,Long * Cm,Long * Rp,Long * Sp,Long * Sleft,Long * Super,Long * Post,Long RimapSize,Long RjmapSize,bool * feasible,Long * numStages,Long * Stagingp,Long * StageMap,size_t * FSize,size_t * RSize,size_t * SSize,Long * FOffsets,Long * ROffsets,Long * SOffsets,cholmod_common * cc)13 void spqrgpu_computeFrontStaging
14 (
15 // inputs, not modified on output
16 Long numFronts, // total number of fronts (nf in caller)
17 Long *Parent, // size nf+1, assembly tree (f=nf is placeholder)
18 Long *Childp, // size nf+2, children of f are
19 // Child [Childp [f] ... Childp [f+1]-1]
20 Long *Child, // size nf+1.
21
22 Long *Fm, // size nf+1, front f has Fm [f] rows
23 Long *Cm, // size nf+1, front f has Cm [f] rows in contrib
24 Long *Rp, // size nf+1, Rj[Rp[f]...Rp[f+1]-1] are the cols in f
25 Long *Sp, // size m+1, row pointers for sparse matrix S
26 Long *Sleft, // size n+2, see spqr_stranspose for description
27 Long *Super, // size nf+1, front f pivotal cols are
28 // Super[f]..Super[f+1]-1
29 Long *Post, // size nf+1, front f is kth, if f = Post [k]
30
31 Long RimapSize, // scalar, size of Rimap on the GPU (# of int's)
32 Long RjmapSize, // scalar, size of Rimap on the GPU (# of int's)
33
34 // output, not defined on input:
35 bool *feasible, // scalar, true if feasible, false if GPU memory too low
36 Long *numStages, // scalar, number of stages
37 Long *Stagingp, // size nf+2, fronts are in the list
38 // Post [Stagingp [stage]...Stagingp[stage+1]-1]
39 Long *StageMap, // size nf, front f is in stage StageMap [f]
40
41 size_t *FSize, // size nf+1, FSize[stage]: size in bytes of MongoF
42 size_t *RSize, // size nf+1, Rsize[stage]: size in bytes of MongoR
43 size_t *SSize, // size nf+1, Ssize[stage]: size in bytes of S
44 Long *FOffsets, // size nf, front f in MondoF [FOffsets[f]...] on GPU
45 Long *ROffsets, // size nf, R block in MondoR [Roffsets[f]...] on CPU
46 Long *SOffsets, // size nf, S entries for front f are in
47 // wsS [SOffsets[f]...]
48
49 // input/output:
50 cholmod_common *cc
51 )
52 {
53
54 // -------------------------------------------------------------------------
55 // determine available GPU memory, required for all stages
56 // -------------------------------------------------------------------------
57
58 // gpuMemorySize = 0 is for testing only. This value forces each front to
59 // appear in its own stage. gpuMemorySize = 1 is also for testing, to
60 // check how this function handles problem that is infeasible due to lack
61 // of GPU memory. We must also ensure that gpuMemorySize does not
62 // accidentally become negative.
63
64 size_t gpuMemorySize = cc->gpuMemorySize;
65
66 // printf ("GPU mem starts %g MB\n", (double) gpuMemorySize / (1024*1024)) ;
67
68 // account for two Scheduler work queues in the GPU memory
69 if (gpuMemorySize > 1)
70 {
71 // The GPU must hold two workspace queues, each of size maxQueueSize,
72 // and where each entry is of size sizeof(TaskDescriptor)
73 size_t maxQueueSize = ssgpu_maxQueueSize (gpuMemorySize) ;
74 size_t s = 2 * maxQueueSize * sizeof (TaskDescriptor) ;
75 gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
76 }
77
78 // account for Rimap in the GPU memory
79 if (gpuMemorySize > 1)
80 {
81 size_t s = RimapSize * sizeof (int) ;
82 gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
83 }
84
85 // account for Rjmap in the GPU memory
86 if (gpuMemorySize > 1)
87 {
88 size_t s = RjmapSize * sizeof (int) ;
89 gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
90 }
91
92 // account for cudaMalloc memory manager overhead in the GPU memory
93 if (gpuMemorySize > 1)
94 {
95 size_t s = 1024 * 1024 ; // just 1 MB for good measure
96 gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
97 }
98
99 // printf ("GPU mem now %g MB\n", (double) gpuMemorySize / (1024*1024)) ;
100
101 // -------------------------------------------------------------------------
102 // assign fronts to stages based on remaining GPU memory availability
103 // -------------------------------------------------------------------------
104
105 /* The memory requirement for a front is the summation of the memory
106 requirements from its children plus its own memory requirement.
107 If we use a postorder traversal, we only need to keep a rolling sum. */
108 size_t ReqMem = 0; // also used in FOffsets
109
110 /* RMem is always < ReqMem.
111 RMem is not just for R, but an upper-bound on the amount of front data
112 we have to pull back from the GPU in the event that the front is staged.
113 We already would have room to pull back the CBlock if we account for it.
114 The QREngine is intelligent enough to pull only the needed FrontData.
115 The language is clearer in QREngine as well ("R" renamed "FrontData") */
116 size_t RMem = 0; // also used in ROffsets
117
118 /* SMem is twice the number of S values (one for index, one for value). */
119 size_t SMem = 0; // also used in SOffsets
120
121 /* VTMem is the amount of memory required for the VT blocks. */
122 size_t VTMem = 0;
123
124 Long stage = 0;
125 Stagingp[0] = 0;
126 for(Long p=0; p<numFronts; p++)
127 {
128 Long f = Post[p]; // The postordering ensures we visit children first
129
130 Long fm = Fm[f];
131 Long fn = Rp[f+1] - Rp[f];
132 Long fp = Super[f+1] - Super[f];
133 Long frank = MIN(fm, fp);
134 // Long cn = fn - fp ;
135 Long cm = Cm[f] ;
136 size_t frontMem = fm * fn; // F
137 size_t rMem = (frank + cm) * fn; // R + C
138
139 // for sMem, "2 *" assumes sizeof (SEntry) is 2*sizeof(double)
140 size_t sMem = 2 * (Sp[Sleft[Super[f+1]]] - Sp[Sleft[Super[f]]]);
141
142 // CEIL is defined in GPUQREngine
143 size_t vtMem = CEIL(fm, TILESIZE) * (TILESIZE+1) * TILESIZE ;
144
145 size_t childMemOld = 0;
146 size_t childMemNew = 0;
147
148 /* Add contribution block memory from children in earlier stages. */
149 for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
150 {
151 Long c = Child[cp];
152
153 // Long cfm = Fm[c];
154 Long cfn = Rp[c+1] - Rp[c];
155 Long cfp = Super[c+1] - Super[c];
156 // Long crank = MIN(cfm, cfp);
157 Long ccn = cfn - cfp ;
158 Long ccm = Cm[c];
159
160 if(StageMap[c] < stage)
161 {
162 childMemOld += ccm * ccn;
163 }
164 else
165 {
166 childMemNew += ccm * ccn;
167 }
168 }
169
170 /* determine which stage will contain this front */
171 if((ReqMem + (frontMem + childMemOld + sMem + vtMem)) * sizeof(double)
172 < gpuMemorySize)
173 {
174 /* If we can add the front to the current stage, accum its mem. */
175 FOffsets[f] = ReqMem;
176 ROffsets[f] = RMem;
177 SOffsets[f] = SMem / 2; // correct for data width
178 ReqMem += frontMem + childMemOld;
179 RMem += rMem;
180 SMem += sMem;
181 VTMem += vtMem;
182 }
183 else if (gpuMemorySize == 0 ||
184 ((frontMem + childMemOld + childMemNew + sMem + vtMem)
185 * sizeof(double) < gpuMemorySize))
186 {
187 /* Else if the front and its children fit on the GPU, add it
188 to the next stage and reset the mem accumulator. */
189 PR (("gpuMemorySize: move front to next stage\n")) ;
190 FSize[stage] = ReqMem; // save the sizes for the stage
191 RSize[stage] = RMem;
192 SSize[stage] = SMem / 2; // correct for data width
193 Stagingp[++stage] = p;
194 ReqMem = 0; // move onto the next stage
195 RMem = 0;
196 SMem = 0;
197 VTMem = 0;
198 FOffsets[f] = ReqMem;
199 ROffsets[f] = RMem;
200 SOffsets[f] = SMem / 2; // correct for data width
201 ReqMem += frontMem + childMemOld + childMemNew;
202 RMem += rMem;
203 SMem += sMem;
204 VTMem += vtMem;
205 }
206 else
207 {
208 /* Else the front and its children can't fit on the GPU,
209 so we have an infeasible schedule. */
210 PR (("gpuMemorySize too small: schedule infeasible\n")) ;
211 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memory too small\n") ;
212 *numStages = 0 ;
213 *feasible = false ;
214 return ;
215 }
216 StageMap[f] = stage;
217 }
218
219 /* Make sure that even if everything fits in one stage
220 that we finalize the stage. */
221 FSize[stage] = ReqMem;
222 RSize[stage] = RMem;
223 SSize[stage] = SMem / 2; // correct for data width
224 Stagingp[++stage] = numFronts;
225 if(Stagingp[stage] == Stagingp[stage-1]) stage--;
226
227 *numStages = stage;
228 *feasible = true;
229 return;
230 }
231 #endif
232