1 // =============================================================================
2 // === spqrgpu_computeFrontStaging =============================================
3 // =============================================================================
4 
5 // Returns a front staging and whether the staging is feasible.
6 // A front staging is infeasible if a front and its children do not fit on
7 // the GPU at the same time.
8 
9 #ifdef GPU_BLAS
10 #include "spqr.hpp"
11 #include "GPUQREngine_Scheduler.hpp"
12 
spqrgpu_computeFrontStaging(Long numFronts,Long * Parent,Long * Childp,Long * Child,Long * Fm,Long * Cm,Long * Rp,Long * Sp,Long * Sleft,Long * Super,Long * Post,Long RimapSize,Long RjmapSize,bool * feasible,Long * numStages,Long * Stagingp,Long * StageMap,size_t * FSize,size_t * RSize,size_t * SSize,Long * FOffsets,Long * ROffsets,Long * SOffsets,cholmod_common * cc)13 void spqrgpu_computeFrontStaging
14 (
15     // inputs, not modified on output
16     Long numFronts,     // total number of fronts (nf in caller)
17     Long *Parent,       // size nf+1, assembly tree (f=nf is placeholder)
18     Long *Childp,       // size nf+2, children of f are
19                         //      Child [Childp [f] ... Childp [f+1]-1]
20     Long *Child,        // size nf+1.
21 
22     Long *Fm,           // size nf+1, front f has Fm [f] rows
23     Long *Cm,           // size nf+1, front f has Cm [f] rows in contrib
24     Long *Rp,           // size nf+1, Rj[Rp[f]...Rp[f+1]-1] are the cols in f
25     Long *Sp,           // size m+1, row pointers for sparse matrix S
26     Long *Sleft,        // size n+2, see spqr_stranspose for description
27     Long *Super,        // size nf+1, front f pivotal cols are
28                         //      Super[f]..Super[f+1]-1
29     Long *Post,         // size nf+1, front f is kth, if f = Post [k]
30 
31     Long RimapSize,     // scalar, size of Rimap on the GPU (# of int's)
32     Long RjmapSize,     // scalar, size of Rimap on the GPU (# of int's)
33 
34     // output, not defined on input:
35     bool *feasible,     // scalar, true if feasible, false if GPU memory too low
36     Long *numStages,    // scalar, number of stages
37     Long *Stagingp,     // size nf+2, fronts are in the list
38                         //      Post [Stagingp [stage]...Stagingp[stage+1]-1]
39     Long *StageMap,     // size nf, front f is in stage StageMap [f]
40 
41     size_t *FSize,      // size nf+1, FSize[stage]: size in bytes of MongoF
42     size_t *RSize,      // size nf+1, Rsize[stage]: size in bytes of MongoR
43     size_t *SSize,      // size nf+1, Ssize[stage]: size in bytes of S
44     Long *FOffsets,     // size nf, front f in MondoF [FOffsets[f]...] on GPU
45     Long *ROffsets,     // size nf, R block in MondoR [Roffsets[f]...] on CPU
46     Long *SOffsets,     // size nf, S entries for front f are in
47                         //      wsS [SOffsets[f]...]
48 
49     // input/output:
50     cholmod_common *cc
51 )
52 {
53 
54     // -------------------------------------------------------------------------
55     // determine available GPU memory, required for all stages
56     // -------------------------------------------------------------------------
57 
58     // gpuMemorySize = 0 is for testing only.  This value forces each front to
59     // appear in its own stage.  gpuMemorySize = 1 is also for testing, to
60     // check how this function handles problem that is infeasible due to lack
61     // of GPU memory.  We must also ensure that gpuMemorySize does not
62     // accidentally become negative.
63 
64     size_t gpuMemorySize = cc->gpuMemorySize;
65 
66 //  printf ("GPU mem starts %g MB\n", (double) gpuMemorySize / (1024*1024)) ;
67 
68     // account for two Scheduler work queues in the GPU memory
69     if (gpuMemorySize > 1)
70     {
71         // The GPU must hold two workspace queues, each of size maxQueueSize,
72         // and where each entry is of size sizeof(TaskDescriptor)
73         size_t maxQueueSize = ssgpu_maxQueueSize (gpuMemorySize) ;
74         size_t s = 2 * maxQueueSize * sizeof (TaskDescriptor) ;
75         gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
76     }
77 
78     // account for Rimap in the GPU memory
79     if (gpuMemorySize > 1)
80     {
81         size_t s = RimapSize * sizeof (int) ;
82         gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
83     }
84 
85     // account for Rjmap in the GPU memory
86     if (gpuMemorySize > 1)
87     {
88         size_t s = RjmapSize * sizeof (int) ;
89         gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
90     }
91 
92     // account for cudaMalloc memory manager overhead in the GPU memory
93     if (gpuMemorySize > 1)
94     {
95         size_t s = 1024 * 1024 ;        // just 1 MB for good measure
96         gpuMemorySize = (gpuMemorySize > s) ? (gpuMemorySize-s) : 0 ;
97     }
98 
99 //  printf ("GPU mem now    %g MB\n", (double) gpuMemorySize / (1024*1024)) ;
100 
101     // -------------------------------------------------------------------------
102     // assign fronts to stages based on remaining GPU memory availability
103     // -------------------------------------------------------------------------
104 
105     /* The memory requirement for a front is the summation of the memory
106        requirements from its children plus its own memory requirement.
107        If we use a postorder traversal, we only need to keep a rolling sum. */
108     size_t ReqMem = 0;   // also used in FOffsets
109 
110     /* RMem is always < ReqMem.
111        RMem is not just for R, but an upper-bound on the amount of front data
112        we have to pull back from the GPU in the event that the front is staged.
113        We already would have room to pull back the CBlock if we account for it.
114        The QREngine is intelligent enough to pull only the needed FrontData.
115        The language is clearer in QREngine as well ("R" renamed "FrontData") */
116     size_t RMem = 0;     // also used in ROffsets
117 
118     /* SMem is twice the number of S values (one for index, one for value). */
119     size_t SMem = 0;     // also used in SOffsets
120 
121     /* VTMem is the amount of memory required for the VT blocks. */
122     size_t VTMem = 0;
123 
124     Long stage = 0;
125     Stagingp[0] = 0;
126     for(Long p=0; p<numFronts; p++)
127     {
128         Long f = Post[p]; // The postordering ensures we visit children first
129 
130         Long fm = Fm[f];
131         Long fn = Rp[f+1] - Rp[f];
132         Long fp = Super[f+1] - Super[f];
133         Long frank = MIN(fm, fp);
134         // Long cn = fn - fp ;
135         Long cm = Cm[f] ;
136         size_t frontMem = fm * fn;          // F
137         size_t rMem = (frank + cm) * fn;    // R + C
138 
139         // for sMem, "2 *" assumes sizeof (SEntry) is 2*sizeof(double)
140         size_t sMem = 2 * (Sp[Sleft[Super[f+1]]] - Sp[Sleft[Super[f]]]);
141 
142         // CEIL is defined in GPUQREngine
143         size_t vtMem = CEIL(fm, TILESIZE) * (TILESIZE+1) * TILESIZE ;
144 
145         size_t childMemOld = 0;
146         size_t childMemNew = 0;
147 
148         /* Add contribution block memory from children in earlier stages. */
149         for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
150         {
151             Long c = Child[cp];
152 
153             // Long cfm = Fm[c];
154             Long cfn = Rp[c+1] - Rp[c];
155             Long cfp = Super[c+1] - Super[c];
156             // Long crank = MIN(cfm, cfp);
157             Long ccn = cfn - cfp ;
158             Long ccm = Cm[c];
159 
160             if(StageMap[c] < stage)
161             {
162                 childMemOld += ccm * ccn;
163             }
164             else
165             {
166                 childMemNew += ccm * ccn;
167             }
168         }
169 
170         /* determine which stage will contain this front */
171         if((ReqMem + (frontMem + childMemOld + sMem + vtMem)) * sizeof(double)
172             < gpuMemorySize)
173         {
174             /* If we can add the front to the current stage, accum its mem. */
175             FOffsets[f] = ReqMem;
176             ROffsets[f] = RMem;
177             SOffsets[f] = SMem / 2; // correct for data width
178             ReqMem += frontMem + childMemOld;
179             RMem += rMem;
180             SMem += sMem;
181             VTMem += vtMem;
182         }
183         else if (gpuMemorySize == 0 ||
184             ((frontMem + childMemOld + childMemNew + sMem + vtMem)
185              * sizeof(double) < gpuMemorySize))
186         {
187             /* Else if the front and its children fit on the GPU, add it
188                to the next stage and reset the mem accumulator. */
189             PR (("gpuMemorySize: move front to next stage\n")) ;
190             FSize[stage] = ReqMem;  // save the sizes for the stage
191             RSize[stage] = RMem;
192             SSize[stage] = SMem / 2; // correct for data width
193             Stagingp[++stage] = p;
194             ReqMem = 0;             // move onto the next stage
195             RMem = 0;
196             SMem = 0;
197             VTMem = 0;
198             FOffsets[f] = ReqMem;
199             ROffsets[f] = RMem;
200             SOffsets[f] = SMem / 2; // correct for data width
201             ReqMem += frontMem + childMemOld + childMemNew;
202             RMem += rMem;
203             SMem += sMem;
204             VTMem += vtMem;
205         }
206         else
207         {
208             /* Else the front and its children can't fit on the GPU,
209                so we have an infeasible schedule. */
210             PR (("gpuMemorySize too small: schedule infeasible\n")) ;
211             ERROR (CHOLMOD_GPU_PROBLEM, "GPU memory too small\n") ;
212             *numStages = 0 ;
213             *feasible = false ;
214             return ;
215         }
216         StageMap[f] = stage;
217     }
218 
219     /* Make sure that even if everything fits in one stage
220        that we finalize the stage. */
221     FSize[stage] = ReqMem;
222     RSize[stage] = RMem;
223     SSize[stage] = SMem / 2;  // correct for data width
224     Stagingp[++stage] = numFronts;
225     if(Stagingp[stage] == Stagingp[stage-1]) stage--;
226 
227     *numStages = stage;
228     *feasible = true;
229     return;
230 }
231 #endif
232