1 // =============================================================================
2 // === GPUQREngine/Source/GPUQREngine_ExpertDense.cpp ==========================
3 // =============================================================================
4 //
5 // This file contains the dense GPUQREngine wrapper that finds the staircase,
6 // makes a copy of the user's front data, then calls down into the Internal
7 // GPUQREngine factorization routine.
8 //
9 // Other functions include:
10 //  - GPUQREngine_Cleanup:       Cleans up relevant workspaces in the dense
11 //                               factorization depending on how we're exiting.
12 //  - GPUQREngine_FindStaircase: Finds the staircase for a front and returns
13 //                               the staircase as an Int* list
14 // =============================================================================
15 
16 #include "GPUQREngine_Internal.hpp"
17 
18 
19 QREngineResultCode GPUQREngine_Cleanup
20 (
21     QREngineResultCode code,    // The result code that we're exiting with
22     Front *userFronts,          // The user-provided list of fronts
23     Front *fronts,              // The internal copy of the user's fronts
24     Int numFronts,              // The number of fronts to be factorized
25     Workspace *wsMongoF,        // Pointer to the total GPU Front workspace
26     Workspace *wsMongoR         // Pointer to the total CPU R workspace
27 );
28 
GPUQREngine(size_t gpuMemorySize,Front * userFronts,Int numFronts,QREngineStats * stats)29 QREngineResultCode GPUQREngine
30 (
31     size_t gpuMemorySize,   // The total available GPU memory size in bytes
32     Front *userFronts,      // The list of fronts to factorize
33     Int numFronts,          // The number of fronts to factorize
34     QREngineStats *stats    // An optional parameter. If present, statistics
35                             // are collected and passed back to the caller
36                             // via this struct
37 )
38 {
39     /* Allocate workspaces */
40     Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
41     if(!fronts)
42     {
43         return QRENGINE_OUTOFMEMORY;
44     }
45 
46     size_t FSize, RSize;
47     FSize = RSize = 0;
48     for(int f=0; f<numFronts; f++)
49     {
50         /* Configure the front */
51         Front *userFront = &(userFronts[f]);
52         Int m = userFront->fm;
53         Int n = userFront->fn;
54         Front *front = new (&fronts[f]) Front(f, EMPTY, m, n);
55         FSize += front->getNumFrontValues();
56         RSize += front->getNumRValues();
57     }
58 
59     // We have to allocate page-locked CPU-GPU space to leverage asynchronous
60     // memory transfers.  This has to be done in a way that the CUDA driver is
61     // aware of, which unfortunately means making a copy of the user input.
62 
63     // calloc pagelocked space on CPU, and calloc space on the GPU
64     Workspace *wsMongoF = Workspace::allocate(FSize,    // CPU and GPU
65         sizeof(double), true, true, true, true);
66 
67     // calloc pagelocked space on the CPU.  Nothing on the GPU
68     Workspace *wsMongoR = Workspace::allocate(RSize,    // CPU
69         sizeof(double), true, true, false, true);
70 
71     /* Cleanup and return if we ran out of memory. */
72     if(!wsMongoF || !wsMongoR)
73     {
74         return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
75             userFronts, fronts, numFronts, wsMongoF, wsMongoR);
76     }
77 
78     /* Prepare the fronts for GPU execution. */
79     size_t FOffset, ROffset;
80     FOffset = ROffset = 0;
81     for(int f=0; f<numFronts; f++)
82     {
83         // Set the front pointers; make the copy from user data into front data.
84         Front *front = &(fronts[f]);
85         front->F    = CPU_REFERENCE(wsMongoF, double*) + FOffset;
86         front->gpuF = GPU_REFERENCE(wsMongoF, double*) + FOffset;
87         front->cpuR = CPU_REFERENCE(wsMongoR, double*) + ROffset;
88         FOffset += front->getNumFrontValues();
89         ROffset += front->getNumRValues();
90 
91         /* COPY USER DATA (user's F to our F) */
92         Front *userFront = &(userFronts[f]);
93         double *userF = userFront->F;
94         double *F = front->F;
95         Int m = userFront->fm;
96         Int n = userFront->fn;
97         bool isColMajor = userFront->isColMajor;
98         Int ldn = userFront->ldn;
99         for(Int i=0; i<m; i++)
100         {
101             for(Int j=0; j<n; j++)
102             {
103                 F[i*n+j] = (isColMajor ? userF[j*ldn+i] : userF[i*ldn+j]);
104             }
105         }
106 
107         /* Attach either the user-specified Stair, or compute it. */
108         front->Stair = userFront->Stair;
109         if(!front->Stair) front->Stair = GPUQREngine_FindStaircase(front);
110 
111         /* Cleanup and return if we ran out of memory building the staircase */
112         if(!front->Stair)
113         {
114             return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
115                 userFronts, fronts, numFronts, wsMongoF, wsMongoR);
116         }
117     }
118 
119     /* Transfer the fronts to the GPU. */
120     if(!wsMongoF->transfer(cudaMemcpyHostToDevice))
121     {
122         return GPUQREngine_Cleanup (QRENGINE_GPUERROR,
123             userFronts, fronts, numFronts, wsMongoF, wsMongoR);
124     }
125 
126     /* Do the factorization for this set of fronts. */
127     QREngineResultCode result = GPUQREngine_Internal(gpuMemorySize, fronts,
128         numFronts, NULL, NULL, NULL, stats);
129     if(result != QRENGINE_SUCCESS)
130     {
131         return GPUQREngine_Cleanup (result,
132             userFronts, fronts, numFronts, wsMongoF, wsMongoR);
133     }
134 
135     /* COPY USER DATA (our R back to user's R) */
136     for(int f=0; f<numFronts; f++)
137     {
138         Front *userFront = &(userFronts[f]);
139         double *R = (&fronts[f])->cpuR;
140         double *userR = userFront->cpuR;
141         Int m = userFront->fm;
142         Int n = userFront->fn;
143         Int rank = userFront->rank;
144         bool isColMajor = userFront->isColMajor;
145         Int ldn = userFront->ldn;
146         for(Int i=0; i<rank; i++)
147         {
148             for(Int j=0; j<n; j++)
149             {
150                 userR[i*ldn+j] = (isColMajor ? R[j*n+i] : R[i*n+j]);
151             }
152         }
153     }
154 
155     /* Return that the factorization was successful. */
156     return GPUQREngine_Cleanup (QRENGINE_SUCCESS,
157         userFronts, fronts, numFronts, wsMongoF, wsMongoR);
158 }
159 
GPUQREngine_Cleanup(QREngineResultCode code,Front * userFronts,Front * fronts,Int numFronts,Workspace * wsMongoF,Workspace * wsMongoR)160 QREngineResultCode GPUQREngine_Cleanup
161 (
162     QREngineResultCode code,    // The result code that we're exiting with
163     Front *userFronts,          // The user-provided list of fronts
164     Front *fronts,              // The internal copy of the user's fronts
165     Int numFronts,              // The number of fronts to be factorized
166     Workspace *wsMongoF,        // Pointer to the total GPU Front workspace
167     Workspace *wsMongoR         // Pointer to the total CPU R workspace
168 )
169 {
170     /* Cleanup fronts. */
171     for(int f=0; f<numFronts; f++)
172     {
173         Front *userFront = (&userFronts[f]);
174         Front *front = &(fronts[f]);
175         if(front != NULL)
176         {
177             /* If we had to attach our own stair, clean it up. */
178             if(userFront->Stair == NULL && front->Stair != NULL)
179             {
180                 front->Stair = (Int *) SuiteSparse_free(front->Stair);
181             }
182 
183             /* Detach front data since it's managed by the mongo. */
184             front->F = NULL;
185         }
186     }
187     fronts = (Front *) SuiteSparse_free(fronts);
188 
189     /* Free the mongo structures. Note that Workspace checks for NULL. */
190     wsMongoF = Workspace::destroy(wsMongoF);
191     wsMongoR = Workspace::destroy(wsMongoR);
192 
193     return code;
194 }
195 
GPUQREngine_FindStaircase(Front * front)196 Int *GPUQREngine_FindStaircase
197 (
198     Front *front                // The front whose staircase we are computing
199 )
200 {
201     Int fm = front->fm;
202     Int fn = front->fn;
203 
204     double *F = front->F;
205     Int *Stair = (Int*) SuiteSparse_malloc(fn, sizeof(Int));
206     if(!F || !Stair) return NULL;
207 
208     Int lastStair = 0;
209     for(int j=0; j<fn; j++)
210     {
211         int i;
212         for(i=fm-1; i>lastStair && F[i*fn+j] == 0.0; i--);
213         Stair[j] = lastStair = i;
214     }
215 
216     return Stair;
217 }
218