1 // =============================================================================
2 // === GPUQREngine/Source/GPUQREngine_ExpertDense.cpp ==========================
3 // =============================================================================
4 //
5 // This file contains the dense GPUQREngine wrapper that finds the staircase,
6 // makes a copy of the user's front data, then calls down into the Internal
7 // GPUQREngine factorization routine.
8 //
9 // Other functions include:
10 // - GPUQREngine_Cleanup: Cleans up relevant workspaces in the dense
11 // factorization depending on how we're exiting.
12 // - GPUQREngine_FindStaircase: Finds the staircase for a front and returns
13 // the staircase as an Int* list
14 // =============================================================================
15
16 #include "GPUQREngine_Internal.hpp"
17
18
19 QREngineResultCode GPUQREngine_Cleanup
20 (
21 QREngineResultCode code, // The result code that we're exiting with
22 Front *userFronts, // The user-provided list of fronts
23 Front *fronts, // The internal copy of the user's fronts
24 Int numFronts, // The number of fronts to be factorized
25 Workspace *wsMongoF, // Pointer to the total GPU Front workspace
26 Workspace *wsMongoR // Pointer to the total CPU R workspace
27 );
28
GPUQREngine(size_t gpuMemorySize,Front * userFronts,Int numFronts,QREngineStats * stats)29 QREngineResultCode GPUQREngine
30 (
31 size_t gpuMemorySize, // The total available GPU memory size in bytes
32 Front *userFronts, // The list of fronts to factorize
33 Int numFronts, // The number of fronts to factorize
34 QREngineStats *stats // An optional parameter. If present, statistics
35 // are collected and passed back to the caller
36 // via this struct
37 )
38 {
39 /* Allocate workspaces */
40 Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
41 if(!fronts)
42 {
43 return QRENGINE_OUTOFMEMORY;
44 }
45
46 size_t FSize, RSize;
47 FSize = RSize = 0;
48 for(int f=0; f<numFronts; f++)
49 {
50 /* Configure the front */
51 Front *userFront = &(userFronts[f]);
52 Int m = userFront->fm;
53 Int n = userFront->fn;
54 Front *front = new (&fronts[f]) Front(f, EMPTY, m, n);
55 FSize += front->getNumFrontValues();
56 RSize += front->getNumRValues();
57 }
58
59 // We have to allocate page-locked CPU-GPU space to leverage asynchronous
60 // memory transfers. This has to be done in a way that the CUDA driver is
61 // aware of, which unfortunately means making a copy of the user input.
62
63 // calloc pagelocked space on CPU, and calloc space on the GPU
64 Workspace *wsMongoF = Workspace::allocate(FSize, // CPU and GPU
65 sizeof(double), true, true, true, true);
66
67 // calloc pagelocked space on the CPU. Nothing on the GPU
68 Workspace *wsMongoR = Workspace::allocate(RSize, // CPU
69 sizeof(double), true, true, false, true);
70
71 /* Cleanup and return if we ran out of memory. */
72 if(!wsMongoF || !wsMongoR)
73 {
74 return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
75 userFronts, fronts, numFronts, wsMongoF, wsMongoR);
76 }
77
78 /* Prepare the fronts for GPU execution. */
79 size_t FOffset, ROffset;
80 FOffset = ROffset = 0;
81 for(int f=0; f<numFronts; f++)
82 {
83 // Set the front pointers; make the copy from user data into front data.
84 Front *front = &(fronts[f]);
85 front->F = CPU_REFERENCE(wsMongoF, double*) + FOffset;
86 front->gpuF = GPU_REFERENCE(wsMongoF, double*) + FOffset;
87 front->cpuR = CPU_REFERENCE(wsMongoR, double*) + ROffset;
88 FOffset += front->getNumFrontValues();
89 ROffset += front->getNumRValues();
90
91 /* COPY USER DATA (user's F to our F) */
92 Front *userFront = &(userFronts[f]);
93 double *userF = userFront->F;
94 double *F = front->F;
95 Int m = userFront->fm;
96 Int n = userFront->fn;
97 bool isColMajor = userFront->isColMajor;
98 Int ldn = userFront->ldn;
99 for(Int i=0; i<m; i++)
100 {
101 for(Int j=0; j<n; j++)
102 {
103 F[i*n+j] = (isColMajor ? userF[j*ldn+i] : userF[i*ldn+j]);
104 }
105 }
106
107 /* Attach either the user-specified Stair, or compute it. */
108 front->Stair = userFront->Stair;
109 if(!front->Stair) front->Stair = GPUQREngine_FindStaircase(front);
110
111 /* Cleanup and return if we ran out of memory building the staircase */
112 if(!front->Stair)
113 {
114 return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
115 userFronts, fronts, numFronts, wsMongoF, wsMongoR);
116 }
117 }
118
119 /* Transfer the fronts to the GPU. */
120 if(!wsMongoF->transfer(cudaMemcpyHostToDevice))
121 {
122 return GPUQREngine_Cleanup (QRENGINE_GPUERROR,
123 userFronts, fronts, numFronts, wsMongoF, wsMongoR);
124 }
125
126 /* Do the factorization for this set of fronts. */
127 QREngineResultCode result = GPUQREngine_Internal(gpuMemorySize, fronts,
128 numFronts, NULL, NULL, NULL, stats);
129 if(result != QRENGINE_SUCCESS)
130 {
131 return GPUQREngine_Cleanup (result,
132 userFronts, fronts, numFronts, wsMongoF, wsMongoR);
133 }
134
135 /* COPY USER DATA (our R back to user's R) */
136 for(int f=0; f<numFronts; f++)
137 {
138 Front *userFront = &(userFronts[f]);
139 double *R = (&fronts[f])->cpuR;
140 double *userR = userFront->cpuR;
141 Int m = userFront->fm;
142 Int n = userFront->fn;
143 Int rank = userFront->rank;
144 bool isColMajor = userFront->isColMajor;
145 Int ldn = userFront->ldn;
146 for(Int i=0; i<rank; i++)
147 {
148 for(Int j=0; j<n; j++)
149 {
150 userR[i*ldn+j] = (isColMajor ? R[j*n+i] : R[i*n+j]);
151 }
152 }
153 }
154
155 /* Return that the factorization was successful. */
156 return GPUQREngine_Cleanup (QRENGINE_SUCCESS,
157 userFronts, fronts, numFronts, wsMongoF, wsMongoR);
158 }
159
GPUQREngine_Cleanup(QREngineResultCode code,Front * userFronts,Front * fronts,Int numFronts,Workspace * wsMongoF,Workspace * wsMongoR)160 QREngineResultCode GPUQREngine_Cleanup
161 (
162 QREngineResultCode code, // The result code that we're exiting with
163 Front *userFronts, // The user-provided list of fronts
164 Front *fronts, // The internal copy of the user's fronts
165 Int numFronts, // The number of fronts to be factorized
166 Workspace *wsMongoF, // Pointer to the total GPU Front workspace
167 Workspace *wsMongoR // Pointer to the total CPU R workspace
168 )
169 {
170 /* Cleanup fronts. */
171 for(int f=0; f<numFronts; f++)
172 {
173 Front *userFront = (&userFronts[f]);
174 Front *front = &(fronts[f]);
175 if(front != NULL)
176 {
177 /* If we had to attach our own stair, clean it up. */
178 if(userFront->Stair == NULL && front->Stair != NULL)
179 {
180 front->Stair = (Int *) SuiteSparse_free(front->Stair);
181 }
182
183 /* Detach front data since it's managed by the mongo. */
184 front->F = NULL;
185 }
186 }
187 fronts = (Front *) SuiteSparse_free(fronts);
188
189 /* Free the mongo structures. Note that Workspace checks for NULL. */
190 wsMongoF = Workspace::destroy(wsMongoF);
191 wsMongoR = Workspace::destroy(wsMongoR);
192
193 return code;
194 }
195
GPUQREngine_FindStaircase(Front * front)196 Int *GPUQREngine_FindStaircase
197 (
198 Front *front // The front whose staircase we are computing
199 )
200 {
201 Int fm = front->fm;
202 Int fn = front->fn;
203
204 double *F = front->F;
205 Int *Stair = (Int*) SuiteSparse_malloc(fn, sizeof(Int));
206 if(!F || !Stair) return NULL;
207
208 Int lastStair = 0;
209 for(int j=0; j<fn; j++)
210 {
211 int i;
212 for(i=fm-1; i>lastStair && F[i*fn+j] == 0.0; i--);
213 Stair[j] = lastStair = i;
214 }
215
216 return Stair;
217 }
218