1 // =============================================================================
2 // === GPUQREngine/Demo/dense_demo.cpp =========================================
3 // =============================================================================
4
5 // GPUQREngine can be used to factorize a set of dense matrices of
6 // various sizes. This is the demo for this 'dense' usage case.
7 // The 'sparse' case is exercised by SuiteSparseQR.
8
9 #ifndef GPU_BLAS
10 #define GPU_BLAS
11 #endif
12
13 #include "GPUQREngine.hpp"
14 #include "cholmod.h"
15 #include <time.h>
16 #include <stdio.h>
17 #include "GPUQREngine_Timing.hpp"
18
19 //------------------------------------------------------------------------------
20
randfill(double * x,Int m,Int n)21 void randfill(double *x, Int m, Int n)
22 {
23 for(int i=0; i<m; i++) {
24 for(int j=0; j<n; j++) {
25 x[i*n+j] = rand() / 1e8;
26 }
27 }
28 }
29
30 //------------------------------------------------------------------------------
31
randfill_stair(double * x,Int m,Int n,Int aggression)32 void randfill_stair(double *x, Int m, Int n, Int aggression)
33 {
34 int i = 0;
35 for(int j=0; j<n; j++)
36 {
37 int iend = MIN(i + (rand() % aggression) + 1, m);
38 for(i=0; i<m; i++)
39 {
40 x[i*n+j] = (i < iend ? rand() / 1e8 : 0.0);
41 }
42 i = iend;
43 }
44 }
45
46 //------------------------------------------------------------------------------
47
randfill_blocktriu(double * x,Int m,Int n)48 void randfill_blocktriu(double *x, Int m, Int n)
49 {
50 int ybmax = CEIL(m, TILESIZE);
51 int xbmax = CEIL(n, TILESIZE);
52
53 for(int yb=0; yb<ybmax; yb++)
54 {
55 for(int xb=0; xb<xbmax; xb++)
56 {
57 if(yb > xb) continue;
58
59 int imax = MIN(m, TILESIZE*(yb+1));
60 for(int i=TILESIZE*yb; i<imax; i++)
61 {
62 int jmax = MIN(n, TILESIZE*(xb+1));
63 for(int j=TILESIZE*xb; j<jmax; j++)
64 {
65 x[i*n+j] = rand() / 1e8;
66 }
67 }
68 }
69 }
70 }
71
72 //------------------------------------------------------------------------------
73
printMatrix(FILE * troll,const char * name,int which,int f,double * inM,int m,int n)74 void printMatrix(FILE *troll, const char *name,
75 int which, int f, double *inM, int m, int n)
76 {
77
78 double (*M)[n] = (double (*)[n]) inM;
79
80 fprintf(troll, "%s%d {%d} = [\n", name, which, 1+f);
81 for(int i=0; i<m; i++)
82 {
83 for(int j=0; j<n; j++)
84 {
85 fprintf(troll, "%30.16e ", M[i][j]);
86 }
87 fprintf(troll, "\n");
88 }
89 fprintf(troll, "];\n");
90 }
91
92 //------------------------------------------------------------------------------
93
printQRScript(FILE * troll)94 void printQRScript (FILE *troll)
95 {
96
97 fprintf (troll, "format compact;\n");
98 fprintf (troll, "res = norm(triu(R)'*triu(R)-A'*A) / norm(A'*A)\n");
99 fprintf (troll, "if isnan(res), error ('failure'); end\n");
100 fprintf (troll, "if res > 1e-12, error ('failure'); end\n");
101
102 // fprintf (troll, "[%d %d] = size (A) ;\n") ;
103 // int rank = MIN(m, n);
104 // fprintf (troll, "[Davis_R V1 T] = qrsigma_concise(A);\n");
105 // fprintf (troll, "norm(triu(Davis_R) - triu(R))\n");
106 // fprintf (troll, "norm(Davis_R(%d:end,:)-R(%d:end,:))\n", rank+1, rank+1);
107
108 fprintf (troll, "MR = qr(A);\n");
109 fprintf (troll, "tR = triu(abs(R));");
110 fprintf (troll, "tMR = triu(abs(MR));");
111 fprintf (troll, "norm(tMR - tR)\n");
112 }
113
114 //------------------------------------------------------------------------------
115
driver(cholmod_common * cc,const char * filename,Front * fronts,int numFronts,QREngineStats * stats,int which)116 void driver
117 (
118 cholmod_common *cc,
119 const char *filename,
120 Front *fronts,
121 int numFronts,
122 QREngineStats *stats,
123 int which
124 )
125 {
126 FILE *troll = NULL ;
127
128 if (filename != NULL)
129 {
130 troll = fopen(filename, "a");
131 }
132 if (troll != NULL)
133 {
134 fprintf (troll, "clear ;\n") ;
135 fprintf (troll, "A_%d = cell (1,%d) ;\n", which, numFronts) ;
136 fprintf (troll, "R_%d = cell (1,%d) ;\n", which, numFronts) ;
137 }
138
139 for(int f=0; f<numFronts; f++)
140 {
141 Front *front = (&fronts[f]);
142 int m = front->fm;
143 int n = front->fn;
144
145 /* Attach the front data. */
146 double *F = front->cpuR = front->F = (double*)
147 SuiteSparse_calloc(m*n, sizeof(double));
148 randfill(F, m, n);
149
150 /* Print the A matrix. */
151 if (troll != NULL)
152 {
153 printMatrix (troll, "A_", which, f, F, m, n);
154 }
155 }
156
157 /* Run the QREngine code */
158 QREngineResultCode result = GPUQREngine (cc->gpuMemorySize,
159 fronts, numFronts, stats);
160 if (result != QRENGINE_SUCCESS)
161 {
162 printf ("test failure!\n'") ;
163 exit (0) ;
164 }
165
166 /* Do something with R factors. */
167 for(int f=0; f<numFronts; f++)
168 {
169 Front *front = (&fronts[f]);
170 int m = front->fm;
171 int n = front->fn;
172 double *R = front->cpuR;
173
174 /* Print the R matrix. */
175 if (troll != NULL)
176 {
177 printMatrix(troll, "R_", which, f, R, m, n);
178 fprintf (troll, "R = R_%d {%d} ; A = A_%d {%d} ;\n",
179 which, 1+f, which, 1+f) ;
180 printQRScript (troll) ;
181 }
182
183 /* Detach the front data. */
184 SuiteSparse_free(front->F);
185 front->F = NULL;
186 }
187
188 if (troll != NULL)
189 {
190 fprintf (troll, "disp ('all tests passed') ;\n") ;
191 fclose (troll) ;
192 }
193 }
194
195 //------------------------------------------------------------------------------
196
printStats(QREngineStats stats,int numFronts,double m,double n)197 void printStats(QREngineStats stats, int numFronts, double m, double n)
198 {
199 float kernelTime = stats.kernelTime;
200 Int numLaunches = stats.numLaunches;
201 Int gpuFlops = stats.flopsActual;
202
203 /* Compute & Print FLOPS */
204 double time = (double) kernelTime;
205 double flops;
206 if(m >= n)
207 {
208 flops = 2.0 * n*n * (m - (n/3));
209 }
210 else
211 {
212 flops = 2.0 * m*m * (n - (m/3));
213 }
214
215 flops *= (double) numFronts;
216 flops /= (time / 1e3);
217 double gflops = flops / 1e9;
218 double gpugflops = (gpuFlops / (time / 1e3)) / 1e9;
219 printf("m: %.0f, n: %.0f, nf: %d, nl: %ld, gpuFlops: %ld, t: %fms, gflops: %f, gpugflops: %f\n", m, n, numFronts, numLaunches, gpuFlops, time, gflops, gpugflops);
220 }
221
222 //------------------------------------------------------------------------------
223
experiment1(cholmod_common * cc,int which,int numFronts,int m,int n)224 void experiment1(cholmod_common *cc, int which, int numFronts, int m, int n)
225 {
226 /* Configure problem set. */
227 Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
228 for(int f=0; f<numFronts; f++)
229 {
230 new (&fronts[f]) Front(f, EMPTY, m, n);
231 }
232
233 /* Run the driver code. */
234 QREngineStats stats;
235 driver(cc, "troll.m", fronts, numFronts, &stats, which);
236 // driver(cc, NULL, fronts, numFronts, &stats, which);
237 printStats(stats, numFronts, m, n);
238
239 /* Cleanup resources. */
240 for(int f=0; f<numFronts; f++) (&fronts[f])->~Front();
241 SuiteSparse_free(fronts);
242 }
243
244 //------------------------------------------------------------------------------
245
experiment2(cholmod_common * cc,int m,int n,int numFronts)246 void experiment2(cholmod_common *cc, int m, int n, int numFronts)
247 {
248 /* See if this experiment would blow out the memory. */
249 size_t threshold = 3.50 * 1024 * 1024 * 1024;
250 size_t memoryReq = (size_t) (numFronts * (CEIL(m, 32) * 32 * 33 + m * n)) ;
251 if(memoryReq * sizeof(double) > threshold) return;
252
253 /* Configure problem set. */
254 Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
255 for(int f=0; f<numFronts; f++)
256 {
257 new (&fronts[f]) Front(f, EMPTY, m, n);
258 }
259
260 /* Run the driver code if we won't run out of memory. */
261 QREngineStats stats;
262 driver(cc, NULL, fronts, numFronts, &stats, 0); // no troll.m output
263 printStats(stats, numFronts, m, n);
264
265 /* Cleanup resources. */
266 for(int f=0; f<numFronts; f++) (&fronts[f])->~Front();
267 SuiteSparse_free(fronts);
268 }
269
270
271 //------------------------------------------------------------------------------
272
main(int argn,char ** argv)273 int main(int argn, char **argv)
274 {
275 double t ;
276 size_t total_mem, available_mem ;
277
278 /* Clear the troll file. */
279 FILE *troll;
280 troll = fopen("troll.m", "w");
281 fclose(troll);
282
283 srand(1);
284
285 // start CHOLMOD
286 cholmod_common *cc, Common ;
287 cc = &Common ;
288 cholmod_l_start (cc) ;
289
290 // warmup the GPU. This can take some time, but only needs
291 // to be done once
292 cc->useGPU = true ;
293 t = SuiteSparse_time ( ) ;
294 cholmod_l_gpu_memorysize (&total_mem, &available_mem, cc) ;
295 cc->gpuMemorySize = available_mem ;
296 t = SuiteSparse_time ( ) - t ;
297 if (cc->gpuMemorySize <= 1)
298 {
299 printf ("no GPU available\n") ;
300 return (0) ;
301 }
302 printf ("available GPU memory: %g MB, warmup time: %g\n",
303 (double) (cc->gpuMemorySize) / (1024 * 1024), t) ;
304
305 experiment1(cc, 1, 2, 8, 8);
306 experiment1(cc, 2, 2, 12, 8);
307 experiment1(cc, 3, 2, 64, 32);
308 experiment1(cc, 4, 1, 100, 200);
309
310 printf ("to check results, run 'troll.m' in MATLAB\n") ;
311
312 #if 0
313 for(int numFronts=1; numFronts<=128; numFronts*=2)
314 {
315 for(int dim=128; dim<=6144; dim+=128)
316 {
317 experiment2(cc, dim, dim, numFronts);
318 }
319 }
320 #endif
321 #if 0
322 for(int numFronts=1; numFronts<=128; numFronts*=2)
323 {
324 for(int smdim=128; smdim<=6144/4; smdim+=128)
325 {
326 experiment2(cc, smdim, 4*smdim, numFronts);
327 experiment2(cc, 4*smdim, smdim, numFronts);
328 }
329 for(int smdim=128; smdim<=6144/16; smdim+=128)
330 {
331 experiment2(cc, smdim, 16*smdim, numFronts);
332 experiment2(cc, 16*smdim, smdim, numFronts);
333 }
334 }
335 #endif
336 }
337
338