1 // =============================================================================
2 // === GPUQREngine/Demo/dense_demo.cpp =========================================
3 // =============================================================================
4 
5 // GPUQREngine can be used to factorize a set of dense matrices of
6 // various sizes.  This is the demo for this 'dense' usage case.
7 // The 'sparse' case is exercised by SuiteSparseQR.
8 
9 #ifndef GPU_BLAS
10 #define GPU_BLAS
11 #endif
12 
13 #include "GPUQREngine.hpp"
14 #include "cholmod.h"
15 #include <time.h>
16 #include <stdio.h>
17 #include "GPUQREngine_Timing.hpp"
18 
19 //------------------------------------------------------------------------------
20 
randfill(double * x,Int m,Int n)21 void randfill(double *x, Int m, Int n)
22 {
23     for(int i=0; i<m; i++) {
24         for(int j=0; j<n; j++) {
25             x[i*n+j] = rand() / 1e8;
26         }
27     }
28 }
29 
30 //------------------------------------------------------------------------------
31 
randfill_stair(double * x,Int m,Int n,Int aggression)32 void randfill_stair(double *x, Int m, Int n, Int aggression)
33 {
34     int i = 0;
35     for(int j=0; j<n; j++)
36     {
37         int iend = MIN(i + (rand() % aggression) + 1, m);
38         for(i=0; i<m; i++)
39         {
40             x[i*n+j] = (i < iend ? rand() / 1e8 : 0.0);
41         }
42         i = iend;
43     }
44 }
45 
46 //------------------------------------------------------------------------------
47 
randfill_blocktriu(double * x,Int m,Int n)48 void randfill_blocktriu(double *x, Int m, Int n)
49 {
50     int ybmax = CEIL(m, TILESIZE);
51     int xbmax = CEIL(n, TILESIZE);
52 
53     for(int yb=0; yb<ybmax; yb++)
54     {
55         for(int xb=0; xb<xbmax; xb++)
56         {
57             if(yb > xb) continue;
58 
59             int imax = MIN(m, TILESIZE*(yb+1));
60             for(int i=TILESIZE*yb; i<imax; i++)
61             {
62                 int jmax = MIN(n, TILESIZE*(xb+1));
63                 for(int j=TILESIZE*xb; j<jmax; j++)
64                 {
65                     x[i*n+j] = rand() / 1e8;
66                 }
67             }
68         }
69     }
70 }
71 
72 //------------------------------------------------------------------------------
73 
printMatrix(FILE * troll,const char * name,int which,int f,double * inM,int m,int n)74 void printMatrix(FILE *troll, const char *name,
75     int which, int f, double *inM, int m, int n)
76 {
77 
78     double (*M)[n] = (double (*)[n]) inM;
79 
80     fprintf(troll, "%s%d {%d} = [\n", name, which, 1+f);
81     for(int i=0; i<m; i++)
82     {
83         for(int j=0; j<n; j++)
84         {
85             fprintf(troll, "%30.16e ", M[i][j]);
86         }
87         fprintf(troll, "\n");
88     }
89     fprintf(troll, "];\n");
90 }
91 
92 //------------------------------------------------------------------------------
93 
printQRScript(FILE * troll)94 void printQRScript (FILE *troll)
95 {
96 
97     fprintf (troll, "format compact;\n");
98     fprintf (troll, "res = norm(triu(R)'*triu(R)-A'*A) / norm(A'*A)\n");
99     fprintf (troll, "if isnan(res), error ('failure'); end\n");
100     fprintf (troll, "if res > 1e-12, error ('failure'); end\n");
101 
102 //  fprintf (troll, "[%d %d] = size (A) ;\n") ;
103 //  int rank = MIN(m, n);
104 //  fprintf (troll, "[Davis_R V1 T] = qrsigma_concise(A);\n");
105 //  fprintf (troll, "norm(triu(Davis_R) - triu(R))\n");
106 //  fprintf (troll, "norm(Davis_R(%d:end,:)-R(%d:end,:))\n", rank+1, rank+1);
107 
108     fprintf (troll, "MR = qr(A);\n");
109     fprintf (troll, "tR = triu(abs(R));");
110     fprintf (troll, "tMR = triu(abs(MR));");
111     fprintf (troll, "norm(tMR - tR)\n");
112 }
113 
114 //------------------------------------------------------------------------------
115 
driver(cholmod_common * cc,const char * filename,Front * fronts,int numFronts,QREngineStats * stats,int which)116 void driver
117 (
118     cholmod_common *cc,
119     const char *filename,
120     Front *fronts,
121     int numFronts,
122     QREngineStats *stats,
123     int which
124 )
125 {
126     FILE *troll = NULL ;
127 
128     if (filename != NULL)
129     {
130         troll = fopen(filename, "a");
131     }
132     if (troll != NULL)
133     {
134         fprintf (troll, "clear ;\n") ;
135         fprintf (troll, "A_%d = cell (1,%d) ;\n", which, numFronts) ;
136         fprintf (troll, "R_%d = cell (1,%d) ;\n", which, numFronts) ;
137     }
138 
139     for(int f=0; f<numFronts; f++)
140     {
141         Front *front = (&fronts[f]);
142         int m = front->fm;
143         int n = front->fn;
144 
145         /* Attach the front data. */
146         double *F = front->cpuR = front->F = (double*)
147             SuiteSparse_calloc(m*n, sizeof(double));
148         randfill(F, m, n);
149 
150         /* Print the A matrix. */
151         if (troll != NULL)
152         {
153             printMatrix (troll, "A_", which, f, F, m, n);
154         }
155     }
156 
157     /* Run the QREngine code */
158     QREngineResultCode result = GPUQREngine (cc->gpuMemorySize,
159         fronts, numFronts, stats);
160     if (result != QRENGINE_SUCCESS)
161     {
162         printf ("test failure!\n'") ;
163         exit (0) ;
164     }
165 
166     /* Do something with R factors. */
167     for(int f=0; f<numFronts; f++)
168     {
169         Front *front = (&fronts[f]);
170         int m = front->fm;
171         int n = front->fn;
172         double *R = front->cpuR;
173 
174         /* Print the R matrix. */
175         if (troll != NULL)
176         {
177             printMatrix(troll, "R_", which, f, R, m, n);
178             fprintf (troll, "R = R_%d {%d} ; A = A_%d {%d} ;\n",
179                 which, 1+f, which, 1+f) ;
180             printQRScript (troll) ;
181         }
182 
183         /* Detach the front data. */
184         SuiteSparse_free(front->F);
185         front->F = NULL;
186     }
187 
188     if (troll != NULL)
189     {
190         fprintf (troll, "disp ('all tests passed') ;\n") ;
191         fclose (troll) ;
192     }
193 }
194 
195 //------------------------------------------------------------------------------
196 
printStats(QREngineStats stats,int numFronts,double m,double n)197 void printStats(QREngineStats stats, int numFronts, double m, double n)
198 {
199     float kernelTime = stats.kernelTime;
200     Int numLaunches = stats.numLaunches;
201     Int gpuFlops = stats.flopsActual;
202 
203     /* Compute & Print FLOPS */
204     double time = (double) kernelTime;
205     double flops;
206     if(m >= n)
207     {
208         flops = 2.0 * n*n * (m - (n/3));
209     }
210     else
211     {
212         flops = 2.0 * m*m * (n - (m/3));
213     }
214 
215     flops *= (double) numFronts;
216     flops /= (time / 1e3);
217     double gflops = flops / 1e9;
218     double gpugflops = (gpuFlops / (time / 1e3)) / 1e9;
219     printf("m: %.0f, n: %.0f, nf: %d, nl: %ld, gpuFlops: %ld, t: %fms, gflops: %f, gpugflops: %f\n", m, n, numFronts, numLaunches, gpuFlops, time, gflops, gpugflops);
220 }
221 
222 //------------------------------------------------------------------------------
223 
experiment1(cholmod_common * cc,int which,int numFronts,int m,int n)224 void experiment1(cholmod_common *cc, int which, int numFronts, int m, int n)
225 {
226     /* Configure problem set. */
227     Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
228     for(int f=0; f<numFronts; f++)
229     {
230         new (&fronts[f]) Front(f, EMPTY, m, n);
231     }
232 
233     /* Run the driver code. */
234     QREngineStats stats;
235     driver(cc, "troll.m", fronts, numFronts, &stats, which);
236 //  driver(cc, NULL,      fronts, numFronts, &stats, which);
237     printStats(stats, numFronts, m, n);
238 
239     /* Cleanup resources. */
240     for(int f=0; f<numFronts; f++) (&fronts[f])->~Front();
241     SuiteSparse_free(fronts);
242 }
243 
244 //------------------------------------------------------------------------------
245 
experiment2(cholmod_common * cc,int m,int n,int numFronts)246 void experiment2(cholmod_common *cc, int m, int n, int numFronts)
247 {
248     /* See if this experiment would blow out the memory. */
249     size_t threshold = 3.50 * 1024 * 1024 * 1024;
250     size_t memoryReq = (size_t) (numFronts * (CEIL(m, 32) * 32 * 33 + m * n)) ;
251     if(memoryReq * sizeof(double) > threshold) return;
252 
253     /* Configure problem set. */
254     Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front));
255     for(int f=0; f<numFronts; f++)
256     {
257         new (&fronts[f]) Front(f, EMPTY, m, n);
258     }
259 
260     /* Run the driver code if we won't run out of memory. */
261     QREngineStats stats;
262     driver(cc, NULL,      fronts, numFronts, &stats, 0);   // no troll.m output
263     printStats(stats, numFronts, m, n);
264 
265     /* Cleanup resources. */
266     for(int f=0; f<numFronts; f++) (&fronts[f])->~Front();
267     SuiteSparse_free(fronts);
268 }
269 
270 
271 //------------------------------------------------------------------------------
272 
main(int argn,char ** argv)273 int main(int argn, char **argv)
274 {
275     double t ;
276     size_t total_mem, available_mem ;
277 
278     /* Clear the troll file. */
279     FILE *troll;
280     troll = fopen("troll.m", "w");
281     fclose(troll);
282 
283     srand(1);
284 
285     // start CHOLMOD
286     cholmod_common *cc, Common ;
287     cc = &Common ;
288     cholmod_l_start (cc) ;
289 
290     // warmup the GPU.  This can take some time, but only needs
291     // to be done once
292     cc->useGPU = true ;
293     t = SuiteSparse_time ( ) ;
294     cholmod_l_gpu_memorysize (&total_mem, &available_mem, cc) ;
295     cc->gpuMemorySize = available_mem ;
296     t = SuiteSparse_time ( ) - t ;
297     if (cc->gpuMemorySize <= 1)
298     {
299         printf ("no GPU available\n") ;
300         return (0) ;
301     }
302     printf ("available GPU memory: %g MB, warmup time: %g\n",
303         (double) (cc->gpuMemorySize) / (1024 * 1024), t) ;
304 
305     experiment1(cc, 1, 2, 8, 8);
306     experiment1(cc, 2, 2, 12, 8);
307     experiment1(cc, 3, 2, 64, 32);
308     experiment1(cc, 4, 1, 100, 200);
309 
310     printf ("to check results, run 'troll.m' in MATLAB\n") ;
311 
312 #if 0
313     for(int numFronts=1; numFronts<=128; numFronts*=2)
314     {
315         for(int dim=128; dim<=6144; dim+=128)
316         {
317             experiment2(cc, dim, dim, numFronts);
318         }
319     }
320 #endif
321 #if 0
322     for(int numFronts=1; numFronts<=128; numFronts*=2)
323     {
324         for(int smdim=128; smdim<=6144/4; smdim+=128)
325         {
326             experiment2(cc, smdim, 4*smdim, numFronts);
327             experiment2(cc, 4*smdim, smdim, numFronts);
328         }
329         for(int smdim=128; smdim<=6144/16; smdim+=128)
330         {
331             experiment2(cc, smdim, 16*smdim, numFronts);
332             experiment2(cc, 16*smdim, smdim, numFronts);
333         }
334     }
335 #endif
336 }
337 
338