1 // This file is part of BOINC.
2 // http://boinc.berkeley.edu
3 // Copyright (C) 2013 University of California
4 //
5 // BOINC is free software; you can redistribute it and/or modify it
6 // under the terms of the GNU Lesser General Public License
7 // as published by the Free Software Foundation,
8 // either version 3 of the License, or (at your option) any later version.
9 //
10 // BOINC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 // See the GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
17 //
18 // This program serves as both
19 // - An example BOINC-OpenCL application, illustrating the use of the BOINC API
20 //   and OpenCL API.
21 // - A program for testing various features of BOINC.
22 //
23 // The program reads the input nxn matrix from the "input" file, inverts the
24 // matrix NUM_ITERATIONS times and write to "output" file.
25 //
26 // To run, place the executable in the same directory as an init_data.xml
27 // file specifying the gpu_type (vendor) and gpu_device_num, then invoke
28 // form the command line as follows:
29 // $ cd to/the/directory/containing/executable/and/init_data.mxl/file
30 // $ ./openclapp [options]
31 //
32 // command line options
33 // -run_slow: sleep 1 second after each character
34 // -cpu_time N: use about N CPU seconds after copying files
35 // -early_exit: exit(10) after 30 iterations
36 // -early_crash: crash after 30 iterations
37 //
38 // See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
39 // Original contributor: Tuan Le (tuanle86@berkeley.edu)
40 
41 #include "openclapp.hpp"
42 #include "boinc_opencl.h"
43 
44 using std::string;
45 
main(int argc,char * argv[])46 int main(int argc, char * argv[]) {
47     int i, retval, lastInversion=0, checkpointExists=0, matrixSize=0;
48     double fd;
49     char input_path[512], output_path[512], chkpt_path[512], buf[256];
50     MFILE out;
51     FILE* state, *infile;
52 
53     generate_random_input_file(MATRIX_SIZE); //call this if you don't want to
54                                              //construct the input file manually
55 	for (i=0; i<argc; i++) {
56         if (!strcmp(argv[i], "-early_exit")) early_exit = true;
57         if (!strcmp(argv[i], "-early_crash")) early_crash = true;
58         if (!strcmp(argv[i], "-run_slow")) run_slow = true;
59         if (!strcmp(argv[i], "-cpu_time")) {
60             cpu_time = atof(argv[++i]);
61         }
62     }
63 
64     retval = boinc_init();
65     if (retval) {
66         fprintf(stderr,
67             "ERROR: %s boinc_init returned %d\n",
68             boinc_msg_prefix(buf, sizeof(buf)), retval
69         );
70         exit(retval);
71     }
72 
73     // open the input file (resolve logical name first)
74     //
75     boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
76     infile = boinc_fopen(input_path, "r");
77     if (!infile) {
78         fprintf(stderr,
79             "ERROR: %s Couldn't find input file, resolved name %s.\n",
80             boinc_msg_prefix(buf, sizeof(buf)), input_path
81         );
82         getchar();
83         exit(-1);
84     }
85 
86     boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
87 
88     // See if there's a valid checkpoint file.
89     // If so retrieve the current matrix and inversion number
90     //
91     boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
92     state = boinc_fopen(chkpt_path, "r");
93     if (state) {
94         printf("Checkpoint file is detected. Read from checkpoint file ... \n");
95         checkpointExists=fscanf(state, "%d", &lastInversion);
96         if (checkpointExists == 1) {
97             isStateFileInUse=true;
98             printf("Last inversion # is : %d\n",lastInversion);
99             fscanf(state,"%d",&matrixSize);
100             width=height=matrixSize;
101             printf("Initialize host ....\n");
102             initialize_host(state);
103         }
104         fclose(state);
105     } else {
106         printf("There's no valid checkpoint file!\n");
107     }
108 
109     retval = out.open(output_path, "wb");
110 
111     if (retval) {
112         fprintf(stderr,
113             "ERROR: %s APP: matrix_inversion output open failed:\n",
114             boinc_msg_prefix(buf, sizeof(buf))
115         );
116         fprintf(stderr,
117             "ERROR: %s resolved name %s, retval %d\n",
118             boinc_msg_prefix(buf, sizeof(buf)), output_path, retval
119         );
120         perror("open");
121         exit(1);
122     }
123 
124 #ifdef APP_GRAPHICS
125     // create shared mem segment for graphics, and arrange to update it
126     //
127     shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
128     if (!shmem) {
129         fprintf(stderr,
130             "ERROR: %s failed to create shared mem segment\n",
131             boinc_msg_prefix(buf, sizeof(buf))
132         );
133     }
134     update_shmem();
135     boinc_register_timer_callback(update_shmem);
136 #endif
137 
138     if (checkpointExists != 1) { //checkpoint file is not found.
139         matrixSize=get_matrix_size(infile);
140         printf("Matrix Size: width = height = %d\n",matrixSize);
141         width=height=matrixSize;
142         // Initialize Host application
143         printf("Initialize host ....\n");
144         if (initialize_host(infile)==1) {
145             return 1;
146         }
147         out.printf("\n----------------- Before being inversed ----------------\n\n");
148         printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",
149                NUM_ITERATIONS);
150     } else {
151         out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",
152                    lastInversion);
153         printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",
154                NUM_ITERATIONS-lastInversion,lastInversion+1);
155     }
156 
157     // Initialize OpenCL resources
158     if (initialize_cl(argc, argv) != 0) {
159         return 1;
160     }
161 
162     print_to_file(&out,input,matrixSize);
163 
164     for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
165         //the invert function will trigger kernel calls.
166         invert(input,output,matrixSize);
167         printf("Finish inversion #%d\n",i);
168         for (int j=0;j<matrixSize*matrixSize;++j) {
169             input[j]=output[j]; //change the input for the next iteration
170         }
171         if (run_slow) {
172             boinc_sleep(1.);
173         }
174 
175         if (early_exit && i>30) {
176             exit(-10);
177         }
178 
179         if (early_crash && i>30) {
180             boinc_crash();
181         }
182 
183         if (boinc_time_to_checkpoint()) {
184             printf("Perform checkpointing at inversion # %d\n",i);
185             //we'll need to write the current matrix to the state file.
186             retval = do_checkpoint(out, i, input, matrixSize);
187             if (retval) {
188                 fprintf(stderr,
189                     "ERROR: %s APP: matrix_inversion checkpoint failed %d\n",
190                     boinc_msg_prefix(buf, sizeof(buf)), retval
191                 );
192                 exit(retval);
193             }
194             boinc_checkpoint_completed();
195         }
196         fd = i/NUM_ITERATIONS;
197         if (cpu_time) fd /= 2;
198         boinc_fraction_done(fd);
199     }
200 
201     out.printf("\n\n----------------- Final inversion #%d ----------------\n\n",
202                NUM_ITERATIONS);
203     print_to_file(&out,output,matrixSize);
204 
205     retval = out.flush(); //force the output file to be closed.
206     if (retval) {
207         fprintf(stderr,
208             "ERROR: %s APP: matrix_inversion flush failed %d\n",
209             boinc_msg_prefix(buf, sizeof(buf)), retval
210         );
211         exit(1);
212     }
213 
214     // Releases OpenCL resources
215     if (cleanup_cl()==1) {
216         fprintf(stderr, "Error from cleanup_cl() !");
217         return 1;
218     }
219 
220     // Release host resources
221     cleanup_host();
222 
223     // burn up some CPU time if needed
224     //
225     if (cpu_time) {
226         printf("\nBurning up some CPU time ... \n");
227         double start = dtime();
228         for (int i=0; ; i++) {
229             double e = dtime()-start;
230             if (e > cpu_time) break;
231             fd = .5 + .5*(e/cpu_time);
232             boinc_fraction_done(fd);
233 
234             if (boinc_time_to_checkpoint()) {
235                 retval = do_checkpoint(out, NUM_ITERATIONS, input, matrixSize);
236                 if (retval) {
237                     fprintf(stderr,
238                         "ERROR: %s APP: maxtrix_inversion checkpoint failed %d\n",
239                         boinc_msg_prefix(buf, sizeof(buf)), retval
240                     );
241                     exit(1);
242                 }
243                 boinc_checkpoint_completed();
244             }
245             comp_result = do_a_giga_flop(i);
246         }
247     }
248     boinc_fraction_done(1);
249 
250 #ifdef APP_GRAPHICS
251     update_shmem();
252 #endif
253 
254     if (boinc_is_standalone()) {
255         printf("\nDone! Please press ENTER to exit. ");
256         getchar();
257     }
258     boinc_finish(0);
259 }
260 
261 
262 /*** BOINC FUNCTION DEFINITIONS ***/
263 
264 /* Do a billion floating-point ops */
do_a_giga_flop(int foo)265 static double do_a_giga_flop(int foo) {
266     double x = 3.14159*foo;
267     int i;
268     for (i=0; i<500000000; i++) {
269         x += 5.12313123;
270         x *= 0.5398394834;
271     }
272     return x;
273 }
274 
275 /* Save the computation state into checkpoint file */
do_checkpoint(MFILE & mf,int n,cl_float * input,int matrixSize)276 int do_checkpoint(MFILE& mf, int n, cl_float *input, int matrixSize) {
277     int retval;
278     string resolved_name;
279 
280     FILE* f = fopen("temp", "w");
281     if (!f) return 1;
282     fprintf(f, "%d", n); //write inversion number
283     fprintf(f, " ");
284     fprintf(f, "%d", matrixSize); //write matrixSize
285     fprintf(f, " ");
286     for (int i=0;i<matrixSize*matrixSize;++i) {
287         fprintf(f, " ");
288         fprintf(f, "%f", input[i]);
289     }
290     fclose(f);
291     retval = mf.flush();
292     if (retval) return retval;
293     boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name);
294     retval = boinc_rename("temp", resolved_name.c_str());
295     if (retval) return retval;
296     return 0; //return 0 to indicate success.
297 }
298 
299 /*** FUNCTION DEFINITIONS ***/
300 
301 /* Create an input file filled with random data of type cl_float. */
generate_random_input_file(int n)302 void generate_random_input_file(int n) {
303     FILE *infile;
304 
305     infile=fopen(INPUT_FILENAME,"w");
306     cl_float *input = (cl_float *)malloc(sizeof(cl_float)*(n*n));
307     srand(n);
308     for( int i = 0; i < n; i++ ) {
309         for (int j = 0; j < n; j++) {
310             input[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
311         }
312         input[i*n+i] += sqrt((float)n);
313     }
314     int j=0;
315     for (int i=0;i<n*n;++i) {
316         fprintf(infile,"%15f",input[i]);
317         if (j+1==n) {
318             fprintf(infile,"\n");
319             j=0;
320         } else {
321             ++j;
322         }
323     }
324     fclose(infile);
325     free(input);
326 }
327 
328 /*
329  * Parse the input file and determine the size of the matrix.
330  * This is an nxn matrix. Note: if width<> height, the matrix is
331  * non-invertible.
332  */
get_matrix_size(FILE * infile)333 int get_matrix_size(FILE *infile) {
334     int w=0;
335     char c;
336 
337     fseek(infile,0,SEEK_SET);
338     while (true) {
339         do {
340             c=fgetc(infile);
341             if (c == EOF || c == '\n') {
342                 goto exitLoop;
343             }
344         } while (isspace(c));
345 
346         if (isdigit(c) || c=='.' || c=='-') {
347             ++w;
348         }
349 
350         do {
351             c=fgetc(infile);
352             if (c == EOF || c == '\n') {
353                 goto exitLoop;
354             }
355         } while (isdigit(c) || c=='.' || c=='-');
356 
357         if (c==EOF || c == '\n') {
358             break;
359         }
360     }
361     exitLoop:
362     return w;
363 }
364 
365 /*
366  * \brief Host Initialization
367  *        Allocate and initialize memory
368  *        on the host. Print input array.
369  */
initialize_host(FILE * infile)370 int initialize_host(FILE *infile) {
371     input  = NULL;
372     output = NULL;
373 
374     if (width!=height) {
375         fprintf(stderr, "Error: non nxn matrix cannot be invertiable.\n");
376         return 1;
377     }
378 
379     /////////////////////////////////////////////////////////////////
380     // Allocate and initialize memory used by host
381     /////////////////////////////////////////////////////////////////
382     cl_uint sizeInBytes = width * height * sizeof(cl_float);
383     input = (cl_float *) malloc(sizeInBytes);
384     if (input == NULL) {
385         fprintf(stderr, "Error: Failed to allocate input memory on host\n");
386         return 1;
387     }
388 
389     output = (cl_float *) malloc(sizeInBytes);
390     if(output == NULL) {
391         fprintf(stderr, "Error: Failed to allocate output memory on host\n");
392         return 1;
393     }
394 
395     //fillRandom(input,width,height);
396     fetch_elements_into_host_memory(infile,input);
397     return 0;
398 }
399 
400 /*
401  * Read the float values from input file into "input" array.
402  */
fetch_elements_into_host_memory(FILE * infile,cl_float * input)403 void fetch_elements_into_host_memory(FILE *infile, cl_float *input) {
404     float num=0;
405     int i=0;
406     if (!isStateFileInUse) {
407         fseek(infile,0,SEEK_SET);
408     }
409     while (fscanf(infile,"%f",&num)==1) {
410         input[i]=num;
411         ++i;
412     }
413 }
414 
415 /*
416  * Converts the contents of a file into a string
417  */
convert_to_string(const char * fileName)418 char * convert_to_string(const char *fileName) {
419     int count=0;
420     char *s;
421     char c;
422     int i=0;
423 
424     // look for "openclapp_kernels.cl" in "boinc/samples/openclapp/debug" or
425     // in "boinc/samples/openclapp/release". Note that "openclapp_kernels.cl"
426     // is automatically copied to these directories along the building process.
427     FILE *infile=fopen(fileName,"r");
428     if (!infile) { //not found. This typically happens on Linux or Mac.
429         //look for "openclapp_kernels.cl" in "boinc/sample/openclapp/" instead.
430         infile = fopen(KERNELS_FILEPATH,"r");
431         if (!infile) {
432             fprintf(stderr, "ERROR: Failed to open file %s!", fileName);
433             exit(0);
434         }
435     }
436     fseek(infile,0,SEEK_SET);
437     while (fgetc(infile)!=EOF) count++;
438     s=(char *) malloc(sizeof(char)*(count+1)); //add 1 for string terminator.
439     fseek(infile,0,SEEK_SET);
440     while ((c=fgetc(infile))!=EOF) {
441         s[i++]=c;
442     }
443     s[i]='\0';
444     fclose(infile);
445     return s;
446 }
447 
448 /*
449  * \brief OpenCL related initialization
450  *        Create Context, Device list, Command Queue
451  *        Load CL file, compile, link CL source
452  *		  Build program and kernel objects
453  */
454 
455  // Note: OpenCL memory buffer objects will be created in invert
456  //       function before kernel calls are made.
initialize_cl(int argc,char * argv[])457 int initialize_cl(int argc, char * argv[]) {
458     cl_int status = 0;
459     int retval = 0;
460 
461     localThreads[0]  = LOCAL_WORK_SIZE;
462     globalThreads[0] = GLOBAL_WORK_SIZE;
463     cl_platform_id platform = NULL;
464     cl_device_id device;
465 
466     // IMPORTANT NOTE: production applications should always specify
467     // the GPU type (vendor) in the call to boinc_get_opencl_ids as
468     // the third argument: it must be either PROC_TYPE_NVIDIA_GPU,
469     // PROC_TYPE_AMD_GPU or PROC_TYPE_INTEL_GPU.  This is to support
470     // older versions of the BOINC client which do not include the
471     // <gpu-type> field in the init_data.xml file.
472     //
473     // This sample passes -1 for the type argument to allow using
474     // just one sample for any GPU vendor (AMD, NVIDIA or Intel.)
475     // As a result, the init_data.xml file for this sample must
476     // specify the GPU type (vendor) and either gpu_device_num (the
477     // GPU's index from that vendor) or gpu_opencl_dev_index (the
478     // GPU's index among OpenCL-capable devices from that vendor.)
479     //
480     // See the ReadMe file for more details, including an explanation
481     // of the difference between the gpu_device_num and the
482     // gpu_opencl_dev_index.
483     retval = boinc_get_opencl_ids(argc, argv, -1, &device, &platform);
484     if (retval) {
485         fprintf(stderr,
486             "Error: boinc_get_opencl_ids() failed with error %d\n",
487             retval
488         );
489         return 1;
490     }
491 
492    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
493                                      (cl_context_properties)platform,
494                                      0
495                                    };
496     context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
497     if (status != CL_SUCCESS) {
498         fprintf(stderr, "Error: clCreateContext() returned %d\n", status);
499         return 1;
500     }
501 
502     /////////////////////////////////////////////////////////////////
503     // Create an OpenCL command queue
504     /////////////////////////////////////////////////////////////////
505     commandQueue = clCreateCommandQueue(context, device, 0, &status);
506     if(status != CL_SUCCESS) {
507         fprintf(stderr,
508             "Error: Creating Command Queue. (clCreateCommandQueue) returned %d\n",
509             status
510         );
511         return 1;
512     }
513 
514     /////////////////////////////////////////////////////////////////
515     // Load CL file, build CL program object, create CL kernel object
516     /////////////////////////////////////////////////////////////////
517     source = convert_to_string(KERNELS_FILENAME);
518     size_t sourceSize[]    = { strlen(source) };
519     program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status);
520     if (status != CL_SUCCESS) {
521         fprintf(stderr,
522             "Error: Loading Binary into cl_program (clCreateProgramWithBinary) returned %d\n",
523             status
524         );
525 
526         return 1;
527     }
528 
529     /* create a cl program executable for all the devices specified */
530     status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
531     if (status != CL_SUCCESS)  {
532         fprintf(stderr,
533             "Error: Building Program (clBuildProgram) returned %d\n",
534             status
535         );
536 
537         return 1;
538     }
539 
540     /* get a kernel object handle for a kernel with the given name */
541     GEStep1A_kernel = clCreateKernel(program, "GEStep1A", &status);
542     if (status != CL_SUCCESS) {
543         fprintf(stderr,
544             "Error: clCreateKernel (GEStep1A) returned %d\n",
545             status
546         );
547 
548         return 1;
549     }
550 
551     GEStep2_kernel = clCreateKernel(program, "GEStep2", &status);
552     if (status != CL_SUCCESS) {
553         fprintf(stderr,
554             "Error: clCreateKernel (GEStep2) returned %d\n",
555             status
556         );
557 
558         return 1;
559     }
560 
561 	GEStep3_kernel = clCreateKernel(program, "GEStep3", &status);
562     if (status != CL_SUCCESS) {
563         fprintf(stderr,
564             "Error: clCreateKernel (GEStep3) returned %d\n",
565             status
566         );
567 
568         return 1;
569     }
570 
571     return 0;
572 }
573 
574 /*
575  * \brief Release OpenCL resources (Context, Memory etc.)
576  */
cleanup_cl(void)577 int cleanup_cl(void) {
578     cl_int status;
579 
580     status = clReleaseKernel(GEStep1A_kernel);
581     if (status != CL_SUCCESS) {
582         fprintf(stderr,
583             "Error: In clReleaseKernel (GEStep1A_kernel) returned %d\n",
584             status
585         );
586         return 1;
587     }
588 
589     status = clReleaseKernel(GEStep2_kernel);
590     if (status != CL_SUCCESS) {
591         fprintf(stderr,
592             "Error: In clReleaseKernel (GEStep2_kernel) returned %d\n",
593             status
594         );
595         return 1;
596 	}
597 
598     status = clReleaseKernel(GEStep3_kernel);
599     if (status != CL_SUCCESS) {
600         fprintf(stderr,
601             "Error: In clReleaseKernel (GEStep3_kernel) returned %d\n",
602             status
603         );
604         return 1;
605     }
606 
607     status = clReleaseProgram(program);
608     if (status != CL_SUCCESS) {
609         fprintf(stderr,
610             "Error: clReleaseProgram returned %d\n",
611             status
612         );
613         return 1;
614     }
615 
616     status = clReleaseCommandQueue(commandQueue);
617     if (status != CL_SUCCESS) {
618         fprintf(stderr,
619             "Error: In clReleaseCommandQueue returned %d\n",
620             status
621         );
622         return 1;
623     }
624 
625     status = clReleaseContext(context);
626     if (status != CL_SUCCESS) {
627         fprintf(stderr,
628             "Error: In clReleaseContext returned %d\n",
629             status
630         );
631         return 1;
632     }
633 
634     return 0;
635 }
636 
637 /*
638  * \brief Releases program's resources
639  */
cleanup_host(void)640 void cleanup_host(void) {
641     if (input != NULL) {
642         free(input);
643         input = NULL;
644     }
645 
646     if (output != NULL) {
647         free(output);
648         output = NULL;
649     }
650 
651     if (source != NULL) {
652         free((char *)source);
653         source = NULL;
654     }
655 }
656 
657 /*
658  * Write the result to output file
659  */
print_to_file(MFILE * out,float * h_odata,int n)660 void print_to_file(MFILE *out, float *h_odata, int n) {
661     int count=0;
662     int move=0;
663     int num_elements=n*n;
664     while (num_elements>0) {
665         out->printf("%15f    ",h_odata[move]);
666         ++count;
667         ++move;
668         if (count==n) {
669             out->printf("\n");
670             count=0;
671         }
672         --num_elements;
673     }
674 }
675 
676 /*
677  * \brief Run OpenCL program
678  *
679  *        Bind host variables to kernel arguments
680  *		  Run the CL kernel
681  */
run_GEStep1A_kernel(cl_float * AI,int i,int n2,int lda2)682 int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
683     cl_int status;
684 
685     /*
686      * the input array to the kernel. This array will eventually be modified
687      * to the inverted array.
688      */
689     status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
690     if (status != CL_SUCCESS) {
691         fprintf(stderr,
692             "Error: Setting kernel argument. (input) returned %d\n",
693             status
694         );
695         return 1;
696     }
697 
698     /*i*/
699     status = clSetKernelArg(GEStep1A_kernel, 1, sizeof(int), (void *)&i);
700     if (status != CL_SUCCESS) {
701         fprintf(stderr,
702             "Error: Setting kernel argument. (i) returned %d\n",
703             status
704         );
705         return 1;
706     }
707 
708     /*n2*/
709     status = clSetKernelArg(GEStep1A_kernel, 2, sizeof(int), (void *)&n2);
710     if (status != CL_SUCCESS) {
711         fprintf(stderr,
712             "Error: Setting kernel argument. (n2) returned %d\n",
713             status
714         );
715         return 1;
716     }
717 
718     /*lda2*/
719     status = clSetKernelArg(GEStep1A_kernel, 3, sizeof(int), (void *)&lda2);
720     if (status != CL_SUCCESS) {
721         fprintf(stderr,
722             "Error: Setting kernel argument. (lda2) returned %d\n",
723             status
724         );
725         return 1;
726     }
727 
728     /*
729      * Enqueue a kernel run call.
730      */
731     status = clEnqueueNDRangeKernel(commandQueue,
732                                     GEStep1A_kernel,
733                                     1,
734                                     NULL,
735                                     globalThreads,
736                                     localThreads,
737                                     0,
738                                     NULL,
739                                     NULL);
740 
741     if (status != CL_SUCCESS) {
742         fprintf(stderr,
743             "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
744             status
745         );
746         return 1;
747     }
748 
749     clFinish(commandQueue);
750 
751     /* Enqueue readBuffer*/  //Note: we are reading back from inputBuffer since AI is modified directly in kernel
752     status = clEnqueueReadBuffer(commandQueue,
753                                  inputBuffer,
754                                  CL_FALSE,
755                                  0,
756                                  globalThreads[0] * sizeof(cl_float),
757                                  AI,
758                                  0,
759                                  NULL,
760                                  NULL);
761 
762     if(status != CL_SUCCESS) {
763         fprintf(stderr,
764             "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n",
765             status
766         );
767         return 1;
768     }
769 
770     return 0;
771 }
772 
run_GEStep2_kernel(cl_float * AI,cl_float diag,int i,int n2,int lda2)773 int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
774     cl_int status;
775 
776     /*
777      * the input array to the kernel. This array will eventually be modified
778      * to the inverted array.
779      */
780     status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
781     if (status != CL_SUCCESS) {
782         fprintf(stderr,
783             "Error: Setting kernel argument. (AI) returned %d\n",
784             status
785         );
786         return 1;
787     }
788 
789     /*diag*/
790     status = clSetKernelArg(GEStep2_kernel, 1, sizeof(cl_float), (void *)&diag);
791     if (status != CL_SUCCESS) {
792         fprintf(stderr,
793             "Error: Setting kernel argument. (diag) returned %d\n",
794             status
795         );
796         return 1;
797     }
798 
799     /*i*/
800     status = clSetKernelArg(GEStep2_kernel, 2, sizeof(int), (void *)&i);
801     if (status != CL_SUCCESS) {
802         fprintf(stderr,
803             "Error: Setting kernel argument. (i) returned %d\n",
804             status
805         );
806         return 1;
807     }
808 
809     /*n2*/
810     status = clSetKernelArg(GEStep2_kernel, 3, sizeof(int), (void *)&n2);
811     if (status != CL_SUCCESS) {
812         fprintf(stderr,
813             "Error: Setting kernel argument. (n2) returned %d\n",
814             status
815         );
816         return 1;
817     }
818 
819     /*lda2*/
820     status = clSetKernelArg(GEStep2_kernel, 4, sizeof(int), (void *)&lda2);
821     if (status != CL_SUCCESS) {
822         fprintf(stderr,
823             "Error: Setting kernel argument. (lda2) returned %d\n",
824             status
825         );
826         return 1;
827     }
828 
829     /*
830      * Enqueue a kernel run call.
831      */
832     status = clEnqueueNDRangeKernel(commandQueue,
833                                     GEStep2_kernel,
834                                     1,
835 									NULL,
836                                     globalThreads,
837                                     localThreads,
838                                     0,
839                                     NULL,
840                                     NULL);
841 
842     if (status != CL_SUCCESS) {
843         fprintf(stderr,
844             "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
845             status
846         );
847         return 1;
848     }
849 
850     clFinish(commandQueue);
851 
852     /* Enqueue readBuffer*/
853 	//Note: we are reading back from inputBuffer since AI is modified directly in kernel
854     status = clEnqueueReadBuffer(commandQueue,
855                                  inputBuffer,
856                                  CL_FALSE,
857                                  0,
858                                  globalThreads[0] * sizeof(cl_float),
859                                  AI,
860                                  0,
861                                  NULL,
862                                  NULL);
863     if (status != CL_SUCCESS) {
864         fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status);
865         return 1;
866     }
867 
868     return 0;
869 }
870 
run_GEStep3_kernel(cl_float * AI,int i,int n2,int lda2)871 int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
872     cl_int status;
873 
874     /*
875      * The input array to the kernel. This array will eventually be modified
876      * to the inverted array.
877      */
878     status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
879     if (status != CL_SUCCESS) {
880         fprintf(stderr,
881             "Error: Setting kernel argument. (input) returned %d\n",
882             status
883         );
884         return 1;
885     }
886 
887     /*i*/
888     status = clSetKernelArg(GEStep3_kernel, 1, sizeof(int), (void *)&i);
889     if (status != CL_SUCCESS) {
890         fprintf(stderr,
891             "Error: Setting kernel argument. (i) returned %d\n",
892             status
893         );
894         return 1;
895     }
896 
897     /*n2*/
898     status = clSetKernelArg(GEStep3_kernel, 2, sizeof(int), (void *)&n2);
899     if (status != CL_SUCCESS) {
900         fprintf(stderr,
901             "Error: Setting kernel argument. (n2) returned %d\n",
902             status
903         );
904         return 1;
905     }
906 
907 	/*lda2*/
908     status = clSetKernelArg(GEStep3_kernel, 3, sizeof(int), (void *)&lda2);
909     if (status != CL_SUCCESS) {
910         fprintf(stderr,
911             "Error: Setting kernel argument. (lda2) returned %d\n",
912             status
913         );
914         return 1;
915     }
916 
917     /*
918      * Enqueue a kernel run call.
919      */
920     status = clEnqueueNDRangeKernel(commandQueue,
921                                     GEStep3_kernel,
922                                     1,
923                                     NULL,
924                                     globalThreads,
925                                     localThreads,
926                                     0,
927                                     NULL,
928                                     NULL);
929 
930     if (status != CL_SUCCESS) {
931         fprintf(stderr,
932             "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
933             status
934         );
935         return 1;
936     }
937 
938     clFinish(commandQueue);
939 
940     /* Enqueue readBuffer*/
941 	//Note: we are reading back from inputBuffer since AI is modified directly in kernel
942     status = clEnqueueReadBuffer(commandQueue,
943                                  inputBuffer,
944                                  CL_TRUE,
945                                  0,
946                                  globalThreads[0] * sizeof(cl_float),
947                                  AI,
948                                  0,
949                                  NULL,
950                                  NULL);
951     if (status != CL_SUCCESS) {
952         fprintf(stderr,
953             "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n",
954             status
955         );
956         return 1;
957     }
958 
959 	return 0;
960 }
961 
invertge(cl_float * AI_d,int lda,int n)962 void invertge(cl_float * AI_d, int lda, int n) {
963     int lda2 = lda * 2;
964     // perform elementary row operations till A in AI becomes identity matrix
965     for (int i = 0; i < n; i++) {
966         // execute kernel
967         run_GEStep1A_kernel(AI_d,i,n*2, lda2);
968     }
969 
970     for (int i = n-1; i >= 0; i--) {
971         cl_float diag = 1.0;
972         diag=AI_d[i*lda2+i];
973         // execute kernels
974         run_GEStep2_kernel(AI_d,diag,i,n*2, lda2);
975         run_GEStep3_kernel(AI_d,i,n*2, lda2);
976     }
977 }
978 
979 /* inverts nxn matrix input and stores the result in output */
invert(cl_float * input,cl_float * output,int n)980 void invert(cl_float * input, cl_float *output, int n) {
981     printf("starting inversion n = %d ", n);
982     volatile clock_t gputime;
983     gputime=clock();
984 
985     int lda = (((n+15)&(~15))|16);
986     cl_float * AI_d = (cl_float *)malloc(sizeof(cl_float)*n*lda*2);
987     memset(AI_d,0,sizeof(cl_float)*n*lda*2);
988     for (int i = 0; i < n; i++) {
989         memcpy(&AI_d[lda*i*2], &input[n*i], sizeof(cl_float)*n);
990         AI_d[lda*i*2+n+i] = 1;
991     }
992 
993     cl_int status;
994 
995     /////////////////////////////////////////////////////////////////
996     // Create OpenCL memory buffer
997     /////////////////////////////////////////////////////////////////
998     inputBuffer = clCreateBuffer(context,
999                                  CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1000                                  sizeof(cl_float) * globalThreads[0],
1001                                  AI_d,
1002                                  &status);
1003     if (status != CL_SUCCESS) {
1004         fprintf(stderr,
1005             "Error: clCreateBuffer (inputBuffer) returned %d\n",
1006             status
1007         );
1008         exit(0);
1009     }
1010 
1011     // Note: there's no output buffer. In kernel, AI_d is modified directly.
1012 	// Thus, we should read the result back to host from inputBuffer as well.
1013     invertge(AI_d, lda, n);
1014     gputime=clock()-gputime;
1015     fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
1016     fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
1017 
1018 #ifdef VERIFY
1019     // let's verify that
1020     cl_float error=0.0;
1021 
1022     // multiply inverse*xcopy, should be Identity matrix
1023     for (int k = 0; k < n; k++) {
1024         for (int j = 0; j < n; j++) {
1025             cl_float sum = 0;
1026             for (int i = 0; i < n; i++) {
1027                 sum += AI[j*lda*2+n+i]*A[i*n+k];
1028 	        }
1029             if (j!=k) {
1030                 error += sum * sum;
1031             } else {
1032                 error += (1.0-sum) * (1.0-sum);
1033             }
1034         }
1035     }
1036     fprintf(stderr, " %6.2f SSE", error);
1037 #endif
1038 
1039     //copy the result to output
1040     for (int i = 0; i < n; i++) {
1041         memcpy(&output[n*i], &AI_d[lda*i*2+n], sizeof(cl_float)*n);
1042     }
1043 
1044     status = clReleaseMemObject(inputBuffer);
1045     if (status != CL_SUCCESS) {
1046         fprintf(stderr,
1047             "Error: In clReleaseMemObject (inputBuffer) returned %d\n",
1048             status
1049         );
1050     }
1051 
1052     free(AI_d);
1053     fprintf(stderr," done!\n");
1054 }
1055