1 // This file is part of BOINC.
2 // http://boinc.berkeley.edu
3 // Copyright (C) 2013 University of California
4 //
5 // BOINC is free software; you can redistribute it and/or modify it
6 // under the terms of the GNU Lesser General Public License
7 // as published by the Free Software Foundation,
8 // either version 3 of the License, or (at your option) any later version.
9 //
10 // BOINC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 // See the GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with BOINC. If not, see <http://www.gnu.org/licenses/>.
17 //
18 // This program serves as both
19 // - An example BOINC-OpenCL application, illustrating the use of the BOINC API
20 // and OpenCL API.
21 // - A program for testing various features of BOINC.
22 //
23 // The program reads the input nxn matrix from the "input" file, inverts the
24 // matrix NUM_ITERATIONS times and write to "output" file.
25 //
26 // To run, place the executable in the same directory as an init_data.xml
27 // file specifying the gpu_type (vendor) and gpu_device_num, then invoke
28 // form the command line as follows:
29 // $ cd to/the/directory/containing/executable/and/init_data.mxl/file
30 // $ ./openclapp [options]
31 //
32 // command line options
33 // -run_slow: sleep 1 second after each character
34 // -cpu_time N: use about N CPU seconds after copying files
35 // -early_exit: exit(10) after 30 iterations
36 // -early_crash: crash after 30 iterations
37 //
38 // See http://boinc.berkeley.edu/trac/wiki/GPUApp for any compiling issues.
39 // Original contributor: Tuan Le (tuanle86@berkeley.edu)
40
41 #include "openclapp.hpp"
42 #include "boinc_opencl.h"
43
44 using std::string;
45
main(int argc,char * argv[])46 int main(int argc, char * argv[]) {
47 int i, retval, lastInversion=0, checkpointExists=0, matrixSize=0;
48 double fd;
49 char input_path[512], output_path[512], chkpt_path[512], buf[256];
50 MFILE out;
51 FILE* state, *infile;
52
53 generate_random_input_file(MATRIX_SIZE); //call this if you don't want to
54 //construct the input file manually
55 for (i=0; i<argc; i++) {
56 if (!strcmp(argv[i], "-early_exit")) early_exit = true;
57 if (!strcmp(argv[i], "-early_crash")) early_crash = true;
58 if (!strcmp(argv[i], "-run_slow")) run_slow = true;
59 if (!strcmp(argv[i], "-cpu_time")) {
60 cpu_time = atof(argv[++i]);
61 }
62 }
63
64 retval = boinc_init();
65 if (retval) {
66 fprintf(stderr,
67 "ERROR: %s boinc_init returned %d\n",
68 boinc_msg_prefix(buf, sizeof(buf)), retval
69 );
70 exit(retval);
71 }
72
73 // open the input file (resolve logical name first)
74 //
75 boinc_resolve_filename(INPUT_FILENAME, input_path, sizeof(input_path));
76 infile = boinc_fopen(input_path, "r");
77 if (!infile) {
78 fprintf(stderr,
79 "ERROR: %s Couldn't find input file, resolved name %s.\n",
80 boinc_msg_prefix(buf, sizeof(buf)), input_path
81 );
82 getchar();
83 exit(-1);
84 }
85
86 boinc_resolve_filename(OUTPUT_FILENAME, output_path, sizeof(output_path));
87
88 // See if there's a valid checkpoint file.
89 // If so retrieve the current matrix and inversion number
90 //
91 boinc_resolve_filename(CHECKPOINT_FILE, chkpt_path, sizeof(chkpt_path));
92 state = boinc_fopen(chkpt_path, "r");
93 if (state) {
94 printf("Checkpoint file is detected. Read from checkpoint file ... \n");
95 checkpointExists=fscanf(state, "%d", &lastInversion);
96 if (checkpointExists == 1) {
97 isStateFileInUse=true;
98 printf("Last inversion # is : %d\n",lastInversion);
99 fscanf(state,"%d",&matrixSize);
100 width=height=matrixSize;
101 printf("Initialize host ....\n");
102 initialize_host(state);
103 }
104 fclose(state);
105 } else {
106 printf("There's no valid checkpoint file!\n");
107 }
108
109 retval = out.open(output_path, "wb");
110
111 if (retval) {
112 fprintf(stderr,
113 "ERROR: %s APP: matrix_inversion output open failed:\n",
114 boinc_msg_prefix(buf, sizeof(buf))
115 );
116 fprintf(stderr,
117 "ERROR: %s resolved name %s, retval %d\n",
118 boinc_msg_prefix(buf, sizeof(buf)), output_path, retval
119 );
120 perror("open");
121 exit(1);
122 }
123
124 #ifdef APP_GRAPHICS
125 // create shared mem segment for graphics, and arrange to update it
126 //
127 shmem = (UC_SHMEM*)boinc_graphics_make_shmem("matrix_inversion", sizeof(UC_SHMEM));
128 if (!shmem) {
129 fprintf(stderr,
130 "ERROR: %s failed to create shared mem segment\n",
131 boinc_msg_prefix(buf, sizeof(buf))
132 );
133 }
134 update_shmem();
135 boinc_register_timer_callback(update_shmem);
136 #endif
137
138 if (checkpointExists != 1) { //checkpoint file is not found.
139 matrixSize=get_matrix_size(infile);
140 printf("Matrix Size: width = height = %d\n",matrixSize);
141 width=height=matrixSize;
142 // Initialize Host application
143 printf("Initialize host ....\n");
144 if (initialize_host(infile)==1) {
145 return 1;
146 }
147 out.printf("\n----------------- Before being inversed ----------------\n\n");
148 printf("Computation is running ... Inverse the matrix %d times. Start at inversion #1\n",
149 NUM_ITERATIONS);
150 } else {
151 out.printf("\n----------------- Last checkpointed inversion #%d ----------------\n\n",
152 lastInversion);
153 printf("Computation is resumed ... Inverse the matrix %d more times. Start at inversion #%d\n",
154 NUM_ITERATIONS-lastInversion,lastInversion+1);
155 }
156
157 // Initialize OpenCL resources
158 if (initialize_cl(argc, argv) != 0) {
159 return 1;
160 }
161
162 print_to_file(&out,input,matrixSize);
163
164 for (int i=lastInversion+1;i<=NUM_ITERATIONS;++i) {
165 //the invert function will trigger kernel calls.
166 invert(input,output,matrixSize);
167 printf("Finish inversion #%d\n",i);
168 for (int j=0;j<matrixSize*matrixSize;++j) {
169 input[j]=output[j]; //change the input for the next iteration
170 }
171 if (run_slow) {
172 boinc_sleep(1.);
173 }
174
175 if (early_exit && i>30) {
176 exit(-10);
177 }
178
179 if (early_crash && i>30) {
180 boinc_crash();
181 }
182
183 if (boinc_time_to_checkpoint()) {
184 printf("Perform checkpointing at inversion # %d\n",i);
185 //we'll need to write the current matrix to the state file.
186 retval = do_checkpoint(out, i, input, matrixSize);
187 if (retval) {
188 fprintf(stderr,
189 "ERROR: %s APP: matrix_inversion checkpoint failed %d\n",
190 boinc_msg_prefix(buf, sizeof(buf)), retval
191 );
192 exit(retval);
193 }
194 boinc_checkpoint_completed();
195 }
196 fd = i/NUM_ITERATIONS;
197 if (cpu_time) fd /= 2;
198 boinc_fraction_done(fd);
199 }
200
201 out.printf("\n\n----------------- Final inversion #%d ----------------\n\n",
202 NUM_ITERATIONS);
203 print_to_file(&out,output,matrixSize);
204
205 retval = out.flush(); //force the output file to be closed.
206 if (retval) {
207 fprintf(stderr,
208 "ERROR: %s APP: matrix_inversion flush failed %d\n",
209 boinc_msg_prefix(buf, sizeof(buf)), retval
210 );
211 exit(1);
212 }
213
214 // Releases OpenCL resources
215 if (cleanup_cl()==1) {
216 fprintf(stderr, "Error from cleanup_cl() !");
217 return 1;
218 }
219
220 // Release host resources
221 cleanup_host();
222
223 // burn up some CPU time if needed
224 //
225 if (cpu_time) {
226 printf("\nBurning up some CPU time ... \n");
227 double start = dtime();
228 for (int i=0; ; i++) {
229 double e = dtime()-start;
230 if (e > cpu_time) break;
231 fd = .5 + .5*(e/cpu_time);
232 boinc_fraction_done(fd);
233
234 if (boinc_time_to_checkpoint()) {
235 retval = do_checkpoint(out, NUM_ITERATIONS, input, matrixSize);
236 if (retval) {
237 fprintf(stderr,
238 "ERROR: %s APP: maxtrix_inversion checkpoint failed %d\n",
239 boinc_msg_prefix(buf, sizeof(buf)), retval
240 );
241 exit(1);
242 }
243 boinc_checkpoint_completed();
244 }
245 comp_result = do_a_giga_flop(i);
246 }
247 }
248 boinc_fraction_done(1);
249
250 #ifdef APP_GRAPHICS
251 update_shmem();
252 #endif
253
254 if (boinc_is_standalone()) {
255 printf("\nDone! Please press ENTER to exit. ");
256 getchar();
257 }
258 boinc_finish(0);
259 }
260
261
262 /*** BOINC FUNCTION DEFINITIONS ***/
263
264 /* Do a billion floating-point ops */
do_a_giga_flop(int foo)265 static double do_a_giga_flop(int foo) {
266 double x = 3.14159*foo;
267 int i;
268 for (i=0; i<500000000; i++) {
269 x += 5.12313123;
270 x *= 0.5398394834;
271 }
272 return x;
273 }
274
275 /* Save the computation state into checkpoint file */
do_checkpoint(MFILE & mf,int n,cl_float * input,int matrixSize)276 int do_checkpoint(MFILE& mf, int n, cl_float *input, int matrixSize) {
277 int retval;
278 string resolved_name;
279
280 FILE* f = fopen("temp", "w");
281 if (!f) return 1;
282 fprintf(f, "%d", n); //write inversion number
283 fprintf(f, " ");
284 fprintf(f, "%d", matrixSize); //write matrixSize
285 fprintf(f, " ");
286 for (int i=0;i<matrixSize*matrixSize;++i) {
287 fprintf(f, " ");
288 fprintf(f, "%f", input[i]);
289 }
290 fclose(f);
291 retval = mf.flush();
292 if (retval) return retval;
293 boinc_resolve_filename_s(CHECKPOINT_FILE, resolved_name);
294 retval = boinc_rename("temp", resolved_name.c_str());
295 if (retval) return retval;
296 return 0; //return 0 to indicate success.
297 }
298
299 /*** FUNCTION DEFINITIONS ***/
300
301 /* Create an input file filled with random data of type cl_float. */
generate_random_input_file(int n)302 void generate_random_input_file(int n) {
303 FILE *infile;
304
305 infile=fopen(INPUT_FILENAME,"w");
306 cl_float *input = (cl_float *)malloc(sizeof(cl_float)*(n*n));
307 srand(n);
308 for( int i = 0; i < n; i++ ) {
309 for (int j = 0; j < n; j++) {
310 input[i*n+j] = 2.0*(rand()%32768)/32768.0 - 1.0;
311 }
312 input[i*n+i] += sqrt((float)n);
313 }
314 int j=0;
315 for (int i=0;i<n*n;++i) {
316 fprintf(infile,"%15f",input[i]);
317 if (j+1==n) {
318 fprintf(infile,"\n");
319 j=0;
320 } else {
321 ++j;
322 }
323 }
324 fclose(infile);
325 free(input);
326 }
327
328 /*
329 * Parse the input file and determine the size of the matrix.
330 * This is an nxn matrix. Note: if width<> height, the matrix is
331 * non-invertible.
332 */
get_matrix_size(FILE * infile)333 int get_matrix_size(FILE *infile) {
334 int w=0;
335 char c;
336
337 fseek(infile,0,SEEK_SET);
338 while (true) {
339 do {
340 c=fgetc(infile);
341 if (c == EOF || c == '\n') {
342 goto exitLoop;
343 }
344 } while (isspace(c));
345
346 if (isdigit(c) || c=='.' || c=='-') {
347 ++w;
348 }
349
350 do {
351 c=fgetc(infile);
352 if (c == EOF || c == '\n') {
353 goto exitLoop;
354 }
355 } while (isdigit(c) || c=='.' || c=='-');
356
357 if (c==EOF || c == '\n') {
358 break;
359 }
360 }
361 exitLoop:
362 return w;
363 }
364
365 /*
366 * \brief Host Initialization
367 * Allocate and initialize memory
368 * on the host. Print input array.
369 */
initialize_host(FILE * infile)370 int initialize_host(FILE *infile) {
371 input = NULL;
372 output = NULL;
373
374 if (width!=height) {
375 fprintf(stderr, "Error: non nxn matrix cannot be invertiable.\n");
376 return 1;
377 }
378
379 /////////////////////////////////////////////////////////////////
380 // Allocate and initialize memory used by host
381 /////////////////////////////////////////////////////////////////
382 cl_uint sizeInBytes = width * height * sizeof(cl_float);
383 input = (cl_float *) malloc(sizeInBytes);
384 if (input == NULL) {
385 fprintf(stderr, "Error: Failed to allocate input memory on host\n");
386 return 1;
387 }
388
389 output = (cl_float *) malloc(sizeInBytes);
390 if(output == NULL) {
391 fprintf(stderr, "Error: Failed to allocate output memory on host\n");
392 return 1;
393 }
394
395 //fillRandom(input,width,height);
396 fetch_elements_into_host_memory(infile,input);
397 return 0;
398 }
399
400 /*
401 * Read the float values from input file into "input" array.
402 */
fetch_elements_into_host_memory(FILE * infile,cl_float * input)403 void fetch_elements_into_host_memory(FILE *infile, cl_float *input) {
404 float num=0;
405 int i=0;
406 if (!isStateFileInUse) {
407 fseek(infile,0,SEEK_SET);
408 }
409 while (fscanf(infile,"%f",&num)==1) {
410 input[i]=num;
411 ++i;
412 }
413 }
414
415 /*
416 * Converts the contents of a file into a string
417 */
convert_to_string(const char * fileName)418 char * convert_to_string(const char *fileName) {
419 int count=0;
420 char *s;
421 char c;
422 int i=0;
423
424 // look for "openclapp_kernels.cl" in "boinc/samples/openclapp/debug" or
425 // in "boinc/samples/openclapp/release". Note that "openclapp_kernels.cl"
426 // is automatically copied to these directories along the building process.
427 FILE *infile=fopen(fileName,"r");
428 if (!infile) { //not found. This typically happens on Linux or Mac.
429 //look for "openclapp_kernels.cl" in "boinc/sample/openclapp/" instead.
430 infile = fopen(KERNELS_FILEPATH,"r");
431 if (!infile) {
432 fprintf(stderr, "ERROR: Failed to open file %s!", fileName);
433 exit(0);
434 }
435 }
436 fseek(infile,0,SEEK_SET);
437 while (fgetc(infile)!=EOF) count++;
438 s=(char *) malloc(sizeof(char)*(count+1)); //add 1 for string terminator.
439 fseek(infile,0,SEEK_SET);
440 while ((c=fgetc(infile))!=EOF) {
441 s[i++]=c;
442 }
443 s[i]='\0';
444 fclose(infile);
445 return s;
446 }
447
448 /*
449 * \brief OpenCL related initialization
450 * Create Context, Device list, Command Queue
451 * Load CL file, compile, link CL source
452 * Build program and kernel objects
453 */
454
455 // Note: OpenCL memory buffer objects will be created in invert
456 // function before kernel calls are made.
initialize_cl(int argc,char * argv[])457 int initialize_cl(int argc, char * argv[]) {
458 cl_int status = 0;
459 int retval = 0;
460
461 localThreads[0] = LOCAL_WORK_SIZE;
462 globalThreads[0] = GLOBAL_WORK_SIZE;
463 cl_platform_id platform = NULL;
464 cl_device_id device;
465
466 // IMPORTANT NOTE: production applications should always specify
467 // the GPU type (vendor) in the call to boinc_get_opencl_ids as
468 // the third argument: it must be either PROC_TYPE_NVIDIA_GPU,
469 // PROC_TYPE_AMD_GPU or PROC_TYPE_INTEL_GPU. This is to support
470 // older versions of the BOINC client which do not include the
471 // <gpu-type> field in the init_data.xml file.
472 //
473 // This sample passes -1 for the type argument to allow using
474 // just one sample for any GPU vendor (AMD, NVIDIA or Intel.)
475 // As a result, the init_data.xml file for this sample must
476 // specify the GPU type (vendor) and either gpu_device_num (the
477 // GPU's index from that vendor) or gpu_opencl_dev_index (the
478 // GPU's index among OpenCL-capable devices from that vendor.)
479 //
480 // See the ReadMe file for more details, including an explanation
481 // of the difference between the gpu_device_num and the
482 // gpu_opencl_dev_index.
483 retval = boinc_get_opencl_ids(argc, argv, -1, &device, &platform);
484 if (retval) {
485 fprintf(stderr,
486 "Error: boinc_get_opencl_ids() failed with error %d\n",
487 retval
488 );
489 return 1;
490 }
491
492 cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM,
493 (cl_context_properties)platform,
494 0
495 };
496 context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
497 if (status != CL_SUCCESS) {
498 fprintf(stderr, "Error: clCreateContext() returned %d\n", status);
499 return 1;
500 }
501
502 /////////////////////////////////////////////////////////////////
503 // Create an OpenCL command queue
504 /////////////////////////////////////////////////////////////////
505 commandQueue = clCreateCommandQueue(context, device, 0, &status);
506 if(status != CL_SUCCESS) {
507 fprintf(stderr,
508 "Error: Creating Command Queue. (clCreateCommandQueue) returned %d\n",
509 status
510 );
511 return 1;
512 }
513
514 /////////////////////////////////////////////////////////////////
515 // Load CL file, build CL program object, create CL kernel object
516 /////////////////////////////////////////////////////////////////
517 source = convert_to_string(KERNELS_FILENAME);
518 size_t sourceSize[] = { strlen(source) };
519 program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status);
520 if (status != CL_SUCCESS) {
521 fprintf(stderr,
522 "Error: Loading Binary into cl_program (clCreateProgramWithBinary) returned %d\n",
523 status
524 );
525
526 return 1;
527 }
528
529 /* create a cl program executable for all the devices specified */
530 status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
531 if (status != CL_SUCCESS) {
532 fprintf(stderr,
533 "Error: Building Program (clBuildProgram) returned %d\n",
534 status
535 );
536
537 return 1;
538 }
539
540 /* get a kernel object handle for a kernel with the given name */
541 GEStep1A_kernel = clCreateKernel(program, "GEStep1A", &status);
542 if (status != CL_SUCCESS) {
543 fprintf(stderr,
544 "Error: clCreateKernel (GEStep1A) returned %d\n",
545 status
546 );
547
548 return 1;
549 }
550
551 GEStep2_kernel = clCreateKernel(program, "GEStep2", &status);
552 if (status != CL_SUCCESS) {
553 fprintf(stderr,
554 "Error: clCreateKernel (GEStep2) returned %d\n",
555 status
556 );
557
558 return 1;
559 }
560
561 GEStep3_kernel = clCreateKernel(program, "GEStep3", &status);
562 if (status != CL_SUCCESS) {
563 fprintf(stderr,
564 "Error: clCreateKernel (GEStep3) returned %d\n",
565 status
566 );
567
568 return 1;
569 }
570
571 return 0;
572 }
573
574 /*
575 * \brief Release OpenCL resources (Context, Memory etc.)
576 */
cleanup_cl(void)577 int cleanup_cl(void) {
578 cl_int status;
579
580 status = clReleaseKernel(GEStep1A_kernel);
581 if (status != CL_SUCCESS) {
582 fprintf(stderr,
583 "Error: In clReleaseKernel (GEStep1A_kernel) returned %d\n",
584 status
585 );
586 return 1;
587 }
588
589 status = clReleaseKernel(GEStep2_kernel);
590 if (status != CL_SUCCESS) {
591 fprintf(stderr,
592 "Error: In clReleaseKernel (GEStep2_kernel) returned %d\n",
593 status
594 );
595 return 1;
596 }
597
598 status = clReleaseKernel(GEStep3_kernel);
599 if (status != CL_SUCCESS) {
600 fprintf(stderr,
601 "Error: In clReleaseKernel (GEStep3_kernel) returned %d\n",
602 status
603 );
604 return 1;
605 }
606
607 status = clReleaseProgram(program);
608 if (status != CL_SUCCESS) {
609 fprintf(stderr,
610 "Error: clReleaseProgram returned %d\n",
611 status
612 );
613 return 1;
614 }
615
616 status = clReleaseCommandQueue(commandQueue);
617 if (status != CL_SUCCESS) {
618 fprintf(stderr,
619 "Error: In clReleaseCommandQueue returned %d\n",
620 status
621 );
622 return 1;
623 }
624
625 status = clReleaseContext(context);
626 if (status != CL_SUCCESS) {
627 fprintf(stderr,
628 "Error: In clReleaseContext returned %d\n",
629 status
630 );
631 return 1;
632 }
633
634 return 0;
635 }
636
637 /*
638 * \brief Releases program's resources
639 */
cleanup_host(void)640 void cleanup_host(void) {
641 if (input != NULL) {
642 free(input);
643 input = NULL;
644 }
645
646 if (output != NULL) {
647 free(output);
648 output = NULL;
649 }
650
651 if (source != NULL) {
652 free((char *)source);
653 source = NULL;
654 }
655 }
656
657 /*
658 * Write the result to output file
659 */
print_to_file(MFILE * out,float * h_odata,int n)660 void print_to_file(MFILE *out, float *h_odata, int n) {
661 int count=0;
662 int move=0;
663 int num_elements=n*n;
664 while (num_elements>0) {
665 out->printf("%15f ",h_odata[move]);
666 ++count;
667 ++move;
668 if (count==n) {
669 out->printf("\n");
670 count=0;
671 }
672 --num_elements;
673 }
674 }
675
676 /*
677 * \brief Run OpenCL program
678 *
679 * Bind host variables to kernel arguments
680 * Run the CL kernel
681 */
run_GEStep1A_kernel(cl_float * AI,int i,int n2,int lda2)682 int run_GEStep1A_kernel(cl_float * AI, int i, int n2, int lda2) {
683 cl_int status;
684
685 /*
686 * the input array to the kernel. This array will eventually be modified
687 * to the inverted array.
688 */
689 status = clSetKernelArg(GEStep1A_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
690 if (status != CL_SUCCESS) {
691 fprintf(stderr,
692 "Error: Setting kernel argument. (input) returned %d\n",
693 status
694 );
695 return 1;
696 }
697
698 /*i*/
699 status = clSetKernelArg(GEStep1A_kernel, 1, sizeof(int), (void *)&i);
700 if (status != CL_SUCCESS) {
701 fprintf(stderr,
702 "Error: Setting kernel argument. (i) returned %d\n",
703 status
704 );
705 return 1;
706 }
707
708 /*n2*/
709 status = clSetKernelArg(GEStep1A_kernel, 2, sizeof(int), (void *)&n2);
710 if (status != CL_SUCCESS) {
711 fprintf(stderr,
712 "Error: Setting kernel argument. (n2) returned %d\n",
713 status
714 );
715 return 1;
716 }
717
718 /*lda2*/
719 status = clSetKernelArg(GEStep1A_kernel, 3, sizeof(int), (void *)&lda2);
720 if (status != CL_SUCCESS) {
721 fprintf(stderr,
722 "Error: Setting kernel argument. (lda2) returned %d\n",
723 status
724 );
725 return 1;
726 }
727
728 /*
729 * Enqueue a kernel run call.
730 */
731 status = clEnqueueNDRangeKernel(commandQueue,
732 GEStep1A_kernel,
733 1,
734 NULL,
735 globalThreads,
736 localThreads,
737 0,
738 NULL,
739 NULL);
740
741 if (status != CL_SUCCESS) {
742 fprintf(stderr,
743 "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
744 status
745 );
746 return 1;
747 }
748
749 clFinish(commandQueue);
750
751 /* Enqueue readBuffer*/ //Note: we are reading back from inputBuffer since AI is modified directly in kernel
752 status = clEnqueueReadBuffer(commandQueue,
753 inputBuffer,
754 CL_FALSE,
755 0,
756 globalThreads[0] * sizeof(cl_float),
757 AI,
758 0,
759 NULL,
760 NULL);
761
762 if(status != CL_SUCCESS) {
763 fprintf(stderr,
764 "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n",
765 status
766 );
767 return 1;
768 }
769
770 return 0;
771 }
772
run_GEStep2_kernel(cl_float * AI,cl_float diag,int i,int n2,int lda2)773 int run_GEStep2_kernel(cl_float * AI, cl_float diag, int i, int n2, int lda2) {
774 cl_int status;
775
776 /*
777 * the input array to the kernel. This array will eventually be modified
778 * to the inverted array.
779 */
780 status = clSetKernelArg(GEStep2_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
781 if (status != CL_SUCCESS) {
782 fprintf(stderr,
783 "Error: Setting kernel argument. (AI) returned %d\n",
784 status
785 );
786 return 1;
787 }
788
789 /*diag*/
790 status = clSetKernelArg(GEStep2_kernel, 1, sizeof(cl_float), (void *)&diag);
791 if (status != CL_SUCCESS) {
792 fprintf(stderr,
793 "Error: Setting kernel argument. (diag) returned %d\n",
794 status
795 );
796 return 1;
797 }
798
799 /*i*/
800 status = clSetKernelArg(GEStep2_kernel, 2, sizeof(int), (void *)&i);
801 if (status != CL_SUCCESS) {
802 fprintf(stderr,
803 "Error: Setting kernel argument. (i) returned %d\n",
804 status
805 );
806 return 1;
807 }
808
809 /*n2*/
810 status = clSetKernelArg(GEStep2_kernel, 3, sizeof(int), (void *)&n2);
811 if (status != CL_SUCCESS) {
812 fprintf(stderr,
813 "Error: Setting kernel argument. (n2) returned %d\n",
814 status
815 );
816 return 1;
817 }
818
819 /*lda2*/
820 status = clSetKernelArg(GEStep2_kernel, 4, sizeof(int), (void *)&lda2);
821 if (status != CL_SUCCESS) {
822 fprintf(stderr,
823 "Error: Setting kernel argument. (lda2) returned %d\n",
824 status
825 );
826 return 1;
827 }
828
829 /*
830 * Enqueue a kernel run call.
831 */
832 status = clEnqueueNDRangeKernel(commandQueue,
833 GEStep2_kernel,
834 1,
835 NULL,
836 globalThreads,
837 localThreads,
838 0,
839 NULL,
840 NULL);
841
842 if (status != CL_SUCCESS) {
843 fprintf(stderr,
844 "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
845 status
846 );
847 return 1;
848 }
849
850 clFinish(commandQueue);
851
852 /* Enqueue readBuffer*/
853 //Note: we are reading back from inputBuffer since AI is modified directly in kernel
854 status = clEnqueueReadBuffer(commandQueue,
855 inputBuffer,
856 CL_FALSE,
857 0,
858 globalThreads[0] * sizeof(cl_float),
859 AI,
860 0,
861 NULL,
862 NULL);
863 if (status != CL_SUCCESS) {
864 fprintf(stderr, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n", status);
865 return 1;
866 }
867
868 return 0;
869 }
870
run_GEStep3_kernel(cl_float * AI,int i,int n2,int lda2)871 int run_GEStep3_kernel(cl_float * AI, int i, int n2, int lda2) {
872 cl_int status;
873
874 /*
875 * The input array to the kernel. This array will eventually be modified
876 * to the inverted array.
877 */
878 status = clSetKernelArg(GEStep3_kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
879 if (status != CL_SUCCESS) {
880 fprintf(stderr,
881 "Error: Setting kernel argument. (input) returned %d\n",
882 status
883 );
884 return 1;
885 }
886
887 /*i*/
888 status = clSetKernelArg(GEStep3_kernel, 1, sizeof(int), (void *)&i);
889 if (status != CL_SUCCESS) {
890 fprintf(stderr,
891 "Error: Setting kernel argument. (i) returned %d\n",
892 status
893 );
894 return 1;
895 }
896
897 /*n2*/
898 status = clSetKernelArg(GEStep3_kernel, 2, sizeof(int), (void *)&n2);
899 if (status != CL_SUCCESS) {
900 fprintf(stderr,
901 "Error: Setting kernel argument. (n2) returned %d\n",
902 status
903 );
904 return 1;
905 }
906
907 /*lda2*/
908 status = clSetKernelArg(GEStep3_kernel, 3, sizeof(int), (void *)&lda2);
909 if (status != CL_SUCCESS) {
910 fprintf(stderr,
911 "Error: Setting kernel argument. (lda2) returned %d\n",
912 status
913 );
914 return 1;
915 }
916
917 /*
918 * Enqueue a kernel run call.
919 */
920 status = clEnqueueNDRangeKernel(commandQueue,
921 GEStep3_kernel,
922 1,
923 NULL,
924 globalThreads,
925 localThreads,
926 0,
927 NULL,
928 NULL);
929
930 if (status != CL_SUCCESS) {
931 fprintf(stderr,
932 "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel) returned %d\n",
933 status
934 );
935 return 1;
936 }
937
938 clFinish(commandQueue);
939
940 /* Enqueue readBuffer*/
941 //Note: we are reading back from inputBuffer since AI is modified directly in kernel
942 status = clEnqueueReadBuffer(commandQueue,
943 inputBuffer,
944 CL_TRUE,
945 0,
946 globalThreads[0] * sizeof(cl_float),
947 AI,
948 0,
949 NULL,
950 NULL);
951 if (status != CL_SUCCESS) {
952 fprintf(stderr,
953 "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer) returned %d\n",
954 status
955 );
956 return 1;
957 }
958
959 return 0;
960 }
961
invertge(cl_float * AI_d,int lda,int n)962 void invertge(cl_float * AI_d, int lda, int n) {
963 int lda2 = lda * 2;
964 // perform elementary row operations till A in AI becomes identity matrix
965 for (int i = 0; i < n; i++) {
966 // execute kernel
967 run_GEStep1A_kernel(AI_d,i,n*2, lda2);
968 }
969
970 for (int i = n-1; i >= 0; i--) {
971 cl_float diag = 1.0;
972 diag=AI_d[i*lda2+i];
973 // execute kernels
974 run_GEStep2_kernel(AI_d,diag,i,n*2, lda2);
975 run_GEStep3_kernel(AI_d,i,n*2, lda2);
976 }
977 }
978
979 /* inverts nxn matrix input and stores the result in output */
invert(cl_float * input,cl_float * output,int n)980 void invert(cl_float * input, cl_float *output, int n) {
981 printf("starting inversion n = %d ", n);
982 volatile clock_t gputime;
983 gputime=clock();
984
985 int lda = (((n+15)&(~15))|16);
986 cl_float * AI_d = (cl_float *)malloc(sizeof(cl_float)*n*lda*2);
987 memset(AI_d,0,sizeof(cl_float)*n*lda*2);
988 for (int i = 0; i < n; i++) {
989 memcpy(&AI_d[lda*i*2], &input[n*i], sizeof(cl_float)*n);
990 AI_d[lda*i*2+n+i] = 1;
991 }
992
993 cl_int status;
994
995 /////////////////////////////////////////////////////////////////
996 // Create OpenCL memory buffer
997 /////////////////////////////////////////////////////////////////
998 inputBuffer = clCreateBuffer(context,
999 CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1000 sizeof(cl_float) * globalThreads[0],
1001 AI_d,
1002 &status);
1003 if (status != CL_SUCCESS) {
1004 fprintf(stderr,
1005 "Error: clCreateBuffer (inputBuffer) returned %d\n",
1006 status
1007 );
1008 exit(0);
1009 }
1010
1011 // Note: there's no output buffer. In kernel, AI_d is modified directly.
1012 // Thus, we should read the result back to host from inputBuffer as well.
1013 invertge(AI_d, lda, n);
1014 gputime=clock()-gputime;
1015 fprintf(stderr, " %7.1f ms ",gputime/1.e3f);
1016 fprintf(stderr, " %7.2f Gflops", 1e-3*(3.0)*n*n*n/3.0/gputime);
1017
1018 #ifdef VERIFY
1019 // let's verify that
1020 cl_float error=0.0;
1021
1022 // multiply inverse*xcopy, should be Identity matrix
1023 for (int k = 0; k < n; k++) {
1024 for (int j = 0; j < n; j++) {
1025 cl_float sum = 0;
1026 for (int i = 0; i < n; i++) {
1027 sum += AI[j*lda*2+n+i]*A[i*n+k];
1028 }
1029 if (j!=k) {
1030 error += sum * sum;
1031 } else {
1032 error += (1.0-sum) * (1.0-sum);
1033 }
1034 }
1035 }
1036 fprintf(stderr, " %6.2f SSE", error);
1037 #endif
1038
1039 //copy the result to output
1040 for (int i = 0; i < n; i++) {
1041 memcpy(&output[n*i], &AI_d[lda*i*2+n], sizeof(cl_float)*n);
1042 }
1043
1044 status = clReleaseMemObject(inputBuffer);
1045 if (status != CL_SUCCESS) {
1046 fprintf(stderr,
1047 "Error: In clReleaseMemObject (inputBuffer) returned %d\n",
1048 status
1049 );
1050 }
1051
1052 free(AI_d);
1053 fprintf(stderr," done!\n");
1054 }
1055