1 /* ************************************************************************ 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * ************************************************************************/ 16 17 18 #pragma once 19 #if !defined( CLFFT_CLTRANSFORM_H ) 20 #define CLFFT_CLTRANSFORM_H 21 22 #include <iostream> 23 #include <vector> 24 #include "clFFT.h" 25 #include "../library/private.h" 26 #include "../client/openCL.misc.h" 27 #include "buffer.h" 28 #include "test_constants.h" 29 30 // Custom deleter functions for our unique_ptr smart pointer class 31 struct clMem_deleter 32 { operatorclMem_deleter33 template <class T> void operator()(T* clMemObj) 34 { 35 if( clMemObj != NULL ) 36 OPENCL_V_THROW( ::clReleaseMemObject( clMemObj ), "Error: In clReleaseMemObject\n" ); 37 }; 38 }; 39 40 struct plan_handle_deleter 41 { operatorplan_handle_deleter42 template <class T> void operator()(T* handle) 43 { 44 if( *handle ) 45 { 46 clfftDestroyPlan( handle ); 47 } 48 clfftTeardown( ); // when multi-GPU tests are written, this will need to occur in the gtest cleanup 49 }; 50 }; 51 52 struct clEvent_deleter 53 { operatorclEvent_deleter54 template <class T> void operator()(T* clEventObj) 55 { 56 if( clEventObj != NULL ) 57 OPENCL_V_THROW( clReleaseEvent( clEventObj ), "Error: In clReleaseEvent\n" ); 58 }; 59 }; 60 61 struct clCommQueue_deleter 62 { operatorclCommQueue_deleter63 template <class T> void operator()(T* clQueueObj) 64 { 65 if( clQueueObj != NULL ) 66 OPENCL_V_THROW( clReleaseCommandQueue( clQueueObj ), "Error: In clReleaseCommandQueue\n" ); 67 }; 68 }; 69 70 struct clContext_deleter 71 { operatorclContext_deleter72 template <class T> void operator()(T* clContextObj) 73 { 74 if( clContextObj != NULL ) 75 OPENCL_V_THROW( clReleaseContext( clContextObj ), "Error: In clReleaseContext\n" ); 76 }; 77 }; 78 79 template <class T> 80 class Precision_Setter 81 { 82 public: Precision_Setter(clfftPlanHandle plan_handle)83 Precision_Setter(clfftPlanHandle plan_handle) 84 { 85 throw std::runtime_error("Precision_Setter: this code path should never be executed"); 86 } 87 88 private: Precision_Setter()89 Precision_Setter(){} 90 }; 91 92 template<> 93 class Precision_Setter<float> 94 { 95 public: Precision_Setter(clfftPlanHandle plan_handle)96 Precision_Setter(clfftPlanHandle plan_handle) 97 { 98 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanPrecision( plan_handle, CLFFT_SINGLE )); 99 } 100 101 private: Precision_Setter()102 Precision_Setter(){} 103 }; 104 105 template<> 106 class Precision_Setter<double> 107 { 108 public: Precision_Setter(clfftPlanHandle plan_handle)109 Precision_Setter(clfftPlanHandle plan_handle) 110 { 111 clfftStatus ret = clfftSetPlanPrecision( plan_handle, CLFFT_DOUBLE ); 112 113 // If device does not support double precision, skip this test, don't fail it 114 if( ret == CLFFT_DEVICE_NO_DOUBLE ) 115 throw std::runtime_error("CLFFT_DEVICE_NO_DOUBLE"); 116 117 EXPECT_EQ( CLFFT_SUCCESS, ret ); 118 } 119 120 private: Precision_Setter()121 Precision_Setter(){} 122 }; 123 124 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/ 125 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/ 126 template <class T, class cl_T> 127 class clfft { 128 private: 129 clfftLayout _input_layout, _output_layout; 130 clfftResultLocation _placeness; 131 132 buffer<T> input; 133 buffer<T> output; 134 135 size_t number_of_data_points; 136 T _forward_scale, _backward_scale; 137 cl_uint commandQueueFlags; 138 bool init_failure; 139 bool dataset_too_large; 140 141 cl_device_type deviceType; 142 std::unique_ptr< clfftPlanHandle, plan_handle_deleter > plan_handle; 143 144 clfftDirection _transformation_direction; 145 clfftDim dimension; 146 147 std::vector<size_t> lengths; 148 149 static const bool printInfo = false; 150 151 // OpenCL resources that need to be carefully managed 152 std::unique_ptr< _cl_context, clContext_deleter > context; 153 std::unique_ptr< _cl_command_queue, clCommQueue_deleter > queue; 154 std::vector< std::unique_ptr< _cl_mem, clMem_deleter > > cl_mem_input; 155 std::vector< std::unique_ptr< _cl_mem, clMem_deleter > > cl_mem_output; 156 std::vector< cl_device_id > device_id; 157 public: 158 /*****************************************************/ clfft(const clfftDim dimensions_in,const size_t * lengths_in,const size_t * input_strides_in,const size_t * output_strides_in,const size_t batch_size_in,const size_t input_distance_in,const size_t output_distance_in,const clfftLayout input_layout_in,const clfftLayout output_layout_in,const clfftResultLocation placeness_in)159 clfft( const clfftDim dimensions_in, const size_t* lengths_in, 160 const size_t* input_strides_in, const size_t* output_strides_in, 161 const size_t batch_size_in, 162 const size_t input_distance_in, const size_t output_distance_in, 163 const clfftLayout input_layout_in, const clfftLayout output_layout_in, 164 const clfftResultLocation placeness_in ) 165 try 166 : _input_layout( input_layout_in ) 167 , _output_layout( output_layout_in ) 168 , _placeness( placeness_in ) 169 , input( static_cast<size_t>(dimensions_in), 170 lengths_in, 171 input_strides_in, 172 batch_size_in, 173 input_distance_in, 174 cl_layout_to_buffer_layout( _input_layout ), 175 _placeness 176 ) 177 , output( static_cast<size_t>(dimensions_in), 178 lengths_in, 179 output_strides_in, 180 batch_size_in, 181 output_distance_in, 182 cl_layout_to_buffer_layout( _output_layout ), 183 _placeness 184 ) 185 , number_of_data_points( input.number_of_data_points()) 186 , _forward_scale( 1.0f ) 187 , _backward_scale( 1.0f/T(number_of_data_points) ) 188 , commandQueueFlags( 0 ) 189 , init_failure( false ) 190 , dataset_too_large( false ) 191 , deviceType( 0 ) 192 , plan_handle( new clfftPlanHandle ) 193 , _transformation_direction( ENDDIRECTION ) 194 , dimension( dimensions_in ) 195 196 { 197 if( _placeness == CLFFT_INPLACE ) 198 { 199 if( ( is_real( _input_layout ) && is_planar( _output_layout ) ) || 200 ( is_planar( _input_layout ) && is_real( _output_layout ) ) ) 201 { 202 throw std::runtime_error( "in-place transforms may not be real<->planar" ); 203 } 204 } 205 206 *plan_handle = 0; 207 clfftSetupData setupData; 208 clfftInitSetupData( &setupData ); 209 clfftSetup( &setupData ); 210 211 for( int i = 0; i < max_dimension; i++ ) 212 { 213 if( i < dimension ) 214 lengths.push_back( lengths_in[i] ); 215 else 216 lengths.push_back( 1 ); 217 } 218 219 initialize_openCL(); 220 initialize_plan(); 221 } catch(const std::exception &)222 catch( const std::exception& ) { 223 throw; 224 } 225 226 /*****************************************************/ ~clfft()227 ~clfft() 228 {} 229 230 /*****************************************************/ is_real(const clfftLayout layout)231 bool is_real( const clfftLayout layout ) 232 { 233 return layout == CLFFT_REAL; 234 } 235 236 /*****************************************************/ is_planar(const clfftLayout layout)237 bool is_planar( const clfftLayout layout ) 238 { 239 return (layout == CLFFT_COMPLEX_PLANAR || layout == CLFFT_HERMITIAN_PLANAR); 240 } 241 242 /*****************************************************/ is_interleaved(const clfftLayout layout)243 bool is_interleaved( const clfftLayout layout ) 244 { 245 return (layout == CLFFT_COMPLEX_INTERLEAVED || layout == CLFFT_HERMITIAN_INTERLEAVED); 246 } 247 248 /*****************************************************/ is_complex(const clfftLayout layout)249 bool is_complex( const clfftLayout layout ) 250 { 251 return (layout == CLFFT_COMPLEX_INTERLEAVED || layout == CLFFT_COMPLEX_PLANAR); 252 } 253 254 /*****************************************************/ is_hermitian(const clfftLayout layout)255 bool is_hermitian( const clfftLayout layout ) 256 { 257 return (layout == CLFFT_HERMITIAN_INTERLEAVED || layout == CLFFT_HERMITIAN_PLANAR); 258 } 259 260 /*****************************************************/ initialize_openCL()261 void initialize_openCL() { 262 try 263 { 264 cl_context tempContext = NULL; 265 device_id = initializeCL( 266 g_device_type, 267 g_device_id, 268 g_platform_id, 269 tempContext, 270 printInfo 271 ); 272 context = std::unique_ptr< _cl_context, clContext_deleter >( tempContext ); 273 274 if( input.size_in_bytes() > cl_device_max_memory_to_allocate(0) || 275 output.size_in_bytes() > cl_device_max_memory_to_allocate(0)) 276 { 277 throw std::runtime_error("problem too large for device"); 278 } 279 280 cl_int status = 0; 281 queue = std::unique_ptr< _cl_command_queue, clCommQueue_deleter >( 282 ::clCreateCommandQueue( context.get( ), device_id[ 0 ], commandQueueFlags, &status ) ); 283 OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" ); 284 285 // make the new buffer 286 const size_t bufferSizeBytes = input.size_in_bytes( ); 287 288 for( cl_int i = 0; i < CLFFT_COMPLEX_INTERLEAVED; ++i ) 289 { 290 cl_int status = 0; 291 std::unique_ptr< _cl_mem, clMem_deleter > inBuff( 292 ::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, bufferSizeBytes, NULL, &status) ); 293 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 294 295 cl_mem_input.push_back( std::move( inBuff ) ); 296 297 std::unique_ptr< _cl_mem, clMem_deleter > outBuff( 298 ::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, bufferSizeBytes, NULL, &status) ); 299 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 300 301 cl_mem_output.push_back( std::move( outBuff ) ); 302 } 303 } 304 catch( const std::exception& ) 305 { 306 throw; 307 } 308 } 309 310 /*****************************************************/ initialize_plan()311 void initialize_plan() 312 { 313 EXPECT_EQ( CLFFT_SUCCESS, clfftCreateDefaultPlan( plan_handle.get(), context.get( ), dimension, &lengths[0] ) ); 314 set_layouts( _input_layout, _output_layout ); 315 placeness( _placeness ); 316 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanInStride( *plan_handle, dimension, input.strides())); 317 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanOutStride( *plan_handle, dimension, output.strides())); 318 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanBatchSize( *plan_handle, input.batch_size())); 319 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanDistance( *plan_handle, input.distance(), output.distance())); 320 Precision_Setter<T> setter(*plan_handle); 321 } 322 323 /*****************************************************/ input_strides_plaintext()324 std::string input_strides_plaintext() 325 { 326 size_t strides[3]; 327 clfftGetPlanInStride( *plan_handle, dimension, &strides[0] ); 328 329 std::ostringstream my_strides_stream; 330 331 for( int i = 0; i < dimension; i++ ) 332 my_strides_stream << strides[i] << " "; 333 334 std::string my_strides( my_strides_stream.str() ); 335 my_strides.erase( my_strides.end() - 1 ); // chomp off trailing space 336 337 return my_strides; 338 } 339 340 /*****************************************************/ output_strides_plaintext()341 std::string output_strides_plaintext() 342 { 343 size_t strides[3]; 344 clfftGetPlanOutStride( *plan_handle, dimension, &strides[0] ); 345 346 std::ostringstream my_strides_stream; 347 348 for( int i = 0; i < dimension; i++ ) 349 my_strides_stream << strides[i] << " "; 350 351 std::string my_strides( my_strides_stream.str() ); 352 my_strides.erase( my_strides.end() - 1 ); // chomp off trailing space 353 354 return my_strides; 355 } 356 357 /*****************************************************/ lengths_plaintext()358 std::string lengths_plaintext() 359 { 360 size_t lengths[3]; 361 clfftGetPlanLength( *plan_handle, dimension, &lengths[0] ); 362 363 std::ostringstream my_lengths_stream; 364 365 for( int i = 0; i < dimension; i++ ) 366 my_lengths_stream << lengths[i] << " "; 367 368 std::string my_lengths( my_lengths_stream.str() ); 369 my_lengths.erase( my_lengths.end() - 1 ); // chomp off trailing space 370 371 return my_lengths; 372 } 373 374 /*****************************************************/ layout_plaintext(clfftLayout layout)375 std::string layout_plaintext( clfftLayout layout ) 376 { 377 switch( layout ) 378 { 379 case CLFFT_REAL: 380 return "real"; 381 case CLFFT_HERMITIAN_INTERLEAVED: 382 return "hermitian interleaved"; 383 case CLFFT_HERMITIAN_PLANAR: 384 return "hermitian planar"; 385 case CLFFT_COMPLEX_INTERLEAVED: 386 return "complex interleaved"; 387 case CLFFT_COMPLEX_PLANAR: 388 return "complex planar"; 389 default: 390 throw std::runtime_error( "invalid layout in layout_plaintext()" ); 391 } 392 } 393 394 /*****************************************************/ refresh_plan()395 void refresh_plan() 396 { 397 clfftDestroyPlan(plan_handle.get()); 398 initialize_plan(); 399 } 400 401 /*****************************************************/ cl_layout_to_buffer_layout(clfftLayout cl_layout)402 layout::buffer_layout_t cl_layout_to_buffer_layout( clfftLayout cl_layout ) 403 { 404 if( cl_layout == CLFFT_REAL ) 405 return layout::real; 406 else if( cl_layout == CLFFT_HERMITIAN_PLANAR ) 407 return layout::hermitian_planar; 408 else if( cl_layout == CLFFT_COMPLEX_PLANAR ) 409 return layout::complex_planar; 410 else if( cl_layout == CLFFT_HERMITIAN_INTERLEAVED ) 411 return layout::hermitian_interleaved; 412 else if( cl_layout == CLFFT_COMPLEX_INTERLEAVED ) 413 return layout::complex_interleaved; 414 else 415 throw std::runtime_error( "invalid cl_layout" ); 416 } 417 418 /*****************************************************/ verbose_output()419 void verbose_output() 420 { 421 if(verbose) 422 { 423 std::cout << "transform parameters as seen by clfft:" << std::endl; 424 425 clfftDim dim; 426 cl_uint dimensions; 427 clfftGetPlanDim( *plan_handle, &dim, &dimensions ); 428 429 std::cout << dimensions << " dimension(s): " << lengths_plaintext() << std::endl; 430 431 size_t batch; 432 clfftGetPlanBatchSize( *plan_handle, &batch ); 433 std::cout << "batch: " << batch << std::endl; 434 435 clfftPrecision precision; 436 clfftGetPlanPrecision( *plan_handle, &precision ); 437 if( precision == CLFFT_SINGLE ) std::cout << "single precision" << std::endl; 438 else if( precision == CLFFT_DOUBLE ) std::cout << "double precision" << std::endl; 439 else throw std::runtime_error( "can't figure out the precision in verbose_output()" ); 440 441 if( placeness() == CLFFT_INPLACE ) std::cout << "in-place" << std::endl; 442 else std::cout << "out-of-place" << std::endl; 443 444 get_layouts(); 445 std::cout << layout_plaintext(_input_layout) << " -> " << layout_plaintext(_output_layout) << std::endl; 446 447 std::cout << "input stride(s): " << input_strides_plaintext() << std::endl; 448 std::cout << "output stride(s): " << output_strides_plaintext() << std::endl; 449 450 size_t input_distance, output_distance; 451 clfftGetPlanDistance( *plan_handle, &input_distance, &output_distance ); 452 std::cout << "input distance: " << input_distance << std::endl; 453 std::cout << "output distance: " << output_distance << std::endl; 454 } 455 } 456 457 /*****************************************************/ placeness()458 clfftResultLocation placeness() { 459 clfftResultLocation res; 460 EXPECT_EQ( CLFFT_SUCCESS, clfftGetResultLocation( *plan_handle, &res ) ); 461 return res; 462 } 463 464 /*****************************************************/ set_forward_transform()465 void set_forward_transform() { 466 _transformation_direction = CLFFT_FORWARD; 467 } 468 469 /*****************************************************/ set_backward_transform()470 void set_backward_transform() { 471 _transformation_direction = CLFFT_BACKWARD; 472 } 473 474 /*****************************************************/ set_transposed()475 void set_transposed() { 476 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanTransposeResult( *plan_handle, CLFFT_TRANSPOSED ) ); 477 } 478 479 /*****************************************************/ set_layouts(clfftLayout new_input_layout,clfftLayout new_output_layout)480 void set_layouts( clfftLayout new_input_layout, clfftLayout new_output_layout ) 481 { 482 cl_mem_input.clear( ); 483 cl_mem_output.clear( ); 484 485 // make the new input buffer 486 const size_t input_buffer_size_in_bytes = input.size_in_bytes(); 487 488 size_t number_of_input_buffers; 489 490 if( is_planar( new_input_layout ) ) 491 number_of_input_buffers = 2; 492 else if( is_real( new_input_layout ) || is_interleaved( new_input_layout ) ) 493 number_of_input_buffers = 1; 494 else 495 throw std::runtime_error( "we shouldn't make it here [set_layouts(), input]" ); 496 497 for( size_t i = 0; i < number_of_input_buffers; ++i ) 498 { 499 cl_int status = 0; 500 std::unique_ptr< _cl_mem, clMem_deleter > buff( 501 ::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, input_buffer_size_in_bytes, NULL, &status) ); 502 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 503 504 cl_mem_input.push_back( std::move( buff ) ); 505 } 506 507 // make the new output buffer 508 const size_t output_buffer_size_in_bytes = output.size_in_bytes(); 509 510 size_t number_of_output_buffers; 511 512 if( is_planar( new_output_layout ) ) 513 number_of_output_buffers = 2; 514 else if( is_real( new_output_layout ) || is_interleaved( new_output_layout ) ) 515 number_of_output_buffers = 1; 516 else 517 throw std::runtime_error( "we shouldn't make it here [set_layouts(), input]" ); 518 519 for( size_t i = 0; i < number_of_output_buffers; ++i ) 520 { 521 cl_int status = 0; 522 std::unique_ptr< _cl_mem, clMem_deleter > buff( 523 ::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, output_buffer_size_in_bytes, NULL, &status) ); 524 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 525 526 cl_mem_output.push_back( std::move( buff ) ); 527 } 528 529 EXPECT_EQ( CLFFT_SUCCESS, clfftSetLayout( *plan_handle, new_input_layout, new_output_layout ) ); 530 get_layouts(); 531 } 532 533 /*****************************************************/ 534 // swap_layouts should only be used with in-place real-to-complex or complex-to-real transforms swap_layouts()535 void swap_layouts() 536 { 537 get_layouts(); 538 clfftLayout new_input_layout = _output_layout; 539 clfftLayout new_output_layout = _input_layout; 540 541 EXPECT_EQ( CLFFT_SUCCESS, clfftSetLayout( *plan_handle, new_input_layout, new_output_layout ) ); 542 get_layouts(); 543 544 refresh_plan(); 545 } 546 547 /*****************************************************/ input_layout()548 clfftLayout input_layout() { 549 get_layouts(); 550 return _input_layout; 551 } 552 553 /*****************************************************/ output_layout()554 clfftLayout output_layout() { 555 get_layouts(); 556 return _output_layout; 557 } 558 559 /*****************************************************/ forward_scale(T in)560 void forward_scale( T in ) { 561 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanScale( *plan_handle, CLFFT_FORWARD, static_cast<float>( in ) ) ); 562 _forward_scale = forward_scale(); 563 } 564 565 /*****************************************************/ backward_scale(T in)566 void backward_scale( T in ) { 567 EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanScale( *plan_handle, CLFFT_BACKWARD, static_cast<float>( in ) ) ); 568 _backward_scale = backward_scale(); 569 } 570 571 /*****************************************************/ forward_scale()572 T forward_scale() { 573 cl_T scale; 574 EXPECT_EQ( CLFFT_SUCCESS, clfftGetPlanScale( *plan_handle, CLFFT_FORWARD, reinterpret_cast<cl_float*>(&scale) )); 575 return scale; 576 } 577 578 /*****************************************************/ backward_scale()579 T backward_scale() { 580 cl_T scale; 581 EXPECT_EQ( CLFFT_SUCCESS, clfftGetPlanScale( *plan_handle, CLFFT_BACKWARD, reinterpret_cast<cl_float*>(&scale) )); 582 return scale; 583 } 584 585 /*****************************************************/ set_input_to_value(T real)586 void set_input_to_value( T real ) 587 { 588 input.set_all_to_value( real ); 589 } 590 591 /*****************************************************/ set_input_to_value(T real,T imag)592 void set_input_to_value( T real, T imag ) 593 { 594 input.set_all_to_value( real, imag ); 595 } 596 597 /*****************************************************/ set_input_to_sawtooth(T max)598 void set_input_to_sawtooth(T max) { 599 input.set_all_to_sawtooth(max); 600 } 601 602 /*****************************************************/ set_input_to_impulse()603 void set_input_to_impulse() { 604 input.set_all_to_impulse(); 605 } 606 607 /*****************************************************/ 608 // yes, the "super duper global seed" is horrible 609 // alas, i'll have TODO it better later set_input_to_random()610 void set_input_to_random() 611 { 612 input.set_all_to_random_data( 10, super_duper_global_seed ); 613 } 614 615 /*****************************************************/ set_input_to_buffer(buffer<T> other_buffer)616 void set_input_to_buffer( buffer<T> other_buffer ) { 617 input = other_buffer; 618 } 619 620 /*****************************************************/ 621 void set_input_precallback(unsigned int localMemSize = 0) { 622 cl_int status = 0; 623 clfftPrecision precision; 624 clfftGetPlanPrecision( *plan_handle, &precision ); 625 626 const char* precallbackstr; 627 628 if (localMemSize > 0) 629 { 630 //Test for LDS in precallback function 631 precallbackstr = STRINGIFY(PRE_MULVAL_LDS); 632 } 633 else 634 { 635 if (input.is_interleaved() ) 636 { 637 precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL) : STRINGIFY(PRE_MULVAL_DP); 638 } 639 else if (input.is_planar()) 640 { 641 precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL_PLANAR) : STRINGIFY(PRE_MULVAL_PLANAR_DP); 642 } 643 else if (input.is_real()) 644 { 645 precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL_REAL) : STRINGIFY(PRE_MULVAL_REAL_DP); 646 } 647 } 648 649 //precallback user data 650 buffer<T> userdata( static_cast<size_t>(dimension), 651 input.lengths(), 652 input.strides(), 653 input.batch_size(), 654 input.distance(), 655 layout::real, 656 _placeness 657 ); 658 659 userdata.set_all_to_random_data(lengths[0], 10); 660 661 // make the new buffer 662 const size_t bufferSizeBytes = userdata.size_in_bytes( ); 663 664 cl_mem userdataBuff = clCreateBuffer( context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bufferSizeBytes, userdata.real_ptr(), &status); 665 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 666 667 //Register the callback 668 OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_pre", precallbackstr, localMemSize, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed"); 669 } 670 671 /*****************************************************/ set_input_precallback_userdatatype()672 void set_input_precallback_userdatatype() { 673 cl_int status = 0; 674 675 const char* precallbackstr = STRINGIFY(PRE_MULVAL_UDT); 676 677 size_t totalPts = input.total_number_of_points_including_data_and_intervening(); 678 679 buffer<T> temp( static_cast<size_t>(dimension), 680 input.lengths(), 681 input.strides(), 682 input.batch_size(), 683 input.distance(), 684 layout::real, 685 _placeness 686 ); 687 688 temp.set_all_to_random_data(lengths[0], 10); 689 690 std::vector<USER_DATA> userdata(totalPts); 691 size_t the_index; 692 for( size_t batch = 0; batch < input.batch_size(); batch++) 693 for( size_t z = 0; z < input.length(dimz); z++) 694 for( size_t y = 0; y < input.length(dimy); y++) 695 for( size_t x = 0; x < input.length(dimx); x++) 696 { 697 the_index = ( input.stride(dimx) * x + input.stride(dimy) * y + input.stride(dimz) * z + input.distance() * batch ); 698 699 userdata[the_index].scalar1 = (float)temp.real(x, y, z, batch); 700 userdata[the_index].scalar2 = 1; 701 } 702 703 cl_mem userdataBuff = clCreateBuffer(context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * totalPts, (void*)&userdata[0], &status); 704 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 705 706 //Register the callback 707 OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_pre", precallbackstr, 0, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed"); 708 } 709 710 /*****************************************************/ 711 void set_output_postcallback(unsigned int localMemSize = 0) { 712 cl_int status = 0; 713 clfftPrecision precision; 714 clfftGetPlanPrecision( *plan_handle, &precision ); 715 716 const char* postcallbackstr; 717 718 if (localMemSize > 0) 719 { 720 //Test for LDS in postcallback function 721 postcallbackstr = STRINGIFY(POST_MULVAL_LDS); 722 } 723 else 724 { 725 if (output.is_interleaved() ) 726 { 727 postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL) : STRINGIFY(POST_MULVAL_DP); 728 } 729 else if (output.is_planar()) 730 { 731 postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL_PLANAR) : STRINGIFY(POST_MULVAL_PLANAR_DP); 732 } 733 else if (output.is_real()) 734 { 735 postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL_REAL) : STRINGIFY(POST_MULVAL_REAL_DP); 736 } 737 } 738 739 //post-callback user data 740 buffer<T> userdata( static_cast<size_t>(dimension), 741 output.lengths(), 742 output.strides(), 743 output.batch_size(), 744 output.distance(), 745 layout::real, 746 _placeness 747 ); 748 749 userdata.set_all_to_random_data(lengths[0], 10); 750 751 // make the new buffer 752 const size_t bufferSizeBytes = userdata.size_in_bytes( ); 753 754 cl_mem userdataBuff = clCreateBuffer( context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bufferSizeBytes, userdata.real_ptr(), &status); 755 OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" ); 756 757 //Register the post-callback 758 OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_post", postcallbackstr, localMemSize, POSTCALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed"); 759 } 760 761 /*****************************************************/ device_list_has_devices()762 bool device_list_has_devices() { 763 return !device_id.empty(); 764 } 765 766 /*****************************************************/ 767 // returns true if the memory required for input + output (if applicable) + intermediate (if applicable) buffers 768 // is too large compared with the OpenCL device's memory size total_memory_footprint_is_too_large_for_device()769 bool total_memory_footprint_is_too_large_for_device() { 770 throw_if_device_list_is_empty(); 771 772 // In order to call clfftEnqueueTransform, we need to pass naked pointers 773 cl_command_queue tempQueue = queue.get( ); 774 size_t buffer_size = 0; 775 776 EXPECT_EQ( CLFFT_SUCCESS, clfftBakePlan(*plan_handle, 1, &tempQueue, NULL, NULL )); 777 EXPECT_EQ( CLFFT_SUCCESS, clfftGetTmpBufSize(*plan_handle, &buffer_size )); 778 779 cl_ulong total_memory_size = input.size_in_bytes() + buffer_size; 780 781 // we are only going to include the result space if the transform is out of place 782 if( placeness() == CLFFT_OUTOFPLACE ) 783 { 784 total_memory_size += output.size_in_bytes(); 785 } 786 787 cl_ulong global_memory_size = cl_device_max_global_memory(0); 788 789 // we don't want to bog down the CPU with ginormous problem sizes 790 // so we chop the global memory way down to keep things manageable 791 if( g_device_type == CL_DEVICE_TYPE_CPU ) 792 { 793 global_memory_size /= 8; 794 } 795 796 return total_memory_size > global_memory_size; 797 } 798 799 /*****************************************************/ throw_if_total_memory_footprint_is_too_large_for_device()800 void throw_if_total_memory_footprint_is_too_large_for_device() 801 { 802 if( total_memory_footprint_is_too_large_for_device() ) 803 { 804 throw std::runtime_error("problem too large for device"); 805 } 806 } 807 808 /*****************************************************/ throw_if_device_list_is_empty()809 void throw_if_device_list_is_empty() 810 { 811 if( !device_list_has_devices() ) { 812 throw std::runtime_error("device list is empty at transform"); 813 } 814 } 815 816 /*****************************************************/ 817 void transform(bool explicit_intermediate_buffer = use_explicit_intermediate_buffer) { 818 verbose_output(); 819 820 throw_if_device_list_is_empty(); 821 822 cl_int status; 823 824 // In order to call clfftEnqueueTransform, we need to pass naked pointers 825 cl_command_queue tempQueue = queue.get( ); 826 std::unique_ptr< _cl_event, clEvent_deleter > tempEvent; 827 std::unique_ptr< _cl_mem, clMem_deleter > intermediate_buffer; 828 829 throw_if_total_memory_footprint_is_too_large_for_device(); 830 831 write_local_input_buffer_to_gpu(); 832 if( placeness() == CLFFT_OUTOFPLACE ) 833 write_local_output_buffer_to_gpu(); 834 835 try 836 { 837 size_t buffer_size = 0; 838 EXPECT_EQ( CLFFT_SUCCESS, clfftBakePlan(*plan_handle, 1, &tempQueue, NULL, NULL )); 839 EXPECT_EQ( CLFFT_SUCCESS, clfftGetTmpBufSize(*plan_handle, &buffer_size )); 840 841 if( explicit_intermediate_buffer ) 842 { 843 // the buffer size is already stashed above 844 // now we want to make the intermediate buffer to pass in (if necessary) 845 if (buffer_size) 846 { 847 // because unique_ptrs are funky, we have to create a temp_buffer 848 // and then std::move it to the intermediate_buffer 849 std::unique_ptr< _cl_mem, clMem_deleter > temp_buffer( 850 ::clCreateBuffer( context.get( ), 851 CL_MEM_READ_WRITE, 852 buffer_size, 853 NULL, 854 &status) ); 855 OPENCL_V_THROW( status, "Creating intermediate Buffer ( ::clCreateBuffer() )" ); 856 857 intermediate_buffer = std::move( temp_buffer ); 858 } 859 } 860 861 cl_mem tempInput[2]; 862 cl_mem tempOutput[2]; 863 for( cl_uint i = 0; i < cl_mem_input.size( ); ++i ) 864 tempInput[ i ] = cl_mem_input[ i ].get( ); 865 866 for( cl_uint i = 0; i < cl_mem_output.size( ); ++i ) 867 tempOutput[ i ] = cl_mem_output[ i ].get( ); 868 869 cl_event tevent = NULL; 870 if( buffer_size ) 871 { 872 status = clfftEnqueueTransform(*plan_handle, 873 _transformation_direction, 874 1, 875 &tempQueue, 876 0, 877 NULL, 878 &tevent, 879 &tempInput[ 0 ], 880 &tempOutput[ 0 ], 881 intermediate_buffer.get() ); 882 } 883 else 884 { 885 status = clfftEnqueueTransform(*plan_handle, 886 _transformation_direction, 887 1, 888 &tempQueue, 889 0, 890 NULL, 891 &tevent, 892 &tempInput[ 0 ], 893 &tempOutput[ 0 ], 894 NULL ); 895 } 896 clFinish(tempQueue); 897 tempEvent.reset(tevent); tevent = NULL; 898 899 if( status != CLFFT_SUCCESS ) 900 { 901 throw std::runtime_error(prettyPrintclFFTStatus(status).c_str()); 902 } 903 904 // wait for the kernel call to finish execution 905 const cl_event revent = tempEvent.get(); 906 cl_int wait_status = clWaitForEvents(1, &revent); 907 if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) 908 { 909 cl_int error_code; 910 clGetEventInfo( revent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &error_code, NULL ); 911 throw std::runtime_error(prettyPrintclFFTStatus(error_code).c_str()); 912 } 913 else if( wait_status != CL_SUCCESS ) 914 { 915 throw std::runtime_error(prettyPrintclFFTStatus(wait_status).c_str()); 916 } 917 } catch(const std::exception &)918 catch (const std::exception& ) { 919 std::cout << "Exception occurred during clfftEnqueueTransform" 920 << __FILE__ << __LINE__ << std::endl; 921 throw; 922 } 923 924 if( in_place() ) { 925 capture_input(); 926 } 927 else { 928 capture_output(); 929 } 930 931 get_layouts(); 932 if( placeness() == CLFFT_INPLACE ) 933 { 934 if( is_real( _input_layout ) && is_hermitian( _output_layout ) ) 935 { 936 input.change_real_to_hermitian( output.strides(), output.distance() ); 937 } 938 else if( is_hermitian( _input_layout ) && is_real( _output_layout ) ) 939 { 940 input.change_hermitian_to_real( output.strides(), output.distance() ); 941 } 942 } 943 944 // there's no way to know if in-place transforms have written in bad places, 945 // because depending on input and output strides, the state of the memory 946 // between points is not necessarily the NaN that we set it to 947 if( _placeness != CLFFT_INPLACE ) 948 { 949 input.make_sure_padding_was_not_overwritten(); 950 output.make_sure_padding_was_not_overwritten(); 951 } 952 } 953 954 /*****************************************************/ maximum_problem_size()955 size_t maximum_problem_size() { 956 int device_index = 0; 957 //N.B. if this class ever needs to support more than one device at once 958 //(i.e., multiple GPUs or CPU+GPU), device index will need to be variable 959 //to choose the device of interest 960 return cl_device_max_memory_to_allocate(device_index)/(sizeof(T)*2); 961 //TODO *2 needs to be either *1 or *2, depending, once real numbers are implemented in clfft 962 } 963 964 /*****************************************************/ number_of_opencl_devices()965 size_t number_of_opencl_devices() { 966 return device_id.size(); 967 } 968 969 970 /*****************************************************/ initialize_failed()971 bool initialize_failed() { 972 return init_failure; 973 } 974 975 /*****************************************************/ dataset_is_too_large_for_device()976 bool dataset_is_too_large_for_device() { 977 return dataset_too_large; 978 } 979 980 /*****************************************************/ input_buffer()981 buffer<T> & input_buffer() 982 { 983 return input; 984 } 985 986 /*****************************************************/ output_buffer()987 buffer<T> & output_buffer() 988 { 989 return output; 990 } 991 992 /*****************************************************/ result()993 buffer<T> & result() 994 { 995 if( placeness() == CLFFT_INPLACE ) 996 return input; 997 else if( placeness() == CLFFT_OUTOFPLACE ) 998 return output; 999 else 1000 throw std::runtime_error( "invalid placeness" ); 1001 } 1002 1003 private: 1004 /*****************************************************/ get_layouts()1005 void get_layouts() { 1006 EXPECT_EQ( CLFFT_SUCCESS, clfftGetLayout( *plan_handle, &_input_layout, &_output_layout ) ); 1007 } 1008 1009 /*****************************************************/ 1010 // after transform() is run: 1011 // if in-place transformation -- the results will be in the input buffer 1012 // otherwise -- the results will be in the output buffer placeness(clfftResultLocation placeness)1013 void placeness( clfftResultLocation placeness ) 1014 { 1015 EXPECT_EQ( CLFFT_SUCCESS, clfftSetResultLocation( *plan_handle, placeness ) ); 1016 } 1017 1018 /*****************************************************/ in_place()1019 bool in_place() { 1020 clfftResultLocation placeness; 1021 clfftGetResultLocation( *plan_handle, &placeness ); 1022 return (placeness == CLFFT_INPLACE) ? true : false; 1023 } 1024 1025 /*****************************************************/ capture_output()1026 void capture_output() { 1027 if( is_planar( output_layout() ) ) { 1028 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0, 1029 output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "reading output buffer - planar real ( ::clEnqueueReadBuffer() )" ); 1030 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[IMAG].get( ), CL_TRUE, 0, 1031 output.size_in_bytes(), output.imag_ptr(), 0, NULL, NULL), "reading output buffer - planar imaginary ( ::clEnqueueReadBuffer() )" ); 1032 } 1033 else if( is_interleaved( output_layout() ) ) { 1034 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[0].get( ), CL_TRUE, 0, 1035 output.size_in_bytes(), output.interleaved_ptr(), 0, NULL, NULL), "reading output buffer - interleaved ( ::clEnqueueReadBuffer() )" ); 1036 } 1037 else if( is_real( output_layout() ) ) { 1038 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0, 1039 output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "reading output buffer - planar real ( ::clEnqueueReadBuffer() )" ); 1040 } 1041 else 1042 { 1043 throw std::runtime_error( "we shouldn't make it here [capture_output()]" ); 1044 } 1045 } 1046 1047 /*****************************************************/ capture_input()1048 void capture_input() { 1049 if( is_planar( input_layout() ) ) { 1050 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0, 1051 input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "reading input buffer - planar real ( ::clEnqueueReadBuffer() )" ); 1052 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[IMAG].get( ), CL_TRUE, 0, 1053 input.size_in_bytes(), input.imag_ptr(), 0, NULL, NULL), "reading input buffer - planar imaginary ( ::clEnqueueReadBuffer() )" ); 1054 } 1055 else if( is_interleaved ( input_layout() ) ) { 1056 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[0].get( ), CL_TRUE, 0, 1057 input.size_in_bytes(), input.interleaved_ptr(), 0, NULL, NULL), "reading input buffer - interleaved ( ::clEnqueueReadBuffer() )" ); 1058 } 1059 else if( is_real( input_layout() ) ) { 1060 OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0, 1061 input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "reading input buffer - planar real ( ::clEnqueueReadBuffer() )" ); 1062 } 1063 else 1064 { 1065 throw std::runtime_error( "we shouldn't make it here [capture_input()]" ); 1066 } 1067 } 1068 1069 /*****************************************************/ write_local_output_buffer_to_gpu()1070 void write_local_output_buffer_to_gpu() { 1071 if( is_planar( output_layout() ) ) { 1072 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0, 1073 output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "writing output buffer - planar real ( ::clEnqueueWriteBuffer() )" ); 1074 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[IMAG].get( ), CL_TRUE, 0, 1075 output.size_in_bytes(), output.imag_ptr(), 0, NULL, NULL), "writing output buffer - planar imaginary ( ::clEnqueueWriteBuffer() )" ); 1076 } 1077 else if( is_interleaved ( output_layout() ) ) { 1078 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[0].get( ), CL_TRUE, 0, 1079 output.size_in_bytes(), output.interleaved_ptr(), 0, NULL, NULL), "writing output buffer - interleaved ( ::clEnqueueWriteBuffer() )" ); 1080 } 1081 else if( is_real( output_layout() ) ) { 1082 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0, 1083 output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "writing output buffer - planar real ( ::clEnqueueWriteBuffer() )" ); 1084 } 1085 else 1086 { 1087 throw std::runtime_error( "we shouldn't make it here [write_local_output_buffer_to_gpu()]" ); 1088 } 1089 } 1090 1091 /*****************************************************/ write_local_input_buffer_to_gpu()1092 void write_local_input_buffer_to_gpu() { 1093 if( is_planar( input_layout() ) ) { 1094 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0, 1095 input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "writing input buffer - planar real ( ::clEnqueueWriteBuffer() )" ); 1096 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[IMAG].get( ), CL_TRUE, 0, 1097 input.size_in_bytes(), input.imag_ptr(), 0, NULL, NULL), "writing input buffer - planar imaginary ( ::clEnqueueWriteBuffer() )" ); 1098 } 1099 else if( is_interleaved( input_layout() ) ) { 1100 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[0].get( ), CL_TRUE, 0, 1101 input.size_in_bytes(), input.interleaved_ptr(), 0, NULL, NULL), "writing input buffer - interleaved ( ::clEnqueueWriteBuffer() )" ); 1102 } 1103 else if( is_real( input_layout() ) ) { 1104 OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0, 1105 input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "writing input buffer - planar real ( ::clEnqueueWriteBuffer() )" ); 1106 } 1107 else 1108 { 1109 throw std::runtime_error( "we shouldn't make it here [write_local_input_buffer_to_gpu()]" ); 1110 } 1111 } 1112 1113 1114 /*****************************************************/ cl_device_max_memory_to_allocate(size_t device_index)1115 cl_ulong cl_device_max_memory_to_allocate(size_t device_index) { 1116 if( number_of_opencl_devices() == 0 || device_index > number_of_opencl_devices() ) 1117 { 1118 return 0; 1119 } 1120 else 1121 { 1122 cl_ulong device_max_to_allocate = 0; 1123 OPENCL_V_THROW( ::clGetDeviceInfo( device_id[device_index], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &device_max_to_allocate, NULL ), 1124 "Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" ); 1125 1126 return device_max_to_allocate; 1127 } 1128 } 1129 1130 1131 /*****************************************************/ cl_device_max_global_memory(size_t device_index)1132 cl_ulong cl_device_max_global_memory(size_t device_index) { 1133 if( number_of_opencl_devices() == 0 || device_index > number_of_opencl_devices() ) 1134 { 1135 return 0; 1136 } 1137 else 1138 { 1139 cl_ulong global_mem_size = 0; 1140 OPENCL_V_THROW( ::clGetDeviceInfo( device_id[device_index], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &global_mem_size, NULL ), 1141 "Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" ); 1142 1143 return global_mem_size; 1144 } 1145 } 1146 1147 #if defined(PERSISTENT_PLANS_FEATURE_HAS_BEEN_DEFEATURED_WHICH_MEANS_IT_IS_NO_LONGER_A_FEATURE) 1148 /*****************************************************/ write_plan_to_file(std::string filename)1149 void write_plan_to_file(std::string filename) 1150 { 1151 cl_command_queue tempQueue = queue.get( ); 1152 EXPECT_EQ( CLFFT_SUCCESS, clfftBakePlan(*plan_handle, 1, &tempQueue, NULL, NULL )); 1153 // we need to make sure the plan is baked before we write it out, or we won't get any juicy binaries along with it 1154 1155 clfftWritePlanToDisk(*plan_handle, filename.c_str()); 1156 } 1157 1158 /*****************************************************/ read_plan_from_file(std::string filename)1159 void read_plan_from_file(std::string filename) 1160 { 1161 clfftReadPlanFromDisk( *plan_handle, filename.c_str() ); 1162 1163 // if we've changed from the default for input and output layouts, we need to re-set the layouts to make sure buffers get set up completely 1164 set_layouts( input_layout(), output_layout() ); 1165 } 1166 #endif 1167 }; 1168 1169 #endif 1170