1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #pragma once
19 #if !defined( CLFFT_CLTRANSFORM_H )
20 #define CLFFT_CLTRANSFORM_H
21 
22 #include <iostream>
23 #include <vector>
24 #include "clFFT.h"
25 #include "../library/private.h"
26 #include "../client/openCL.misc.h"
27 #include "buffer.h"
28 #include "test_constants.h"
29 
30 //	Custom deleter functions for our unique_ptr smart pointer class
31 struct clMem_deleter
32 {
operatorclMem_deleter33 	template <class T> void operator()(T* clMemObj)
34 	{
35 		if( clMemObj != NULL )
36 			OPENCL_V_THROW( ::clReleaseMemObject( clMemObj ), "Error: In clReleaseMemObject\n" );
37 	};
38 };
39 
40 struct plan_handle_deleter
41 {
operatorplan_handle_deleter42 	template <class T> void operator()(T* handle)
43 	{
44 		if( *handle )
45 		{
46 			clfftDestroyPlan( handle );
47 		}
48 		clfftTeardown( ); // when multi-GPU tests are written, this will need to occur in the gtest cleanup
49 	};
50 };
51 
52 struct clEvent_deleter
53 {
operatorclEvent_deleter54 	template <class T> void operator()(T* clEventObj)
55 	{
56 		if( clEventObj != NULL )
57 			OPENCL_V_THROW( clReleaseEvent( clEventObj ), "Error: In clReleaseEvent\n" );
58 	};
59 };
60 
61 struct clCommQueue_deleter
62 {
operatorclCommQueue_deleter63 	template <class T> void operator()(T* clQueueObj)
64 	{
65 		if( clQueueObj != NULL )
66 			OPENCL_V_THROW( clReleaseCommandQueue( clQueueObj ), "Error: In clReleaseCommandQueue\n" );
67 	};
68 };
69 
70 struct clContext_deleter
71 {
operatorclContext_deleter72 	template <class T> void operator()(T* clContextObj)
73 	{
74 		if( clContextObj != NULL )
75 			OPENCL_V_THROW( clReleaseContext( clContextObj ), "Error: In clReleaseContext\n" );
76 	};
77 };
78 
79 template <class T>
80 class Precision_Setter
81 {
82 public:
Precision_Setter(clfftPlanHandle plan_handle)83     Precision_Setter(clfftPlanHandle plan_handle)
84     {
85         throw std::runtime_error("Precision_Setter: this code path should never be executed");
86     }
87 
88 private:
Precision_Setter()89     Precision_Setter(){}
90 };
91 
92 template<>
93 class Precision_Setter<float>
94 {
95 public:
Precision_Setter(clfftPlanHandle plan_handle)96     Precision_Setter(clfftPlanHandle plan_handle)
97     {
98 	    EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanPrecision( plan_handle, CLFFT_SINGLE ));
99     }
100 
101 private:
Precision_Setter()102     Precision_Setter(){}
103 };
104 
105 template<>
106 class Precision_Setter<double>
107 {
108 public:
Precision_Setter(clfftPlanHandle plan_handle)109     Precision_Setter(clfftPlanHandle plan_handle)
110     {
111 		clfftStatus ret = clfftSetPlanPrecision( plan_handle, CLFFT_DOUBLE );
112 
113 		//	If device does not support double precision, skip this test, don't fail it
114 		if( ret == CLFFT_DEVICE_NO_DOUBLE )
115 			throw std::runtime_error("CLFFT_DEVICE_NO_DOUBLE");
116 
117 		EXPECT_EQ( CLFFT_SUCCESS, ret );
118     }
119 
120 private:
Precision_Setter()121     Precision_Setter(){}
122 };
123 
124  /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
125  /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
126 template <class T, class cl_T>
127 class clfft {
128 private:
129 	clfftLayout _input_layout, _output_layout;
130 	clfftResultLocation _placeness;
131 
132 	buffer<T> input;
133 	buffer<T> output;
134 
135 	size_t number_of_data_points;
136 	T _forward_scale, _backward_scale;
137 	cl_uint commandQueueFlags;
138 	bool init_failure;
139 	bool dataset_too_large;
140 
141 	cl_device_type deviceType;
142 	std::unique_ptr< clfftPlanHandle, plan_handle_deleter > plan_handle;
143 
144 	clfftDirection _transformation_direction;
145 	clfftDim dimension;
146 
147 	std::vector<size_t> lengths;
148 
149 	static const bool printInfo = false;
150 
151 	//	OpenCL resources that need to be carefully managed
152 	std::unique_ptr< _cl_context, clContext_deleter > context;
153 	std::unique_ptr< _cl_command_queue, clCommQueue_deleter > queue;
154 	std::vector< std::unique_ptr< _cl_mem, clMem_deleter > > cl_mem_input;
155 	std::vector< std::unique_ptr< _cl_mem, clMem_deleter > > cl_mem_output;
156 	std::vector< cl_device_id >	device_id;
157 public:
158 	/*****************************************************/
clfft(const clfftDim dimensions_in,const size_t * lengths_in,const size_t * input_strides_in,const size_t * output_strides_in,const size_t batch_size_in,const size_t input_distance_in,const size_t output_distance_in,const clfftLayout input_layout_in,const clfftLayout output_layout_in,const clfftResultLocation placeness_in)159 	clfft(  const clfftDim dimensions_in, const size_t* lengths_in,
160 			const size_t* input_strides_in, const size_t* output_strides_in,
161 			const size_t batch_size_in,
162 			const size_t input_distance_in, const size_t output_distance_in,
163 			const clfftLayout input_layout_in, const clfftLayout output_layout_in,
164 			const clfftResultLocation placeness_in )
165 		try
166 		: _input_layout( input_layout_in )
167 		, _output_layout( output_layout_in )
168 		, _placeness( placeness_in )
169 		, input( 	static_cast<size_t>(dimensions_in),
170 					lengths_in,
171 					input_strides_in,
172 					batch_size_in,
173 					input_distance_in,
174 					cl_layout_to_buffer_layout( _input_layout ),
175 					_placeness
176 				)
177 		, output(	static_cast<size_t>(dimensions_in),
178 					lengths_in,
179 					output_strides_in,
180 					batch_size_in,
181 					output_distance_in,
182 					cl_layout_to_buffer_layout( _output_layout ),
183 					_placeness
184 				)
185 		, number_of_data_points( input.number_of_data_points())
186 		, _forward_scale( 1.0f )
187 		, _backward_scale( 1.0f/T(number_of_data_points) )
188 		, commandQueueFlags( 0 )
189 		, init_failure( false )
190 		, dataset_too_large( false )
191 		, deviceType( 0 )
192 		, plan_handle( new clfftPlanHandle )
193 		, _transformation_direction( ENDDIRECTION )
194 		, dimension( dimensions_in )
195 
196 	{
197 		if( _placeness == CLFFT_INPLACE )
198 		{
199 			if( ( is_real( _input_layout ) && is_planar( _output_layout ) ) ||
200 				( is_planar( _input_layout ) && is_real( _output_layout ) ) )
201 			{
202 				throw std::runtime_error( "in-place transforms may not be real<->planar" );
203 			}
204 		}
205 
206 		*plan_handle = 0;
207 		clfftSetupData setupData;
208 		clfftInitSetupData( &setupData );
209 		clfftSetup( &setupData );
210 
211 		for( int i = 0; i < max_dimension; i++ )
212 		{
213 			if( i < dimension )
214 				lengths.push_back( lengths_in[i] );
215 			else
216 				lengths.push_back( 1 );
217 		}
218 
219 		initialize_openCL();
220 		initialize_plan();
221 	}
catch(const std::exception &)222 	catch( const std::exception& ) {
223 		throw;
224 	}
225 
226 	/*****************************************************/
~clfft()227 	~clfft()
228 	{}
229 
230 	/*****************************************************/
is_real(const clfftLayout layout)231 	bool is_real( const clfftLayout layout )
232 	{
233 		return layout == CLFFT_REAL;
234 	}
235 
236 	/*****************************************************/
is_planar(const clfftLayout layout)237 	bool is_planar( const clfftLayout layout )
238 	{
239 		return (layout == CLFFT_COMPLEX_PLANAR || layout == CLFFT_HERMITIAN_PLANAR);
240 	}
241 
242 	/*****************************************************/
is_interleaved(const clfftLayout layout)243 	bool is_interleaved( const clfftLayout layout )
244 	{
245 		return (layout == CLFFT_COMPLEX_INTERLEAVED || layout == CLFFT_HERMITIAN_INTERLEAVED);
246 	}
247 
248 	/*****************************************************/
is_complex(const clfftLayout layout)249 	bool is_complex( const clfftLayout layout )
250 	{
251 		return (layout == CLFFT_COMPLEX_INTERLEAVED || layout == CLFFT_COMPLEX_PLANAR);
252 	}
253 
254 	/*****************************************************/
is_hermitian(const clfftLayout layout)255 	bool is_hermitian( const clfftLayout layout )
256 	{
257 		return (layout == CLFFT_HERMITIAN_INTERLEAVED || layout == CLFFT_HERMITIAN_PLANAR);
258 	}
259 
260 	/*****************************************************/
initialize_openCL()261 	void initialize_openCL() {
262 		try
263 		{
264 			cl_context tempContext = NULL;
265 			device_id = initializeCL(
266 				g_device_type,
267 				g_device_id,
268 				g_platform_id,
269 				tempContext,
270 				printInfo
271 			);
272 			context = std::unique_ptr< _cl_context, clContext_deleter >( tempContext );
273 
274 			if( input.size_in_bytes() > cl_device_max_memory_to_allocate(0) ||
275 				output.size_in_bytes() > cl_device_max_memory_to_allocate(0))
276 			{
277 				throw std::runtime_error("problem too large for device");
278 			}
279 
280 			cl_int status = 0;
281 			queue	= std::unique_ptr< _cl_command_queue, clCommQueue_deleter >(
282 					::clCreateCommandQueue( context.get( ), device_id[ 0 ], commandQueueFlags, &status ) );
283 			OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
284 
285 			// make the new buffer
286 			const size_t bufferSizeBytes = input.size_in_bytes( );
287 
288 			for( cl_int i = 0; i < CLFFT_COMPLEX_INTERLEAVED; ++i )
289 			{
290 				cl_int status = 0;
291 				std::unique_ptr< _cl_mem, clMem_deleter > inBuff(
292 						::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, bufferSizeBytes, NULL, &status) );
293 				OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
294 
295 				cl_mem_input.push_back( std::move( inBuff ) );
296 
297 				std::unique_ptr< _cl_mem, clMem_deleter > outBuff(
298 						::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, bufferSizeBytes, NULL, &status) );
299 				OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
300 
301 				cl_mem_output.push_back( std::move( outBuff ) );
302 			}
303 		}
304 		catch( const std::exception& )
305 		{
306 			throw;
307 		}
308 	}
309 
310 	/*****************************************************/
initialize_plan()311 	void initialize_plan()
312 	{
313 		EXPECT_EQ( CLFFT_SUCCESS, clfftCreateDefaultPlan( plan_handle.get(), context.get( ), dimension, &lengths[0] ) );
314 		set_layouts( _input_layout, _output_layout );
315 		placeness( _placeness );
316 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanInStride( *plan_handle, dimension, input.strides()));
317 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanOutStride( *plan_handle, dimension, output.strides()));
318 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanBatchSize( *plan_handle, input.batch_size()));
319 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanDistance( *plan_handle, input.distance(), output.distance()));
320 		Precision_Setter<T> setter(*plan_handle);
321 	}
322 
323 	/*****************************************************/
input_strides_plaintext()324 	std::string input_strides_plaintext()
325 	{
326 		size_t strides[3];
327 		clfftGetPlanInStride( *plan_handle, dimension, &strides[0] );
328 
329 		std::ostringstream my_strides_stream;
330 
331 		for( int i = 0; i < dimension; i++ )
332 			my_strides_stream << strides[i] << " ";
333 
334 		std::string my_strides( my_strides_stream.str() );
335 		my_strides.erase( my_strides.end() - 1 ); // chomp off trailing space
336 
337 		return my_strides;
338 	}
339 
340 	/*****************************************************/
output_strides_plaintext()341 	std::string output_strides_plaintext()
342 	{
343 		size_t strides[3];
344 		clfftGetPlanOutStride( *plan_handle, dimension, &strides[0] );
345 
346 		std::ostringstream my_strides_stream;
347 
348 		for( int i = 0; i < dimension; i++ )
349 			my_strides_stream << strides[i] << " ";
350 
351 		std::string my_strides( my_strides_stream.str() );
352 		my_strides.erase( my_strides.end() - 1 ); // chomp off trailing space
353 
354 		return my_strides;
355 	}
356 
357 	/*****************************************************/
lengths_plaintext()358 	std::string lengths_plaintext()
359 	{
360 		size_t lengths[3];
361 		clfftGetPlanLength( *plan_handle, dimension, &lengths[0] );
362 
363 		std::ostringstream my_lengths_stream;
364 
365 		for( int i = 0; i < dimension; i++ )
366 			my_lengths_stream << lengths[i] << " ";
367 
368 		std::string my_lengths( my_lengths_stream.str() );
369 		my_lengths.erase( my_lengths.end() - 1 ); // chomp off trailing space
370 
371 		return my_lengths;
372 	}
373 
374 	/*****************************************************/
layout_plaintext(clfftLayout layout)375 	std::string layout_plaintext( clfftLayout layout )
376 	{
377 		switch( layout )
378 		{
379 		case CLFFT_REAL:
380 			return "real";
381 		case CLFFT_HERMITIAN_INTERLEAVED:
382 			return "hermitian interleaved";
383 		case CLFFT_HERMITIAN_PLANAR:
384 			return "hermitian planar";
385 		case CLFFT_COMPLEX_INTERLEAVED:
386 			return "complex interleaved";
387 		case CLFFT_COMPLEX_PLANAR:
388 			return "complex planar";
389 		default:
390 			throw std::runtime_error( "invalid layout in layout_plaintext()" );
391 		}
392 	}
393 
394 	/*****************************************************/
refresh_plan()395 	void refresh_plan()
396 	{
397 		clfftDestroyPlan(plan_handle.get());
398 		initialize_plan();
399 	}
400 
401 	/*****************************************************/
cl_layout_to_buffer_layout(clfftLayout cl_layout)402 	layout::buffer_layout_t cl_layout_to_buffer_layout( clfftLayout cl_layout )
403 	{
404 		if( cl_layout == CLFFT_REAL )
405 			return layout::real;
406 		else if( cl_layout == CLFFT_HERMITIAN_PLANAR )
407 			return layout::hermitian_planar;
408 		else if( cl_layout == CLFFT_COMPLEX_PLANAR )
409 			return layout::complex_planar;
410 		else if( cl_layout == CLFFT_HERMITIAN_INTERLEAVED )
411 			return layout::hermitian_interleaved;
412 		else if( cl_layout == CLFFT_COMPLEX_INTERLEAVED )
413 			return layout::complex_interleaved;
414 		else
415 			throw std::runtime_error( "invalid cl_layout" );
416 	}
417 
418 	/*****************************************************/
verbose_output()419 	void verbose_output()
420 	{
421 		if(verbose)
422 		{
423 			std::cout << "transform parameters as seen by clfft:" << std::endl;
424 
425 			clfftDim dim;
426 			cl_uint dimensions;
427 			clfftGetPlanDim( *plan_handle, &dim, &dimensions );
428 
429 			std::cout << dimensions << " dimension(s): " << lengths_plaintext() << std::endl;
430 
431 			size_t batch;
432 			clfftGetPlanBatchSize( *plan_handle, &batch );
433 			std::cout << "batch: " << batch << std::endl;
434 
435 			clfftPrecision precision;
436 			clfftGetPlanPrecision( *plan_handle, &precision );
437 			if( precision == CLFFT_SINGLE ) std::cout << "single precision" << std::endl;
438 			else if( precision == CLFFT_DOUBLE ) std::cout << "double precision" << std::endl;
439 			else throw std::runtime_error( "can't figure out the precision in verbose_output()" );
440 
441 			if( placeness() == CLFFT_INPLACE ) std::cout << "in-place" << std::endl;
442 			else std::cout << "out-of-place" << std::endl;
443 
444 			get_layouts();
445 			std::cout << layout_plaintext(_input_layout) << " -> " << layout_plaintext(_output_layout) << std::endl;
446 
447 			std::cout << "input stride(s): " << input_strides_plaintext() << std::endl;
448 			std::cout << "output stride(s): " << output_strides_plaintext() << std::endl;
449 
450 			size_t input_distance, output_distance;
451 			clfftGetPlanDistance( *plan_handle, &input_distance, &output_distance );
452 			std::cout << "input distance: " << input_distance << std::endl;
453 			std::cout << "output distance: " << output_distance << std::endl;
454 		}
455 	}
456 
457 	/*****************************************************/
placeness()458 	clfftResultLocation placeness() {
459 		clfftResultLocation res;
460 		EXPECT_EQ( CLFFT_SUCCESS, clfftGetResultLocation( *plan_handle, &res ) );
461 		return res;
462 	}
463 
464 	/*****************************************************/
set_forward_transform()465 	void set_forward_transform() {
466 		_transformation_direction = CLFFT_FORWARD;
467 	}
468 
469 	/*****************************************************/
set_backward_transform()470 	void set_backward_transform() {
471 		_transformation_direction = CLFFT_BACKWARD;
472 	}
473 
474 	/*****************************************************/
set_transposed()475 	void set_transposed() {
476 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanTransposeResult( *plan_handle, CLFFT_TRANSPOSED ) );
477 	}
478 
479 	/*****************************************************/
set_layouts(clfftLayout new_input_layout,clfftLayout new_output_layout)480 	void set_layouts( clfftLayout new_input_layout, clfftLayout new_output_layout )
481 	{
482 		cl_mem_input.clear( );
483 		cl_mem_output.clear( );
484 
485 		// make the new input buffer
486 		const size_t input_buffer_size_in_bytes = input.size_in_bytes();
487 
488 		size_t number_of_input_buffers;
489 
490 		if( is_planar( new_input_layout ) )
491 			number_of_input_buffers = 2;
492 		else if( is_real( new_input_layout ) || is_interleaved( new_input_layout ) )
493 			number_of_input_buffers = 1;
494 		else
495 			throw std::runtime_error( "we shouldn't make it here [set_layouts(), input]" );
496 
497 		for( size_t i = 0; i < number_of_input_buffers; ++i )
498 		{
499 			cl_int status = 0;
500 			std::unique_ptr< _cl_mem, clMem_deleter > buff(
501 				::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, input_buffer_size_in_bytes, NULL, &status) );
502 			OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
503 
504 			cl_mem_input.push_back( std::move( buff ) );
505 		}
506 
507 		// make the new output buffer
508 		const size_t output_buffer_size_in_bytes = output.size_in_bytes();
509 
510 		size_t number_of_output_buffers;
511 
512 		if( is_planar( new_output_layout ) )
513 			number_of_output_buffers = 2;
514 		else if( is_real( new_output_layout ) || is_interleaved( new_output_layout ) )
515 			number_of_output_buffers = 1;
516 		else
517 			throw std::runtime_error( "we shouldn't make it here [set_layouts(), input]" );
518 
519 		for( size_t i = 0; i < number_of_output_buffers; ++i )
520 		{
521 			cl_int status = 0;
522 			std::unique_ptr< _cl_mem, clMem_deleter > buff(
523 				::clCreateBuffer( context.get( ), CL_MEM_READ_WRITE, output_buffer_size_in_bytes, NULL, &status) );
524 			OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
525 
526 			cl_mem_output.push_back( std::move( buff ) );
527 		}
528 
529 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetLayout( *plan_handle, new_input_layout, new_output_layout ) );
530 		get_layouts();
531 	}
532 
533 	/*****************************************************/
534 	// swap_layouts should only be used with in-place real-to-complex or complex-to-real transforms
swap_layouts()535 	void swap_layouts()
536 	{
537 		get_layouts();
538 		clfftLayout new_input_layout = _output_layout;
539 		clfftLayout new_output_layout = _input_layout;
540 
541 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetLayout( *plan_handle, new_input_layout, new_output_layout ) );
542 		get_layouts();
543 
544 		refresh_plan();
545 	}
546 
547 	/*****************************************************/
input_layout()548 	clfftLayout input_layout() {
549 		get_layouts();
550 		return _input_layout;
551 	}
552 
553 	/*****************************************************/
output_layout()554 	clfftLayout output_layout() {
555 		get_layouts();
556 		return _output_layout;
557 	}
558 
559 	/*****************************************************/
forward_scale(T in)560 	void forward_scale( T in ) {
561 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanScale( *plan_handle, CLFFT_FORWARD, static_cast<float>( in ) ) );
562 		_forward_scale = forward_scale();
563 	}
564 
565 	/*****************************************************/
backward_scale(T in)566 	void backward_scale( T in ) {
567 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetPlanScale( *plan_handle, CLFFT_BACKWARD, static_cast<float>( in ) ) );
568 		_backward_scale = backward_scale();
569 	}
570 
571 	/*****************************************************/
forward_scale()572 	T forward_scale() {
573 		cl_T scale;
574 		EXPECT_EQ( CLFFT_SUCCESS, clfftGetPlanScale( *plan_handle, CLFFT_FORWARD, reinterpret_cast<cl_float*>(&scale) ));
575 		return scale;
576 	}
577 
578 	/*****************************************************/
backward_scale()579 	T backward_scale() {
580 		cl_T scale;
581 		EXPECT_EQ( CLFFT_SUCCESS, clfftGetPlanScale( *plan_handle, CLFFT_BACKWARD, reinterpret_cast<cl_float*>(&scale) ));
582 		return scale;
583 	}
584 
585 	/*****************************************************/
set_input_to_value(T real)586 	void set_input_to_value( T real )
587 	{
588 		input.set_all_to_value( real );
589 	}
590 
591 	/*****************************************************/
set_input_to_value(T real,T imag)592 	void set_input_to_value( T real, T imag )
593 	{
594 		input.set_all_to_value( real, imag );
595 	}
596 
597 	/*****************************************************/
set_input_to_sawtooth(T max)598 	void set_input_to_sawtooth(T max) {
599 		input.set_all_to_sawtooth(max);
600 	}
601 
602 	/*****************************************************/
set_input_to_impulse()603 	void set_input_to_impulse() {
604 		input.set_all_to_impulse();
605 	}
606 
607 	/*****************************************************/
608 	// yes, the "super duper global seed" is horrible
609 	// alas, i'll have TODO it better later
set_input_to_random()610 	void set_input_to_random()
611 	{
612 		input.set_all_to_random_data( 10, super_duper_global_seed );
613 	}
614 
615 	/*****************************************************/
set_input_to_buffer(buffer<T> other_buffer)616 	void set_input_to_buffer( buffer<T> other_buffer ) {
617 		input = other_buffer;
618 	}
619 
620 	/*****************************************************/
621 	void set_input_precallback(unsigned int localMemSize = 0) {
622 		cl_int status = 0;
623 		clfftPrecision precision;
624 		clfftGetPlanPrecision( *plan_handle, &precision );
625 
626 		const char* precallbackstr;
627 
628 		if (localMemSize > 0)
629 		{
630 			//Test for LDS in precallback function
631 			precallbackstr = STRINGIFY(PRE_MULVAL_LDS);
632 		}
633 		else
634 		{
635 			if (input.is_interleaved() )
636 			{
637 				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL) : STRINGIFY(PRE_MULVAL_DP);
638 			}
639 			else if (input.is_planar())
640 			{
641 				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL_PLANAR) : STRINGIFY(PRE_MULVAL_PLANAR_DP);
642 			}
643 			else if (input.is_real())
644 			{
645 				precallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(PRE_MULVAL_REAL) : STRINGIFY(PRE_MULVAL_REAL_DP);
646 			}
647 		}
648 
649 		//precallback user data
650 		buffer<T> userdata( 	static_cast<size_t>(dimension),
651 					input.lengths(),
652 					input.strides(),
653 					input.batch_size(),
654 					input.distance(),
655 					layout::real,
656 					_placeness
657 					);
658 
659 		userdata.set_all_to_random_data(lengths[0], 10);
660 
661 		// make the new buffer
662 		const size_t bufferSizeBytes = userdata.size_in_bytes( );
663 
664 		cl_mem userdataBuff = clCreateBuffer( context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bufferSizeBytes, userdata.real_ptr(), &status);
665 		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
666 
667 		//Register the callback
668 		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_pre", precallbackstr, localMemSize, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
669 	}
670 
671 		/*****************************************************/
set_input_precallback_userdatatype()672 	void set_input_precallback_userdatatype() {
673 		cl_int status = 0;
674 
675 		const char* precallbackstr = STRINGIFY(PRE_MULVAL_UDT);
676 
677 		size_t totalPts = input.total_number_of_points_including_data_and_intervening();
678 
679 		buffer<T> temp( 	static_cast<size_t>(dimension),
680 					input.lengths(),
681 					input.strides(),
682 					input.batch_size(),
683 					input.distance(),
684 					layout::real,
685 					_placeness
686 					);
687 
688 		temp.set_all_to_random_data(lengths[0], 10);
689 
690 		std::vector<USER_DATA> userdata(totalPts);
691 		size_t the_index;
692 		for( size_t batch = 0; batch < input.batch_size(); batch++)
693 			for( size_t z = 0; z < input.length(dimz); z++)
694 				for( size_t y = 0; y < input.length(dimy); y++)
695 					for( size_t x = 0; x < input.length(dimx); x++)
696 					{
697 						the_index = ( input.stride(dimx) * x + input.stride(dimy) * y + input.stride(dimz) * z + input.distance() * batch );
698 
699 						userdata[the_index].scalar1 = (float)temp.real(x, y, z, batch);
700 						userdata[the_index].scalar2 = 1;
701 					}
702 
703 		cl_mem userdataBuff = clCreateBuffer(context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(USER_DATA) * totalPts, (void*)&userdata[0], &status);
704 		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
705 
706 		//Register the callback
707 		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_pre", precallbackstr, 0, PRECALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
708 	}
709 
710 		/*****************************************************/
711 	void set_output_postcallback(unsigned int localMemSize = 0) {
712 		cl_int status = 0;
713 		clfftPrecision precision;
714 		clfftGetPlanPrecision( *plan_handle, &precision );
715 
716 		const char* postcallbackstr;
717 
718 		if (localMemSize > 0)
719 		{
720 			//Test for LDS in postcallback function
721 			postcallbackstr = STRINGIFY(POST_MULVAL_LDS);
722 		}
723 		else
724 		{
725 			if (output.is_interleaved() )
726 			{
727 				postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL) : STRINGIFY(POST_MULVAL_DP);
728 			}
729 			else if (output.is_planar())
730 			{
731 				postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL_PLANAR) : STRINGIFY(POST_MULVAL_PLANAR_DP);
732 			}
733 			else if (output.is_real())
734 			{
735 				postcallbackstr = (precision == CLFFT_SINGLE) ? STRINGIFY(POST_MULVAL_REAL) : STRINGIFY(POST_MULVAL_REAL_DP);
736 			}
737 		}
738 
739 		//post-callback user data
740 		buffer<T> userdata( 	static_cast<size_t>(dimension),
741 					output.lengths(),
742 					output.strides(),
743 					output.batch_size(),
744 					output.distance(),
745 					layout::real,
746 					_placeness
747 					);
748 
749 		userdata.set_all_to_random_data(lengths[0], 10);
750 
751 		// make the new buffer
752 		const size_t bufferSizeBytes = userdata.size_in_bytes( );
753 
754 		cl_mem userdataBuff = clCreateBuffer( context.get( ), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bufferSizeBytes, userdata.real_ptr(), &status);
755 		OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer() )" );
756 
757 		//Register the post-callback
758 		OPENCL_V_THROW (clfftSetPlanCallback(*plan_handle, "mulval_post", postcallbackstr, localMemSize, POSTCALLBACK, &userdataBuff, 1), "clFFTSetPlanCallback failed");
759 	}
760 
761 	/*****************************************************/
device_list_has_devices()762 	bool device_list_has_devices() {
763 		return !device_id.empty();
764 	}
765 
766 	/*****************************************************/
767 	// returns true if the memory required for input + output (if applicable) + intermediate (if applicable) buffers
768 	// is too large compared with the OpenCL device's memory size
total_memory_footprint_is_too_large_for_device()769 	bool total_memory_footprint_is_too_large_for_device() {
770 		throw_if_device_list_is_empty();
771 
772 		// In order to call clfftEnqueueTransform, we need to pass naked pointers
773 		cl_command_queue tempQueue = queue.get( );
774 		size_t buffer_size = 0;
775 
776 		EXPECT_EQ( CLFFT_SUCCESS, clfftBakePlan(*plan_handle, 1, &tempQueue, NULL, NULL ));
777 		EXPECT_EQ( CLFFT_SUCCESS, clfftGetTmpBufSize(*plan_handle, &buffer_size ));
778 
779 		cl_ulong total_memory_size = input.size_in_bytes() + buffer_size;
780 
781 		// we are only going to include the result space if the transform is out of place
782 		if( placeness() == CLFFT_OUTOFPLACE )
783 		{
784 			total_memory_size += output.size_in_bytes();
785 		}
786 
787 		cl_ulong global_memory_size = cl_device_max_global_memory(0);
788 
789 		// we don't want to bog down the CPU with ginormous problem sizes
790 		// so we chop the global memory way down to keep things manageable
791 		if( g_device_type == CL_DEVICE_TYPE_CPU )
792 		{
793 			global_memory_size /= 8;
794 		}
795 
796 		return total_memory_size > global_memory_size;
797 	}
798 
799 	/*****************************************************/
throw_if_total_memory_footprint_is_too_large_for_device()800 	void throw_if_total_memory_footprint_is_too_large_for_device()
801 	{
802 		if( total_memory_footprint_is_too_large_for_device() )
803 		{
804 			throw std::runtime_error("problem too large for device");
805 		}
806 	}
807 
808 	/*****************************************************/
throw_if_device_list_is_empty()809 	void throw_if_device_list_is_empty()
810 	{
811 		if( !device_list_has_devices() ) {
812 			throw std::runtime_error("device list is empty at transform");
813 		}
814 	}
815 
816 	/*****************************************************/
817 	void transform(bool explicit_intermediate_buffer = use_explicit_intermediate_buffer) {
818 		verbose_output();
819 
820 		throw_if_device_list_is_empty();
821 
822 		cl_int status;
823 
824 		// In order to call clfftEnqueueTransform, we need to pass naked pointers
825 		cl_command_queue tempQueue = queue.get( );
826 		std::unique_ptr< _cl_event, clEvent_deleter > tempEvent;
827 		std::unique_ptr< _cl_mem, clMem_deleter > intermediate_buffer;
828 
829 		throw_if_total_memory_footprint_is_too_large_for_device();
830 
831 		write_local_input_buffer_to_gpu();
832 		if( placeness() == CLFFT_OUTOFPLACE )
833 			write_local_output_buffer_to_gpu();
834 
835 		try
836 		{
837 			size_t buffer_size = 0;
838 			EXPECT_EQ( CLFFT_SUCCESS, clfftBakePlan(*plan_handle, 1, &tempQueue, NULL, NULL ));
839 			EXPECT_EQ( CLFFT_SUCCESS, clfftGetTmpBufSize(*plan_handle, &buffer_size ));
840 
841 			if( explicit_intermediate_buffer )
842 			{
843 				// the buffer size is already stashed above
844 				// now we want to make the intermediate buffer to pass in (if necessary)
845 				if (buffer_size)
846 				{
847 					// because unique_ptrs are funky, we have to create a temp_buffer
848 					// and then std::move it to the intermediate_buffer
849 					std::unique_ptr< _cl_mem, clMem_deleter > temp_buffer(
850 						::clCreateBuffer( context.get( ),
851 								  CL_MEM_READ_WRITE,
852 								  buffer_size,
853 								  NULL,
854 								  &status) );
855 					OPENCL_V_THROW( status, "Creating intermediate Buffer ( ::clCreateBuffer() )" );
856 
857 					intermediate_buffer = std::move( temp_buffer );
858 				}
859 			}
860 
861 			cl_mem	tempInput[2];
862 			cl_mem	tempOutput[2];
863 			for( cl_uint i = 0; i < cl_mem_input.size( ); ++i )
864 				tempInput[ i ] = cl_mem_input[ i ].get( );
865 
866 			for( cl_uint i = 0; i < cl_mem_output.size( ); ++i )
867 				tempOutput[ i ] = cl_mem_output[ i ].get( );
868 
869 			cl_event tevent = NULL;
870 			if( buffer_size )
871 			{
872 				status = clfftEnqueueTransform(*plan_handle,
873 								  _transformation_direction,
874 								  1,
875 								  &tempQueue,
876 								  0,
877 								  NULL,
878 								  &tevent,
879 								  &tempInput[ 0 ],
880 								  &tempOutput[ 0 ],
881 								  intermediate_buffer.get() );
882 			}
883 			else
884 			{
885 				status = clfftEnqueueTransform(*plan_handle,
886 								  _transformation_direction,
887 								  1,
888 								  &tempQueue,
889 								  0,
890 								  NULL,
891 								  &tevent,
892 								  &tempInput[ 0 ],
893 								  &tempOutput[ 0 ],
894 								  NULL );
895 			}
896             clFinish(tempQueue);
897 			tempEvent.reset(tevent); tevent = NULL;
898 
899 			if( status != CLFFT_SUCCESS )
900 			{
901 				throw std::runtime_error(prettyPrintclFFTStatus(status).c_str());
902 			}
903 
904 			// wait for the kernel call to finish execution
905 			const cl_event revent = tempEvent.get();
906             cl_int wait_status = clWaitForEvents(1, &revent);
907 			if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
908 			{
909 				cl_int error_code;
910 				clGetEventInfo( revent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &error_code, NULL );
911 				throw std::runtime_error(prettyPrintclFFTStatus(error_code).c_str());
912 			}
913             else if( wait_status != CL_SUCCESS )
914             {
915 				throw std::runtime_error(prettyPrintclFFTStatus(wait_status).c_str());
916             }
917 		}
catch(const std::exception &)918 		catch (const std::exception& ) {
919 			std::cout << "Exception occurred during clfftEnqueueTransform"
920 					  << __FILE__ << __LINE__ << std::endl;
921 			throw;
922 		}
923 
924 		if( in_place() ) {
925 			capture_input();
926 		}
927 		else {
928 			capture_output();
929 		}
930 
931 		get_layouts();
932 		if( placeness() == CLFFT_INPLACE )
933 		{
934 			if( is_real( _input_layout ) && is_hermitian( _output_layout ) )
935 			{
936 				input.change_real_to_hermitian( output.strides(), output.distance() );
937 			}
938 			else if( is_hermitian( _input_layout ) && is_real( _output_layout ) )
939 			{
940 				input.change_hermitian_to_real( output.strides(), output.distance() );
941 			}
942 		}
943 
944 		// there's no way to know if in-place transforms have written in bad places,
945 		// because depending on input and output strides, the state of the memory
946 		// between points is not necessarily the NaN that we set it to
947 		if( _placeness != CLFFT_INPLACE )
948 		{
949 			input.make_sure_padding_was_not_overwritten();
950 			output.make_sure_padding_was_not_overwritten();
951 		}
952 	}
953 
954 	/*****************************************************/
maximum_problem_size()955 	size_t maximum_problem_size() {
956 		int device_index = 0;
957 		//N.B. if this class ever needs to support more than one device at once
958 		//(i.e., multiple GPUs or CPU+GPU), device index will need to be variable
959 		//to choose the device of interest
960 		return cl_device_max_memory_to_allocate(device_index)/(sizeof(T)*2);
961 		//TODO *2 needs to be either *1 or *2, depending, once real numbers are implemented in clfft
962 	}
963 
964 	/*****************************************************/
number_of_opencl_devices()965 	size_t number_of_opencl_devices() {
966 		return device_id.size();
967 	}
968 
969 
970 	/*****************************************************/
initialize_failed()971 	bool initialize_failed() {
972 		return init_failure;
973 	}
974 
975 	/*****************************************************/
dataset_is_too_large_for_device()976 	bool dataset_is_too_large_for_device() {
977 		return dataset_too_large;
978 	}
979 
980 	/*****************************************************/
input_buffer()981 	buffer<T> & input_buffer()
982 	{
983 		return input;
984 	}
985 
986 	/*****************************************************/
output_buffer()987 	buffer<T> & output_buffer()
988 	{
989 		return output;
990 	}
991 
992 	/*****************************************************/
result()993 	buffer<T> & result()
994 	{
995 		if( placeness() == CLFFT_INPLACE )
996 			return input;
997 		else if( placeness() == CLFFT_OUTOFPLACE )
998 			return output;
999 		else
1000 			throw std::runtime_error( "invalid placeness" );
1001 	}
1002 
1003 private:
1004 	/*****************************************************/
get_layouts()1005 	void get_layouts() {
1006 		EXPECT_EQ( CLFFT_SUCCESS, clfftGetLayout( *plan_handle, &_input_layout, &_output_layout ) );
1007 	}
1008 
1009 	/*****************************************************/
1010 	// after transform() is run:
1011 	//   if in-place transformation -- the results will be in the input buffer
1012 	//	 otherwise -- the results will be in the output buffer
placeness(clfftResultLocation placeness)1013 	void placeness( clfftResultLocation placeness )
1014 	{
1015 		EXPECT_EQ( CLFFT_SUCCESS, clfftSetResultLocation( *plan_handle, placeness ) );
1016 	}
1017 
1018 	/*****************************************************/
in_place()1019 	bool in_place() {
1020 		clfftResultLocation placeness;
1021 		clfftGetResultLocation( *plan_handle, &placeness );
1022 		return (placeness == CLFFT_INPLACE) ? true : false;
1023 	}
1024 
1025 	/*****************************************************/
capture_output()1026 	void capture_output() {
1027 		if( is_planar( output_layout() ) ) {
1028 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0,
1029 					output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "reading output buffer - planar real ( ::clEnqueueReadBuffer() )" );
1030 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[IMAG].get( ), CL_TRUE, 0,
1031 					output.size_in_bytes(), output.imag_ptr(), 0, NULL, NULL), "reading output buffer - planar imaginary ( ::clEnqueueReadBuffer() )" );
1032 		}
1033 		else if( is_interleaved( output_layout() ) ) {
1034 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[0].get( ), CL_TRUE, 0,
1035 					output.size_in_bytes(), output.interleaved_ptr(), 0, NULL, NULL), "reading output buffer - interleaved ( ::clEnqueueReadBuffer() )" );
1036 		}
1037 		else if( is_real( output_layout() ) ) {
1038 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0,
1039 					output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "reading output buffer - planar real ( ::clEnqueueReadBuffer() )" );
1040 		}
1041 		else
1042 		{
1043 			throw std::runtime_error( "we shouldn't make it here [capture_output()]" );
1044 		}
1045 	}
1046 
1047 	/*****************************************************/
capture_input()1048 	void capture_input() {
1049 		if( is_planar( input_layout() ) ) {
1050 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0,
1051 					input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "reading input buffer - planar real ( ::clEnqueueReadBuffer() )" );
1052 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[IMAG].get( ), CL_TRUE, 0,
1053 					input.size_in_bytes(), input.imag_ptr(), 0, NULL, NULL), "reading input buffer - planar imaginary ( ::clEnqueueReadBuffer() )" );
1054 		}
1055 		else if( is_interleaved ( input_layout() ) ) {
1056 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[0].get( ), CL_TRUE, 0,
1057 					input.size_in_bytes(), input.interleaved_ptr(), 0, NULL, NULL), "reading input buffer - interleaved ( ::clEnqueueReadBuffer() )" );
1058 		}
1059 		else if( is_real( input_layout() ) ) {
1060 			OPENCL_V_THROW( clEnqueueReadBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0,
1061 					input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "reading input buffer - planar real ( ::clEnqueueReadBuffer() )" );
1062 		}
1063 		else
1064 		{
1065 			throw std::runtime_error( "we shouldn't make it here [capture_input()]" );
1066 		}
1067 	}
1068 
1069 	/*****************************************************/
write_local_output_buffer_to_gpu()1070 	void write_local_output_buffer_to_gpu() {
1071 		if( is_planar( output_layout() ) ) {
1072 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0,
1073 					output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "writing output buffer - planar real ( ::clEnqueueWriteBuffer() )" );
1074 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[IMAG].get( ), CL_TRUE, 0,
1075 					output.size_in_bytes(), output.imag_ptr(), 0, NULL, NULL), "writing output buffer - planar imaginary ( ::clEnqueueWriteBuffer() )" );
1076 		}
1077 		else if( is_interleaved ( output_layout() ) ) {
1078 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[0].get( ), CL_TRUE, 0,
1079 					output.size_in_bytes(), output.interleaved_ptr(), 0, NULL, NULL), "writing output buffer - interleaved ( ::clEnqueueWriteBuffer() )" );
1080 		}
1081 		else if( is_real( output_layout() ) ) {
1082 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_output[REAL].get( ), CL_TRUE, 0,
1083 					output.size_in_bytes(), output.real_ptr(), 0, NULL, NULL), "writing output buffer - planar real ( ::clEnqueueWriteBuffer() )" );
1084 		}
1085 		else
1086 		{
1087 			throw std::runtime_error( "we shouldn't make it here [write_local_output_buffer_to_gpu()]" );
1088 		}
1089 	}
1090 
1091 	/*****************************************************/
write_local_input_buffer_to_gpu()1092 	void write_local_input_buffer_to_gpu() {
1093 		if( is_planar( input_layout() ) ) {
1094 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0,
1095 					input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "writing input buffer - planar real ( ::clEnqueueWriteBuffer() )" );
1096 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[IMAG].get( ), CL_TRUE, 0,
1097 					input.size_in_bytes(), input.imag_ptr(), 0, NULL, NULL), "writing input buffer - planar imaginary ( ::clEnqueueWriteBuffer() )" );
1098 		}
1099 		else if( is_interleaved( input_layout() ) ) {
1100 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[0].get( ), CL_TRUE, 0,
1101 					input.size_in_bytes(), input.interleaved_ptr(), 0, NULL, NULL), "writing input buffer - interleaved ( ::clEnqueueWriteBuffer() )" );
1102 		}
1103 		else if( is_real( input_layout() ) ) {
1104 			OPENCL_V_THROW( clEnqueueWriteBuffer( queue.get( ), cl_mem_input[REAL].get( ), CL_TRUE, 0,
1105 					input.size_in_bytes(), input.real_ptr(), 0, NULL, NULL), "writing input buffer - planar real ( ::clEnqueueWriteBuffer() )" );
1106 		}
1107 		else
1108 		{
1109 			throw std::runtime_error( "we shouldn't make it here [write_local_input_buffer_to_gpu()]" );
1110 		}
1111 	}
1112 
1113 
1114 	/*****************************************************/
cl_device_max_memory_to_allocate(size_t device_index)1115 	cl_ulong cl_device_max_memory_to_allocate(size_t device_index) {
1116 		if( number_of_opencl_devices() == 0 || device_index > number_of_opencl_devices() )
1117 		{
1118 			return 0;
1119 		}
1120 		else
1121 		{
1122 			cl_ulong device_max_to_allocate = 0;
1123 			OPENCL_V_THROW( ::clGetDeviceInfo( device_id[device_index], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( cl_ulong ), &device_max_to_allocate, NULL ),
1124 				"Getting CL_DEVICE_MAX_MEM_ALLOC_SIZE device info ( ::clGetDeviceInfo() )" );
1125 
1126 			return device_max_to_allocate;
1127 		}
1128 	}
1129 
1130 
1131 	/*****************************************************/
cl_device_max_global_memory(size_t device_index)1132 	cl_ulong cl_device_max_global_memory(size_t device_index) {
1133 		if( number_of_opencl_devices() == 0 || device_index > number_of_opencl_devices() )
1134 		{
1135 			return 0;
1136 		}
1137 		else
1138 		{
1139 			cl_ulong global_mem_size = 0;
1140 			OPENCL_V_THROW( ::clGetDeviceInfo( device_id[device_index], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( cl_ulong ), &global_mem_size, NULL ),
1141 				"Getting CL_DEVICE_GLOBAL_MEM_SIZE device info ( ::clGetDeviceInfo() )" );
1142 
1143 			return global_mem_size;
1144 		}
1145 	}
1146 
1147 	#if defined(PERSISTENT_PLANS_FEATURE_HAS_BEEN_DEFEATURED_WHICH_MEANS_IT_IS_NO_LONGER_A_FEATURE)
1148 	/*****************************************************/
write_plan_to_file(std::string filename)1149 	void write_plan_to_file(std::string filename)
1150 	{
1151 		cl_command_queue tempQueue = queue.get( );
1152 		EXPECT_EQ( CLFFT_SUCCESS, clfftBakePlan(*plan_handle, 1, &tempQueue, NULL, NULL ));
1153 		// we need to make sure the plan is baked before we write it out, or we won't get any juicy binaries along with it
1154 
1155 		clfftWritePlanToDisk(*plan_handle, filename.c_str());
1156 	}
1157 
1158 	/*****************************************************/
read_plan_from_file(std::string filename)1159 	void read_plan_from_file(std::string filename)
1160 	{
1161 		clfftReadPlanFromDisk( *plan_handle, filename.c_str() );
1162 
1163 		// if we've changed from the default for input and output layouts, we need to re-set the layouts to make sure buffers get set up completely
1164 		set_layouts( input_layout(), output_layout() );
1165 	}
1166 	#endif
1167 };
1168 
1169 #endif
1170