1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #pragma once
19 #if !defined( CLFFT_BUFFER_H )
20 #define CLFFT_BUFFER_H
21 
22 #include <cmath>
23 #include <complex>
24 #include <stdexcept>
25 #include <memory>
26 #include <vector>
27 #include <utility>
28 #include <sstream>
29 #include "../include/clFFT.h"
30 #include "test_constants.h"
31 #include <boost/random.hpp>
32 #include <stdint.h>
33 #include "buffer_memory.h"
34 
35 /*****************************************************/
36 /*****************************************************/
37 template< typename T >
floats_are_about_equal(T a,T b)38 bool floats_are_about_equal( T a, T b) {
39 	// explicit check to see if a and b are both zero-ish . . .
40 	if( fabs(a) < 0.00001f && fabs(b) < 0.00001f) return true;
41 	// . . . and if not, we'll see if they're the same-ish
42 	return ( fabs(a-b) > fabs(a*tolerance) ) ? false : true;
43 }
44 
45 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
46 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
47 struct index_t {
48 	size_t x, y, z, batch;
49 
index_tindex_t50 	index_t( size_t inx, size_t iny, size_t inz, size_t inbatch )
51 		: x(inx)
52 		, y(iny)
53 		, z(inz)
54 		, batch(inbatch)
55 	{}
56 };
57 
58 namespace layout
59 {
60 	// buffer_layout_t will be used to let class buffer know how many instances of buffer_memory to make and their sizes
61 	enum buffer_layout_t
62 	{
63 		real,
64 		complex_interleaved,
65 		complex_planar,
66 		hermitian_interleaved,
67 		hermitian_planar
68 	};
69 }
70 
71 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
72 /*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
73 template <class T>
74 class buffer {
75 private:
76 	// we need to save the requested length x, because
77 	// if we change the buffer from complex to real,
78 	// (as in a round-trip test) we need to be able to
79 	// get back to the original length of x. in the case
80 	// of an odd transform length, that's not possible
81 	// due to round-off error unless we explicitly save it
82 	size_t _requested_length_x;
83 	size_t _number_of_dimensions;
84 	size_t _batch_size;
85 	size_t _distance;
86 	layout::buffer_layout_t _layout;
87 	clfftResultLocation _placeness;
88 
89 	std::vector< size_t > _lengths;
90 	std::vector< size_t > _strides;
91 	bool _tightly_packed_strides;
92 	bool _tightly_packed_distance;
93 
94 	static const size_t tightly_packed = 0;
95 
96 	// if real or planar:
97 	// _the_buffers[re] will hold the real portion
98 	// _the_buffers[im] will hold the imaginary portion (planar only)
99 	// if interleaved:
100 	// _the_buffers[interleaved] will hold the whole banana
101 	std::vector< buffer_memory< T > > _the_buffers;
102 
103 	enum
104 	{
105 		interleaved = 0,
106 		re = 0, // real
107 		im = 1 // imaginary
108 	};
109 
110 public:
111 	/*****************************************************/
buffer(const size_t dimensions_in,const size_t * lengths_in,const size_t * strides_in,const size_t batch_size_in,const size_t distance_in,const layout::buffer_layout_t layout_in,const clfftResultLocation placeness_in)112 	buffer( const size_t dimensions_in,
113 			const size_t* lengths_in,
114 			const size_t* strides_in,
115 			const size_t batch_size_in,
116 			const size_t distance_in,
117 			const layout::buffer_layout_t layout_in,
118 			const clfftResultLocation placeness_in
119 		  )
120 		: _number_of_dimensions( dimensions_in )
121 		, _batch_size( batch_size_in )
122 		, _distance( distance_in )
123 		, _layout( layout_in )
124 		, _placeness( placeness_in )
125 		, _lengths()
126 		, _strides()
127 		, _the_buffers()
128 	{
129 		initialize_lengths(lengths_in);
130 		initialize_strides(strides_in);
131 		initialize_distance(distance_in);
132 		create_buffer_memory();
133 		clear();
134 	}
135 
136 	/*****************************************************/
~buffer()137 	~buffer()
138 	{}
139 
140 	/*****************************************************/
141 	// this assignment operator only copies _data_.
142 	// it does not change the rest of the buffer information
143 	// and in fact, it requires that the buffer sizes be the same going in
144 	buffer<T> & operator=( buffer<T> & that )
145 	{
146 		if( this->is_real() != that.is_real() ||
147 			this->is_hermitian() != that.is_hermitian() ||
148 			this->is_complex() != that.is_complex() )
149 		{
150 			throw std::runtime_error( "Buffers must be the same layout type for assignment operator" );
151 		}
152 
153 		if( this->_number_of_dimensions != that._number_of_dimensions ||
154 			this->_batch_size != that._batch_size ||
155 			this->_lengths != that._lengths )
156 		{
157 			throw std::runtime_error( "Buffers must be the same size for assignment operator" );
158 		}
159 
160 		if( this->is_real() )
161 		{
162 			for( size_t batch = 0; batch < batch_size(); batch++ ) {
163 				for( size_t z = 0; z < length(dimz); z++ ) {
164 					for( size_t y = 0; y < length(dimy); y++ ) {
165 						for( size_t x = 0; x < length(dimx); x++ ) {
166 							this->set_one_data_point( that.real(x,y,z,batch), x, y, z, batch );
167 						}
168 					}
169 				}
170 			}
171 		}
172 		else
173 		{
174 			for( size_t batch = 0; batch < batch_size(); batch++ ) {
175 				for( size_t z = 0; z < length(dimz); z++ ) {
176 					for( size_t y = 0; y < length(dimy); y++ ) {
177 						for( size_t x = 0; x < length(dimx); x++ ) {
178 							this->set_one_data_point( that.real(x,y,z,batch), that.imag(x,y,z,batch), x, y, z, batch );
179 						}
180 					}
181 				}
182 			}
183 		}
184 
185 		return *this;
186 	}
187 
188 private:
189 	/*****************************************************/
preinitialize_lengths_to_1_1_1()190 	void preinitialize_lengths_to_1_1_1()
191 	{
192 		_lengths.clear();
193 
194 		for( int i = 0; i < max_dimension; ++i ) {
195 			_lengths.push_back(1);
196 		}
197 	}
198 
199 	/*****************************************************/
initialize_lengths(const size_t * lengths_in)200 	void initialize_lengths(const size_t* lengths_in)
201 	{
202 		preinitialize_lengths_to_1_1_1();
203 
204 		for( size_t i = 0; i < _number_of_dimensions; ++i )
205 		{
206 			_lengths[i] = lengths_in[i];
207 		}
208 
209 		_requested_length_x = _lengths[dimx];
210 		adjust_length_x_for_hermitian_buffers();
211 	}
212 
213 	/*****************************************************/
adjust_length_x_for_hermitian_buffers()214 	void adjust_length_x_for_hermitian_buffers()
215 	{
216 		// complex-to-complex transforms do not require any change
217 		// to the number of points in the buffer
218 
219 		// real buffers also never require a change to the number of
220 		// points in the buffer
221 
222 		// a hermitian buffer with a length of "X" will actually
223 		// have X/2 + 1 points (the other half-ish are conjugates
224 		// and do not need to be stored). lenY and lenZ are never
225 		// modified
226 		if( is_hermitian() )
227 		{
228 			_lengths[dimx] = _lengths[dimx] / 2 + 1;
229 		}
230 	}
231 
232 	/*****************************************************/
preinitialize_strides_to_1_1_1()233 	void preinitialize_strides_to_1_1_1()
234 	{
235 		_strides.clear();
236 
237 		for( int i = 0; i < max_dimension; ++i ) {
238 			_strides.push_back(1);
239 		}
240 	}
241 
242 	/*****************************************************/
initialize_strides(const size_t * strides_in)243 	void initialize_strides(const size_t* strides_in)
244 	{
245 		preinitialize_strides_to_1_1_1();
246 
247 		// we need to calculate the strides if tightly packed
248 		if( strides_in == nullptr ) {
249 			_strides[dimx] = 1;
250 			for( size_t i = 1; i < _number_of_dimensions; ++i )
251 			{
252 				_strides[i] = _strides[i-1]*_lengths[i-1];
253 			}
254 
255 			_tightly_packed_strides = true;
256 		}
257 		// we do not need to calculate anything if the user specifies strides
258 		// we just copy the input strides into place
259 		else
260 		{
261 			for( size_t i = 0; i < _number_of_dimensions; ++i )
262 			{
263 				_strides[i] = strides_in[i];
264 			}
265 
266 			_tightly_packed_strides = false;
267 		}
268 	}
269 
270 	/*****************************************************/
initialize_distance(const size_t distance_in)271 	void initialize_distance(const size_t distance_in)
272 	{
273 		if( distance_in == tightly_packed )
274 		{
275 			// calculate distance if not passed in
276 			_distance = _lengths[_number_of_dimensions-1] * _strides[_number_of_dimensions-1];
277 
278 			_tightly_packed_distance = true;
279 		}
280 		else
281 		{
282 			// or copy it if passed in
283 			_distance = distance_in;
284 
285 			_tightly_packed_distance = false;
286 		}
287 	}
288 
289 	/*****************************************************/
create_buffer_memory()290 	void create_buffer_memory()
291 	{
292 		if( is_real() )
293 		{
294 			// just one real buffer
295 			_the_buffers.push_back( buffer_memory< T >( total_number_of_points_including_data_and_intervening() ) );
296 
297 			increase_memory_allocation_for_real_in_place_buffers();
298 		}
299 		else if( is_planar() )
300 		{
301 			// one real buffer
302 			_the_buffers.push_back( buffer_memory< T >( total_number_of_points_including_data_and_intervening() ) );
303 			// and one imaginary buffer
304 			_the_buffers.push_back( buffer_memory< T >( total_number_of_points_including_data_and_intervening() ) );
305 		}
306 		else if( is_interleaved() )
307 		{
308 			// one double-wide interleaved buffer
309 			_the_buffers.push_back( buffer_memory< T >( 2 * total_number_of_points_including_data_and_intervening() ) );
310 		}
311 	}
312 
313 	/*****************************************************/
amount_of_extra_padding_per_x()314 	size_t amount_of_extra_padding_per_x()
315 	{
316 		if( length(dimx) % 2 == 0 ) // even lengths of x add 2 per row
317 			return 2;
318 		else // odd lengths of x add 1 per row
319 			return 1;
320 	}
321 
322 	/*****************************************************/
adjust_strides_and_distance_for_in_place_real_buffer()323 	void adjust_strides_and_distance_for_in_place_real_buffer()
324 	{
325 		if( is_real() )
326 		{
327 			if( is_in_place() )
328 			{
329 				size_t amount_to_add_for_this_dimension = stride(dimx) * amount_of_extra_padding_per_x();
330 
331 				// strides first
332 				if( number_of_dimensions() >= 2 )
333 				{
334 					_strides[dimy] += amount_to_add_for_this_dimension;
335 				}
336 
337 				if( number_of_dimensions() == 3 )
338 				{
339 					amount_to_add_for_this_dimension *= length(dimy);
340 					_strides[dimz] += amount_to_add_for_this_dimension;
341 				}
342 
343 				// distance next
344 				if( number_of_dimensions() == 1 )
345 				{
346 					_distance += amount_to_add_for_this_dimension;
347 				}
348 				else if( number_of_dimensions() == 2 )
349 				{
350 					_distance += ( amount_to_add_for_this_dimension * length(dimy) );
351 				}
352 				else if( number_of_dimensions() == 3 )
353 				{
354 					_distance += ( amount_to_add_for_this_dimension * length(dimz) );
355 				}
356 				else throw std::runtime_error( "invalid dimensions in adjust_strides_and_distance_for_in_place_real_buffer()" );
357 			}
358 			else throw std::runtime_error( "this buffer is out of place and shouldn't be adjusting strides" );
359 		}
360 		else throw std::runtime_error( "this buffer is unreal and shouldn't be adjusting strides" );
361 	}
362 
363 	/*****************************************************/
increase_memory_allocation_for_real_in_place_buffers()364 	void increase_memory_allocation_for_real_in_place_buffers()
365 	{
366 		// when performing an in-place, real-to-hermitian transform,
367 		// we want a little extra space to account for the larger size
368 		// of the hermitian output.
369 
370 		// each row in the X dimension should have enough space for 2 extra reals
371 		// (to account for the one extra complex number that will be put
372 		// into the buffer after the transform)
373 
374 		// we don't want to change the length, because the number of points
375 		// in the transform isn't changing. we only want to change the
376 		// amount of memory reserved
377 		if( is_real() )
378 		{
379 			if( is_in_place() )
380 			{
381 				if( _tightly_packed_strides && _tightly_packed_distance )
382 				{
383 					// request extra memory
384 					_the_buffers[re].increase_allocated_memory( amount_of_extra_padding_per_x() * stride(dimx) * length(dimy) * length(dimz) * batch_size() );
385 
386 					// adjust strides/distances so that the padding is at the end of each row in the Xth dimension
387 					adjust_strides_and_distance_for_in_place_real_buffer();
388 				}
389 			}
390 		}
391 	}
392 
393 	/*****************************************************/
394 	size_t index( const size_t x, const size_t y=0, const size_t z=0, const size_t batch=0)
395 	{
396 		size_t interleaved_offset = 1;
397 
398 		// if this buffer is interleaved, the index should actually be double what it appears.
399 		// interleaved_offset will accomplish this magical doubling.
400 		if( is_interleaved() )
401 			interleaved_offset = 2;
402 
403 		size_t the_index = ( stride(dimx) * x + stride(dimy) * y + stride(dimz) * z + distance() * batch ) * interleaved_offset;
404 
405 		return the_index;
406 	}
407 
408 	/*****************************************************/
409 	size_t next_index( const size_t x, const size_t y=0, const size_t z=0, const size_t batch=0)
410 	{
411 		if( x+1 < length(dimx))
412 			return index( x+1, y, z, batch );
413 		else if( y+1 < length(dimy) )
414 			return index( 0, y+1, z, batch );
415 		else if( z+1 < length(dimz) )
416 			return index( 0, 0, z+1, batch );
417 		else if( batch+1 < batch_size() )
418 			return index( 0, 0, 0, batch+1 );
419 		else
420 			// we are at the last point
421 			// return the location immediately after the last point
422 			return index( 0, 0, 0, batch+1 );
423 	}
424 
425 	/*****************************************************/
points_are_about_equal(buffer<T> & other_buffer,size_t x,size_t y,size_t z,size_t batch)426 	bool points_are_about_equal( buffer<T> & other_buffer, size_t x, size_t y, size_t z, size_t batch )
427 	{
428 		if( is_real() )
429 			return floats_are_about_equal<T>( real(x, y, z, batch), other_buffer.real(x, y, z, batch) );
430 		else if( is_complex() || is_hermitian() )
431 			return ( floats_are_about_equal<T>( real(x, y, z, batch), other_buffer.real(x, y, z, batch) ) &&
432 					 floats_are_about_equal<T>( imag(x, y, z, batch), other_buffer.imag(x, y, z, batch) ) );
433 		else
434 			throw std::runtime_error( "invalid layout in points_are_about_equal()" );
435 	}
436 
437 	/*****************************************************/
buffer_mismatches(buffer<T> & other_buffer,bool compare_method)438 	size_t buffer_mismatches( buffer<T> & other_buffer, bool compare_method)
439 	{
440 		std::vector< index_t > mismatched_point_indices;
441 
442 		if (compare_method == pointwise_compare)
443 		{
444 			for( size_t batch = 0; batch < batch_size(); batch++ )
445 				for( size_t z = 0; z < length(dimz); z++ )
446 					for( size_t y = 0; y < length(dimy); y++ )
447 						for( size_t x = 0; x < length(dimx); x++ )
448 							if( !points_are_about_equal( other_buffer, x, y, z, batch ) )
449 							{
450 								mismatched_point_indices.push_back( index_t(x, y, z, batch));
451 							}
452 
453 			const size_t max_mismatches_output = default_number_of_mismatches_to_output;
454 
455 			if( mismatched_point_indices.size() != 0 && max_mismatches_output != 0 && suppress_output == false) {
456 				std::cout << std::endl << std::dec << mismatched_point_indices.size() << " of " << batch_size() * number_of_data_points_single_batch()
457 					<<" data points did not match.  The first " << max_mismatches_output << " (max) mismatching points follow:" << std::endl;
458 
459 				std::cout << std::endl << "(array index)(index) ";
460 				std::cout << "[test value (dec)] / [expected value (dec)]";
461 				std::cout << std::endl;
462 				for( size_t i = 0; i < max_mismatches_output && i < mismatched_point_indices.size(); i++ )
463 				{
464 					index_t mismatch = mismatched_point_indices[i];
465 
466 					std::cout
467 						<< std::dec << "(" << mismatched_point_indices.at(i).batch << ")"
468 						<< std::dec << "(" << mismatched_point_indices.at(i).x << "," << mismatched_point_indices.at(i).y << "," << mismatched_point_indices.at(i).z << ") ";
469 					std::cout
470 						<< real( mismatch.x, mismatch.y, mismatch.z, mismatch.batch );
471 
472 					if( is_complex() || is_hermitian() )
473 					{
474 						std::cout << "+i*" << imag( mismatch.x, mismatch.y, mismatch.z, mismatch.batch );
475 					}
476 					std::cout
477 						<< " / " << other_buffer.real( mismatch.x, mismatch.y, mismatch.z, mismatch.batch );
478 
479 					if( is_complex() || is_hermitian() )
480 					{
481 						std::cout << "+i*" << other_buffer.imag( mismatch.x, mismatch.y, mismatch.z, mismatch.batch );
482 					}
483 					std::cout << std::endl;
484 				}
485 				std::cout << std::endl;
486 			}
487 			return mismatched_point_indices.size();
488 		}
489 		else
490 		{
491 			//RMS accuracy judgement
492 
493 			size_t problem_size_per_transform = length(dimx) * length(dimy) * length(dimz);
494 			double rmse_tolerance_this = rmse_tolerance * sqrt((double)problem_size_per_transform / 4096.0);
495 
496 			for (size_t batch = 0; batch < batch_size(); batch++) {
497 
498 				double maxMag = 0.0, maxMagInv = 1.0;
499 
500 				// Compute RMS error relative to maximum magnitude
501 				double rms = 0;
502 
503 				for (size_t z = 0; z < length(dimz); z++) {
504 					for (size_t y = 0; y < length(dimy); y++) {
505 						for (size_t x = 0; x < length(dimx); x++) {
506 							double ex_r, ex_i, ac_r, ac_i;
507 							double mag;
508 
509 							ex_r = other_buffer.real(x, y, z, batch);
510 							ac_r = real(x, y, z, batch);
511 
512 							if (other_buffer.is_complex() || other_buffer.is_hermitian())
513 								ex_i = other_buffer.imag(x, y, z, batch);
514 							else
515 								ex_i = 0;
516 
517 							if (other_buffer.is_complex() || other_buffer.is_hermitian())
518 								ac_i = imag(x, y, z, batch);
519 							else
520 								ac_i = 0;
521 
522 							// find maximum magnitude
523 							mag = ex_r*ex_r + ex_i*ex_i;
524 							maxMag = (mag > maxMag) ? mag : maxMag;
525 
526 							// compute square error
527 							rms += ((ex_r - ac_r)*(ex_r - ac_r) + (ex_i - ac_i)*(ex_i - ac_i));
528 						}
529 					}
530 				}
531 
532 				if (maxMag > magnitude_lower_limit)
533 				{
534 					maxMagInv = 1.0 / maxMag;
535 				}
536 
537 				rms = sqrt(rms*maxMagInv);
538 
539 				if (fabs(rms) > rmse_tolerance_this)
540 				{
541 					if (suppress_output == false)
542 						std::cout << std::endl << "RMSE accuracy judgement failure -- RMSE = " << std::dec << rms <<
543 							", maximum allowed RMSE = " << std::dec << rmse_tolerance_this << std::endl;
544 					return 1;
545 				}
546 			}
547 
548 			return 0;
549 		}
550 	}
551 
552 public:
553 	/*****************************************************/
554 	bool operator==( buffer<T> & other_buffer )
555 	{
556 		// complexity of each dimension must be the same
557 		if( ( is_real() && !other_buffer.is_real() ) || ( !is_real() && other_buffer.is_real() ) ||
558 			( is_hermitian() && !other_buffer.is_hermitian() ) || ( !is_hermitian() && other_buffer.is_hermitian() ) ||
559 			( is_complex() && !other_buffer.is_complex() ) || ( !is_complex() && other_buffer.is_complex() ) )
560 		{
561 			return false;
562 		}
563 
564 		// batch_size of the data must be the same
565 		if( batch_size() != other_buffer.batch_size() )
566 		{
567 			return false;
568 		}
569 
570 		// dimensionality of the data must be the same
571 		if( number_of_dimensions() != other_buffer.number_of_dimensions() )
572 		{
573 			return false;
574 		}
575 
576 		// size of each dimension must be the same
577 		for( size_t i = 0; i < number_of_dimensions(); ++i )
578 		{
579 			if( length(i) != other_buffer.length(i)) return false;
580 		}
581 
582 		size_t number_deaths = 0;
583 		number_deaths += buffer_mismatches( other_buffer, comparison_type);
584 
585 		if( number_deaths == 0 ) return true;
586 		else return false;
587 	}
588 
589 	/*****************************************************/
590 	bool operator!=( buffer<T> & other_buffer )
591 	{
592 		return !( *this == other_buffer );
593 	}
594 
595 	void operator*=( buffer<T> & other_buffer )
596 	{
597 		size_t the_index;
598 		T* base_ptr;
599 		T* real_ptr;
600 		T* imag_ptr;
601 
602 		if( is_interleaved() )
603 		{
604 			base_ptr = _the_buffers[interleaved].ptr();
605 		}
606 		else if ( is_planar() )
607 		{
608 			real_ptr = _the_buffers[re].ptr();
609 			imag_ptr = _the_buffers[im].ptr();
610 		}
611 		else if ( is_real() )
612 		{
613 			base_ptr = _the_buffers[re].ptr();
614 		}
615 
616 		for( size_t batch = 0; batch < batch_size(); batch++ )
617 			for( size_t z = 0; z < length(dimz); z++ )
618 				for( size_t y = 0; y < length(dimy); y++ )
619 					for( size_t x = 0; x < length(dimx); x++ )
620 					{
621 						the_index = index(x, y, z, batch);
622 						if( is_interleaved() )
623 						{
624 							*( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
625 
626 							the_index = the_index + 1; // the imaginary component immediately follows the real
627 							if (other_buffer.is_real())
628 							{
629 								*( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
630 							}
631 							else
632 							{
633 								*( base_ptr + the_index ) *= other_buffer.imag(x, y, z, batch);
634 							}
635 						}
636 						else if ( is_planar() )
637 						{
638 							*( real_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
639 
640 							if (other_buffer.is_real())
641 							{
642 								*( imag_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
643 							}
644 							else
645 							{
646 								*( imag_ptr + the_index ) *= other_buffer.imag(x, y, z, batch);
647 							}
648 						}
649 						else if ( is_real() )
650 						{
651 							*( base_ptr + the_index ) *= other_buffer.real(x, y, z, batch);
652 						}
653 					}
654 	}
655 
656 	//Calculates a 3 point average of other_buffer and
657 	//multiplies with buffer
658 	//only real layout is supported for other_buffer currently
multiply_3pt_average(buffer<T> & other_buffer)659 	void multiply_3pt_average( buffer<T> & other_buffer )
660 	{
661 		if (!other_buffer.is_real())
662 		{
663 			throw std::runtime_error( "only real layout is supported currently for other_buffer" );
664 		}
665 
666 		size_t the_index, o_the_index;
667 		T *base_ptr, *o_base_ptr;
668 		T *real_ptr;
669 		T *imag_ptr;
670 		T o_prev_val, o_next_val;
671 		T average;
672 
673 		if( is_interleaved() )
674 		{
675 			base_ptr = _the_buffers[interleaved].ptr();
676 		}
677 		else if ( is_planar() )
678 		{
679 			real_ptr = _the_buffers[re].ptr();
680 			imag_ptr = _the_buffers[im].ptr();
681 		}
682 		else if ( is_real() )
683 		{
684 			base_ptr = _the_buffers[re].ptr();
685 		}
686 		o_base_ptr = other_buffer.real_ptr();
687 
688 		for( size_t batch = 0; batch < batch_size(); batch++ )
689 			for( size_t z = 0; z < length(dimz); z++ )
690 				for( size_t y = 0; y < length(dimy); y++ )
691 					for( size_t x = 0; x < length(dimx); x++ )
692 					{
693 						the_index = index(x, y, z, batch);
694 						o_the_index = other_buffer.index(x, y, z, batch);
695 						o_prev_val = o_the_index <= 0 ? 0 : *(o_base_ptr + o_the_index - 1);
696 						o_next_val = o_the_index >= (other_buffer.total_number_of_points_including_data_and_intervening() - 1) ? 0 : *(o_base_ptr + o_the_index +  1);
697 
698 						average = (o_prev_val + *(o_base_ptr + o_the_index) + o_next_val)/ 3.0f ;
699 
700 						if( is_interleaved() )
701 						{
702 							*( base_ptr + the_index ) *= average;
703 
704 							the_index = the_index + 1; // the imaginary component immediately follows the real
705 							*( base_ptr + the_index ) *= average;
706 						}
707 						else if ( is_planar() )
708 						{
709 							*( real_ptr + the_index ) *= average;
710 
711 							*( imag_ptr + the_index ) *= average;
712 						}
713 						else if ( is_real() )
714 						{
715 							*( base_ptr + the_index ) *= average;
716 						}
717 					}
718 	}
719 
720 	/*****************************************************/
721 	// strides and distance are those of the output (that is, the new hermitian buffer)
change_real_to_hermitian(const size_t * strides_in,const size_t distance_in)722 	void change_real_to_hermitian( const size_t* strides_in, const size_t distance_in )
723 	{
724 		if( !is_real() || !is_in_place() )
725 		{
726 			throw std::runtime_error( "can only change a real buffer used in an in-place transform to a hermitian one" );
727 		}
728 
729 		// we currently only support hermitian interleaved for in-place transforms
730 		_layout = layout::hermitian_interleaved;
731 		adjust_length_x_for_hermitian_buffers();
732 		initialize_strides(strides_in);
733 		initialize_distance(distance_in);
734 	}
735 
736 	/*****************************************************/
737 	// strides and distance are those of the output (that is, the new real buffer)
change_hermitian_to_real(const size_t * strides_in,const size_t distance_in)738 	void change_hermitian_to_real( const size_t* strides_in, const size_t distance_in )
739 	{
740 		// we currently only support hermitian interleaved for in-place transforms
741 		if( _layout != layout::hermitian_interleaved || !is_in_place() )
742 		{
743 			throw std::runtime_error( "can only change a hermitian interleaved buffer used in an in-place transform to a real one" );
744 		}
745 
746 		_layout = layout::real;
747 		_lengths[dimx] = _requested_length_x;
748 		initialize_strides(strides_in);
749 		initialize_distance(distance_in);
750 	}
751 
752 	/*****************************************************/
is_real()753 	bool is_real()
754 	{
755 		return _layout == layout::real;
756 	}
757 
758 	/*****************************************************/
is_complex()759 	bool is_complex()
760 	{
761 		return _layout == layout::complex_interleaved || _layout == layout::complex_planar;
762 	}
763 
764 	/*****************************************************/
is_hermitian()765 	bool is_hermitian()
766 	{
767 		return _layout == layout::hermitian_interleaved || _layout == layout::hermitian_planar;
768 	}
769 
770 	/*****************************************************/
is_planar()771 	bool is_planar()
772 	{
773 		return _layout == layout::complex_planar || _layout == layout::hermitian_planar;
774 	}
775 
776 	/*****************************************************/
is_interleaved()777 	bool is_interleaved()
778 	{
779 		return _layout == layout::complex_interleaved || _layout == layout::hermitian_interleaved;
780 	}
781 
782 	/*****************************************************/
is_in_place()783 	bool is_in_place()
784 	{
785 		if( _placeness == CLFFT_INPLACE ) return true;
786 		else if( _placeness == CLFFT_OUTOFPLACE) return false;
787 		else throw std::runtime_error( "invalid placeness value in is_in_place()" );
788 	}
789 
790 	/*****************************************************/
interleaved_ptr()791 	T* interleaved_ptr()
792 	{
793 		if( is_interleaved() )
794 			return _the_buffers[interleaved].ptr();
795 		else
796 			throw std::runtime_error( "interleaved_ptr() is only available on interleaved buffers" );
797 	}
798 
799 	/*****************************************************/
real_ptr()800 	T* real_ptr()
801 	{
802 		if( is_planar() || is_real() )
803 			return _the_buffers[re].ptr();
804 		else
805 			throw std::runtime_error( "real() is only available on real and planar buffers" );
806 	}
807 
808 	/*****************************************************/
imag_ptr()809 	T* imag_ptr()
810 	{
811 		if( is_planar() )
812 			return _the_buffers[im].ptr();
813 		else
814 			throw std::runtime_error( "imag_ptr() is only available on planar buffers" );
815 	}
816 
817 	/*****************************************************/
818 	T real( const size_t x, const size_t y=0, const size_t z=0, const size_t batch=0 )
819 	{
820 		size_t this_index = index( x, y, z, batch );
821 
822 		// all layouts will have a real component
823 		// using [re] will catch the real component for
824 		// layout::interleaved as well
825 		T this_value = _the_buffers[re][this_index];
826 		return this_value;
827 	}
828 
829 	/*****************************************************/
830 	T imag( const size_t x, const size_t y=0, const size_t z=0, const size_t batch=0 )
831 	{
832 		size_t this_index = index( x, y, z, batch );
833 
834 		if( is_real() )
835 			throw std::runtime_error( "imag() is not available for this real buffer" );
836 		else if( is_planar() )
837 			return _the_buffers[im][this_index];
838 		else if( is_interleaved() )
839 			// index always points to the real component of an interleaved number
840 			// the following memory location is the imaginary component
841 			return _the_buffers[interleaved][this_index + 1];
842 		else
843 			throw std::runtime_error( "invalid layout type in imag()" );
844 	}
845 
846 	/*****************************************************/
847 	std::complex<T> complex( const size_t x, const size_t y=0, const size_t z=0, const size_t batch=0 )
848 	{
849 		if( is_real() )
850 			throw std::runtime_error( "complex() is not available for this real buffer" );
851 		else if( is_complex() || is_hermitian() )
852 		{
853 			std::complex<T> this_complex( real( x, y, z, batch ), imag( x, y, z, batch ) );
854 			return this_complex;
855 		}
856 		else
857 			throw std::runtime_error( "invalid layout type in complex()" );
858 	}
859 
860 	/*****************************************************/
number_of_dimensions()861 	size_t number_of_dimensions()
862 	{
863 		return _number_of_dimensions;
864 	}
865 
866 	/*****************************************************/
number_of_data_points_single_batch()867 	size_t number_of_data_points_single_batch()
868 	{
869 		size_t number_of_points = 1;
870 		for( size_t i = 0; i < _number_of_dimensions; ++i )
871 		{
872 			number_of_points *= length(i);
873 		}
874 		return number_of_points;
875 	}
876 
877 	/*****************************************************/
number_of_data_points()878 	size_t number_of_data_points()
879 	{
880 		return number_of_data_points_single_batch() * batch_size();
881 	}
882 
883 	/*****************************************************/
884 	// note that this returns the size in number of points and
885 	// does not take layout into consideration. this will yield
886 	// the same number for real, interleaved, and planar layouts.
887 	// whomever uses this information will need to know if they
888 	// want 1x buffer of this size (real), 2x buffer of this
889 	// size (planar), or 1x double-wide buffer (interleaved)
total_number_of_points_including_data_and_intervening()890 	size_t total_number_of_points_including_data_and_intervening()
891 	{
892 		return distance() * batch_size();
893 	}
894 
895 	/*****************************************************/
896 	// note that this will return the size of ONE BUFFER in bytes
897 	// for real and interleaved, that doesn't change anything
898 	// for planar, you will get the size of the real _or_ the imaginary
899 	//			(which should always be the same)
size_in_bytes()900 	size_t size_in_bytes()
901 	{
902 		return _the_buffers[0].size_in_bytes();
903 	}
904 
905 	/*****************************************************/
length(size_t dim)906 	size_t length(size_t dim)
907 	{
908 		return _lengths[dim];
909 	}
910 
911 	/*****************************************************/
stride(size_t dim)912 	size_t stride(size_t dim)
913 	{
914 		return _strides[dim];
915 	}
916 
917 	/*****************************************************/
lengths()918 	size_t* lengths()
919 	{
920 		return &_lengths[0];
921 	}
922 
923 	/*****************************************************/
strides()924 	size_t* strides()
925 	{
926 		return &_strides[0];
927 	}
928 
929 	/*****************************************************/
batch_size()930 	size_t batch_size()
931 	{
932 		return _batch_size;
933 	}
934 
935 	/*****************************************************/
distance()936 	size_t distance()
937 	{
938 		return _distance;
939 	}
940 
941 	/*****************************************************/
clear()942 	void clear()
943 	{
944 		// for all batches
945 
946 		if( is_real() )
947 			set_all_to_value( 0.0f );
948 		else
949 			set_all_to_value( 0.0f, 0.0f );
950 	}
951 
952 	/*****************************************************/
set_one_data_point(T real,const size_t x,const size_t y,const size_t z,const size_t batch)953 	void set_one_data_point( T real, const size_t x, const size_t y, const size_t z, const size_t batch )
954 	{
955 		if( is_real() )
956 		{
957 			T* base_ptr = _the_buffers[re].ptr();
958 			size_t real_index = index(x, y, z, batch);
959 
960 			*( base_ptr + real_index ) = real;
961 		}
962 		else
963 			throw std::runtime_error( "attempting to use real data point setter for complex or hermitian buffer" );
964 	}
965 
966 	/*****************************************************/
set_one_data_point(T real,T imag,const size_t x,const size_t y,const size_t z,const size_t batch)967 	void set_one_data_point( T real, T imag, const size_t x, const size_t y, const size_t z, const size_t batch )
968 	{
969 		if( is_real() )
970 			throw std::runtime_error( "attempting to use complex data point setter for real buffer" );
971 		else if( is_interleaved() )
972 		{
973 			T* base_ptr = _the_buffers[interleaved].ptr();
974 			size_t real_index = index(x, y, z, batch);
975 			size_t imag_index = real_index + 1; // the imaginary component immediately follows the real
976 
977 			*( base_ptr + real_index ) = real;
978 			*( base_ptr + imag_index ) = imag;
979 		}
980 		else // planar
981 		{
982 			T* real_ptr = _the_buffers[re].ptr();
983 			T* imag_ptr = _the_buffers[im].ptr();
984 			size_t the_index = index(x, y, z, batch);
985 
986 			*( real_ptr + the_index ) = real;
987 			*( imag_ptr + the_index ) = imag;
988 		}
989 	}
990 
991 	/*****************************************************/
set_all_to_value(T real)992 	void set_all_to_value( T real )
993 	{
994 		// for all batches
995 
996 		for( size_t batch = 0; batch < batch_size(); batch++ ) {
997 			for( size_t z = 0; z < length(dimz); z++ ) {
998 				for( size_t y = 0; y < length(dimy); y++ ) {
999 					for( size_t x = 0; x < length(dimx); x++ ) {
1000 						set_one_data_point( real, x, y, z, batch );
1001 					}
1002 				}
1003 			}
1004 		}
1005 	}
1006 
1007 	/*****************************************************/
set_all_to_value(T real,T imag)1008 	void set_all_to_value( T real, T imag )
1009 	{
1010 		// for all batches
1011 
1012 		for( size_t batch = 0; batch < batch_size(); batch++ ) {
1013 			for( size_t z = 0; z < length(dimz); z++ ) {
1014 				for( size_t y = 0; y < length(dimy); y++ ) {
1015 					for( size_t x = 0; x < length(dimx); x++ ) {
1016 						set_one_data_point( real, imag, x, y, z, batch );
1017 					}
1018 				}
1019 			}
1020 		}
1021 	}
1022 
1023 	/*****************************************************/
set_all_to_linear_increase()1024 	void set_all_to_linear_increase()
1025 	{
1026 		// for all batches
1027 
1028 		size_t val = 1;
1029 		for( size_t batch = 0; batch < batch_size(); batch++ ) {
1030 			for( size_t z = 0; z < length(dimz); z++ ) {
1031 				for( size_t y = 0; y < length(dimy); y++ ) {
1032 					for( size_t x = 0; x < length(dimx); x++ ) {
1033 						if( is_real() )
1034 						{
1035 							set_one_data_point( static_cast<T>(val), x, y, z, batch );
1036 						}
1037 
1038 						else
1039 						{
1040 							set_one_data_point( static_cast<T>(val), static_cast<T>(val) + 0.5f, x, y, z, batch );
1041 						}
1042 
1043 						++val;
1044 					}
1045 				}
1046 			}
1047 		}
1048 	}
1049 
1050 	/*****************************************************/
set_all_to_sawtooth(T amplitude)1051 	void set_all_to_sawtooth( T amplitude )
1052 	{
1053 		// for all batches
1054 
1055 		for( size_t batch = 0; batch < batch_size(); batch++ )
1056 		{
1057 			for( size_t z = 0; z < length(dimz); z++ )
1058 			{
1059 				for( size_t y = 0; y < length(dimy); y++ )
1060 				{
1061 					// waveform will be 1 period of sawtooth
1062 					size_t number_of_points_in_one_period = length(dimx);
1063 					size_t number_of_points_on_one_line = number_of_points_in_one_period / 2;
1064 
1065 					// the sawtooth will start at 0 and increase to amplitude at T/2
1066 					// at T/2, value will change to -amplitude and increase back up to 0 at T
1067 					// if there are an odd number of points in the whole period,
1068 					// we'll make a stop at 0 in the middle of the jump
1069 					T value = 0.0f;
1070 					T per_point_delta = amplitude / (number_of_points_on_one_line - 1);
1071 
1072 					for( size_t x = 0; x < number_of_points_in_one_period; x++) {
1073 						if( is_real() )
1074 						{
1075 							set_one_data_point( value, x, y, z, batch);
1076 						}
1077 						else
1078 						{
1079 							// for the real value, we want the sawtooth as described above
1080 							// for the imaginary value, we want the 2 times the inverse
1081 							//		(so that real and imaginary don't match, possibly obscuring errors)
1082 							set_one_data_point( value, -2.0f * value, x, y, z, batch);
1083 						}
1084 
1085 						// if we're at T/2, we want to saw on down to the negative amplitude . . .
1086 						if( floats_are_about_equal( value, amplitude ) )
1087 						{
1088 							if( number_of_points_in_one_period % 2 != 0 ) // odd, we need to add the 0
1089 							{
1090 								x++;
1091 								if( is_real() )
1092 								{
1093 									set_one_data_point( 0.0f, x, y, z, batch);
1094 								}
1095 								else
1096 								{
1097 									set_one_data_point( 0.0f, 0.0f, x, y, z, batch);
1098 								}
1099 							}
1100 							value = -1 * amplitude;
1101 						}
1102 						// . . . otherwise, keep going up
1103 						else value += per_point_delta;
1104 					}
1105 				}
1106 			}
1107 		}
1108 	}
1109 
1110 	/*****************************************************/
set_all_to_random_data(size_t max_value,size_t seed)1111 	void set_all_to_random_data( size_t max_value, size_t seed ) {
1112 		// for all batches
1113 
1114 		boost::mt19937 random_data_generator;
1115 		boost::uniform_int<> distribution(1, INT_MAX);
1116 		boost::variate_generator<boost::mt19937&, boost::uniform_int<> >
1117 			random_value(random_data_generator, distribution);
1118 		random_data_generator.seed( static_cast<boost::uint32_t>( seed ) );
1119 
1120 		for( size_t batch = 0; batch < batch_size(); batch++) {
1121 			for( size_t z = 0; z < length(dimz); z++) {
1122 				for( size_t y = 0; y < length(dimy); y++) {
1123 					for( size_t x = 0; x < length(dimx); x++) {
1124 						int val = random_value() % (max_value + 1); // pluck a random value
1125 						if( random_value() % 2 ) val *= -1; // make it negative about 50% of the time
1126 
1127 						if( is_real() )
1128 						{
1129 							set_one_data_point( static_cast<T>(val), x, y, z, batch );
1130 						}
1131 
1132 						else
1133 						{
1134 							set_one_data_point( static_cast<T>(val), static_cast<T>(val), x, y, z, batch );
1135 						}
1136 					}
1137 				}
1138 			}
1139 		}
1140 	}
1141 
1142 	/*****************************************************/
set_all_to_impulse()1143 	void set_all_to_impulse()
1144 	{
1145 		// for all batches
1146 		clear();
1147 
1148 		for( size_t batch = 0; batch < batch_size(); batch++ )
1149 		{
1150 			if( is_real() )
1151 				set_one_data_point( static_cast<T>(number_of_data_points_single_batch()), 0, 0, 0, batch);
1152 			else
1153 				set_one_data_point( static_cast<T>(number_of_data_points_single_batch()), 0.0f, 0, 0, 0, batch);
1154 		}
1155 	}
1156 
1157 
1158 
1159 	/*****************************************************/
scale_data(T scale)1160 	void scale_data( T scale) {
1161 		// for all batches
1162 
1163 		for( size_t batch = 0; batch < batch_size(); batch++ )
1164 		{
1165 			for( size_t z = 0; z < length(dimz); z++ )
1166 			{
1167 				for( size_t y = 0; y < length(dimy); y++ )
1168 				{
1169 					for( size_t x = 0; x < length(dimx); x++ )
1170 					{
1171 						if( is_real() )
1172 						{
1173 							T this_value = real(x, y, z, batch);
1174 							T scaled_value = this_value * scale;
1175 							set_one_data_point( scaled_value, x, y, z, batch );
1176 						}
1177 						else
1178 						{
1179 							T this_real = real(x, y, z, batch);
1180 							T this_imag = imag(x, y, z, batch);
1181 
1182 							T scaled_real = this_real * scale;
1183 							T scaled_imag = this_imag * scale;
1184 							set_one_data_point( scaled_real, scaled_imag, x, y, z, batch );
1185 						}
1186 					}
1187 				}
1188 			}
1189 		}
1190 	}
1191 
1192 	/*****************************************************/
make_sure_padding_was_not_overwritten()1193 	void make_sure_padding_was_not_overwritten()
1194 	{
1195 		// check before and after memory first
1196 		for( size_t i = 0; i < _the_buffers.size(); i++ )
1197 		{
1198 			_the_buffers[i].check_memory_boundaries();
1199 		}
1200 
1201 		if( _tightly_packed_strides && _tightly_packed_distance) return; // nothing worth checking
1202 
1203 		size_t intervening_point_touched = 0;
1204 
1205 		for( size_t batch = 0; batch < batch_size(); batch++)
1206 		{
1207 			for( size_t z = 0; z < length(dimz); z++)
1208 			{
1209 				for( size_t y = 0; y < length(dimy); y++)
1210 				{
1211 					for( size_t x = 0; x < length(dimx); x++)
1212 					{
1213 						size_t this_point = index(x, y, z, batch);
1214 						size_t next_point = next_index(x, y, z, batch);
1215 
1216 						if( is_planar() )
1217 						{
1218 							if( this_point < _the_buffers[re].size() && this_point + 1 != next_point)
1219 							{
1220 								for( size_t i = this_point+1; i < next_point; i++)
1221 								{
1222 									T this_real = _the_buffers[re][i];
1223 									T this_imag = _the_buffers[im][i];
1224 
1225 									if( nan_as_hex(this_real) != float_as_hex(this_real)
1226 										|| nan_as_hex(this_imag) != float_as_hex(this_imag) )
1227 									{
1228 										++intervening_point_touched;
1229 									}
1230 								}
1231 							}
1232 						}
1233 						else if( is_real() )
1234 						{
1235 							if( this_point < _the_buffers[re].size() && this_point + 1 != next_point)
1236 							{
1237 								for( size_t i = this_point+1; i < next_point; i++)
1238 								{
1239 									T this_real = _the_buffers[re][i];
1240 
1241 									if( nan_as_hex(this_real) != float_as_hex(this_real) )
1242 									{
1243 										++intervening_point_touched;
1244 									}
1245 								}
1246 							}
1247 						}
1248 						else if( is_interleaved() )
1249 						{
1250 							if( this_point < _the_buffers[re].size() && this_point + 1 != next_point)
1251 							{
1252 								// NOTE whereas real and planar initialize i = this_point+1,
1253 								// we want this_point+2 for interleaved so that we skip the
1254 								// imaginary value of the point
1255 								for( size_t i = this_point+2; i < next_point; i++)
1256 								{
1257 									T this_real = _the_buffers[interleaved][i];
1258 
1259 									if( nan_as_hex(this_real) != float_as_hex(this_real) )
1260 									{
1261 										++intervening_point_touched;
1262 									}
1263 								}
1264 							}
1265 						}
1266 						else
1267 							throw std::runtime_error( "invalid layout in make_sure_memory_between_data_points_was_not_touched()" );
1268 					}
1269 				}
1270 			}
1271 		}
1272 
1273 		EXPECT_EQ( 0, intervening_point_touched );
1274 	}
1275 };
1276 
1277 #endif
1278