1 /////////////////////////////////////////////////////////////////////////////// 2 // // 3 // The Template Matrix/Vector Library for C++ was created by Mike Jarvis // 4 // Copyright (C) 1998 - 2016 // 5 // All rights reserved // 6 // // 7 // The project is hosted at https://code.google.com/p/tmv-cpp/ // 8 // where you can find the current version and current documention. // 9 // // 10 // For concerns or problems with the software, Mike may be contacted at // 11 // mike_jarvis17 [at] gmail. // 12 // // 13 // This software is licensed under a FreeBSD license. The file // 14 // TMV_LICENSE should have bee included with this distribution. // 15 // It not, you can get a copy from https://code.google.com/p/tmv-cpp/. // 16 // // 17 // Essentially, you can use this software however you want provided that // 18 // you include the TMV_LICENSE file in any distribution that uses it. // 19 // // 20 /////////////////////////////////////////////////////////////////////////////// 21 22 23 // This file defines two classes that are used by TMV for allocating 24 // memory. 25 // 26 // First, AlignedArray works basically like a regular new v[] 27 // allocation, except that if we are doing SSE or SSE2, then we enforce 28 // that the allocation is aligned on a 128 byte boundary. 29 // 30 // Second, StackArray is used for small arrays and emulates a normal C array 31 // on the stack: T v[N]. However, when N is large, it uses the 32 // heap instead to avoid stack overflows. 33 34 #ifndef Array_H 35 #define Array_H 36 37 const ptrdiff_t TMV_MaxStack = 1024; // bytes 38 39 #include <complex> 40 41 namespace tmv 42 { 43 template <typename T> TMV_Aligned(const T * p)44 inline bool TMV_Aligned(const T* p) 45 { return (reinterpret_cast<ptrdiff_t>(p) & 0xf) == 0; } 46 47 // There doesn't seem to be any portable C++ function that guarantees 48 // that the memory allocated will be aligned as necessary for 49 // SSE functions. 50 // Sometimes posix_memalign or memalign does this job. 51 // But it doesn't seem to be standard, since some systems don't have it. 52 // Other systems have _mm_malloc (ICPC) or _aligned_malloc (WIN). 53 // So probably I should have ifdefs that check to see if one of these 54 // is available and use that instead. 55 // 56 // But for now we have this simple class that simply loads a bit more 57 // memory than necessary and then finds the starting point that is 58 // 16 byte aligned. 59 60 // Also, the TMV_END_PADDING option is implemented here. If it is 61 // defined, then we write 0's to the end of the full 16 byte word. 62 // This is mostly useful when running valgrind with a BLAS library 63 // that isn't careful about reading past the end of the allocated 64 // memory. GotoBLAS for example. 65 // So if end padding is enabled, we make sure to allocate enough memory 66 // to finish the block of 16 bytes. And we write 0's to the values 67 // that aren't part of the requested memory. 68 69 // First the regular non-SSE version, where we don't need aligment. 70 template <typename T> 71 class AlignedMemory 72 { 73 public: AlignedMemory()74 inline AlignedMemory() : p(0) {} allocate(const ptrdiff_t n)75 inline void allocate(const ptrdiff_t n) 76 { 77 #ifdef TMV_END_PADDING 78 const ptrdiff_t nn = n + 16/sizeof(T); 79 p = new T[nn]; 80 for(ptrdiff_t i=n;i<nn;++i) p[i] = T(0); 81 #else 82 p = new T[n]; 83 #endif 84 } deallocate()85 inline void deallocate() 86 { if (p) delete [] p; p=0; } swapWith(AlignedMemory<T> & rhs)87 inline void swapWith(AlignedMemory<T>& rhs) 88 { T* temp = p; p = rhs.p; rhs.p = temp; } get()89 inline T* get() { return p; } get()90 inline const T* get() const { return p; } 91 private: 92 T* p; 93 }; 94 95 // Now specialize float and double 96 // We do this regardless of whether __SSE__ or __SSE2__ is defined, 97 // since these might be allocated in a unit that doesn't defined them 98 // and then have get() called in a unit that does. This leads to problems! 99 // So we always make the float and double allocations aligned at 100 // 16 byte boundaries. 101 template <> 102 class AlignedMemory<float> 103 { 104 public: AlignedMemory()105 AlignedMemory() : mem(0), p(0) {} allocate(const ptrdiff_t n)106 inline void allocate(const ptrdiff_t n) 107 { 108 #ifdef TMV_END_PADDING 109 const ptrdiff_t nn = (n<<2)+15 + 16; 110 mem = new char[nn]; 111 p = calculateP(); 112 for(ptrdiff_t i=n;i<(nn>>2);++i) p[i] = 0.F; 113 #else 114 mem = new char[(n<<2)+15]; 115 p = calculateP(); 116 #endif 117 TMVAssert((void*)(mem+(n<<2)+15) >= (void*)(p+n)); 118 } deallocate()119 inline void deallocate() 120 { if (mem) delete [] mem; mem=0; p=0; } swapWith(AlignedMemory<float> & rhs)121 inline void swapWith(AlignedMemory<float>& rhs) 122 { 123 char* temp1 = mem; mem = rhs.mem; rhs.mem = temp1; 124 float* temp2 = p; p = rhs.p; rhs.p = temp2; 125 } get()126 inline float* get() { return p; } calculateP()127 inline float* calculateP() 128 { 129 float* pf = reinterpret_cast<float*>( 130 mem + ((0x10-((ptrdiff_t)(mem) & 0xf)) & ~0x10)); 131 TMVAssert( ((ptrdiff_t)(pf) & 0xf) == 0); 132 return pf; 133 } get()134 inline const float* get() const { return p; } 135 private: 136 char* mem; 137 float* p; 138 }; 139 template <> 140 class AlignedMemory<double> 141 { 142 public: AlignedMemory()143 AlignedMemory() : mem(0), p(0) {} allocate(const ptrdiff_t n)144 inline void allocate(const ptrdiff_t n) 145 { 146 #ifdef TMV_END_PADDING 147 const ptrdiff_t nn = (n<<3)+15 + 16; 148 mem = new char[nn]; 149 p = calculateP(); 150 for(ptrdiff_t i=n;i<(nn>>3);++i) p[i] = 0.; 151 #else 152 mem = new char[(n<<3)+15]; 153 p = calculateP(); 154 #endif 155 TMVAssert((void*)(mem+(n<<3)+15) >= (void*)(p+n)); 156 } deallocate()157 inline void deallocate() 158 { if (mem) delete [] mem; mem=0; p=0; } swapWith(AlignedMemory<double> & rhs)159 inline void swapWith(AlignedMemory<double>& rhs) 160 { 161 char* temp1 = mem; mem = rhs.mem; rhs.mem = temp1; 162 double* temp2 = p; p = rhs.p; rhs.p = temp2; 163 } calculateP()164 inline double* calculateP() 165 { 166 double* pd = reinterpret_cast<double*>( 167 mem + ((0x10-((ptrdiff_t)(mem) & 0xf)) & ~0x10)); 168 TMVAssert( ((ptrdiff_t)(pd) & 0xf) == 0); 169 return pd; 170 } get()171 inline double* get() { return p; } get()172 inline const double* get() const { return p; } 173 174 private : 175 char* mem; 176 double* p; 177 }; 178 179 #ifdef TMV_INITIALIZE_NAN 180 // This option is to stress test the code to make sure it works 181 // ok if uninitialized data happens to have a nan in it. 182 // Naive BLAS calls can fail when there are nan's, since it recasts 183 // the equation y = A*x as y = beta*y + alpha*A*x with alpha=1 and 184 // beta=0. So if y initially has nan's, then the beta*y 185 // part has 0*nan => nan, which is then added to whatever 186 // alpha*A*x has, so stays nan on output. 187 // TMV should be accounting for this kind of thing correctly, and 188 // this section here helps test for it. 189 template <typename T> 190 struct TMV_Nan 191 { getTMV_Nan192 static inline T get() 193 { 194 static T zero(0); 195 static T nan = 196 std::numeric_limits<T>::is_integer ? zero : zero/zero; 197 return nan; 198 } 199 }; 200 template <typename T> 201 struct TMV_Nan<std::complex<T> > 202 { 203 static inline std::complex<T> get() 204 { return std::complex<T>(TMV_Nan<T>::get(),TMV_Nan<T>::get()); } 205 }; 206 #endif 207 208 // Now the actual class that makes AlignedMemory work like a normal 209 // pointer. 210 template <typename T> 211 class AlignedArray 212 { 213 public : 214 215 inline AlignedArray() 216 { 217 #ifdef TMV_INITIALIZE_NAN 218 _n = 0; 219 #endif 220 } 221 inline AlignedArray(const ptrdiff_t n) 222 { 223 if (n > 0) p.allocate(n); 224 #ifdef TMV_INITIALIZE_NAN 225 _n = n; 226 for(ptrdiff_t i=0;i<_n;++i) get()[i] = TMV_Nan<T>::get(); 227 #endif 228 } 229 inline ~AlignedArray() 230 { 231 #ifdef TMV_INITIALIZE_NAN 232 for(ptrdiff_t i=0;i<_n;++i) get()[i] = T(-999); 233 #endif 234 p.deallocate(); 235 } 236 237 inline T& operator*() { return *get(); } 238 inline T* operator->() { return get(); } 239 inline operator T*() { return get(); } 240 241 inline const T& operator*() const { return *get(); } 242 inline const T* operator->() const { return get(); } 243 inline operator const T*() const { return get(); } 244 245 inline void swapWith(AlignedArray<T>& rhs) 246 { 247 #ifdef TMV_INITIALIZE_NAN 248 TMV_SWAP(_n,rhs._n); 249 #endif 250 p.swapWith(rhs.p); 251 } 252 inline void resize(const ptrdiff_t n) 253 { 254 #ifdef TMV_INITIALIZE_NAN 255 for(ptrdiff_t i=0;i<_n;++i) get()[i] = T(-999); 256 #endif 257 p.deallocate(); 258 if (n > 0) p.allocate(n); 259 #ifdef TMV_INITIALIZE_NAN 260 _n = n; 261 for(ptrdiff_t i=0;i<_n;++i) get()[i] = TMV_Nan<T>::get(); 262 #endif 263 } 264 265 inline T* get() { return p.get(); } 266 inline const T* get() const { return p.get(); } 267 268 private : 269 270 AlignedMemory<T> p; 271 #ifdef TMV_INITIALIZE_NAN 272 ptrdiff_t _n; 273 #endif 274 275 AlignedArray& operator=(const AlignedArray& p2); 276 AlignedArray(const AlignedArray& p2); 277 }; 278 279 template <class RT> 280 class AlignedArray<std::complex<RT> > 281 { 282 public : 283 typedef std::complex<RT> T; 284 285 inline AlignedArray() 286 { 287 #ifdef TMV_INITIALIZE_NAN 288 _n = 0; 289 #endif 290 } 291 inline AlignedArray(const ptrdiff_t n) 292 { 293 if (n > 0) p.allocate(n<<1); 294 #ifdef TMV_INITIALIZE_NAN 295 _n = n; 296 for(ptrdiff_t i=0;i<_n;++i) 297 get()[i] = TMV_Nan<std::complex<RT> >::get(); 298 #endif 299 } 300 inline ~AlignedArray() 301 { 302 #ifdef TMV_INITIALIZE_NAN 303 for(ptrdiff_t i=0;i<_n;++i) get()[i] = std::complex<RT>(-999,-888); 304 #endif 305 p.deallocate(); 306 } 307 308 inline T& operator*() { return *get(); } 309 inline T* operator->() { return get(); } 310 inline operator T*() { return get(); } 311 312 inline const T& operator*() const { return *get(); } 313 inline const T* operator->() const { return get(); } 314 inline operator const T*() const { return get(); } 315 316 inline void swapWith(AlignedArray<T>& rhs) 317 { 318 #ifdef TMV_INITIALIZE_NAN 319 TMV_SWAP(_n,rhs._n); 320 #endif 321 p.swapWith(rhs.p); 322 } 323 inline void resize(const ptrdiff_t n) 324 { 325 #ifdef TMV_INITIALIZE_NAN 326 for(ptrdiff_t i=0;i<_n;++i) get()[i] = std::complex<RT>(-999,-888); 327 #endif 328 p.deallocate(); 329 if (n > 0) p.allocate(n<<1); 330 #ifdef TMV_INITIALIZE_NAN 331 _n = n; 332 for(ptrdiff_t i=0;i<_n;++i) 333 get()[i] = TMV_Nan<std::complex<RT> >::get(); 334 #endif 335 } 336 337 inline T* get() { return reinterpret_cast<T*>(p.get()); } 338 inline const T* get() const 339 { return reinterpret_cast<const T*>(p.get()); } 340 341 private : 342 343 AlignedMemory<RT> p; 344 #ifdef TMV_INITIALIZE_NAN 345 ptrdiff_t _n; 346 #endif 347 348 AlignedArray& operator=(const AlignedArray& p2); 349 AlignedArray(const AlignedArray& p2); 350 }; 351 352 353 354 // The rest of this file implements memory that is allocated on 355 // the stack rather than on the heap. Here things are a bit easier, 356 // since this will always align __m128 objects on 16 byte boundaries. 357 // So the way to get float or double aligned is simply to union 358 // the array with one of these. 359 360 // Another wrinkle though is that we don't actually want to use the 361 // stack for very large arrays, since it will crash. And very small 362 // arrays don't need the alignement. 363 // So here is a helper class that has bigN as a template parameter 364 // to decide whether to use the stack or not. 365 // And smallN is for N < 4 or N < 2 where the SSE alignment isn't 366 // necessary, to make sure we don't gratuitously use extra memory when 367 // we have a lot of SmallVector<float,2>'s or something like that. 368 template <typename T, int N, bool bigN, bool smallN> 369 class StackArray2; 370 371 template <typename T, int N> 372 class StackArray2<T,N,false,false> 373 // !bigN, !smallN 374 { 375 public: 376 #ifdef TMV_END_PADDING 377 inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = T(0); } 378 #endif 379 inline T* get() { return p; } 380 inline const T* get() const { return p; } 381 private: 382 #ifdef TMV_END_PADDING 383 enum { NN = N + (16/sizeof(T)) }; 384 #else 385 enum { NN = N }; 386 #endif 387 #ifdef __GNUC__ 388 T p[NN] __attribute__ ((aligned (16))); 389 #else 390 T p[NN]; 391 #endif 392 }; 393 template <typename T, int N> 394 class StackArray2<T,N,false,true> 395 // smallN 396 { 397 public: 398 #ifdef TMV_END_PADDING 399 inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = T(0); } 400 #endif 401 inline T* get() { return p; } 402 inline const T* get() const { return p; } 403 private: 404 #ifdef TMV_END_PADDING 405 enum { NN = N + 4 }; 406 #else 407 enum { NN = N==0 ? 1 : N }; 408 #endif 409 T p[NN]; 410 }; 411 412 #ifdef __SSE__ 413 template <int N> 414 class StackArray2<float,N,false,false> 415 // !bigN, !smallN 416 { 417 public: 418 #ifdef TMV_END_PADDING 419 inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.F; } 420 #endif 421 inline float* get() { return xp.p; } 422 inline const float* get() const { return xp.p; } 423 private: 424 #ifdef TMV_END_PADDING 425 enum { NN = N + 4 }; 426 #else 427 enum { NN = N==0 ? 1 : N }; 428 #endif 429 union { float p[NN]; __m128 x; } xp; 430 }; 431 template <int N> 432 class StackArray2<float,N,false,true> 433 // smallN 434 { 435 public: 436 #ifdef TMV_END_PADDING 437 inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.F; } 438 #endif 439 inline float* get() { return p; } 440 inline const float* get() const { return p; } 441 private: 442 #ifdef TMV_END_PADDING 443 enum { NN = N + 4 }; 444 #else 445 enum { NN = N==0 ? 1 : N }; 446 #endif 447 float p[NN]; 448 }; 449 #endif 450 #ifdef __SSE2__ 451 template <int N> 452 class StackArray2<double,N,false,false> 453 // !bigN, !smallN 454 { 455 public: 456 #ifdef TMV_END_PADDING 457 inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.; } 458 #endif 459 inline double* get() { return xp.p; } 460 inline const double* get() const { return xp.p; } 461 private: 462 #ifdef TMV_END_PADDING 463 enum { NN = N + 2 }; 464 #else 465 enum { NN = N==0 ? 1 : N }; 466 #endif 467 union { double p[NN]; __m128d x; } xp; 468 }; 469 template <int N> 470 class StackArray2<double,N,false,true> 471 // smallN 472 { 473 public: 474 #ifdef TMV_END_PADDING 475 inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.; } 476 #endif 477 inline double* get() { return p; } 478 inline const double* get() const { return p; } 479 private: 480 #ifdef TMV_END_PADDING 481 enum { NN = N + 2 }; 482 #else 483 enum { NN = N==0 ? 1 : N }; 484 #endif 485 double p[NN]; 486 }; 487 #endif 488 489 template <typename T, int N> 490 class StackArray2<T,N,true,false> 491 // bigN 492 { 493 public: 494 inline StackArray2() : p(N) {} 495 inline T* get() { return p.get(); } 496 inline const T* get() const { return p.get(); } 497 private: 498 AlignedArray<T> p; 499 }; 500 501 // Now the real class that we use: StackArray<T,N> 502 template <typename T, int N> 503 class StackArray 504 { 505 public : 506 inline StackArray() 507 { 508 #ifdef TMV_INITIALIZE_NAN 509 for(int i=0;i<N;++i) get()[i] = TMV_Nan<T>::get(); 510 #endif 511 } 512 inline ~StackArray() 513 { 514 #ifdef TMV_INITIALIZE_NAN 515 for(int i=0;i<N;++i) get()[i] = T(-999); 516 #endif 517 } 518 519 inline T& operator*() { return *get(); } 520 inline T* operator->() { return get(); } 521 inline operator T*() { return get(); } 522 523 inline const T& operator*() const { return *get(); } 524 inline const T* operator->() const { return get(); } 525 inline operator const T*() const { return get(); } 526 527 inline T* get() { return p.get(); } 528 inline const T* get() const { return p.get(); } 529 530 private : 531 enum { bigN = N*int(sizeof(T)) > TMV_MaxStack }; 532 enum { smallN = ( 533 #ifdef __SSE__ 534 Traits2<T,float>::sametype ? ( N < 4 ) : 535 #endif 536 #ifdef __SSE2__ 537 Traits2<T,double>::sametype ? ( N < 2 ) : 538 #endif 539 false ) }; 540 541 StackArray2<T,N,bigN,smallN> p; 542 543 StackArray& operator=(const StackArray& p2); 544 StackArray(const StackArray& p2); 545 }; 546 547 template <class RT, int N> 548 class StackArray<std::complex<RT>,N> 549 { 550 public : 551 typedef std::complex<RT> T; 552 553 inline StackArray() 554 { 555 #ifdef TMV_INITIALIZE_NAN 556 for(int i=0;i<N;++i) get()[i] = TMV_Nan<std::complex<RT> >::get(); 557 #endif 558 } 559 inline ~StackArray() 560 { 561 #ifdef TMV_INITIALIZE_NAN 562 for(int i=0;i<N;++i) get()[i] = std::complex<RT>(-999,-888); 563 #endif 564 } 565 566 inline T& operator*() { return *get(); } 567 inline T* operator->() { return get(); } 568 inline operator T*() { return get(); } 569 570 inline const T& operator*() const { return *get(); } 571 inline const T* operator->() const { return get(); } 572 inline operator const T*() const { return get(); } 573 574 inline T* get() { return reinterpret_cast<T*>(p.get()); } 575 inline const T* get() const 576 { return reinterpret_cast<const T*>(p.get()); } 577 578 private : 579 580 enum { bigN = N*int(sizeof(T)) > TMV_MaxStack }; 581 enum { smallN = ( 582 #ifdef __SSE__ 583 Traits2<T,float>::sametype ? ( N < 2 ) : 584 #endif 585 #ifdef __SSE2__ 586 Traits2<T,double>::sametype ? ( N < 1 ) : 587 #endif 588 false ) }; 589 590 StackArray2<RT,(N<<1),bigN,smallN> p; 591 592 StackArray& operator=(const StackArray& p2); 593 StackArray(const StackArray& p2); 594 }; 595 596 } // namespace tmv 597 598 #endif 599