1 ///////////////////////////////////////////////////////////////////////////////
2 //                                                                           //
3 // The Template Matrix/Vector Library for C++ was created by Mike Jarvis     //
4 // Copyright (C) 1998 - 2016                                                 //
5 // All rights reserved                                                       //
6 //                                                                           //
7 // The project is hosted at https://code.google.com/p/tmv-cpp/               //
8 // where you can find the current version and current documention.           //
9 //                                                                           //
10 // For concerns or problems with the software, Mike may be contacted at      //
11 // mike_jarvis17 [at] gmail.                                                 //
12 //                                                                           //
13 // This software is licensed under a FreeBSD license.  The file              //
14 // TMV_LICENSE should have bee included with this distribution.              //
15 // It not, you can get a copy from https://code.google.com/p/tmv-cpp/.       //
16 //                                                                           //
17 // Essentially, you can use this software however you want provided that     //
18 // you include the TMV_LICENSE file in any distribution that uses it.        //
19 //                                                                           //
20 ///////////////////////////////////////////////////////////////////////////////
21 
22 
23 // This file defines two classes that are used by TMV for allocating
24 // memory.
25 //
26 // First, AlignedArray works basically like a regular new v[]
27 // allocation, except that if we are doing SSE or SSE2, then we enforce
28 // that the allocation is aligned on a 128 byte boundary.
29 //
30 // Second, StackArray is used for small arrays and emulates a normal C array
31 // on the stack: T v[N].  However, when N is large, it uses the
32 // heap instead to avoid stack overflows.
33 
34 #ifndef Array_H
35 #define Array_H
36 
37 const ptrdiff_t TMV_MaxStack = 1024; // bytes
38 
39 #include <complex>
40 
41 namespace tmv
42 {
43     template <typename T>
TMV_Aligned(const T * p)44     inline bool TMV_Aligned(const T* p)
45     { return (reinterpret_cast<ptrdiff_t>(p) & 0xf) == 0; }
46 
47     // There doesn't seem to be any portable C++ function that guarantees
48     // that the memory allocated will be aligned as necessary for
49     // SSE functions.
50     // Sometimes posix_memalign or memalign does this job.
51     // But it doesn't seem to be standard, since some systems don't have it.
52     // Other systems have _mm_malloc (ICPC) or _aligned_malloc (WIN).
53     // So probably I should have ifdefs that check to see if one of these
54     // is available and use that instead.
55     //
56     // But for now we have this simple class that simply loads a bit more
57     // memory than necessary and then finds the starting point that is
58     // 16 byte aligned.
59 
60     // Also, the TMV_END_PADDING option is implemented here.  If it is
61     // defined, then we write 0's to the end of the full 16 byte word.
62     // This is mostly useful when running valgrind with a BLAS library
63     // that isn't careful about reading past the end of the allocated
64     // memory.  GotoBLAS for example.
65     // So if end padding is enabled, we make sure to allocate enough memory
66     // to finish the block of 16 bytes.  And we write 0's to the values
67     // that aren't part of the requested memory.
68 
69     // First the regular non-SSE version, where we don't need aligment.
70     template <typename T>
71     class AlignedMemory
72     {
73     public:
AlignedMemory()74         inline AlignedMemory() : p(0) {}
allocate(const ptrdiff_t n)75         inline void allocate(const ptrdiff_t n)
76         {
77 #ifdef TMV_END_PADDING
78             const ptrdiff_t nn = n + 16/sizeof(T);
79             p = new T[nn];
80             for(ptrdiff_t i=n;i<nn;++i) p[i] = T(0);
81 #else
82             p = new T[n];
83 #endif
84         }
deallocate()85         inline void deallocate()
86         { if (p) delete [] p; p=0; }
swapWith(AlignedMemory<T> & rhs)87         inline void swapWith(AlignedMemory<T>& rhs)
88         { T* temp = p; p = rhs.p; rhs.p = temp; }
get()89         inline T* get() { return p; }
get()90         inline const T* get() const { return p; }
91     private:
92         T* p;
93     };
94 
95     // Now specialize float and double
96     // We do this regardless of whether __SSE__ or __SSE2__ is defined,
97     // since these might be allocated in a unit that doesn't defined them
98     // and then have get() called in a unit that does.  This leads to problems!
99     // So we always make the float and double allocations aligned at
100     // 16 byte boundaries.
101     template <>
102     class AlignedMemory<float>
103     {
104     public:
AlignedMemory()105         AlignedMemory() : mem(0), p(0) {}
allocate(const ptrdiff_t n)106         inline void allocate(const ptrdiff_t n)
107         {
108 #ifdef TMV_END_PADDING
109             const ptrdiff_t nn = (n<<2)+15 + 16;
110             mem = new char[nn];
111             p = calculateP();
112             for(ptrdiff_t i=n;i<(nn>>2);++i) p[i] = 0.F;
113 #else
114             mem = new char[(n<<2)+15];
115             p = calculateP();
116 #endif
117             TMVAssert((void*)(mem+(n<<2)+15) >= (void*)(p+n));
118         }
deallocate()119         inline void deallocate()
120         { if (mem) delete [] mem; mem=0; p=0; }
swapWith(AlignedMemory<float> & rhs)121         inline void swapWith(AlignedMemory<float>& rhs)
122         {
123             char* temp1 = mem; mem = rhs.mem; rhs.mem = temp1;
124             float* temp2 = p; p = rhs.p; rhs.p = temp2;
125         }
get()126         inline float* get() { return p; }
calculateP()127         inline float* calculateP()
128         {
129             float* pf = reinterpret_cast<float*>(
130                 mem + ((0x10-((ptrdiff_t)(mem) & 0xf)) & ~0x10));
131             TMVAssert( ((ptrdiff_t)(pf) & 0xf) == 0);
132             return pf;
133         }
get()134         inline const float* get() const  { return p; }
135     private:
136         char* mem;
137         float* p;
138     };
139     template <>
140     class AlignedMemory<double>
141     {
142     public:
AlignedMemory()143         AlignedMemory() : mem(0), p(0) {}
allocate(const ptrdiff_t n)144         inline void allocate(const ptrdiff_t n)
145         {
146 #ifdef TMV_END_PADDING
147             const ptrdiff_t nn = (n<<3)+15 + 16;
148             mem = new char[nn];
149             p = calculateP();
150             for(ptrdiff_t i=n;i<(nn>>3);++i) p[i] = 0.;
151 #else
152             mem = new char[(n<<3)+15];
153             p = calculateP();
154 #endif
155             TMVAssert((void*)(mem+(n<<3)+15) >= (void*)(p+n));
156         }
deallocate()157         inline void deallocate()
158         { if (mem) delete [] mem; mem=0; p=0; }
swapWith(AlignedMemory<double> & rhs)159         inline void swapWith(AlignedMemory<double>& rhs)
160         {
161             char* temp1 = mem; mem = rhs.mem; rhs.mem = temp1;
162             double* temp2 = p; p = rhs.p; rhs.p = temp2;
163         }
calculateP()164         inline double* calculateP()
165         {
166             double* pd = reinterpret_cast<double*>(
167                 mem + ((0x10-((ptrdiff_t)(mem) & 0xf)) & ~0x10));
168             TMVAssert( ((ptrdiff_t)(pd) & 0xf) == 0);
169             return pd;
170         }
get()171         inline double* get() { return p; }
get()172         inline const double* get() const { return p; }
173 
174     private :
175         char* mem;
176         double* p;
177     };
178 
179 #ifdef TMV_INITIALIZE_NAN
180     // This option is to stress test the code to make sure it works
181     // ok if uninitialized data happens to have a nan in it.
182     // Naive BLAS calls can fail when there are nan's, since it recasts
183     // the equation y = A*x as y = beta*y + alpha*A*x with alpha=1 and
184     // beta=0.  So if y initially has nan's, then the beta*y
185     // part has 0*nan => nan, which is then added to whatever
186     // alpha*A*x has, so stays nan on output.
187     // TMV should be accounting for this kind of thing correctly, and
188     // this section here helps test for it.
189     template <typename T>
190     struct TMV_Nan
191     {
getTMV_Nan192         static inline T get()
193         {
194             static T zero(0);
195             static T nan =
196                 std::numeric_limits<T>::is_integer ? zero : zero/zero;
197             return nan;
198         }
199     };
200     template <typename T>
201     struct TMV_Nan<std::complex<T> >
202     {
203         static inline std::complex<T> get()
204         { return std::complex<T>(TMV_Nan<T>::get(),TMV_Nan<T>::get()); }
205     };
206 #endif
207 
208     // Now the actual class that makes AlignedMemory work like a normal
209     // pointer.
210     template <typename T>
211     class AlignedArray
212     {
213     public :
214 
215         inline AlignedArray()
216         {
217 #ifdef TMV_INITIALIZE_NAN
218             _n = 0;
219 #endif
220         }
221         inline AlignedArray(const ptrdiff_t n)
222         {
223             if (n > 0) p.allocate(n);
224 #ifdef TMV_INITIALIZE_NAN
225             _n = n;
226             for(ptrdiff_t i=0;i<_n;++i) get()[i] = TMV_Nan<T>::get();
227 #endif
228         }
229         inline ~AlignedArray()
230         {
231 #ifdef TMV_INITIALIZE_NAN
232             for(ptrdiff_t i=0;i<_n;++i) get()[i] = T(-999);
233 #endif
234             p.deallocate();
235         }
236 
237         inline T& operator*() { return *get(); }
238         inline T* operator->() { return get(); }
239         inline operator T*() { return get(); }
240 
241         inline const T& operator*() const { return *get(); }
242         inline const T* operator->() const { return get(); }
243         inline operator const T*() const { return get(); }
244 
245         inline void swapWith(AlignedArray<T>& rhs)
246         {
247 #ifdef TMV_INITIALIZE_NAN
248             TMV_SWAP(_n,rhs._n);
249 #endif
250             p.swapWith(rhs.p);
251         }
252         inline void resize(const ptrdiff_t n)
253         {
254 #ifdef TMV_INITIALIZE_NAN
255             for(ptrdiff_t i=0;i<_n;++i) get()[i] = T(-999);
256 #endif
257             p.deallocate();
258             if (n > 0) p.allocate(n);
259 #ifdef TMV_INITIALIZE_NAN
260             _n = n;
261             for(ptrdiff_t i=0;i<_n;++i) get()[i] = TMV_Nan<T>::get();
262 #endif
263         }
264 
265         inline T* get() { return p.get(); }
266         inline const T* get() const  { return p.get(); }
267 
268     private :
269 
270         AlignedMemory<T> p;
271 #ifdef TMV_INITIALIZE_NAN
272         ptrdiff_t _n;
273 #endif
274 
275         AlignedArray& operator=(const AlignedArray& p2);
276         AlignedArray(const AlignedArray& p2);
277     };
278 
279     template <class RT>
280     class AlignedArray<std::complex<RT> >
281     {
282     public :
283         typedef std::complex<RT> T;
284 
285         inline AlignedArray()
286         {
287 #ifdef TMV_INITIALIZE_NAN
288             _n = 0;
289 #endif
290         }
291         inline AlignedArray(const ptrdiff_t n)
292         {
293             if (n > 0) p.allocate(n<<1);
294 #ifdef TMV_INITIALIZE_NAN
295             _n = n;
296             for(ptrdiff_t i=0;i<_n;++i)
297                 get()[i] = TMV_Nan<std::complex<RT> >::get();
298 #endif
299         }
300         inline ~AlignedArray()
301         {
302 #ifdef TMV_INITIALIZE_NAN
303             for(ptrdiff_t i=0;i<_n;++i) get()[i] = std::complex<RT>(-999,-888);
304 #endif
305             p.deallocate();
306         }
307 
308         inline T& operator*() { return *get(); }
309         inline T* operator->() { return get(); }
310         inline operator T*() { return get(); }
311 
312         inline const T& operator*() const { return *get(); }
313         inline const T* operator->() const { return get(); }
314         inline operator const T*() const { return get(); }
315 
316         inline void swapWith(AlignedArray<T>& rhs)
317         {
318 #ifdef TMV_INITIALIZE_NAN
319             TMV_SWAP(_n,rhs._n);
320 #endif
321             p.swapWith(rhs.p);
322         }
323         inline void resize(const ptrdiff_t n)
324         {
325 #ifdef TMV_INITIALIZE_NAN
326             for(ptrdiff_t i=0;i<_n;++i) get()[i] = std::complex<RT>(-999,-888);
327 #endif
328             p.deallocate();
329             if (n > 0) p.allocate(n<<1);
330 #ifdef TMV_INITIALIZE_NAN
331             _n = n;
332             for(ptrdiff_t i=0;i<_n;++i)
333                 get()[i] = TMV_Nan<std::complex<RT> >::get();
334 #endif
335         }
336 
337         inline T* get() { return reinterpret_cast<T*>(p.get()); }
338         inline const T* get() const
339         { return reinterpret_cast<const T*>(p.get()); }
340 
341     private :
342 
343         AlignedMemory<RT> p;
344 #ifdef TMV_INITIALIZE_NAN
345         ptrdiff_t _n;
346 #endif
347 
348         AlignedArray& operator=(const AlignedArray& p2);
349         AlignedArray(const AlignedArray& p2);
350     };
351 
352 
353 
354     // The rest of this file implements memory that is allocated on
355     // the stack rather than on the heap.  Here things are a bit easier,
356     // since this will always align __m128 objects on 16 byte boundaries.
357     // So the way to get float or double aligned is simply to union
358     // the array with one of these.
359 
360     // Another wrinkle though is that we don't actually want to use the
361     // stack for very large arrays, since it will crash.  And very small
362     // arrays don't need the alignement.
363     // So here is a helper class that has bigN as a template parameter
364     // to decide whether to use the stack or not.
365     // And smallN is for N < 4 or N < 2 where the SSE alignment isn't
366     // necessary, to make sure we don't gratuitously use extra memory when
367     // we have a lot of SmallVector<float,2>'s or something like that.
368     template <typename T, int N, bool bigN, bool smallN>
369     class StackArray2;
370 
371     template <typename T, int N>
372     class StackArray2<T,N,false,false>
373     // !bigN, !smallN
374     {
375     public:
376 #ifdef TMV_END_PADDING
377         inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = T(0); }
378 #endif
379         inline T* get() { return p; }
380         inline const T* get() const { return p; }
381     private:
382 #ifdef TMV_END_PADDING
383         enum { NN = N + (16/sizeof(T)) };
384 #else
385         enum { NN = N };
386 #endif
387 #ifdef __GNUC__
388         T p[NN] __attribute__ ((aligned (16)));
389 #else
390         T p[NN];
391 #endif
392     };
393     template <typename T, int N>
394     class StackArray2<T,N,false,true>
395     // smallN
396     {
397     public:
398 #ifdef TMV_END_PADDING
399         inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = T(0); }
400 #endif
401         inline T* get() { return p; }
402         inline const T* get() const { return p; }
403     private:
404 #ifdef TMV_END_PADDING
405         enum { NN = N + 4 };
406 #else
407         enum { NN = N==0 ? 1 : N };
408 #endif
409         T p[NN];
410     };
411 
412 #ifdef __SSE__
413     template <int N>
414     class StackArray2<float,N,false,false>
415     // !bigN, !smallN
416     {
417     public:
418 #ifdef TMV_END_PADDING
419         inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.F; }
420 #endif
421         inline float* get() { return xp.p; }
422         inline const float* get() const { return xp.p; }
423     private:
424 #ifdef TMV_END_PADDING
425         enum { NN = N + 4 };
426 #else
427         enum { NN = N==0 ? 1 : N };
428 #endif
429         union { float p[NN]; __m128 x; } xp;
430     };
431     template <int N>
432     class StackArray2<float,N,false,true>
433     // smallN
434     {
435     public:
436 #ifdef TMV_END_PADDING
437         inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.F; }
438 #endif
439         inline float* get() { return p; }
440         inline const float* get() const { return p; }
441     private:
442 #ifdef TMV_END_PADDING
443         enum { NN = N + 4 };
444 #else
445         enum { NN = N==0 ? 1 : N };
446 #endif
447         float p[NN];
448     };
449 #endif
450 #ifdef __SSE2__
451     template <int N>
452     class StackArray2<double,N,false,false>
453     // !bigN, !smallN
454     {
455     public:
456 #ifdef TMV_END_PADDING
457         inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.; }
458 #endif
459         inline double* get() { return xp.p; }
460         inline const double* get() const { return xp.p; }
461     private:
462 #ifdef TMV_END_PADDING
463         enum { NN = N + 2 };
464 #else
465         enum { NN = N==0 ? 1 : N };
466 #endif
467         union { double p[NN]; __m128d x; } xp;
468     };
469     template <int N>
470     class StackArray2<double,N,false,true>
471     // smallN
472     {
473     public:
474 #ifdef TMV_END_PADDING
475         inline StackArray2() { for(int i=N;i<NN;++i) get()[i] = 0.; }
476 #endif
477         inline double* get() { return p; }
478         inline const double* get() const { return p; }
479     private:
480 #ifdef TMV_END_PADDING
481         enum { NN = N + 2 };
482 #else
483         enum { NN = N==0 ? 1 : N };
484 #endif
485         double p[NN];
486     };
487 #endif
488 
489     template <typename T, int N>
490     class StackArray2<T,N,true,false>
491     // bigN
492     {
493     public:
494         inline StackArray2() : p(N) {}
495         inline T* get() { return p.get(); }
496         inline const T* get() const { return p.get(); }
497     private:
498         AlignedArray<T> p;
499     };
500 
501     // Now the real class that we use: StackArray<T,N>
502     template <typename T, int N>
503     class StackArray
504     {
505     public :
506         inline StackArray()
507         {
508 #ifdef TMV_INITIALIZE_NAN
509             for(int i=0;i<N;++i) get()[i] = TMV_Nan<T>::get();
510 #endif
511         }
512         inline ~StackArray()
513         {
514 #ifdef TMV_INITIALIZE_NAN
515             for(int i=0;i<N;++i) get()[i] = T(-999);
516 #endif
517         }
518 
519         inline T& operator*() { return *get(); }
520         inline T* operator->() { return get(); }
521         inline operator T*() { return get(); }
522 
523         inline const T& operator*() const { return *get(); }
524         inline const T* operator->() const { return get(); }
525         inline operator const T*() const { return get(); }
526 
527         inline T* get() { return p.get(); }
528         inline const T* get() const { return p.get(); }
529 
530     private :
531         enum { bigN = N*int(sizeof(T)) > TMV_MaxStack };
532         enum { smallN = (
533 #ifdef __SSE__
534                 Traits2<T,float>::sametype ? ( N < 4 ) :
535 #endif
536 #ifdef __SSE2__
537                 Traits2<T,double>::sametype ? ( N < 2 ) :
538 #endif
539                 false ) };
540 
541         StackArray2<T,N,bigN,smallN> p;
542 
543         StackArray& operator=(const StackArray& p2);
544         StackArray(const StackArray& p2);
545     };
546 
547     template <class RT, int N>
548     class StackArray<std::complex<RT>,N>
549     {
550     public :
551         typedef std::complex<RT> T;
552 
553         inline StackArray()
554         {
555 #ifdef TMV_INITIALIZE_NAN
556             for(int i=0;i<N;++i) get()[i] = TMV_Nan<std::complex<RT> >::get();
557 #endif
558         }
559         inline ~StackArray()
560         {
561 #ifdef TMV_INITIALIZE_NAN
562             for(int i=0;i<N;++i) get()[i] = std::complex<RT>(-999,-888);
563 #endif
564         }
565 
566         inline T& operator*() { return *get(); }
567         inline T* operator->() { return get(); }
568         inline operator T*() { return get(); }
569 
570         inline const T& operator*() const { return *get(); }
571         inline const T* operator->() const { return get(); }
572         inline operator const T*() const { return get(); }
573 
574         inline T* get() { return reinterpret_cast<T*>(p.get()); }
575         inline const T* get() const
576         { return reinterpret_cast<const T*>(p.get()); }
577 
578     private :
579 
580         enum { bigN = N*int(sizeof(T)) > TMV_MaxStack };
581         enum { smallN = (
582 #ifdef __SSE__
583                 Traits2<T,float>::sametype ? ( N < 2 ) :
584 #endif
585 #ifdef __SSE2__
586                 Traits2<T,double>::sametype ? ( N < 1 ) :
587 #endif
588                 false ) };
589 
590         StackArray2<RT,(N<<1),bigN,smallN> p;
591 
592         StackArray& operator=(const StackArray& p2);
593         StackArray(const StackArray& p2);
594     };
595 
596 } // namespace tmv
597 
598 #endif
599