1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this
6 //  license. If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without
17 // modification, are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote
27 //     products derived from this software without specific prior written
28 //     permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is"
31 // and any express or implied warranties, including, but not limited to, the
32 // implied warranties of merchantability and fitness for a particular purpose
33 // are disclaimed. In no event shall the Intel Corporation or contributors be
34 // liable for any direct, indirect, incidental, special, exemplary, or
35 // consequential damages (including, but not limited to, procurement of
36 // substitute goods or services; loss of use, data, or profits; or business
37 // interruption) however caused and on any theory of liability, whether in
38 // contract, strict liability, or tort (including negligence or otherwise)
39 // arising in any way out of the use of this software, even if advised of the
40 // possibility of such damage.
41 //
42 ////////////////////////////////////////////////////////////////////////////////
43 #ifndef OPENCV_TRANSPOSE_HPP_
44 #define OPENCV_TRANSPOSE_HPP_
45 
46 //! @addtogroup matlab
47 //! @{
48 
49 template <typename InputScalar, typename OutputScalar>
transposeBlock(const size_t M,const size_t N,const InputScalar * src,size_t lda,OutputScalar * dst,size_t ldb)50 void transposeBlock(const size_t M, const size_t N, const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) {
51   InputScalar cache[16];
52   // copy the source into the cache contiguously
53   for (size_t n = 0; n < N; ++n)
54     for (size_t m = 0; m < M; ++m)
55       cache[m+n*4] = src[m+n*lda];
56   // copy the destination out of the cache contiguously
57   for (size_t m = 0; m < M; ++m)
58     for (size_t n = 0; n < N; ++n)
59       dst[n+m*ldb] = cache[m+n*4];
60 }
61 
62 template <typename InputScalar, typename OutputScalar>
transpose4x4(const InputScalar * src,size_t lda,OutputScalar * dst,size_t ldb)63 void transpose4x4(const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) {
64   InputScalar cache[16];
65   // copy the source into the cache contiguously
66   cache[0] = src[0];  cache[1] = src[1];  cache[2] = src[2];  cache[3] = src[3];  src+=lda;
67   cache[4] = src[0];  cache[5] = src[1];  cache[6] = src[2];  cache[7] = src[3];  src+=lda;
68   cache[8] = src[0];  cache[9] = src[1];  cache[10] = src[2]; cache[11] = src[3]; src+=lda;
69   cache[12] = src[0]; cache[13] = src[1]; cache[14] = src[2]; cache[15] = src[3]; src+=lda;
70   // copy the destination out of the contiguously
71   dst[0] = cache[0];  dst[1] = cache[4];  dst[2] = cache[8];   dst[3] = cache[12]; dst+=ldb;
72   dst[0] = cache[1];  dst[1] = cache[5];  dst[2] = cache[9];   dst[3] = cache[13]; dst+=ldb;
73   dst[0] = cache[2];  dst[1] = cache[6];  dst[2] = cache[10];  dst[3] = cache[14]; dst+=ldb;
74   dst[0] = cache[3];  dst[1] = cache[7];  dst[2] = cache[11];  dst[3] = cache[15]; dst+=ldb;
75 }
76 
77 
78 /*
79  * Vanilla copy, transpose and cast
80  */
81 template <typename InputScalar, typename OutputScalar>
gemt(const char major,const size_t M,const size_t N,const InputScalar * a,size_t lda,OutputScalar * b,size_t ldb)82 void gemt(const char major, const size_t M, const size_t N, const InputScalar* a, size_t lda, OutputScalar* b, size_t ldb) {
83 
84   // 1x1 transpose is just copy
85   if (M == 1 && N == 1) { *b = *a; return; }
86 
87   // get the interior 4x4 blocks, and the extra skirting
88   const size_t Fblock = (major == 'R') ? N/4 : M/4;
89   const size_t Frem   = (major == 'R') ? N%4 : M%4;
90   const size_t Sblock = (major == 'R') ? M/4 : N/4;
91   const size_t Srem   = (major == 'R') ? M%4 : N%4;
92 
93   // if less than 4x4, invoke the block transpose immediately
94   if (M < 4 && N < 4) { transposeBlock(Frem, Srem, a, lda, b, ldb); return; }
95 
96   // transpose 4x4 blocks
97   const InputScalar* aptr = a;
98   OutputScalar* bptr = b;
99   for (size_t second = 0; second < Sblock; ++second) {
100     aptr = a + second*lda;
101     bptr = b + second;
102     for (size_t first = 0; first < Fblock; ++first) {
103       transposeBlock(4, 4, aptr, lda, bptr, ldb);
104       //transpose4x4(aptr, lda, bptr, ldb);
105       aptr+=4;
106       bptr+=4*ldb;
107     }
108     // transpose trailing blocks on primary dimension
109     transposeBlock(Frem, 4, aptr, lda, bptr, ldb);
110   }
111   // transpose trailing blocks on secondary dimension
112   aptr = a + 4*Sblock*lda;
113   bptr = b + 4*Sblock;
114   for (size_t first = 0; first < Fblock; ++first) {
115     transposeBlock(4, Srem, aptr, lda, bptr, ldb);
116     aptr+=4;
117     bptr+=4*ldb;
118   }
119   // transpose bottom right-hand corner
120   transposeBlock(Frem, Srem, aptr, lda, bptr, ldb);
121 }
122 
123 #ifdef __SSE2__
124 /*
125  * SSE2 supported fast copy, transpose and cast
126  */
127 #include <emmintrin.h>
128 
129 template <>
transpose4x4(const float * src,size_t lda,float * dst,size_t ldb)130 void transpose4x4<float, float>(const float* src, size_t lda, float* dst, size_t ldb) {
131   __m128 row0, row1, row2, row3;
132   row0 = _mm_loadu_ps(src);
133   row1 = _mm_loadu_ps(src+lda);
134   row2 = _mm_loadu_ps(src+2*lda);
135   row3 = _mm_loadu_ps(src+3*lda);
136   _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
137   _mm_storeu_ps(dst, row0);
138   _mm_storeu_ps(dst+ldb, row1);
139   _mm_storeu_ps(dst+2*ldb, row2);
140   _mm_storeu_ps(dst+3*ldb, row3);
141 }
142 
143 #endif
144 
145 //! @}
146 
147 #endif
148