1 // This file is part of QuTiP: Quantum Toolbox in Python.
2 //
3 // Copyright (c) 2011 and later, QuSTaR.
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are
8 // met:
9 //
10 // 1. Redistributions of source code must retain the above copyright notice,
11 // this list of conditions and the following disclaimer.
12 //
13 // 2. Redistributions in binary form must reproduce the above copyright
14 // notice, this list of conditions and the following disclaimer in the
15 // documentation and/or other materials provided with the distribution.
16 //
17 // 3. Neither the name of the QuTiP: Quantum Toolbox in Python nor the names
18 // of its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 //#############################################################################
33 #include <complex>
34
35 #if defined(__GNUC__) && defined(__SSE3__) // Using GCC or CLANG and SSE3
36 #include <pmmintrin.h>
zspmvpy(const std::complex<double> * __restrict__ data,const int * __restrict__ ind,const int * __restrict__ ptr,const std::complex<double> * __restrict__ vec,const std::complex<double> a,std::complex<double> * __restrict__ out,const unsigned int nrows)37 void zspmvpy(const std::complex<double> * __restrict__ data, const int * __restrict__ ind,
38 const int * __restrict__ ptr,
39 const std::complex<double> * __restrict__ vec, const std::complex<double> a,
40 std::complex<double> * __restrict__ out, const unsigned int nrows)
41 {
42 size_t row, jj;
43 unsigned int row_start, row_end;
44 __m128d num1, num2, num3, num4;
45 for (row=0; row < nrows; row++)
46 {
47 num4 = _mm_setzero_pd();
48 row_start = ptr[row];
49 row_end = ptr[row+1];
50 for (jj=row_start; jj <row_end; jj++)
51 {
52 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[0]);
53 num2 = _mm_set_pd(std::imag(vec[ind[jj]]),std::real(vec[ind[jj]]));
54 num3 = _mm_mul_pd(num2, num1);
55 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[1]);
56 num2 = _mm_shuffle_pd(num2, num2, 1);
57 num2 = _mm_mul_pd(num2, num1);
58 num3 = _mm_addsub_pd(num3, num2);
59 num4 = _mm_add_pd(num3, num4);
60 }
61 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[0]);
62 num3 = _mm_mul_pd(num4, num1);
63 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[1]);
64 num4 = _mm_shuffle_pd(num4, num4, 1);
65 num4 = _mm_mul_pd(num4, num1);
66 num3 = _mm_addsub_pd(num3, num4);
67 num2 = _mm_loadu_pd((double *)&out[row]);
68 num3 = _mm_add_pd(num2, num3);
69 _mm_storeu_pd((double *)&out[row], num3);
70 }
71 }
72 #elif defined(__GNUC__) // Using GCC or CLANG but no SSE3
zspmvpy(const std::complex<double> * __restrict__ data,const int * __restrict__ ind,const int * __restrict__ ptr,const std::complex<double> * __restrict__ vec,const std::complex<double> a,std::complex<double> * __restrict__ out,const unsigned int nrows)73 void zspmvpy(const std::complex<double> * __restrict__ data, const int * __restrict__ ind,
74 const int * __restrict__ ptr,
75 const std::complex<double> * __restrict__ vec, const std::complex<double> a,
76 std::complex<double> * __restrict__ out, const unsigned int nrows)
77 {
78 size_t row, jj;
79 unsigned int row_start, row_end;
80 std::complex<double> dot;
81 for (row=0; row < nrows; row++)
82 {
83 dot = 0;
84 row_start = ptr[row];
85 row_end = ptr[row+1];
86 for (jj=row_start; jj <row_end; jj++)
87 {
88 dot += data[jj]*vec[ind[jj]];
89 }
90 out[row] += a*dot;
91 }
92 }
93 #elif defined(_MSC_VER) && defined(__AVX__) // Visual Studio with AVX
94 #include <pmmintrin.h>
zspmvpy(const std::complex<double> * __restrict data,const int * __restrict ind,const int * __restrict ptr,const std::complex<double> * __restrict vec,const std::complex<double> a,std::complex<double> * __restrict out,const unsigned int nrows)95 void zspmvpy(const std::complex<double> * __restrict data, const int * __restrict ind,
96 const int * __restrict ptr,
97 const std::complex<double> * __restrict vec, const std::complex<double> a,
98 std::complex<double> * __restrict out, const unsigned int nrows)
99 {
100 size_t row, jj;
101 unsigned int row_start, row_end;
102 __m128d num1, num2, num3, num4;
103 for (row=0; row < nrows; row++)
104 {
105 num4 = _mm_setzero_pd();
106 row_start = ptr[row];
107 row_end = ptr[row+1];
108 for (jj=row_start; jj <row_end; jj++)
109 {
110 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[0]);
111 num2 = _mm_set_pd(std::imag(vec[ind[jj]]),std::real(vec[ind[jj]]));
112 num3 = _mm_mul_pd(num2, num1);
113 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[1]);
114 num2 = _mm_shuffle_pd(num2, num2, 1);
115 num2 = _mm_mul_pd(num2, num1);
116 num3 = _mm_addsub_pd(num3, num2);
117 num4 = _mm_add_pd(num3, num4);
118 }
119 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[0]);
120 num3 = _mm_mul_pd(num4, num1);
121 num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[1]);
122 num4 = _mm_shuffle_pd(num4, num4, 1);
123 num4 = _mm_mul_pd(num4, num1);
124 num3 = _mm_addsub_pd(num3, num4);
125 num2 = _mm_loadu_pd((double *)&out[row]);
126 num3 = _mm_add_pd(num2, num3);
127 _mm_storeu_pd((double *)&out[row], num3);
128 }
129 }
130 #elif defined(_MSC_VER) // Visual Studio no AVX
zspmvpy(const std::complex<double> * __restrict data,const int * __restrict ind,const int * __restrict ptr,const std::complex<double> * __restrict vec,const std::complex<double> a,std::complex<double> * __restrict out,const unsigned int nrows)131 void zspmvpy(const std::complex<double> * __restrict data, const int * __restrict ind,
132 const int * __restrict ptr,
133 const std::complex<double> * __restrict vec, const std::complex<double> a,
134 std::complex<double> * __restrict out, const unsigned int nrows)
135 {
136 size_t row, jj;
137 unsigned int row_start, row_end;
138 std::complex<double> dot;
139 for (row=0; row < nrows; row++)
140 {
141 dot = 0;
142 row_start = ptr[row];
143 row_end = ptr[row+1];
144 for (jj=row_start; jj <row_end; jj++)
145 {
146 dot += data[jj]*vec[ind[jj]];
147 }
148 out[row] += a*dot;
149 }
150 }
151 #else // Everything else
zspmvpy(const std::complex<double> * data,const int * ind,const int * ptr,const std::complex<double> * vec,const std::complex<double> a,std::complex<double> * out,const unsigned int nrows)152 void zspmvpy(const std::complex<double> * data, const int * ind,
153 const int * ptr,
154 const std::complex<double> * vec, const std::complex<double> a,
155 std::complex<double> * out, const unsigned int nrows)
156 {
157 size_t row, jj;
158 unsigned int row_start, row_end;
159 std::complex<double> dot;
160 for (row=0; row < nrows; row++)
161 {
162 dot = 0;
163 row_start = ptr[row];
164 row_end = ptr[row+1];
165 for (jj=row_start; jj <row_end; jj++)
166 {
167 dot += data[jj]*vec[ind[jj]];
168 }
169 out[row] += a*dot;
170 }
171 }
172 #endif
173