1 // This file is part of QuTiP: Quantum Toolbox in Python.
2 //
3 //    Copyright (c) 2011 and later, QuSTaR.
4 //   All rights reserved.
5 //
6 //    Redistribution and use in source and binary forms, with or without
7 //    modification, are permitted provided that the following conditions are
8 //    met:
9 //
10 //   1. Redistributions of source code must retain the above copyright notice,
11 //       this list of conditions and the following disclaimer.
12 //
13 //    2. Redistributions in binary form must reproduce the above copyright
14 //       notice, this list of conditions and the following disclaimer in the
15 //      documentation and/or other materials provided with the distribution.
16 //
17 //   3. Neither the name of the QuTiP: Quantum Toolbox in Python nor the names
18 //       of its contributors may be used to endorse or promote products derived
19 //       from this software without specific prior written permission.
20 //
21 //    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 //    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 //    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 //    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 //    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 //    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 //    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 //    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 //    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 //    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 //    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 //#############################################################################
33 #include <complex>
34 
35 #if defined(__GNUC__) && defined(__SSE3__) // Using GCC or CLANG and SSE3
36 #include <pmmintrin.h>
zspmvpy(const std::complex<double> * __restrict__ data,const int * __restrict__ ind,const int * __restrict__ ptr,const std::complex<double> * __restrict__ vec,const std::complex<double> a,std::complex<double> * __restrict__ out,const unsigned int nrows)37 void zspmvpy(const std::complex<double> * __restrict__ data, const int * __restrict__ ind,
38             const int * __restrict__ ptr,
39             const std::complex<double> * __restrict__ vec, const std::complex<double> a,
40             std::complex<double> * __restrict__ out, const unsigned int nrows)
41 {
42     size_t row, jj;
43     unsigned int row_start, row_end;
44     __m128d num1, num2, num3, num4;
45     for (row=0; row < nrows; row++)
46     {
47         num4 = _mm_setzero_pd();
48         row_start = ptr[row];
49         row_end = ptr[row+1];
50         for (jj=row_start; jj <row_end; jj++)
51         {
52             num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[0]);
53             num2 = _mm_set_pd(std::imag(vec[ind[jj]]),std::real(vec[ind[jj]]));
54             num3 = _mm_mul_pd(num2, num1);
55             num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[1]);
56             num2 = _mm_shuffle_pd(num2, num2, 1);
57             num2 = _mm_mul_pd(num2, num1);
58             num3 = _mm_addsub_pd(num3, num2);
59             num4 = _mm_add_pd(num3, num4);
60         }
61         num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[0]);
62         num3 = _mm_mul_pd(num4, num1);
63         num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[1]);
64         num4 = _mm_shuffle_pd(num4, num4, 1);
65         num4 = _mm_mul_pd(num4, num1);
66         num3 = _mm_addsub_pd(num3, num4);
67         num2 = _mm_loadu_pd((double *)&out[row]);
68         num3 = _mm_add_pd(num2, num3);
69         _mm_storeu_pd((double *)&out[row], num3);
70     }
71 }
72 #elif defined(__GNUC__) // Using GCC or CLANG but no SSE3
zspmvpy(const std::complex<double> * __restrict__ data,const int * __restrict__ ind,const int * __restrict__ ptr,const std::complex<double> * __restrict__ vec,const std::complex<double> a,std::complex<double> * __restrict__ out,const unsigned int nrows)73 void zspmvpy(const std::complex<double> * __restrict__ data, const int * __restrict__ ind,
74             const int * __restrict__ ptr,
75             const std::complex<double> * __restrict__ vec, const std::complex<double> a,
76             std::complex<double> * __restrict__ out, const unsigned int nrows)
77 {
78     size_t row, jj;
79     unsigned int row_start, row_end;
80     std::complex<double> dot;
81     for (row=0; row < nrows; row++)
82     {
83         dot = 0;
84         row_start = ptr[row];
85         row_end = ptr[row+1];
86         for (jj=row_start; jj <row_end; jj++)
87         {
88             dot += data[jj]*vec[ind[jj]];
89         }
90         out[row] += a*dot;
91     }
92 }
93 #elif defined(_MSC_VER) && defined(__AVX__) // Visual Studio with AVX
94 #include <pmmintrin.h>
zspmvpy(const std::complex<double> * __restrict data,const int * __restrict ind,const int * __restrict ptr,const std::complex<double> * __restrict vec,const std::complex<double> a,std::complex<double> * __restrict out,const unsigned int nrows)95 void zspmvpy(const std::complex<double> * __restrict data, const int * __restrict ind,
96             const int * __restrict ptr,
97             const std::complex<double> * __restrict vec, const std::complex<double> a,
98             std::complex<double> * __restrict out, const unsigned int nrows)
99 {
100     size_t row, jj;
101     unsigned int row_start, row_end;
102     __m128d num1, num2, num3, num4;
103     for (row=0; row < nrows; row++)
104     {
105         num4 = _mm_setzero_pd();
106         row_start = ptr[row];
107         row_end = ptr[row+1];
108         for (jj=row_start; jj <row_end; jj++)
109         {
110             num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[0]);
111             num2 = _mm_set_pd(std::imag(vec[ind[jj]]),std::real(vec[ind[jj]]));
112             num3 = _mm_mul_pd(num2, num1);
113             num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(data[jj])[1]);
114             num2 = _mm_shuffle_pd(num2, num2, 1);
115             num2 = _mm_mul_pd(num2, num1);
116             num3 = _mm_addsub_pd(num3, num2);
117             num4 = _mm_add_pd(num3, num4);
118         }
119         num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[0]);
120         num3 = _mm_mul_pd(num4, num1);
121         num1 = _mm_loaddup_pd(&reinterpret_cast<const double(&)[2]>(a)[1]);
122         num4 = _mm_shuffle_pd(num4, num4, 1);
123         num4 = _mm_mul_pd(num4, num1);
124         num3 = _mm_addsub_pd(num3, num4);
125         num2 = _mm_loadu_pd((double *)&out[row]);
126         num3 = _mm_add_pd(num2, num3);
127         _mm_storeu_pd((double *)&out[row], num3);
128     }
129 }
130 #elif defined(_MSC_VER) // Visual Studio no AVX
zspmvpy(const std::complex<double> * __restrict data,const int * __restrict ind,const int * __restrict ptr,const std::complex<double> * __restrict vec,const std::complex<double> a,std::complex<double> * __restrict out,const unsigned int nrows)131 void zspmvpy(const std::complex<double> * __restrict data, const int * __restrict ind,
132             const int * __restrict ptr,
133             const std::complex<double> * __restrict vec, const std::complex<double> a,
134             std::complex<double> * __restrict out, const unsigned int nrows)
135 {
136     size_t row, jj;
137     unsigned int row_start, row_end;
138     std::complex<double> dot;
139     for (row=0; row < nrows; row++)
140     {
141         dot = 0;
142         row_start = ptr[row];
143         row_end = ptr[row+1];
144         for (jj=row_start; jj <row_end; jj++)
145         {
146             dot += data[jj]*vec[ind[jj]];
147         }
148         out[row] += a*dot;
149     }
150 }
151 #else // Everything else
zspmvpy(const std::complex<double> * data,const int * ind,const int * ptr,const std::complex<double> * vec,const std::complex<double> a,std::complex<double> * out,const unsigned int nrows)152 void zspmvpy(const std::complex<double> * data, const int * ind,
153             const int * ptr,
154             const std::complex<double> * vec, const std::complex<double> a,
155             std::complex<double> * out, const unsigned int nrows)
156 {
157     size_t row, jj;
158     unsigned int row_start, row_end;
159     std::complex<double> dot;
160     for (row=0; row < nrows; row++)
161     {
162         dot = 0;
163         row_start = ptr[row];
164         row_end = ptr[row+1];
165         for (jj=row_start; jj <row_end; jj++)
166         {
167             dot += data[jj]*vec[ind[jj]];
168         }
169         out[row] += a*dot;
170     }
171 }
172 #endif
173