1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
CNAME(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT dummy3,FLOAT * srcx,BLASLONG inc_x,FLOAT * srcy,BLASLONG inc_y,FLOAT * dummy,BLASLONG dummy2)31 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
32 FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
33 FLOAT *dummy, BLASLONG dummy2)
34 {
35 BLASLONG i = 0, pref_offsetx, pref_offsety;
36 FLOAT *px, *py;
37 FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
38 FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
39 v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
40 v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
41
42 if (n < 0) return (0);
43
44 pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
45 if (pref_offsetx > 0)
46 {
47 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
48 pref_offsetx = pref_offsetx / sizeof(FLOAT);
49 }
50
51 pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
52 if (pref_offsety > 0)
53 {
54 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
55 pref_offsety = pref_offsety / sizeof(FLOAT);
56 }
57
58 px = srcx;
59 py = srcy;
60
61 if ((1 == inc_x) && (1 == inc_y))
62 {
63 if (n >> 4)
64 {
65 LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
66
67 for (i = (n >> 4) - 1; i--;)
68 {
69 PREFETCH(px + pref_offsetx + 16);
70 PREFETCH(px + pref_offsetx + 20);
71 PREFETCH(px + pref_offsetx + 24);
72 PREFETCH(px + pref_offsetx + 28);
73
74 PREFETCH(py + pref_offsety + 16);
75 PREFETCH(py + pref_offsety + 20);
76 PREFETCH(py + pref_offsety + 24);
77 PREFETCH(py + pref_offsety + 28);
78
79 yv0 = LD_DP(py); py += 2;
80 ST_DP(xv0, srcy); srcy += 2;
81 yv1 = LD_DP(py); py += 2;
82 ST_DP(xv1, srcy); srcy += 2;
83 yv2 = LD_DP(py); py += 2;
84 ST_DP(xv2, srcy); srcy += 2;
85 yv3 = LD_DP(py); py += 2;
86 ST_DP(xv3, srcy); srcy += 2;
87 yv4 = LD_DP(py); py += 2;
88 ST_DP(xv4, srcy); srcy += 2;
89 yv5 = LD_DP(py); py += 2;
90 ST_DP(xv5, srcy); srcy += 2;
91 yv6 = LD_DP(py); py += 2;
92 ST_DP(xv6, srcy); srcy += 2;
93 yv7 = LD_DP(py); py += 2;
94 ST_DP(xv7, srcy); srcy += 2;
95
96 xv0 = LD_DP(px); px += 2;
97 ST_DP(yv0, srcx); srcx += 2;
98 xv1 = LD_DP(px); px += 2;
99 ST_DP(yv1, srcx); srcx += 2;
100 xv2 = LD_DP(px); px += 2;
101 ST_DP(yv2, srcx); srcx += 2;
102 xv3 = LD_DP(px); px += 2;
103 ST_DP(yv3, srcx); srcx += 2;
104 xv4 = LD_DP(px); px += 2;
105 ST_DP(yv4, srcx); srcx += 2;
106 xv5 = LD_DP(px); px += 2;
107 ST_DP(yv5, srcx); srcx += 2;
108 xv6 = LD_DP(px); px += 2;
109 ST_DP(yv6, srcx); srcx += 2;
110 xv7 = LD_DP(px); px += 2;
111 ST_DP(yv7, srcx); srcx += 2;
112 }
113
114 LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
115 ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2);
116 ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2);
117 }
118
119 if (n & 15)
120 {
121 if ((n & 8) && (n & 4) && (n & 2))
122 {
123 LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
124 LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
125 ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2);
126 ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2);
127 }
128 else if ((n & 8) && (n & 4))
129 {
130 LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5);
131 LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5);
132 ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2);
133 ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2);
134 }
135 else if ((n & 8) && (n & 2))
136 {
137 LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4);
138 LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4);
139 ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2);
140 ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2);
141 }
142 else if ((n & 4) && (n & 2))
143 {
144 LD_DP3_INC(px, 2, xv0, xv1, xv2);
145 LD_DP3_INC(py, 2, yv0, yv1, yv2);
146 ST_DP3_INC(xv0, xv1, xv2, srcy, 2);
147 ST_DP3_INC(yv0, yv1, yv2, srcx, 2);
148 }
149 else if (n & 8)
150 {
151 LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3);
152 LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3);
153 ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2);
154 ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2);
155 }
156 else if (n & 4)
157 {
158 LD_DP2_INC(px, 2, xv0, xv1);
159 LD_DP2_INC(py, 2, yv0, yv1);
160 ST_DP2_INC(xv0, xv1, srcy, 2);
161 ST_DP2_INC(yv0, yv1, srcx, 2);
162 }
163 else if (n & 2)
164 {
165 xv0 = LD_DP(px);
166 yv0 = LD_DP(py);
167
168 px += 2;
169 py += 2;
170
171 ST_DP(xv0, srcy);
172 ST_DP(yv0, srcx);
173
174 srcx += 2;
175 srcy += 2;
176 }
177
178 if (n & 1)
179 {
180 x0 = px[0];
181 y0 = py[0];
182 srcx[0] = y0;
183 srcy[0] = x0;
184 }
185 }
186 }
187 else if ((inc_x != 0) && (inc_y != 0))
188 {
189 for (i = (n >> 3); i--;)
190 {
191 LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
192 LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
193 ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
194 ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
195 }
196
197 if (n & 7)
198 {
199 if ((n & 4) && (n & 2) && (n & 1))
200 {
201 LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
202 LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
203 ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
204 ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
205 }
206 else if ((n & 4) && (n & 2))
207 {
208 LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
209 LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
210 ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
211 ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
212 }
213 else if ((n & 4) && (n & 1))
214 {
215 LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
216 LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
217 ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
218 ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
219 }
220 else if ((n & 2) && (n & 1))
221 {
222 LD_GP3_INC(px, inc_x, x0, x1, x2);
223 LD_GP3_INC(py, inc_y, y0, y1, y2);
224 ST_GP3_INC(x0, x1, x2, srcy, inc_y);
225 ST_GP3_INC(y0, y1, y2, srcx, inc_x);
226 }
227 else if (n & 4)
228 {
229 LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
230 LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
231 ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
232 ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
233 }
234 else if (n & 2)
235 {
236 LD_GP2_INC(px, inc_x, x0, x1);
237 LD_GP2_INC(py, inc_y, y0, y1);
238 ST_GP2_INC(x0, x1, srcy, inc_y);
239 ST_GP2_INC(y0, y1, srcx, inc_x);
240 }
241 else if (n & 1)
242 {
243 x0 = *srcx;
244 y0 = *srcy;
245
246 *srcx = y0;
247 *srcy = x0;
248 }
249 }
250 }
251 else
252 {
253 if (inc_x == inc_y)
254 {
255 if (n & 1)
256 {
257 x0 = *srcx;
258 *srcx = *srcy;
259 *srcy = x0;
260 }
261 else
262 return (0);
263 }
264 else
265 {
266 BLASLONG ix = 0, iy = 0;
267 while (i < n)
268 {
269 x0 = srcx[ix];
270 srcx[ix] = srcy[iy];
271 srcy[iy] = x0;
272 ix += inc_x;
273 iy += inc_y;
274 i++;
275 }
276 }
277 }
278 return (0);
279 }
280