1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
CNAME(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT dummy3,FLOAT * srcx,BLASLONG inc_x,FLOAT * srcy,BLASLONG inc_y,FLOAT * dummy,BLASLONG dummy2)31 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
32 FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
33 FLOAT *dummy, BLASLONG dummy2)
34 {
35 BLASLONG i = 0, pref_offsetx, pref_offsety;
36 FLOAT *px, *py;
37 FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
38 FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
39 v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
40 v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
41
42 if (n < 0) return (0);
43
44 pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
45 if (pref_offsetx > 0)
46 {
47 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
48 pref_offsetx = pref_offsetx / sizeof(FLOAT);
49 }
50
51 pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
52 if (pref_offsety > 0)
53 {
54 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
55 pref_offsety = pref_offsety / sizeof(FLOAT);
56 }
57
58 px = srcx;
59 py = srcy;
60
61 if ((1 == inc_x) && (1 == inc_y))
62 {
63 if (n >> 5)
64 {
65 LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
66
67 for (i = (n >> 5) - 1; i--;)
68 {
69 PREFETCH(px + pref_offsetx + 32);
70 PREFETCH(px + pref_offsetx + 40);
71 PREFETCH(px + pref_offsetx + 48);
72 PREFETCH(px + pref_offsetx + 56);
73
74 PREFETCH(py + pref_offsety + 32);
75 PREFETCH(py + pref_offsety + 40);
76 PREFETCH(py + pref_offsety + 48);
77 PREFETCH(py + pref_offsety + 56);
78
79 yv0 = LD_SP(py); py += 4;
80 ST_SP(xv0, srcy); srcy += 4;
81 yv1 = LD_SP(py); py += 4;
82 ST_SP(xv1, srcy); srcy += 4;
83 yv2 = LD_SP(py); py += 4;
84 ST_SP(xv2, srcy); srcy += 4;
85 yv3 = LD_SP(py); py += 4;
86 ST_SP(xv3, srcy); srcy += 4;
87 yv4 = LD_SP(py); py += 4;
88 ST_SP(xv4, srcy); srcy += 4;
89 yv5 = LD_SP(py); py += 4;
90 ST_SP(xv5, srcy); srcy += 4;
91 yv6 = LD_SP(py); py += 4;
92 ST_SP(xv6, srcy); srcy += 4;
93 yv7 = LD_SP(py); py += 4;
94 ST_SP(xv7, srcy); srcy += 4;
95
96 xv0 = LD_SP(px); px += 4;
97 ST_SP(yv0, srcx); srcx += 4;
98 xv1 = LD_SP(px); px += 4;
99 ST_SP(yv1, srcx); srcx += 4;
100 xv2 = LD_SP(px); px += 4;
101 ST_SP(yv2, srcx); srcx += 4;
102 xv3 = LD_SP(px); px += 4;
103 ST_SP(yv3, srcx); srcx += 4;
104 xv4 = LD_SP(px); px += 4;
105 ST_SP(yv4, srcx); srcx += 4;
106 xv5 = LD_SP(px); px += 4;
107 ST_SP(yv5, srcx); srcx += 4;
108 xv6 = LD_SP(px); px += 4;
109 ST_SP(yv6, srcx); srcx += 4;
110 xv7 = LD_SP(px); px += 4;
111 ST_SP(yv7, srcx); srcx += 4;
112 }
113
114 LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
115 ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
116 ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
117 }
118
119 if (n & 31)
120 {
121 if ((n & 16) && (n & 8) && (n & 4))
122 {
123 LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
124 LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
125 ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
126 ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
127 }
128 else if ((n & 16) && (n & 8))
129 {
130 LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
131 LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
132 ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
133 ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
134 }
135 else if ((n & 16) && (n & 4))
136 {
137 LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
138 LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
139 ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
140 ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
141 }
142 else if ((n & 8) && (n & 4))
143 {
144 LD_SP3_INC(px, 4, xv0, xv1, xv2);
145 LD_SP3_INC(py, 4, yv0, yv1, yv2);
146 ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
147 ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
148 }
149 else if (n & 16)
150 {
151 LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
152 LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
153 ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
154 ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
155 }
156 else if (n & 8)
157 {
158 LD_SP2_INC(px, 4, xv0, xv1);
159 LD_SP2_INC(py, 4, yv0, yv1);
160 ST_SP2_INC(xv0, xv1, srcy, 4);
161 ST_SP2_INC(yv0, yv1, srcx, 4);
162 }
163 else if (n & 4)
164 {
165 xv0 = LD_SP(px);
166 yv0 = LD_SP(py);
167
168 px += 4;
169 py += 4;
170
171 ST_SP(xv0, srcy);
172 ST_SP(yv0, srcx);
173
174 srcx += 4;
175 srcy += 4;
176 }
177
178 if ((n & 2) && (n & 1))
179 {
180 LD_GP3_INC(px, 1, x0, x1, x3);
181 LD_GP3_INC(py, 1, y0, y1, y3);
182 ST_GP3_INC(x0, x1, x3, srcy, 1);
183 ST_GP3_INC(y0, y1, y3, srcx, 1);
184 }
185 else if (n & 2)
186 {
187 LD_GP2_INC(px, 1, x0, x1);
188 LD_GP2_INC(py, 1, y0, y1);
189 ST_GP2_INC(x0, x1, srcy, 1);
190 ST_GP2_INC(y0, y1, srcx, 1);
191 }
192 else if (n & 1)
193 {
194 x0 = px[0];
195 y0 = py[0];
196 srcx[0] = y0;
197 srcy[0] = x0;
198 }
199 }
200 }
201 else if ((inc_x != 0) && (inc_y != 0))
202 {
203 for (i = (n >> 3); i--;)
204 {
205 LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
206 LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
207 ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
208 ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
209 }
210
211 if (n & 7)
212 {
213 if ((n & 4) && (n & 2) && (n & 1))
214 {
215 LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
216 LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
217 ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
218 ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
219 }
220 else if ((n & 4) && (n & 2))
221 {
222 LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
223 LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
224 ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
225 ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
226 }
227 else if ((n & 4) && (n & 1))
228 {
229 LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
230 LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
231 ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
232 ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
233 }
234 else if ((n & 2) && (n & 1))
235 {
236 LD_GP3_INC(px, inc_x, x0, x1, x2);
237 LD_GP3_INC(py, inc_y, y0, y1, y2);
238 ST_GP3_INC(x0, x1, x2, srcy, inc_y);
239 ST_GP3_INC(y0, y1, y2, srcx, inc_x);
240 }
241 else if (n & 4)
242 {
243 LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
244 LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
245 ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
246 ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
247 }
248 else if (n & 2)
249 {
250 LD_GP2_INC(px, inc_x, x0, x1);
251 LD_GP2_INC(py, inc_y, y0, y1);
252 ST_GP2_INC(x0, x1, srcy, inc_y);
253 ST_GP2_INC(y0, y1, srcx, inc_x);
254 }
255 else if (n & 1)
256 {
257 x0 = *srcx;
258 y0 = *srcy;
259
260 *srcx = y0;
261 *srcy = x0;
262 }
263 }
264 }
265 else
266 {
267 if (inc_x == inc_y)
268 {
269 if (n & 1)
270 {
271 x0 = *srcx;
272 *srcx = *srcy;
273 *srcy = x0;
274 }
275 else
276 return (0);
277 }
278 else
279 {
280 BLASLONG ix = 0, iy = 0;
281 while (i < n)
282 {
283 x0 = srcx[ix];
284 srcx[ix] = srcy[iy];
285 srcy[iy] = x0;
286 ix += inc_x;
287 iy += inc_y;
288 i++;
289 }
290 }
291 }
292
293 return (0);
294 }
295