1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include <math.h>
30 #include "macros_msa.h"
31
32 #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
33
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x)34 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
35 {
36 BLASLONG i;
37 FLOAT sumf = 0.0;
38 v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
39 v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
40 v2f64 sum_abs0 = {0, 0};
41 v2f64 sum_abs1 = {0, 0};
42 v2f64 sum_abs2 = {0, 0};
43 v2f64 sum_abs3 = {0, 0};
44 v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
45
46 if (n <= 0 || inc_x <= 0) return (sumf);
47
48 if (1 == inc_x)
49 {
50 if (n > 16)
51 {
52 FLOAT *x_pref;
53 BLASLONG pref_offset;
54
55 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
56 if (pref_offset > 0)
57 {
58 pref_offset = L1_DATA_LINESIZE - pref_offset;
59 pref_offset = pref_offset / sizeof(FLOAT);
60 }
61 x_pref = x + pref_offset + 64 + 16;
62
63 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
64 for (i = (n >> 4) - 1; i--;)
65 {
66 PREF_OFFSET(x_pref, 0);
67 PREF_OFFSET(x_pref, 32);
68 PREF_OFFSET(x_pref, 64);
69 PREF_OFFSET(x_pref, 96);
70 PREF_OFFSET(x_pref, 128);
71 PREF_OFFSET(x_pref, 160);
72 PREF_OFFSET(x_pref, 192);
73 PREF_OFFSET(x_pref, 224);
74 x_pref += 32;
75
76 LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
77
78 sum_abs0 += AND_VEC_D(src0);
79 sum_abs1 += AND_VEC_D(src1);
80 sum_abs2 += AND_VEC_D(src2);
81 sum_abs3 += AND_VEC_D(src3);
82 sum_abs0 += AND_VEC_D(src4);
83 sum_abs1 += AND_VEC_D(src5);
84 sum_abs2 += AND_VEC_D(src6);
85 sum_abs3 += AND_VEC_D(src7);
86
87 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
88
89 sum_abs0 += AND_VEC_D(src8);
90 sum_abs1 += AND_VEC_D(src9);
91 sum_abs2 += AND_VEC_D(src10);
92 sum_abs3 += AND_VEC_D(src11);
93 sum_abs0 += AND_VEC_D(src12);
94 sum_abs1 += AND_VEC_D(src13);
95 sum_abs2 += AND_VEC_D(src14);
96 sum_abs3 += AND_VEC_D(src15);
97 }
98
99 LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
100
101 sum_abs0 += AND_VEC_D(src0);
102 sum_abs1 += AND_VEC_D(src1);
103 sum_abs2 += AND_VEC_D(src2);
104 sum_abs3 += AND_VEC_D(src3);
105 sum_abs0 += AND_VEC_D(src4);
106 sum_abs1 += AND_VEC_D(src5);
107 sum_abs2 += AND_VEC_D(src6);
108 sum_abs3 += AND_VEC_D(src7);
109 sum_abs0 += AND_VEC_D(src8);
110 sum_abs1 += AND_VEC_D(src9);
111 sum_abs2 += AND_VEC_D(src10);
112 sum_abs3 += AND_VEC_D(src11);
113 sum_abs0 += AND_VEC_D(src12);
114 sum_abs1 += AND_VEC_D(src13);
115 sum_abs2 += AND_VEC_D(src14);
116 sum_abs3 += AND_VEC_D(src15);
117 }
118
119 if (n & 15)
120 {
121 if (n & 8)
122 {
123 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
124
125 sum_abs0 += AND_VEC_D(src0);
126 sum_abs1 += AND_VEC_D(src1);
127 sum_abs2 += AND_VEC_D(src2);
128 sum_abs3 += AND_VEC_D(src3);
129 sum_abs0 += AND_VEC_D(src4);
130 sum_abs1 += AND_VEC_D(src5);
131 sum_abs2 += AND_VEC_D(src6);
132 sum_abs3 += AND_VEC_D(src7);
133 }
134
135 if (n & 4)
136 {
137 LD_DP4_INC(x, 2, src0, src1, src2, src3);
138
139 sum_abs0 += AND_VEC_D(src0);
140 sum_abs1 += AND_VEC_D(src1);
141 sum_abs2 += AND_VEC_D(src2);
142 sum_abs3 += AND_VEC_D(src3);
143 }
144
145 if (n & 2)
146 {
147 LD_DP2_INC(x, 2, src0, src1);
148
149 sum_abs0 += AND_VEC_D(src0);
150 sum_abs1 += AND_VEC_D(src1);
151 }
152
153 if (n & 1)
154 {
155 src0 = LD_DP(x);
156
157 sum_abs0 += AND_VEC_D(src0);
158 }
159 }
160
161 sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
162 sumf = sum_abs0[0] + sum_abs0[1];
163 }
164 else
165 {
166 inc_x *= 2;
167
168 if (n > 16)
169 {
170 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
171 for (i = (n >> 4) - 1; i--;)
172 {
173 LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
174
175 sum_abs0 += AND_VEC_D(src0);
176 sum_abs1 += AND_VEC_D(src1);
177 sum_abs2 += AND_VEC_D(src2);
178 sum_abs3 += AND_VEC_D(src3);
179 sum_abs0 += AND_VEC_D(src4);
180 sum_abs1 += AND_VEC_D(src5);
181 sum_abs2 += AND_VEC_D(src6);
182 sum_abs3 += AND_VEC_D(src7);
183
184 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
185
186 sum_abs0 += AND_VEC_D(src8);
187 sum_abs1 += AND_VEC_D(src9);
188 sum_abs2 += AND_VEC_D(src10);
189 sum_abs3 += AND_VEC_D(src11);
190 sum_abs0 += AND_VEC_D(src12);
191 sum_abs1 += AND_VEC_D(src13);
192 sum_abs2 += AND_VEC_D(src14);
193 sum_abs3 += AND_VEC_D(src15);
194 }
195
196 LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
197
198 sum_abs0 += AND_VEC_D(src0);
199 sum_abs1 += AND_VEC_D(src1);
200 sum_abs2 += AND_VEC_D(src2);
201 sum_abs3 += AND_VEC_D(src3);
202 sum_abs0 += AND_VEC_D(src4);
203 sum_abs1 += AND_VEC_D(src5);
204 sum_abs2 += AND_VEC_D(src6);
205 sum_abs3 += AND_VEC_D(src7);
206 sum_abs0 += AND_VEC_D(src8);
207 sum_abs1 += AND_VEC_D(src9);
208 sum_abs2 += AND_VEC_D(src10);
209 sum_abs3 += AND_VEC_D(src11);
210 sum_abs0 += AND_VEC_D(src12);
211 sum_abs1 += AND_VEC_D(src13);
212 sum_abs2 += AND_VEC_D(src14);
213 sum_abs3 += AND_VEC_D(src15);
214 }
215
216 if (n & 15)
217 {
218 if (n & 8)
219 {
220 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
221
222 sum_abs0 += AND_VEC_D(src0);
223 sum_abs1 += AND_VEC_D(src1);
224 sum_abs2 += AND_VEC_D(src2);
225 sum_abs3 += AND_VEC_D(src3);
226 sum_abs0 += AND_VEC_D(src4);
227 sum_abs1 += AND_VEC_D(src5);
228 sum_abs2 += AND_VEC_D(src6);
229 sum_abs3 += AND_VEC_D(src7);
230 }
231
232 if (n & 4)
233 {
234 LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
235
236 sum_abs0 += AND_VEC_D(src0);
237 sum_abs1 += AND_VEC_D(src1);
238 sum_abs2 += AND_VEC_D(src2);
239 sum_abs3 += AND_VEC_D(src3);
240 }
241
242 if (n & 2)
243 {
244 LD_DP2_INC(x, inc_x, src0, src1);
245
246 sum_abs0 += AND_VEC_D(src0);
247 sum_abs1 += AND_VEC_D(src1);
248 }
249
250 if (n & 1)
251 {
252 src0 = LD_DP(x);
253
254 sum_abs0 += AND_VEC_D(src0);
255 }
256 }
257
258 sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
259 sumf = sum_abs0[0] + sum_abs0[1];
260 }
261
262 return (sumf);
263 }
264