1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27 
28 #include "common.h"
29 #include <math.h>
30 #include "macros_msa.h"
31 
32 #define AND_VEC_D(in)   ((v2f64) ((v2i64) in & and_vec))
33 
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x)34 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
35 {
36     BLASLONG i;
37     FLOAT sumf = 0.0;
38     v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
39     v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
40     v2f64 sum_abs0 = {0, 0};
41     v2f64 sum_abs1 = {0, 0};
42     v2f64 sum_abs2 = {0, 0};
43     v2f64 sum_abs3 = {0, 0};
44     v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
45 
46     if (n <= 0 || inc_x <= 0) return (sumf);
47 
48     if (1 == inc_x)
49     {
50         if (n > 16)
51         {
52             FLOAT *x_pref;
53             BLASLONG pref_offset;
54 
55             pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
56             if (pref_offset > 0)
57             {
58                 pref_offset = L1_DATA_LINESIZE - pref_offset;
59                 pref_offset = pref_offset / sizeof(FLOAT);
60             }
61             x_pref = x + pref_offset + 64 + 16;
62 
63             LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
64             for (i = (n >> 4) - 1; i--;)
65             {
66                 PREF_OFFSET(x_pref, 0);
67                 PREF_OFFSET(x_pref, 32);
68                 PREF_OFFSET(x_pref, 64);
69                 PREF_OFFSET(x_pref, 96);
70                 PREF_OFFSET(x_pref, 128);
71                 PREF_OFFSET(x_pref, 160);
72                 PREF_OFFSET(x_pref, 192);
73                 PREF_OFFSET(x_pref, 224);
74                 x_pref += 32;
75 
76                 LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
77 
78                 sum_abs0 += AND_VEC_D(src0);
79                 sum_abs1 += AND_VEC_D(src1);
80                 sum_abs2 += AND_VEC_D(src2);
81                 sum_abs3 += AND_VEC_D(src3);
82                 sum_abs0 += AND_VEC_D(src4);
83                 sum_abs1 += AND_VEC_D(src5);
84                 sum_abs2 += AND_VEC_D(src6);
85                 sum_abs3 += AND_VEC_D(src7);
86 
87                 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
88 
89                 sum_abs0 += AND_VEC_D(src8);
90                 sum_abs1 += AND_VEC_D(src9);
91                 sum_abs2 += AND_VEC_D(src10);
92                 sum_abs3 += AND_VEC_D(src11);
93                 sum_abs0 += AND_VEC_D(src12);
94                 sum_abs1 += AND_VEC_D(src13);
95                 sum_abs2 += AND_VEC_D(src14);
96                 sum_abs3 += AND_VEC_D(src15);
97             }
98 
99             LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
100 
101             sum_abs0 += AND_VEC_D(src0);
102             sum_abs1 += AND_VEC_D(src1);
103             sum_abs2 += AND_VEC_D(src2);
104             sum_abs3 += AND_VEC_D(src3);
105             sum_abs0 += AND_VEC_D(src4);
106             sum_abs1 += AND_VEC_D(src5);
107             sum_abs2 += AND_VEC_D(src6);
108             sum_abs3 += AND_VEC_D(src7);
109             sum_abs0 += AND_VEC_D(src8);
110             sum_abs1 += AND_VEC_D(src9);
111             sum_abs2 += AND_VEC_D(src10);
112             sum_abs3 += AND_VEC_D(src11);
113             sum_abs0 += AND_VEC_D(src12);
114             sum_abs1 += AND_VEC_D(src13);
115             sum_abs2 += AND_VEC_D(src14);
116             sum_abs3 += AND_VEC_D(src15);
117         }
118 
119         if (n & 15)
120         {
121             if (n & 8)
122             {
123                 LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
124 
125                 sum_abs0 += AND_VEC_D(src0);
126                 sum_abs1 += AND_VEC_D(src1);
127                 sum_abs2 += AND_VEC_D(src2);
128                 sum_abs3 += AND_VEC_D(src3);
129                 sum_abs0 += AND_VEC_D(src4);
130                 sum_abs1 += AND_VEC_D(src5);
131                 sum_abs2 += AND_VEC_D(src6);
132                 sum_abs3 += AND_VEC_D(src7);
133             }
134 
135             if (n & 4)
136             {
137                 LD_DP4_INC(x, 2, src0, src1, src2, src3);
138 
139                 sum_abs0 += AND_VEC_D(src0);
140                 sum_abs1 += AND_VEC_D(src1);
141                 sum_abs2 += AND_VEC_D(src2);
142                 sum_abs3 += AND_VEC_D(src3);
143             }
144 
145             if (n & 2)
146             {
147                 LD_DP2_INC(x, 2, src0, src1);
148 
149                 sum_abs0 += AND_VEC_D(src0);
150                 sum_abs1 += AND_VEC_D(src1);
151             }
152 
153             if (n & 1)
154             {
155                 src0 = LD_DP(x);
156 
157                 sum_abs0 += AND_VEC_D(src0);
158             }
159         }
160 
161         sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
162         sumf = sum_abs0[0] + sum_abs0[1];
163     }
164     else
165     {
166         inc_x *= 2;
167 
168         if (n > 16)
169         {
170             LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
171             for (i = (n >> 4) - 1; i--;)
172             {
173                 LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
174 
175                 sum_abs0 += AND_VEC_D(src0);
176                 sum_abs1 += AND_VEC_D(src1);
177                 sum_abs2 += AND_VEC_D(src2);
178                 sum_abs3 += AND_VEC_D(src3);
179                 sum_abs0 += AND_VEC_D(src4);
180                 sum_abs1 += AND_VEC_D(src5);
181                 sum_abs2 += AND_VEC_D(src6);
182                 sum_abs3 += AND_VEC_D(src7);
183 
184                 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
185 
186                 sum_abs0 += AND_VEC_D(src8);
187                 sum_abs1 += AND_VEC_D(src9);
188                 sum_abs2 += AND_VEC_D(src10);
189                 sum_abs3 += AND_VEC_D(src11);
190                 sum_abs0 += AND_VEC_D(src12);
191                 sum_abs1 += AND_VEC_D(src13);
192                 sum_abs2 += AND_VEC_D(src14);
193                 sum_abs3 += AND_VEC_D(src15);
194             }
195 
196             LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
197 
198             sum_abs0 += AND_VEC_D(src0);
199             sum_abs1 += AND_VEC_D(src1);
200             sum_abs2 += AND_VEC_D(src2);
201             sum_abs3 += AND_VEC_D(src3);
202             sum_abs0 += AND_VEC_D(src4);
203             sum_abs1 += AND_VEC_D(src5);
204             sum_abs2 += AND_VEC_D(src6);
205             sum_abs3 += AND_VEC_D(src7);
206             sum_abs0 += AND_VEC_D(src8);
207             sum_abs1 += AND_VEC_D(src9);
208             sum_abs2 += AND_VEC_D(src10);
209             sum_abs3 += AND_VEC_D(src11);
210             sum_abs0 += AND_VEC_D(src12);
211             sum_abs1 += AND_VEC_D(src13);
212             sum_abs2 += AND_VEC_D(src14);
213             sum_abs3 += AND_VEC_D(src15);
214         }
215 
216         if (n & 15)
217         {
218             if (n & 8)
219             {
220                 LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
221 
222                 sum_abs0 += AND_VEC_D(src0);
223                 sum_abs1 += AND_VEC_D(src1);
224                 sum_abs2 += AND_VEC_D(src2);
225                 sum_abs3 += AND_VEC_D(src3);
226                 sum_abs0 += AND_VEC_D(src4);
227                 sum_abs1 += AND_VEC_D(src5);
228                 sum_abs2 += AND_VEC_D(src6);
229                 sum_abs3 += AND_VEC_D(src7);
230             }
231 
232             if (n & 4)
233             {
234                 LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
235 
236                 sum_abs0 += AND_VEC_D(src0);
237                 sum_abs1 += AND_VEC_D(src1);
238                 sum_abs2 += AND_VEC_D(src2);
239                 sum_abs3 += AND_VEC_D(src3);
240             }
241 
242             if (n & 2)
243             {
244                 LD_DP2_INC(x, inc_x, src0, src1);
245 
246                 sum_abs0 += AND_VEC_D(src0);
247                 sum_abs1 += AND_VEC_D(src1);
248             }
249 
250             if (n & 1)
251             {
252                 src0 = LD_DP(x);
253 
254                 sum_abs0 += AND_VEC_D(src0);
255             }
256         }
257 
258         sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
259         sumf = sum_abs0[0] + sum_abs0[1];
260     }
261 
262     return (sumf);
263 }
264