1 /***************************************************************************
2 Copyright (c) 2020, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28 #include "common.h"
29 #if !defined(DOUBLE)
30 #define VSETVL(n) vsetvl_e32m4(n)
31 #define VSETVL_MAX vsetvlmax_e32m1()
32 #define FLOAT_V_T vfloat32m4_t
33 #define FLOAT_V_T_M1 vfloat32m1_t
34 #define VLEV_FLOAT vle_v_f32m4
35 #define VLSEV_FLOAT vlse_v_f32m4
36 #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
37 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
38 #define VFMVVF_FLOAT vfmv_v_f_f32m4
39 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
40 #define VFDOTVV_FLOAT vfdot_vv_f32m4
41 #define ABS fabsf
42 #define MASK_T vbool8_t
43 #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
44 #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
45 #define VMFIRSTM vmfirst_m_b8
46 #define VFDIVVF_FLOAT vfdiv_vf_f32m4
47 #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
48 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
49 #else
50 #define VSETVL(n) vsetvl_e64m4(n)
51 #define VSETVL_MAX vsetvlmax_e64m1()
52 #define FLOAT_V_T vfloat64m4_t
53 #define FLOAT_V_T_M1 vfloat64m1_t
54 #define VLEV_FLOAT vle_v_f64m4
55 #define VLSEV_FLOAT vlse_v_f64m4
56 #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
57 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
58 #define VFMVVF_FLOAT vfmv_v_f_f64m4
59 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
60 #define VFDOTVV_FLOAT vfdot_vv_f64m4
61 #define ABS fabs
62 #define MASK_T vbool16_t
63 #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
64 #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
65 #define VMFIRSTM vmfirst_m_b16
66 #define VFDIVVF_FLOAT vfdiv_vf_f64m4
67 #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
68 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
69 #endif
70
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x)71 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
72 {
73 BLASLONG i=0, j=0;
74
75 if ( n < 0 ) return(0.0);
76 // if(n == 1) return (ABS(x[0]));
77
78 FLOAT_V_T vr, v0, v_zero;
79 unsigned int gvl = 0;
80 FLOAT_V_T_M1 v_res, v_z0;
81 gvl = VSETVL_MAX;
82 v_res = VFMVVF_FLOAT_M1(0, gvl);
83 v_z0 = VFMVVF_FLOAT_M1(0, gvl);
84
85 FLOAT scale = 0.0, ssq = 0.0;
86 MASK_T mask;
87 BLASLONG index = 0;
88 if(inc_x == 1){
89 BLASLONG n2 = n * 2;
90 gvl = VSETVL(n2);
91 vr = VFMVVF_FLOAT(0, gvl);
92 v_zero = VFMVVF_FLOAT(0, gvl);
93 for(i=0,j=0; i<n2/gvl; i++){
94 v0 = VLEV_FLOAT(&x[j], gvl);
95 //fabs(vector)
96 mask = VMFLTVF_FLOAT(v0, 0, gvl);
97 v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
98 //if scale change
99 mask = VMFGTVF_FLOAT(v0, scale, gvl);
100 index = VMFIRSTM(mask, gvl);
101 if(index == -1){//no elements greater than scale
102 if(scale != 0.0){
103 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
104 vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
105 }
106 }else{//found greater element
107 //ssq in vector vr: vr[0]
108 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
109 //total ssq before current vector
110 ssq += v_res[0];
111 //find max
112 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
113 //update ssq before max_index
114 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
115 //update scale
116 scale = v_res[0];
117 //ssq in vector vr
118 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
119 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
120 }
121 j += gvl;
122 }
123 //ssq in vector vr: vr[0]
124 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
125 //total ssq now
126 ssq += v_res[0];
127
128 //tail
129 if(j < n2){
130 gvl = VSETVL(n2-j);
131 v0 = VLEV_FLOAT(&x[j], gvl);
132 //fabs(vector)
133 mask = VMFLTVF_FLOAT(v0, 0, gvl);
134 v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
135 //if scale change
136 mask = VMFGTVF_FLOAT(v0, scale, gvl);
137 index = VMFIRSTM(mask, gvl);
138 if(index == -1){//no elements greater than scale
139 if(scale != 0.0)
140 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
141 }else{//found greater element
142 //find max
143 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
144 //update ssq before max_index
145 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
146 //update scale
147 scale = v_res[0];
148 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
149 }
150 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
151 //ssq in vector vr: vr[0]
152 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
153 //total ssq now
154 ssq += v_res[0];
155 }
156 }else{
157 gvl = VSETVL(n);
158 vr = VFMVVF_FLOAT(0, gvl);
159 v_zero = VFMVVF_FLOAT(0, gvl);
160 unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
161 int idx = 0, inc_v = inc_x * gvl * 2;
162 for(i=0,j=0; i<n/gvl; i++){
163 v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
164 //fabs(vector)
165 mask = VMFLTVF_FLOAT(v0, 0, gvl);
166 v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
167 //if scale change
168 mask = VMFGTVF_FLOAT(v0, scale, gvl);
169 index = VMFIRSTM(mask, gvl);
170 if(index == -1){//no elements greater than scale
171 if(scale != 0.0){
172 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
173 vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
174 }
175 }else{//found greater element
176 //ssq in vector vr: vr[0]
177 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
178 //total ssq before current vector
179 ssq += v_res[0];
180 //find max
181 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
182 //update ssq before max_index
183 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
184 //update scale
185 scale = v_res[0];
186 //ssq in vector vr
187 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
188 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
189 }
190
191 v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
192 //fabs(vector)
193 mask = VMFLTVF_FLOAT(v0, 0, gvl);
194 v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
195 //if scale change
196 mask = VMFGTVF_FLOAT(v0, scale, gvl);
197 index = VMFIRSTM(mask, gvl);
198 if(index == -1){//no elements greater than scale
199 if(scale != 0.0){
200 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
201 vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
202 }
203 }else{//found greater element
204 //ssq in vector vr: vr[0]
205 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
206 //total ssq before current vector
207 ssq += v_res[0];
208 //find max
209 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
210 //update ssq before max_index
211 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
212 //update scale
213 scale = v_res[0];
214 //ssq in vector vr
215 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
216 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
217 }
218 j += gvl;
219 idx += inc_v;
220 }
221 //ssq in vector vr: vr[0]
222 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
223 //total ssq now
224 ssq += v_res[0];
225
226 //tail
227 if(j < n){
228 gvl = VSETVL(n-j);
229 v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
230 //fabs(vector)
231 mask = VMFLTVF_FLOAT(v0, 0, gvl);
232 v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
233 //if scale change
234 mask = VMFGTVF_FLOAT(v0, scale, gvl);
235 index = VMFIRSTM(mask, gvl);
236 if(index == -1){//no elements greater than scale
237 if(scale != 0.0){
238 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
239 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
240 }
241 }else{//found greater element
242 //find max
243 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
244 //update ssq before max_index
245 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
246 //update scale
247 scale = v_res[0];
248 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
249 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
250 }
251
252 v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
253 //fabs(vector)
254 mask = VMFLTVF_FLOAT(v0, 0, gvl);
255 v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
256 //if scale change
257 mask = VMFGTVF_FLOAT(v0, scale, gvl);
258 index = VMFIRSTM(mask, gvl);
259 if(index == -1){//no elements greater than scale
260 if(scale != 0.0){
261 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
262 vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
263 }
264 }else{//found greater element
265 //ssq in vector vr: vr[0]
266 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
267 //total ssq before current vector
268 ssq += v_res[0];
269 //find max
270 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
271 //update ssq before max_index
272 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
273 //update scale
274 scale = v_res[0];
275 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
276 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
277 }
278 //ssq in vector vr: vr[0]
279 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
280 //total ssq now
281 ssq += v_res[0];
282 }
283 }
284 return(scale * sqrt(ssq));
285 }
286
287
288