1 /***************************************************************************
2 Copyright (c) 2020, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27 
28 #include "common.h"
29 #if !defined(DOUBLE)
30 #define VSETVL(n) vsetvl_e32m4(n)
31 #define VSETVL_MAX vsetvlmax_e32m1()
32 #define FLOAT_V_T vfloat32m4_t
33 #define FLOAT_V_T_M1 vfloat32m1_t
34 #define VLEV_FLOAT vle_v_f32m4
35 #define VLSEV_FLOAT vlse_v_f32m4
36 #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
37 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
38 #define VFMVVF_FLOAT vfmv_v_f_f32m4
39 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
40 #define VFDOTVV_FLOAT vfdot_vv_f32m4
41 #define ABS fabsf
42 #define MASK_T vbool8_t
43 #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
44 #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
45 #define VMFIRSTM vmfirst_m_b8
46 #define VFDIVVF_FLOAT vfdiv_vf_f32m4
47 #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
48 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
49 #else
50 #define VSETVL(n) vsetvl_e64m4(n)
51 #define VSETVL_MAX vsetvlmax_e64m1()
52 #define FLOAT_V_T vfloat64m4_t
53 #define FLOAT_V_T_M1 vfloat64m1_t
54 #define VLEV_FLOAT vle_v_f64m4
55 #define VLSEV_FLOAT vlse_v_f64m4
56 #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
57 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
58 #define VFMVVF_FLOAT vfmv_v_f_f64m4
59 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
60 #define VFDOTVV_FLOAT vfdot_vv_f64m4
61 #define ABS fabs
62 #define MASK_T vbool16_t
63 #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
64 #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
65 #define VMFIRSTM vmfirst_m_b16
66 #define VFDIVVF_FLOAT vfdiv_vf_f64m4
67 #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
68 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
69 #endif
70 
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x)71 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
72 {
73 	BLASLONG i=0, j=0;
74 
75 	if ( n < 0 )  return(0.0);
76 //        if(n == 1) return (ABS(x[0]));
77 
78         FLOAT_V_T vr, v0, v_zero;
79         unsigned int gvl = 0;
80         FLOAT_V_T_M1 v_res, v_z0;
81         gvl = VSETVL_MAX;
82         v_res = VFMVVF_FLOAT_M1(0, gvl);
83         v_z0 = VFMVVF_FLOAT_M1(0, gvl);
84 
85         FLOAT scale = 0.0, ssq = 0.0;
86         MASK_T mask;
87         BLASLONG index = 0;
88         if(inc_x == 1){
89                 BLASLONG n2 = n * 2;
90                 gvl = VSETVL(n2);
91                 vr = VFMVVF_FLOAT(0, gvl);
92                 v_zero = VFMVVF_FLOAT(0, gvl);
93                 for(i=0,j=0; i<n2/gvl; i++){
94                         v0 = VLEV_FLOAT(&x[j], gvl);
95                         //fabs(vector)
96                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
97                         v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
98                         //if scale change
99                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
100                         index = VMFIRSTM(mask, gvl);
101                         if(index == -1){//no elements greater than scale
102                                 if(scale != 0.0){
103                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
104                                         vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
105                                 }
106                         }else{//found greater element
107                                 //ssq in vector vr: vr[0]
108                                 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
109                                 //total ssq before current vector
110                                 ssq += v_res[0];
111                                 //find max
112                                 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
113                                 //update ssq before max_index
114                                 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
115                                 //update scale
116                                 scale = v_res[0];
117                                 //ssq in vector vr
118                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
119                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
120                         }
121                         j += gvl;
122                 }
123                 //ssq in vector vr: vr[0]
124                 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
125                 //total ssq now
126                 ssq += v_res[0];
127 
128                 //tail
129                 if(j < n2){
130                         gvl = VSETVL(n2-j);
131                         v0 = VLEV_FLOAT(&x[j], gvl);
132                         //fabs(vector)
133                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
134                         v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
135                         //if scale change
136                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
137                         index = VMFIRSTM(mask, gvl);
138                         if(index == -1){//no elements greater than scale
139                                 if(scale != 0.0)
140                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
141                         }else{//found greater element
142                                 //find max
143                                 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
144                                 //update ssq before max_index
145                                 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
146                                 //update scale
147                                 scale = v_res[0];
148                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
149                         }
150                         vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
151                         //ssq in vector vr: vr[0]
152                         v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
153                         //total ssq now
154                         ssq += v_res[0];
155                 }
156         }else{
157                 gvl = VSETVL(n);
158                 vr = VFMVVF_FLOAT(0, gvl);
159                 v_zero = VFMVVF_FLOAT(0, gvl);
160                 unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
161                 int idx = 0, inc_v = inc_x * gvl * 2;
162                 for(i=0,j=0; i<n/gvl; i++){
163                         v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
164                         //fabs(vector)
165                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
166                         v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
167                         //if scale change
168                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
169                         index = VMFIRSTM(mask, gvl);
170                         if(index == -1){//no elements greater than scale
171                                 if(scale != 0.0){
172                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
173                                         vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
174                                 }
175                         }else{//found greater element
176                                 //ssq in vector vr: vr[0]
177                                 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
178                                 //total ssq before current vector
179                                 ssq += v_res[0];
180                                 //find max
181                                 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
182                                 //update ssq before max_index
183                                 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
184                                 //update scale
185                                 scale = v_res[0];
186                                 //ssq in vector vr
187                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
188                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
189                         }
190 
191                         v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
192                         //fabs(vector)
193                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
194                         v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
195                         //if scale change
196                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
197                         index = VMFIRSTM(mask, gvl);
198                         if(index == -1){//no elements greater than scale
199                                 if(scale != 0.0){
200                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
201                                         vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
202                                 }
203                         }else{//found greater element
204                                 //ssq in vector vr: vr[0]
205                                 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
206                                 //total ssq before current vector
207                                 ssq += v_res[0];
208                                 //find max
209                                 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
210                                 //update ssq before max_index
211                                 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
212                                 //update scale
213                                 scale = v_res[0];
214                                 //ssq in vector vr
215                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
216                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
217                         }
218                         j += gvl;
219                         idx += inc_v;
220                 }
221                 //ssq in vector vr: vr[0]
222                 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
223                 //total ssq now
224                 ssq += v_res[0];
225 
226                 //tail
227                 if(j < n){
228                         gvl = VSETVL(n-j);
229                         v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
230                         //fabs(vector)
231                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
232                         v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
233                         //if scale change
234                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
235                         index = VMFIRSTM(mask, gvl);
236                         if(index == -1){//no elements greater than scale
237                                 if(scale != 0.0){
238                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
239                                         vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
240                                 }
241                         }else{//found greater element
242                                 //find max
243                                 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
244                                 //update ssq before max_index
245                                 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
246                                 //update scale
247                                 scale = v_res[0];
248                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
249                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
250                         }
251 
252                         v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
253                         //fabs(vector)
254                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
255                         v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
256                         //if scale change
257                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
258                         index = VMFIRSTM(mask, gvl);
259                         if(index == -1){//no elements greater than scale
260                                 if(scale != 0.0){
261                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
262                                         vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
263                                 }
264                         }else{//found greater element
265                                 //ssq in vector vr: vr[0]
266                                 v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
267                                 //total ssq before current vector
268                                 ssq += v_res[0];
269                                 //find max
270                                 v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
271                                 //update ssq before max_index
272                                 ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
273                                 //update scale
274                                 scale = v_res[0];
275                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
276                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
277                         }
278                         //ssq in vector vr: vr[0]
279                         v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
280                         //total ssq now
281                         ssq += v_res[0];
282                 }
283         }
284 	return(scale * sqrt(ssq));
285 }
286 
287 
288