1 /***************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27 
28 #include "common.h"
29 
30 #define N		"x0"	/* vector length */
31 #define X		"x1"	/* "X" vector address */
32 #define INC_X		"x2"	/* "X" stride */
33 #define INDEX		"x3"	/* index of max/min value */
34 #define Z		"x4"	/* vector index */
35 #define J		"x5"	/* loop variable */
36 
37 #if !defined(DOUBLE)
38 #define MAXF		"s0"
39 #define TMPF0		"s1"
40 #define TMPF0V		"v1.2s"
41 #define TMPF1		"d4"
42 #define TMPF1V		"v4.2s"
43 #define N_KERNEL_SIZE	"32"
44 #define SZ		"8"
45 #define N_DIV_SHIFT	"5"
46 #define N_REM_MASK	"31"
47 #define INC_SHIFT	"3"
48 #else
49 #define MAXF		"d0"
50 #define TMPF0		"d1"
51 #define TMPF0V		"v1.2d"
52 #define TMPF1		"q4"
53 #define TMPF1V		"v4.2d"
54 #define N_KERNEL_SIZE	"16"
55 #define SZ		"16"
56 #define N_DIV_SHIFT	"4"
57 #define N_REM_MASK	"15"
58 #define INC_SHIFT	"4"
59 #endif
60 
61 /******************************************************************************/
62 
63 #if !defined(DOUBLE)
64 #define KERNEL_F						\
65 	"ldp	q2, q3, ["X"]				\n"	\
66 	"ldp	q4, q5, ["X", #32]			\n"	\
67 	"ldp	q6, q7, ["X", #64]			\n"	\
68 	"ldp	q16, q17, ["X", #96]			\n"	\
69 	"ldp	q18, q19, ["X", #128]			\n"	\
70 	"ldp	q20, q21, ["X", #160]			\n"	\
71 	"ldp	q22, q23, ["X", #192]			\n"	\
72 	"ldp	q24, q25, ["X", #224]			\n"	\
73 	"add	"X", "X", #256				\n"	\
74 	"fabs	v2.4s, v2.4s				\n"	\
75 	"fabs	v3.4s, v3.4s				\n"	\
76 	"fabs	v4.4s, v4.4s				\n"	\
77 	"fabs	v5.4s, v5.4s				\n"	\
78 	"fabs	v6.4s, v6.4s				\n"	\
79 	"fabs	v7.4s, v7.4s				\n"	\
80 	"fabs	v16.4s, v16.4s				\n"	\
81 	"fabs	v17.4s, v17.4s				\n"	\
82 	"fabs	v18.4s, v18.4s				\n"	\
83 	"fabs	v19.4s, v19.4s				\n"	\
84 	"fabs	v20.4s, v20.4s				\n"	\
85 	"fabs	v21.4s, v21.4s				\n"	\
86 	"fabs	v22.4s, v22.4s				\n"	\
87 	"fabs	v23.4s, v23.4s				\n"	\
88 	"fabs	v24.4s, v24.4s				\n"	\
89 	"fabs	v25.4s, v25.4s				\n"	\
90 	"faddp	v2.4s, v2.4s, v3.4s			\n"	\
91 	"faddp	v4.4s, v4.4s, v5.4s			\n"	\
92 	"faddp	v6.4s, v6.4s, v7.4s			\n"	\
93 	"faddp	v16.4s, v16.4s, v17.4s			\n"	\
94 	"faddp	v18.4s, v18.4s, v19.4s			\n"	\
95 	"faddp	v20.4s, v20.4s, v21.4s			\n"	\
96 	"faddp	v22.4s, v22.4s, v23.4s			\n"	\
97 	"faddp	v24.4s, v24.4s, v25.4s			\n"	\
98 	"fmax	v2.4s, v2.4s, v4.4s			\n"	\
99 	"fmax	v6.4s, v6.4s, v16.4s			\n"	\
100 	"fmax	v18.4s, v18.4s, v20.4s			\n"	\
101 	"fmax	v22.4s, v22.4s, v24.4s			\n"	\
102 	"PRFM	PLDL1KEEP, ["X", #1024]			\n"	\
103 	"PRFM	PLDL1KEEP, ["X", #1024+64]		\n"	\
104 	"PRFM	PLDL1KEEP, ["X", #1024+128]		\n"	\
105 	"PRFM	PLDL1KEEP, ["X", #1024+192]		\n"	\
106 	"fmax	v2.4s, v2.4s, v6.4s			\n"	\
107 	"fmax	v18.4s, v18.4s, v22.4s			\n"	\
108 	"fmax	v2.4s, v2.4s, v18.4s			\n"	\
109 	"fmaxv	"TMPF0", v2.4s				\n"	\
110 	"fcmp	"MAXF", "TMPF0"				\n"	\
111 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
112 	"csel	"INDEX", "INDEX", "Z", ge		\n"	\
113 	"add	"Z", "Z", #"N_KERNEL_SIZE"		\n"
114 
115 #else
116 
117 #define KERNEL_F						\
118 	"ldp	q2, q3, ["X"]				\n"	\
119 	"ldp	q4, q5, ["X", #32]			\n"	\
120 	"ldp	q6, q7, ["X", #64]			\n"	\
121 	"ldp	q16, q17, ["X", #96]			\n"	\
122 	"ldp	q18, q19, ["X", #128]			\n"	\
123 	"ldp	q20, q21, ["X", #160]			\n"	\
124 	"ldp	q22, q23, ["X", #192]			\n"	\
125 	"ldp	q24, q25, ["X", #224]			\n"	\
126 	"add	"X", "X", #256				\n"	\
127 	"fabs	v2.2d, v2.2d				\n"	\
128 	"fabs	v3.2d, v3.2d				\n"	\
129 	"fabs	v4.2d, v4.2d				\n"	\
130 	"fabs	v5.2d, v5.2d				\n"	\
131 	"fabs	v6.2d, v6.2d				\n"	\
132 	"fabs	v7.2d, v7.2d				\n"	\
133 	"fabs	v16.2d, v16.2d				\n"	\
134 	"fabs	v17.2d, v17.2d				\n"	\
135 	"fabs	v18.2d, v18.2d				\n"	\
136 	"fabs	v19.2d, v19.2d				\n"	\
137 	"fabs	v20.2d, v20.2d				\n"	\
138 	"fabs	v21.2d, v21.2d				\n"	\
139 	"fabs	v22.2d, v22.2d				\n"	\
140 	"fabs	v23.2d, v23.2d				\n"	\
141 	"fabs	v24.2d, v24.2d				\n"	\
142 	"fabs	v25.2d, v25.2d				\n"	\
143 	"faddp	v2.2d, v2.2d, v3.2d			\n"	\
144 	"faddp	v4.2d, v4.2d, v5.2d			\n"	\
145 	"faddp	v6.2d, v6.2d, v7.2d			\n"	\
146 	"faddp	v16.2d, v16.2d, v17.2d			\n"	\
147 	"faddp	v18.2d, v18.2d, v19.2d			\n"	\
148 	"faddp	v20.2d, v20.2d, v21.2d			\n"	\
149 	"faddp	v22.2d, v22.2d, v23.2d			\n"	\
150 	"faddp	v24.2d, v24.2d, v25.2d			\n"	\
151 	"fmax	v2.2d, v2.2d, v4.2d			\n"	\
152 	"fmax	v6.2d, v6.2d, v16.2d			\n"	\
153 	"fmax	v18.2d, v18.2d, v20.2d			\n"	\
154 	"fmax	v22.2d, v22.2d, v24.2d			\n"	\
155 	"PRFM	PLDL1KEEP, ["X", #1024]			\n"	\
156 	"PRFM	PLDL1KEEP, ["X", #1024+64]		\n"	\
157 	"PRFM	PLDL1KEEP, ["X", #1024+128]		\n"	\
158 	"PRFM	PLDL1KEEP, ["X", #1024+192]		\n"	\
159 	"fmax	v2.2d, v2.2d, v6.2d			\n"	\
160 	"fmax	v18.2d, v18.2d, v22.2d			\n"	\
161 	"fmax	v2.2d, v2.2d, v18.2d			\n"	\
162 	"ins	v3.d[0], v2.d[1]			\n"	\
163 	"fmax	"TMPF0", d3, d2				\n"	\
164 	"fcmp	"MAXF", "TMPF0"				\n"	\
165 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
166 	"csel	"INDEX", "INDEX", "Z", ge		\n"	\
167 	"add	"Z", "Z", #"N_KERNEL_SIZE"		\n"
168 #endif
169 
170 #define KERNEL_F_FINALIZE					\
171 	"sub	x6, "INDEX", #1				\n"	\
172 	"lsl	x6, x6, #"INC_SHIFT" 			\n"	\
173 	"add	x7, x7, x6				\n"	\
174 	"mov	x6, #0					\n"	\
175 	"1:						\n"	\
176 	"add	x6, x6, #1				\n"	\
177 	"cmp	x6, #"N_KERNEL_SIZE"			\n"	\
178 	"bge	2f					\n"	\
179 	"ldr	"TMPF1", [x7] 				\n"	\
180 	"fabs	"TMPF1V", "TMPF1V"			\n"	\
181 	"faddp	"TMPF0V", "TMPF1V", "TMPF1V"		\n"	\
182 	"fcmp	"MAXF", "TMPF0"				\n"	\
183 	"add	x7, x7, #"SZ"				\n"	\
184 	"bne	1b					\n"	\
185 	"2:						\n"	\
186 	"sub	x6, x6, #1				\n"	\
187 	"add	"INDEX", "INDEX", x6			\n"
188 
189 
190 #define INIT							\
191 	"lsl	"INC_X", "INC_X", #"INC_SHIFT"		\n"	\
192 	"ldr	"TMPF1", ["X"] 				\n"	\
193 	"fabs	"TMPF1V", "TMPF1V"			\n"	\
194 	"faddp	"TMPF0V", "TMPF1V", "TMPF1V"		\n"	\
195 	"fmov	"MAXF" , "TMPF0"			\n"	\
196 	"add	"X", "X", "INC_X"			\n"	\
197 	"mov	"Z", #1					\n"	\
198 	"mov	"INDEX", "Z"				\n"	\
199 	"fabs	"MAXF", "MAXF"				\n"
200 
201 
202 #define KERNEL_S1						\
203 	"ldr	"TMPF1", ["X"]				\n"	\
204 	"add	"X", "X", "INC_X"			\n"	\
205 	"add	"Z", "Z", #1				\n"	\
206 	"fabs	"TMPF1V", "TMPF1V"			\n"	\
207 	"faddp	"TMPF0V", "TMPF1V", "TMPF1V"		\n"	\
208 	"fcmp	"MAXF", "TMPF0"				\n"	\
209 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
210 	"csel	"INDEX", "INDEX", "Z", ge		\n"
211 
212 
213 #if defined(SMP)
214 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
215 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
216 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
217 #endif
218 
219 
izamax_compute(BLASLONG n,FLOAT * x,BLASLONG inc_x)220 static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
221 {
222 	BLASLONG index = 0;
223 
224 	if ( n < 0 )  return index;
225 
226 	__asm__ __volatile__ (
227 	"	mov	"N", %[N_]				\n"
228 	"	mov	"X", %[X_]				\n"
229 	"	mov	"INC_X", %[INCX_]			\n"
230 
231 	"	cmp	"N", xzr				\n"
232 	"	ble	10f //izamax_kernel_zero		\n"
233 	"	cmp	"INC_X", xzr				\n"
234 	"	ble	10f //izamax_kernel_zero		\n"
235 	"	cmp	"INC_X", #1				\n"
236 	"	bne	5f //izamax_kernel_S_BEGIN		\n"
237 	"	mov	x7, "X"					\n"
238 
239 	"1: //izamax_kernel_F_BEGIN:				\n"
240 	"	"INIT"						\n"
241 	"	subs	"N", "N", #1				\n"
242 	"	ble	9f //izamax_kernel_L999			\n"
243 	"	asr	"J", "N", #"N_DIV_SHIFT"		\n"
244 	"	cmp	"J", xzr				\n"
245 	"	beq	3f //izamax_kernel_F1			\n"
246 	"	add	"Z", "Z", #1				\n"
247 
248 	"2: //izamax_kernel_F:					\n"
249 	"	"KERNEL_F"					\n"
250 	"	subs	"J", "J", #1				\n"
251 	"	bne	2b //izamax_kernel_F			\n"
252 	"	"KERNEL_F_FINALIZE"				\n"
253 	"	sub	"Z", "Z", #1				\n"
254 
255 	"3: //izamax_kernel_F1:					\n"
256 	"	ands	"J", "N", #"N_REM_MASK"			\n"
257 	"	ble	9f //izamax_kernel_L999			\n"
258 
259 	"4: //izamax_kernel_F10:				\n"
260 	"	"KERNEL_S1"					\n"
261 	"	subs	"J", "J", #1				\n"
262 	"	bne	4b //izamax_kernel_F10			\n"
263 	"	b	9f //izamax_kernel_L999			\n"
264 
265 	"5: //izamax_kernel_S_BEGIN:				\n"
266 	"	"INIT"						\n"
267 	"	subs	"N", "N", #1				\n"
268 	"	ble	9f //izamax_kernel_L999			\n"
269 	"	asr	"J", "N", #2				\n"
270 	"	cmp	"J", xzr				\n"
271 	"	ble	7f //izamax_kernel_S1			\n"
272 
273 	"6: //izamax_kernel_S4:					\n"
274 	"	"KERNEL_S1"					\n"
275 	"	"KERNEL_S1"					\n"
276 	"	"KERNEL_S1"					\n"
277 	"	"KERNEL_S1"					\n"
278 	"	subs	"J", "J", #1				\n"
279 	"	bne	6b //izamax_kernel_S4			\n"
280 
281 	"7: //izamax_kernel_S1:					\n"
282 	"	ands	"J", "N", #3				\n"
283 	"	ble	9f //izamax_kernel_L999			\n"
284 
285 	"8: //izamax_kernel_S10:				\n"
286 	"	"KERNEL_S1"					\n"
287 	"	subs	"J", "J", #1				\n"
288 	"	bne	8b //izamax_kernel_S10			\n"
289 
290 	"9: //izamax_kernel_L999:				\n"
291 	"	mov	x0, "INDEX"				\n"
292 	"	b	11f //izamax_kernel_DONE		\n"
293 
294 	"10: //izamax_kernel_zero:				\n"
295 	"	mov	x0, xzr					\n"
296 
297 	"11: //izamax_kernel_DONE:				\n"
298 	"	mov	%[INDEX_], "INDEX"			\n"
299 
300 	: [INDEX_] "=r" (index)		//%0
301 	: [N_]    "r"  (n),		//%1
302 	  [X_]    "r"  (x),		//%2
303 	  [INCX_] "r"  (inc_x)		//%3
304 	: "cc",
305 	  "memory",
306 	  "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
307 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
308 	);
309 
310 	return index;
311 }
312 
313 #if defined(SMP)
izamax_thread_function(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT dummy2,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * result,BLASLONG dummy3)314 static int izamax_thread_function(BLASLONG n, BLASLONG dummy0,
315 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
316 	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
317 {
318 	*(BLASLONG *)result = izamax_compute(n, x, inc_x);
319 
320 	return 0;
321 }
322 #endif
323 
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x)324 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
325 {
326 #if defined(SMP)
327 	int nthreads;
328 	FLOAT dummy_alpha[2];
329 #endif
330 	BLASLONG max_index = 0;
331 
332 #if defined(SMP)
333 	if (inc_x == 0 || n <= 10000)
334 		nthreads = 1;
335 	else
336 		nthreads = num_cpu_avail(1);
337 
338 	if (nthreads == 1) {
339 		max_index = izamax_compute(n, x, inc_x);
340 	} else {
341 		BLASLONG i, width, cur_index;
342 		int num_cpu;
343 		int mode;
344 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
345 		FLOAT max = -1.0;
346 
347 #if !defined(DOUBLE)
348 		mode = BLAS_SINGLE | BLAS_COMPLEX;
349 #else
350 		mode = BLAS_DOUBLE | BLAS_COMPLEX;
351 #endif
352 
353 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
354 				   x, inc_x, NULL, 0, result, 0,
355 				   ( void *)izamax_thread_function, nthreads);
356 
357 		num_cpu = 0;
358 		i = n;
359 		cur_index = 0;
360 
361 		while (i > 0) {
362 			FLOAT elem_r, elem_i;
363 			BLASLONG cur_max_index;
364 
365 			cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2];
366 			elem_r = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 0];
367 			elem_i = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 1];
368 			elem_r = fabs(elem_r) + fabs(elem_i);
369 
370 			if (elem_r >= max) {
371 				max = elem_r;
372 				max_index = cur_index + cur_max_index;
373 			}
374 
375 			width = blas_quickdivide(i + nthreads - num_cpu - 1,
376 				 nthreads - num_cpu);
377 			i -= width;
378 			cur_index += width;
379 			num_cpu ++;
380 		}
381 	}
382 #else
383 	max_index = izamax_compute(n, x, inc_x);
384 #endif
385 
386 	return max_index;
387 }
388