1/*******************************************************************************
2Copyright (c) 2015, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*******************************************************************************/
27
28#define ASSEMBLER
29#include "common.h"
30
31#define	N	x0	/* vector length */
32#define	X	x1	/* X vector address */
33#define	INC_X	x2	/* X stride */
34#define I	x5	/* loop variable */
35
36/*******************************************************************************
37* Macro definitions
38*******************************************************************************/
39
40#define REG0	wzr
41#define SUMF	s0
42#define TMPF	s1
43#define TMPVF	{v1.s}[0]
44#define SZ	4
45
46/******************************************************************************/
47
48.macro KERNEL_F1
49	ld1	{v1.2s}, [X], #8
50	fabs	v1.2s, v1.2s
51	ext	v2.8b, v1.8b, v1.8b, #4
52	fadd	TMPF, TMPF, s2
53	fadd	SUMF, SUMF, TMPF
54.endm
55
56.macro KERNEL_F8
57	ld1	{v1.4s, v2.4s, v3.4s, v4.4s}, [X]
58	add	X, X, #64
59	fabs	v1.4s, v1.4s
60	fabs	v2.4s, v2.4s
61	fabs	v3.4s, v3.4s
62	fabs	v4.4s, v4.4s
63
64	PRFM	PLDL1KEEP, [X, #1024]
65
66	fadd	v1.4s, v1.4s, v2.4s
67	fadd	v3.4s, v3.4s, v4.4s
68	fadd	v0.4s, v0.4s, v1.4s
69	fadd	v0.4s, v0.4s, v3.4s
70.endm
71
72.macro KERNEL_F8_FINALIZE
73	ext	v1.16b, v0.16b, v0.16b, #8
74	fadd	v0.2s, v0.2s, v1.2s
75	faddp	SUMF, v0.2s
76.endm
77
78.macro INIT_S
79	lsl	INC_X, INC_X, #3
80.endm
81
82.macro KERNEL_S1
83	ld1	{v1.2s}, [X], INC_X
84	fabs	v1.2s, v1.2s
85	ext	v2.8b, v1.8b, v1.8b, #4
86	fadd	TMPF, TMPF, s2
87	fadd	SUMF, SUMF, TMPF
88
89.endm
90
91/*******************************************************************************
92* End of macro definitions
93*******************************************************************************/
94
95	PROLOGUE
96
97	fmov	SUMF, REG0
98	fmov	s1, SUMF
99
100	cmp	N, xzr
101	ble	.Lcasum_kernel_L999
102	cmp	INC_X, xzr
103	ble	.Lcasum_kernel_L999
104
105	cmp	INC_X, #1
106	bne	.Lcasum_kernel_S_BEGIN
107
108.Lcasum_kernel_F_BEGIN:
109
110	asr	I, N, #3
111	cmp	I, xzr
112	beq	.Lcasum_kernel_F1
113
114.Lcasum_kernel_F8:
115
116	KERNEL_F8
117
118	subs	I, I, #1
119	bne	.Lcasum_kernel_F8
120
121	KERNEL_F8_FINALIZE
122
123.Lcasum_kernel_F1:
124
125	ands	I, N, #7
126	ble	.Lcasum_kernel_L999
127
128.Lcasum_kernel_F10:
129
130	KERNEL_F1
131
132	subs    I, I, #1
133        bne     .Lcasum_kernel_F10
134
135.Lcasum_kernel_L999:
136	ret
137
138.Lcasum_kernel_S_BEGIN:
139
140	INIT_S
141
142	asr	I, N, #2
143	cmp	I, xzr
144	ble	.Lcasum_kernel_S1
145
146.Lcasum_kernel_S4:
147
148	KERNEL_S1
149	KERNEL_S1
150	KERNEL_S1
151	KERNEL_S1
152
153	subs	I, I, #1
154	bne	.Lcasum_kernel_S4
155
156.Lcasum_kernel_S1:
157
158	ands	I, N, #3
159	ble	.Lcasum_kernel_L999
160
161.Lcasum_kernel_S10:
162
163	KERNEL_S1
164
165	subs    I, I, #1
166        bne     .Lcasum_kernel_S10
167
168	ret
169
170	EPILOGUE
171