1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/simd.h>
27 
28 #ifdef __linux__
29 #define	__asm __asm__ __volatile__
30 #endif
31 
32 #define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
33 #define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
34 
35 #define	VR0_(REG, ...) "%[w"#REG"]"
36 #define	VR1_(_1, REG, ...) "%[w"#REG"]"
37 #define	VR2_(_1, _2, REG, ...) "%[w"#REG"]"
38 #define	VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
39 #define	VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
40 #define	VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
41 #define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
42 #define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
43 
44 /*
45  * Here we need registers not used otherwise.
46  * They will be used in unused ASM for the case
47  * with more registers than required... but GCC
48  * will still need to make sure the constraints
49  * are correct, and duplicate constraints are illegal
50  * ... and we use the "register" number as a name
51  */
52 
53 #define	VR0(r...) VR0_(r)
54 #define	VR1(r...) VR1_(r)
55 #define	VR2(r...) VR2_(r, 36)
56 #define	VR3(r...) VR3_(r, 36, 35)
57 #define	VR4(r...) VR4_(r, 36, 35, 34, 33)
58 #define	VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
59 #define	VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
60 #define	VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
61 
62 #define	VR(X) "%[w"#X"]"
63 
64 #define	RVR0_(REG, ...) [w##REG] "w" (w##REG)
65 #define	RVR1_(_1, REG, ...) [w##REG] "w" (w##REG)
66 #define	RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG)
67 #define	RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG)
68 #define	RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG)
69 #define	RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG)
70 #define	RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG)
71 #define	RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG)
72 
73 #define	RVR0(r...) RVR0_(r)
74 #define	RVR1(r...) RVR1_(r)
75 #define	RVR2(r...) RVR2_(r, 36)
76 #define	RVR3(r...) RVR3_(r, 36, 35)
77 #define	RVR4(r...) RVR4_(r, 36, 35, 34, 33)
78 #define	RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
79 #define	RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
80 #define	RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
81 
82 #define	RVR(X) [w##X] "w" (w##X)
83 
84 #define	WVR0_(REG, ...) [w##REG] "=w" (w##REG)
85 #define	WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG)
86 #define	WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG)
87 #define	WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG)
88 #define	WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG)
89 #define	WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG)
90 #define	WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG)
91 #define	WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG)
92 
93 #define	WVR0(r...) WVR0_(r)
94 #define	WVR1(r...) WVR1_(r)
95 #define	WVR2(r...) WVR2_(r, 36)
96 #define	WVR3(r...) WVR3_(r, 36, 35)
97 #define	WVR4(r...) WVR4_(r, 36, 35, 34, 33)
98 #define	WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
99 #define	WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
100 #define	WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
101 
102 #define	WVR(X) [w##X] "=w" (w##X)
103 
104 #define	UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
105 #define	UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
106 #define	UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
107 #define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
108 #define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
109 #define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
110 #define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
111 #define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
112 
113 #define	UVR0(r...) UVR0_(r)
114 #define	UVR1(r...) UVR1_(r)
115 #define	UVR2(r...) UVR2_(r, 36)
116 #define	UVR3(r...) UVR3_(r, 36, 35)
117 #define	UVR4(r...) UVR4_(r, 36, 35, 34, 33)
118 #define	UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
119 #define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
120 #define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
121 
122 #define	UVR(X) [w##X] "+&w" (w##X)
123 
124 #define	R_01(REG1, REG2, ...) REG1, REG2
125 #define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
126 #define	R_23(REG...) _R_23(REG, 1, 2, 3)
127 
128 #define	ZFS_ASM_BUG()	ASSERT(0)
129 
130 #define	OFFSET(ptr, val)	(((unsigned char *)(ptr))+val)
131 
132 extern const uint8_t gf_clmul_mod_lt[4*256][16];
133 
134 #define	ELEM_SIZE 16
135 
136 typedef struct v {
137 	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
138 } v_t;
139 
140 #define	XOR_ACC(src, r...)						\
141 {									\
142 	switch (REG_CNT(r)) {						\
143 	case 8:								\
144 		__asm(							\
145 		"ld1 { v21.4s },%[SRC0]\n"				\
146 		"ld1 { v20.4s },%[SRC1]\n"				\
147 		"ld1 { v19.4s },%[SRC2]\n"				\
148 		"ld1 { v18.4s },%[SRC3]\n"				\
149 		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
150 		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
151 		"eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"		\
152 		"eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"		\
153 		"ld1 { v21.4s },%[SRC4]\n"				\
154 		"ld1 { v20.4s },%[SRC5]\n"				\
155 		"ld1 { v19.4s },%[SRC6]\n"				\
156 		"ld1 { v18.4s },%[SRC7]\n"				\
157 		"eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n"		\
158 		"eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n"		\
159 		"eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n"		\
160 		"eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n"		\
161 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r),		\
162 			UVR4(r), UVR5(r), UVR6(r), UVR7(r)		\
163 		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
164 		[SRC1] "Q" (*(OFFSET(src, 16))),			\
165 		[SRC2] "Q" (*(OFFSET(src, 32))),			\
166 		[SRC3] "Q" (*(OFFSET(src, 48))),			\
167 		[SRC4] "Q" (*(OFFSET(src, 64))),			\
168 		[SRC5] "Q" (*(OFFSET(src, 80))),			\
169 		[SRC6] "Q" (*(OFFSET(src, 96))),			\
170 		[SRC7] "Q" (*(OFFSET(src, 112)))			\
171 		:	"v18", "v19", "v20", "v21");			\
172 		break;							\
173 	case 4:								\
174 		__asm(							\
175 		"ld1 { v21.4s },%[SRC0]\n"				\
176 		"ld1 { v20.4s },%[SRC1]\n"				\
177 		"ld1 { v19.4s },%[SRC2]\n"				\
178 		"ld1 { v18.4s },%[SRC3]\n"				\
179 		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
180 		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
181 		"eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"		\
182 		"eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"		\
183 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)		\
184 		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
185 		[SRC1] "Q" (*(OFFSET(src, 16))),			\
186 		[SRC2] "Q" (*(OFFSET(src, 32))),			\
187 		[SRC3] "Q" (*(OFFSET(src, 48)))				\
188 		:	"v18", "v19", "v20", "v21");			\
189 		break;							\
190 	case 2:								\
191 		__asm(							\
192 		"ld1 { v21.4s },%[SRC0]\n"				\
193 		"ld1 { v20.4s },%[SRC1]\n"				\
194 		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
195 		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
196 		:	UVR0(r), UVR1(r)				\
197 		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
198 		[SRC1] "Q" (*(OFFSET(src, 16)))				\
199 		:	"v20", "v21");					\
200 		break;							\
201 	default:							\
202 		ZFS_ASM_BUG();						\
203 	}								\
204 }
205 
206 #define	XOR(r...)							\
207 {									\
208 	switch (REG_CNT(r)) {						\
209 	case 8:								\
210 		__asm(							\
211 		"eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n"	\
212 		"eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n"	\
213 		"eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n"	\
214 		"eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n"	\
215 		:	UVR4(r), UVR5(r), UVR6(r), UVR7(r)		\
216 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
217 		break;							\
218 	case 4:								\
219 		__asm(							\
220 		"eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n"	\
221 		"eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n"	\
222 		:	UVR2(r), UVR3(r)				\
223 		:	RVR0(r), RVR1(r));				\
224 		break;							\
225 	default:							\
226 		ZFS_ASM_BUG();						\
227 	}								\
228 }
229 
230 #define	ZERO(r...)							\
231 {									\
232 	switch (REG_CNT(r)) {						\
233 	case 8:								\
234 		__asm(							\
235 		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
236 		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
237 		"eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"	\
238 		"eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"	\
239 		"eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n"	\
240 		"eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n"	\
241 		"eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n"	\
242 		"eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n"	\
243 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),		\
244 			WVR4(r), WVR5(r), WVR6(r), WVR7(r));		\
245 		break;							\
246 	case 4:								\
247 		__asm(							\
248 		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
249 		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
250 		"eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"	\
251 		"eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"	\
252 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));		\
253 		break;							\
254 	case 2:								\
255 		__asm(							\
256 		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
257 		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
258 		:	WVR0(r), WVR1(r));				\
259 		break;							\
260 	default:							\
261 		ZFS_ASM_BUG();						\
262 	}								\
263 }
264 
265 #define	COPY(r...)							\
266 {									\
267 	switch (REG_CNT(r)) {						\
268 	case 8:								\
269 		__asm(							\
270 		"mov " VR4(r) ".16b," VR0(r) ".16b\n"			\
271 		"mov " VR5(r) ".16b," VR1(r) ".16b\n"			\
272 		"mov " VR6(r) ".16b," VR2(r) ".16b\n"			\
273 		"mov " VR7(r) ".16b," VR3(r) ".16b\n"			\
274 		:	WVR4(r), WVR5(r), WVR6(r), WVR7(r)		\
275 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
276 		break;							\
277 	case 4:								\
278 		__asm(							\
279 		"mov " VR2(r) ".16b," VR0(r) ".16b\n"			\
280 		"mov " VR3(r) ".16b," VR1(r) ".16b\n"			\
281 		:	WVR2(r), WVR3(r)				\
282 		:	RVR0(r), RVR1(r));				\
283 		break;							\
284 	default:							\
285 		ZFS_ASM_BUG();						\
286 	}								\
287 }
288 
289 #define	LOAD(src, r...)							\
290 {									\
291 	switch (REG_CNT(r)) {						\
292 	case 8:								\
293 		__asm(							\
294 		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
295 		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
296 		"ld1 { " VR2(r) ".4s },%[SRC2]\n"			\
297 		"ld1 { " VR3(r) ".4s },%[SRC3]\n"			\
298 		"ld1 { " VR4(r) ".4s },%[SRC4]\n"			\
299 		"ld1 { " VR5(r) ".4s },%[SRC5]\n"			\
300 		"ld1 { " VR6(r) ".4s },%[SRC6]\n"			\
301 		"ld1 { " VR7(r) ".4s },%[SRC7]\n"			\
302 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),		\
303 			WVR4(r), WVR5(r), WVR6(r), WVR7(r)		\
304 		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
305 		[SRC1] "Q" (*(OFFSET(src, 16))),			\
306 		[SRC2] "Q" (*(OFFSET(src, 32))),			\
307 		[SRC3] "Q" (*(OFFSET(src, 48))),			\
308 		[SRC4] "Q" (*(OFFSET(src, 64))),			\
309 		[SRC5] "Q" (*(OFFSET(src, 80))),			\
310 		[SRC6] "Q" (*(OFFSET(src, 96))),			\
311 		[SRC7] "Q" (*(OFFSET(src, 112))));			\
312 		break;							\
313 	case 4:								\
314 		__asm(							\
315 		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
316 		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
317 		"ld1 { " VR2(r) ".4s },%[SRC2]\n"			\
318 		"ld1 { " VR3(r) ".4s },%[SRC3]\n"			\
319 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r)		\
320 		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
321 		[SRC1] "Q" (*(OFFSET(src, 16))),			\
322 		[SRC2] "Q" (*(OFFSET(src, 32))),			\
323 		[SRC3] "Q" (*(OFFSET(src, 48))));			\
324 		break;							\
325 	case 2:								\
326 		__asm(							\
327 		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
328 		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
329 		:	WVR0(r), WVR1(r)				\
330 		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
331 		[SRC1] "Q" (*(OFFSET(src, 16))));			\
332 		break;							\
333 	default:							\
334 		ZFS_ASM_BUG();						\
335 	}								\
336 }
337 
338 #define	STORE(dst, r...)						\
339 {									\
340 	switch (REG_CNT(r)) {						\
341 	case 8:								\
342 		__asm(							\
343 		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
344 		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
345 		"st1 { " VR2(r) ".4s },%[DST2]\n"			\
346 		"st1 { " VR3(r) ".4s },%[DST3]\n"			\
347 		"st1 { " VR4(r) ".4s },%[DST4]\n"			\
348 		"st1 { " VR5(r) ".4s },%[DST5]\n"			\
349 		"st1 { " VR6(r) ".4s },%[DST6]\n"			\
350 		"st1 { " VR7(r) ".4s },%[DST7]\n"			\
351 		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
352 		[DST1] "=Q" (*(OFFSET(dst, 16))),			\
353 		[DST2] "=Q" (*(OFFSET(dst, 32))),			\
354 		[DST3] "=Q" (*(OFFSET(dst, 48))),			\
355 		[DST4] "=Q" (*(OFFSET(dst, 64))),			\
356 		[DST5] "=Q" (*(OFFSET(dst, 80))),			\
357 		[DST6] "=Q" (*(OFFSET(dst, 96))),			\
358 		[DST7] "=Q" (*(OFFSET(dst, 112)))			\
359 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r),		\
360 			RVR4(r), RVR5(r), RVR6(r), RVR7(r));		\
361 		break;							\
362 	case 4:								\
363 		__asm(							\
364 		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
365 		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
366 		"st1 { " VR2(r) ".4s },%[DST2]\n"			\
367 		"st1 { " VR3(r) ".4s },%[DST3]\n"			\
368 		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
369 		[DST1] "=Q" (*(OFFSET(dst, 16))),			\
370 		[DST2] "=Q" (*(OFFSET(dst, 32))),			\
371 		[DST3] "=Q" (*(OFFSET(dst, 48)))			\
372 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
373 		break;							\
374 	case 2:								\
375 		__asm(							\
376 		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
377 		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
378 		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
379 		[DST1] "=Q" (*(OFFSET(dst, 16)))			\
380 		:	RVR0(r), RVR1(r));				\
381 		break;							\
382 	default:							\
383 		ZFS_ASM_BUG();						\
384 	}								\
385 }
386 
387 /*
388  * Unfortunately cannot use the macro, because GCC
389  * will try to use the macro name and not value
390  * later on...
391  * Kept as a reference to what a numbered variable is
392  */
393 #define	_00	"v17"
394 #define	_1d	"v16"
395 #define	_temp0	"v19"
396 #define	_temp1	"v18"
397 
398 #define	MUL2_SETUP()							\
399 {									\
400 	__asm(								\
401 	"eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n"		\
402 	"movi " VR(16) ".16b,#0x1d\n"					\
403 	:	WVR(16), WVR(17));					\
404 }
405 
406 #define	MUL2(r...)							\
407 {									\
408 	switch (REG_CNT(r)) {						\
409 	case 4:								\
410 		__asm(							\
411 		"cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"		\
412 		"cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"		\
413 		"cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n"		\
414 		"cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n"		\
415 		"and v19.16b,v19.16b," VR(16) ".16b\n"			\
416 		"and v18.16b,v18.16b," VR(16) ".16b\n"			\
417 		"and v21.16b,v21.16b," VR(16) ".16b\n"			\
418 		"and v20.16b,v20.16b," VR(16) ".16b\n"			\
419 		"shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"		\
420 		"shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"		\
421 		"shl " VR2(r) ".16b," VR2(r) ".16b,#1\n"		\
422 		"shl " VR3(r) ".16b," VR3(r) ".16b,#1\n"		\
423 		"eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"		\
424 		"eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"		\
425 		"eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n"		\
426 		"eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n"		\
427 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)		\
428 		:	RVR(17), RVR(16)				\
429 		:	"v18", "v19", "v20", "v21");			\
430 		break;							\
431 	case 2:								\
432 		__asm(							\
433 		"cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"		\
434 		"cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"		\
435 		"and v19.16b,v19.16b," VR(16) ".16b\n"			\
436 		"and v18.16b,v18.16b," VR(16) ".16b\n"			\
437 		"shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"		\
438 		"shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"		\
439 		"eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"		\
440 		"eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"		\
441 		:	UVR0(r), UVR1(r)				\
442 		:	RVR(17), RVR(16)				\
443 		:	"v18", "v19");					\
444 		break;							\
445 	default:							\
446 		ZFS_ASM_BUG();						\
447 	}								\
448 }
449 
450 #define	MUL4(r...)							\
451 {									\
452 	MUL2(r);							\
453 	MUL2(r);							\
454 }
455 
456 /*
457  * Unfortunately cannot use the macro, because GCC
458  * will try to use the macro name and not value
459  * later on...
460  * Kept as a reference to what a register is
461  * (here we're using actual registers for the
462  * clobbered ones)
463  */
464 #define	_0f		"v15"
465 #define	_a_save		"v14"
466 #define	_b_save		"v13"
467 #define	_lt_mod_a	"v12"
468 #define	_lt_clmul_a	"v11"
469 #define	_lt_mod_b	"v10"
470 #define	_lt_clmul_b	"v15"
471 
472 #define	_MULx2(c, r...)							\
473 {									\
474 	switch (REG_CNT(r)) {						\
475 	case 2:								\
476 		__asm(							\
477 		/* lts for upper part */				\
478 		"movi v15.16b,#0x0f\n"					\
479 		"ld1 { v10.4s },%[lt0]\n"				\
480 		"ld1 { v11.4s },%[lt1]\n"				\
481 		/* upper part */					\
482 		"and v14.16b," VR0(r) ".16b,v15.16b\n"			\
483 		"and v13.16b," VR1(r) ".16b,v15.16b\n"			\
484 		"ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n"		\
485 		"ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n"		\
486 									\
487 		"tbl v12.16b,{v10.16b}," VR0(r) ".16b\n"		\
488 		"tbl v10.16b,{v10.16b}," VR1(r) ".16b\n"		\
489 		"tbl v15.16b,{v11.16b}," VR0(r) ".16b\n"		\
490 		"tbl v11.16b,{v11.16b}," VR1(r) ".16b\n"		\
491 									\
492 		"eor " VR0(r) ".16b,v15.16b,v12.16b\n"			\
493 		"eor " VR1(r) ".16b,v11.16b,v10.16b\n"			\
494 		/* lts for lower part */				\
495 		"ld1 { v10.4s },%[lt2]\n"				\
496 		"ld1 { v15.4s },%[lt3]\n"				\
497 		/* lower part */					\
498 		"tbl v12.16b,{v10.16b},v14.16b\n"			\
499 		"tbl v10.16b,{v10.16b},v13.16b\n"			\
500 		"tbl v11.16b,{v15.16b},v14.16b\n"			\
501 		"tbl v15.16b,{v15.16b},v13.16b\n"			\
502 									\
503 		"eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n"		\
504 		"eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n"		\
505 		"eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n"		\
506 		"eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n"		\
507 		:	UVR0(r), UVR1(r)				\
508 		:	[lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])),	\
509 		[lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])),		\
510 		[lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])),		\
511 		[lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0]))		\
512 		:	"v10", "v11", "v12", "v13", "v14", "v15");	\
513 		break;							\
514 	default:							\
515 		ZFS_ASM_BUG();						\
516 	}								\
517 }
518 
519 #define	MUL(c, r...)							\
520 {									\
521 	switch (REG_CNT(r)) {						\
522 	case 4:								\
523 		_MULx2(c, R_23(r));					\
524 		_MULx2(c, R_01(r));					\
525 		break;							\
526 	case 2:								\
527 		_MULx2(c, R_01(r));					\
528 		break;							\
529 	default:							\
530 		ZFS_ASM_BUG();						\
531 	}								\
532 }
533 
534 #define	raidz_math_begin()	kfpu_begin()
535 #define	raidz_math_end()	kfpu_end()
536 
537 /* Overkill... */
538 #if defined(_KERNEL)
539 #define	GEN_X_DEFINE_0_3()	\
540 register unsigned char w0 asm("v0") __attribute__((vector_size(16)));	\
541 register unsigned char w1 asm("v1") __attribute__((vector_size(16)));	\
542 register unsigned char w2 asm("v2") __attribute__((vector_size(16)));	\
543 register unsigned char w3 asm("v3") __attribute__((vector_size(16)));
544 #define	GEN_X_DEFINE_4_5()	\
545 register unsigned char w4 asm("v4") __attribute__((vector_size(16)));	\
546 register unsigned char w5 asm("v5") __attribute__((vector_size(16)));
547 #define	GEN_X_DEFINE_6_7()	\
548 register unsigned char w6 asm("v6") __attribute__((vector_size(16)));	\
549 register unsigned char w7 asm("v7") __attribute__((vector_size(16)));
550 #define	GEN_X_DEFINE_8_9()	\
551 register unsigned char w8 asm("v8") __attribute__((vector_size(16)));	\
552 register unsigned char w9 asm("v9") __attribute__((vector_size(16)));
553 #define	GEN_X_DEFINE_10_11()	\
554 register unsigned char w10 asm("v10") __attribute__((vector_size(16)));	\
555 register unsigned char w11 asm("v11") __attribute__((vector_size(16)));
556 #define	GEN_X_DEFINE_12_15()	\
557 register unsigned char w12 asm("v12") __attribute__((vector_size(16)));	\
558 register unsigned char w13 asm("v13") __attribute__((vector_size(16)));	\
559 register unsigned char w14 asm("v14") __attribute__((vector_size(16)));	\
560 register unsigned char w15 asm("v15") __attribute__((vector_size(16)));
561 #define	GEN_X_DEFINE_16()	\
562 register unsigned char w16 asm("v16") __attribute__((vector_size(16)));
563 #define	GEN_X_DEFINE_17()	\
564 register unsigned char w17 asm("v17") __attribute__((vector_size(16)));
565 #define	GEN_X_DEFINE_18_21()	\
566 register unsigned char w18 asm("v18") __attribute__((vector_size(16)));	\
567 register unsigned char w19 asm("v19") __attribute__((vector_size(16)));	\
568 register unsigned char w20 asm("v20") __attribute__((vector_size(16)));	\
569 register unsigned char w21 asm("v21") __attribute__((vector_size(16)));
570 #define	GEN_X_DEFINE_22_23()	\
571 register unsigned char w22 asm("v22") __attribute__((vector_size(16)));	\
572 register unsigned char w23 asm("v23") __attribute__((vector_size(16)));
573 #define	GEN_X_DEFINE_24_27()	\
574 register unsigned char w24 asm("v24") __attribute__((vector_size(16)));	\
575 register unsigned char w25 asm("v25") __attribute__((vector_size(16)));	\
576 register unsigned char w26 asm("v26") __attribute__((vector_size(16)));	\
577 register unsigned char w27 asm("v27") __attribute__((vector_size(16)));
578 #define	GEN_X_DEFINE_28_30()	\
579 register unsigned char w28 asm("v28") __attribute__((vector_size(16)));	\
580 register unsigned char w29 asm("v29") __attribute__((vector_size(16)));	\
581 register unsigned char w30 asm("v30") __attribute__((vector_size(16)));
582 #define	GEN_X_DEFINE_31()	\
583 register unsigned char w31 asm("v31") __attribute__((vector_size(16)));
584 #define	GEN_X_DEFINE_32()	\
585 register unsigned char w32 asm("v31") __attribute__((vector_size(16)));
586 #define	GEN_X_DEFINE_33_36()	\
587 register unsigned char w33 asm("v31") __attribute__((vector_size(16)));	\
588 register unsigned char w34 asm("v31") __attribute__((vector_size(16)));	\
589 register unsigned char w35 asm("v31") __attribute__((vector_size(16)));	\
590 register unsigned char w36 asm("v31") __attribute__((vector_size(16)));
591 #define	GEN_X_DEFINE_37_38()	\
592 register unsigned char w37 asm("v31") __attribute__((vector_size(16)));	\
593 register unsigned char w38 asm("v31") __attribute__((vector_size(16)));
594 #define	GEN_X_DEFINE_ALL()	\
595 	GEN_X_DEFINE_0_3()	\
596 	GEN_X_DEFINE_4_5()	\
597 	GEN_X_DEFINE_6_7()	\
598 	GEN_X_DEFINE_8_9()	\
599 	GEN_X_DEFINE_10_11()	\
600 	GEN_X_DEFINE_12_15()	\
601 	GEN_X_DEFINE_16()	\
602 	GEN_X_DEFINE_17()	\
603 	GEN_X_DEFINE_18_21()	\
604 	GEN_X_DEFINE_22_23()	\
605 	GEN_X_DEFINE_24_27()	\
606 	GEN_X_DEFINE_28_30()	\
607 	GEN_X_DEFINE_31()	\
608 	GEN_X_DEFINE_32()	\
609 	GEN_X_DEFINE_33_36() 	\
610 	GEN_X_DEFINE_37_38()
611 #else
612 #define	GEN_X_DEFINE_0_3()	\
613 	unsigned char w0 __attribute__((vector_size(16)));	\
614 	unsigned char w1 __attribute__((vector_size(16)));	\
615 	unsigned char w2 __attribute__((vector_size(16)));	\
616 	unsigned char w3 __attribute__((vector_size(16)));
617 #define	GEN_X_DEFINE_4_5()	\
618 	unsigned char w4 __attribute__((vector_size(16)));	\
619 	unsigned char w5 __attribute__((vector_size(16)));
620 #define	GEN_X_DEFINE_6_7()	\
621 	unsigned char w6 __attribute__((vector_size(16)));	\
622 	unsigned char w7 __attribute__((vector_size(16)));
623 #define	GEN_X_DEFINE_8_9()	\
624 	unsigned char w8 __attribute__((vector_size(16)));	\
625 	unsigned char w9 __attribute__((vector_size(16)));
626 #define	GEN_X_DEFINE_10_11()	\
627 	unsigned char w10 __attribute__((vector_size(16)));	\
628 	unsigned char w11 __attribute__((vector_size(16)));
629 #define	GEN_X_DEFINE_12_15()	\
630 	unsigned char w12 __attribute__((vector_size(16)));	\
631 	unsigned char w13 __attribute__((vector_size(16)));	\
632 	unsigned char w14 __attribute__((vector_size(16)));	\
633 	unsigned char w15 __attribute__((vector_size(16)));
634 #define	GEN_X_DEFINE_16()	\
635 	unsigned char w16 __attribute__((vector_size(16)));
636 #define	GEN_X_DEFINE_17()	\
637 	unsigned char w17 __attribute__((vector_size(16)));
638 #define	GEN_X_DEFINE_18_21()	\
639 	unsigned char w18 __attribute__((vector_size(16)));	\
640 	unsigned char w19 __attribute__((vector_size(16)));	\
641 	unsigned char w20 __attribute__((vector_size(16)));	\
642 	unsigned char w21 __attribute__((vector_size(16)));
643 #define	GEN_X_DEFINE_22_23()	\
644 	unsigned char w22 __attribute__((vector_size(16)));	\
645 	unsigned char w23 __attribute__((vector_size(16)));
646 #define	GEN_X_DEFINE_24_27()	\
647 	unsigned char w24 __attribute__((vector_size(16)));	\
648 	unsigned char w25 __attribute__((vector_size(16)));	\
649 	unsigned char w26 __attribute__((vector_size(16)));	\
650 	unsigned char w27 __attribute__((vector_size(16)));
651 #define	GEN_X_DEFINE_28_30()	\
652 	unsigned char w28 __attribute__((vector_size(16)));	\
653 	unsigned char w29 __attribute__((vector_size(16)));	\
654 	unsigned char w30 __attribute__((vector_size(16)));
655 #define	GEN_X_DEFINE_31()	\
656 	unsigned char w31 __attribute__((vector_size(16)));
657 #define	GEN_X_DEFINE_32()	\
658 	unsigned char w32 __attribute__((vector_size(16)));
659 #define	GEN_X_DEFINE_33_36()	\
660 	unsigned char w33 __attribute__((vector_size(16)));	\
661 	unsigned char w34 __attribute__((vector_size(16)));	\
662 	unsigned char w35 __attribute__((vector_size(16)));	\
663 	unsigned char w36 __attribute__((vector_size(16)));
664 #define	GEN_X_DEFINE_37_38()	\
665 	unsigned char w37 __attribute__((vector_size(16)));	\
666 	unsigned char w38 __attribute__((vector_size(16)));
667 #define	GEN_X_DEFINE_ALL()	\
668 	GEN_X_DEFINE_0_3()	\
669 	GEN_X_DEFINE_4_5()	\
670 	GEN_X_DEFINE_6_7()	\
671 	GEN_X_DEFINE_8_9()	\
672 	GEN_X_DEFINE_10_11()	\
673 	GEN_X_DEFINE_12_15()	\
674 	GEN_X_DEFINE_16()	\
675 	GEN_X_DEFINE_17()	\
676 	GEN_X_DEFINE_18_21()	\
677 	GEN_X_DEFINE_22_23()	\
678 	GEN_X_DEFINE_24_27()	\
679 	GEN_X_DEFINE_28_30()	\
680 	GEN_X_DEFINE_31()	\
681 	GEN_X_DEFINE_32()	\
682 	GEN_X_DEFINE_33_36()	\
683 	GEN_X_DEFINE_37_38()
684 #endif
685