1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (C) 2019 Romain Dolbeau. All rights reserved.
23  *           <romain.dolbeau@european-processor-initiative.eu>
24  */
25 
26 #include <sys/types.h>
27 #include <sys/simd.h>
28 
29 #ifdef __linux__
30 #define	__asm __asm__ __volatile__
31 #endif
32 
33 #define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
34 #define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
35 
36 #define	VR0_(REG, ...) "%[w"#REG"]"
37 #define	VR1_(_1, REG, ...) "%[w"#REG"]"
38 #define	VR2_(_1, _2, REG, ...) "%[w"#REG"]"
39 #define	VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
40 #define	VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
41 #define	VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
42 #define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
43 #define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
44 
45 /*
46  * Here we need registers not used otherwise.
47  * They will be used in unused ASM for the case
48  * with more registers than required... but GCC
49  * will still need to make sure the constraints
50  * are correct, and duplicate constraints are illegal
51  * ... and we use the "register" number as a name
52  */
53 
54 #define	VR0(r...) VR0_(r)
55 #define	VR1(r...) VR1_(r)
56 #define	VR2(r...) VR2_(r, 36)
57 #define	VR3(r...) VR3_(r, 36, 35)
58 #define	VR4(r...) VR4_(r, 36, 35, 34, 33)
59 #define	VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
60 #define	VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
61 #define	VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
62 
63 #define	VR(X) "%[w"#X"]"
64 
65 #define	RVR0_(REG, ...) [w##REG] "v" (w##REG)
66 #define	RVR1_(_1, REG, ...) [w##REG] "v" (w##REG)
67 #define	RVR2_(_1, _2, REG, ...) [w##REG] "v" (w##REG)
68 #define	RVR3_(_1, _2, _3, REG, ...) [w##REG] "v" (w##REG)
69 #define	RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "v" (w##REG)
70 #define	RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "v" (w##REG)
71 #define	RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "v" (w##REG)
72 #define	RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "v" (w##REG)
73 
74 #define	RVR0(r...) RVR0_(r)
75 #define	RVR1(r...) RVR1_(r)
76 #define	RVR2(r...) RVR2_(r, 36)
77 #define	RVR3(r...) RVR3_(r, 36, 35)
78 #define	RVR4(r...) RVR4_(r, 36, 35, 34, 33)
79 #define	RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
80 #define	RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
81 #define	RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
82 
83 #define	RVR(X) [w##X] "v" (w##X)
84 
85 #define	WVR0_(REG, ...) [w##REG] "=v" (w##REG)
86 #define	WVR1_(_1, REG, ...) [w##REG] "=v" (w##REG)
87 #define	WVR2_(_1, _2, REG, ...) [w##REG] "=v" (w##REG)
88 #define	WVR3_(_1, _2, _3, REG, ...) [w##REG] "=v" (w##REG)
89 #define	WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=v" (w##REG)
90 #define	WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=v" (w##REG)
91 #define	WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=v" (w##REG)
92 #define	WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=v" (w##REG)
93 
94 #define	WVR0(r...) WVR0_(r)
95 #define	WVR1(r...) WVR1_(r)
96 #define	WVR2(r...) WVR2_(r, 36)
97 #define	WVR3(r...) WVR3_(r, 36, 35)
98 #define	WVR4(r...) WVR4_(r, 36, 35, 34, 33)
99 #define	WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
100 #define	WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
101 #define	WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
102 
103 #define	WVR(X) [w##X] "=v" (w##X)
104 
105 #define	UVR0_(REG, ...) [w##REG] "+&v" (w##REG)
106 #define	UVR1_(_1, REG, ...) [w##REG] "+&v" (w##REG)
107 #define	UVR2_(_1, _2, REG, ...) [w##REG] "+&v" (w##REG)
108 #define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&v" (w##REG)
109 #define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&v" (w##REG)
110 #define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&v" (w##REG)
111 #define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&v" (w##REG)
112 #define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&v" (w##REG)
113 
114 #define	UVR0(r...) UVR0_(r)
115 #define	UVR1(r...) UVR1_(r)
116 #define	UVR2(r...) UVR2_(r, 36)
117 #define	UVR3(r...) UVR3_(r, 36, 35)
118 #define	UVR4(r...) UVR4_(r, 36, 35, 34, 33)
119 #define	UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
120 #define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
121 #define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
122 
123 #define	UVR(X) [w##X] "+&v" (w##X)
124 
125 #define	R_01(REG1, REG2, ...) REG1, REG2
126 #define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
127 #define	R_23(REG...) _R_23(REG, 1, 2, 3)
128 
129 #define	ZFS_ASM_BUG()	ASSERT(0)
130 
131 #define	OFFSET(ptr, val)	(((unsigned char *)(ptr))+val)
132 
133 extern const uint8_t gf_clmul_mod_lt[4*256][16];
134 
135 #define	ELEM_SIZE 16
136 
137 typedef struct v {
138 	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
139 } v_t;
140 
141 #define	XOR_ACC(src, r...)					\
142 {								\
143 	switch (REG_CNT(r)) {					\
144 	case 8:							\
145 		__asm(						\
146 		"lvx 21,0,%[SRC0]\n"				\
147 		"lvx 20,0,%[SRC1]\n"				\
148 		"lvx 19,0,%[SRC2]\n"				\
149 		"lvx 18,0,%[SRC3]\n"				\
150 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
151 		"vxor " VR1(r) "," VR1(r) ",20\n"		\
152 		"vxor " VR2(r) "," VR2(r) ",19\n"		\
153 		"vxor " VR3(r) "," VR3(r) ",18\n"		\
154 		"lvx 21,0,%[SRC4]\n"				\
155 		"lvx 20,0,%[SRC5]\n"				\
156 		"lvx 19,0,%[SRC6]\n"				\
157 		"lvx 18,0,%[SRC7]\n"				\
158 		"vxor " VR4(r) "," VR4(r) ",21\n"		\
159 		"vxor " VR5(r) "," VR5(r) ",20\n"		\
160 		"vxor " VR6(r) "," VR6(r) ",19\n"		\
161 		"vxor " VR7(r) "," VR7(r) ",18\n"		\
162 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r),	\
163 			UVR4(r), UVR5(r), UVR6(r), UVR7(r)	\
164 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
165 		[SRC1] "r" ((OFFSET(src, 16))),			\
166 		[SRC2] "r" ((OFFSET(src, 32))),			\
167 		[SRC3] "r" ((OFFSET(src, 48))),			\
168 		[SRC4] "r" ((OFFSET(src, 64))),			\
169 		[SRC5] "r" ((OFFSET(src, 80))),			\
170 		[SRC6] "r" ((OFFSET(src, 96))),			\
171 		[SRC7] "r" ((OFFSET(src, 112)))			\
172 		:	"v18", "v19", "v20", "v21");		\
173 		break;						\
174 	case 4:							\
175 		__asm(						\
176 		"lvx 21,0,%[SRC0]\n"				\
177 		"lvx 20,0,%[SRC1]\n"				\
178 		"lvx 19,0,%[SRC2]\n"				\
179 		"lvx 18,0,%[SRC3]\n"				\
180 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
181 		"vxor " VR1(r) "," VR1(r) ",20\n"		\
182 		"vxor " VR2(r) "," VR2(r) ",19\n"		\
183 		"vxor " VR3(r) "," VR3(r) ",18\n"		\
184 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)	\
185 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
186 		[SRC1] "r" ((OFFSET(src, 16))),			\
187 		[SRC2] "r" ((OFFSET(src, 32))),			\
188 		[SRC3] "r" ((OFFSET(src, 48)))			\
189 		:	"v18", "v19", "v20", "v21");		\
190 		break;						\
191 	case 2:							\
192 		__asm(						\
193 		"lvx 21,0,%[SRC0]\n"				\
194 		"lvx 20,0,%[SRC1]\n"				\
195 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
196 		"vxor " VR1(r) "," VR1(r) ",20\n"		\
197 		:	UVR0(r), UVR1(r)			\
198 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
199 		[SRC1] "r" ((OFFSET(src, 16)))			\
200 		:	"v20", "v21");				\
201 		break;						\
202 	default:						\
203 		ZFS_ASM_BUG();					\
204 	}							\
205 }
206 
207 #define	XOR(r...)						\
208 {								\
209 	switch (REG_CNT(r)) {					\
210 	case 8:							\
211 		__asm(						\
212 		"vxor " VR4(r) "," VR4(r) "," VR0(r) "\n"	\
213 		"vxor " VR5(r) "," VR5(r) "," VR1(r) "\n"	\
214 		"vxor " VR6(r) "," VR6(r) "," VR2(r) "\n"	\
215 		"vxor " VR7(r) "," VR7(r) "," VR3(r) "\n"	\
216 		:	UVR4(r), UVR5(r), UVR6(r), UVR7(r)	\
217 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
218 		break;						\
219 	case 4:							\
220 		__asm(						\
221 		"vxor " VR2(r) "," VR2(r) "," VR0(r) "\n"	\
222 		"vxor " VR3(r) "," VR3(r) "," VR1(r) "\n"	\
223 		:	UVR2(r), UVR3(r)			\
224 		:	RVR0(r), RVR1(r));			\
225 		break;						\
226 	default:						\
227 		ZFS_ASM_BUG();					\
228 	}							\
229 }
230 
231 #define	ZERO(r...)						\
232 {								\
233 	switch (REG_CNT(r)) {					\
234 	case 8:							\
235 		__asm(						\
236 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
237 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
238 		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
239 		"vxor " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
240 		"vxor " VR4(r) "," VR4(r) "," VR4(r) "\n"	\
241 		"vxor " VR5(r) "," VR5(r) "," VR5(r) "\n"	\
242 		"vxor " VR6(r) "," VR6(r) "," VR6(r) "\n"	\
243 		"vxor " VR7(r) "," VR7(r) "," VR7(r) "\n"	\
244 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),	\
245 			WVR4(r), WVR5(r), WVR6(r), WVR7(r));	\
246 		break;						\
247 	case 4:							\
248 		__asm(						\
249 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
250 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
251 		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
252 		"vxor " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
253 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));	\
254 		break;						\
255 	case 2:							\
256 		__asm(						\
257 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
258 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
259 		:	WVR0(r), WVR1(r));			\
260 		break;						\
261 	default:						\
262 		ZFS_ASM_BUG();					\
263 	}							\
264 }
265 
266 #define	COPY(r...)						\
267 {								\
268 	switch (REG_CNT(r)) {					\
269 	case 8:							\
270 		__asm(						\
271 		"vor " VR4(r) "," VR0(r) "," VR0(r) "\n"	\
272 		"vor " VR5(r) "," VR1(r) "," VR1(r) "\n"	\
273 		"vor " VR6(r) "," VR2(r) "," VR2(r) "\n"	\
274 		"vor " VR7(r) "," VR3(r) "," VR3(r) "\n"	\
275 		:	WVR4(r), WVR5(r), WVR6(r), WVR7(r)	\
276 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
277 		break;						\
278 	case 4:							\
279 		__asm(						\
280 		"vor " VR2(r) "," VR0(r) "," VR0(r) "\n"	\
281 		"vor " VR3(r) "," VR1(r) "," VR1(r) "\n"	\
282 		:	WVR2(r), WVR3(r)			\
283 		:	RVR0(r), RVR1(r));			\
284 		break;						\
285 	default:						\
286 		ZFS_ASM_BUG();					\
287 	}							\
288 }
289 
290 #define	LOAD(src, r...)						\
291 {								\
292 	switch (REG_CNT(r)) {					\
293 	case 8:							\
294 		__asm(						\
295 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
296 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
297 		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
298 		"lvx " VR3(r) " ,0,%[SRC3]\n"			\
299 		"lvx " VR4(r) " ,0,%[SRC4]\n"			\
300 		"lvx " VR5(r) " ,0,%[SRC5]\n"			\
301 		"lvx " VR6(r) " ,0,%[SRC6]\n"			\
302 		"lvx " VR7(r) " ,0,%[SRC7]\n"			\
303 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),	\
304 			WVR4(r), WVR5(r), WVR6(r), WVR7(r)	\
305 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
306 		[SRC1] "r" ((OFFSET(src, 16))),			\
307 		[SRC2] "r" ((OFFSET(src, 32))),			\
308 		[SRC3] "r" ((OFFSET(src, 48))),			\
309 		[SRC4] "r" ((OFFSET(src, 64))),			\
310 		[SRC5] "r" ((OFFSET(src, 80))),			\
311 		[SRC6] "r" ((OFFSET(src, 96))),			\
312 		[SRC7] "r" ((OFFSET(src, 112))));		\
313 		break;						\
314 	case 4:							\
315 		__asm(						\
316 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
317 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
318 		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
319 		"lvx " VR3(r) " ,0,%[SRC3]\n"			\
320 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r)	\
321 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
322 		[SRC1] "r" ((OFFSET(src, 16))),			\
323 		[SRC2] "r" ((OFFSET(src, 32))),			\
324 		[SRC3] "r" ((OFFSET(src, 48))));		\
325 		break;						\
326 	case 2:							\
327 		__asm(						\
328 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
329 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
330 		:	WVR0(r), WVR1(r)			\
331 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
332 		[SRC1] "r" ((OFFSET(src, 16))));		\
333 		break;						\
334 	default:						\
335 		ZFS_ASM_BUG();					\
336 	}							\
337 }
338 
339 #define	STORE(dst, r...)					\
340 {								\
341 	switch (REG_CNT(r)) {					\
342 	case 8:							\
343 		__asm(						\
344 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
345 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
346 		"stvx " VR2(r) " ,0,%[DST2]\n"			\
347 		"stvx " VR3(r) " ,0,%[DST3]\n"			\
348 		"stvx " VR4(r) " ,0,%[DST4]\n"			\
349 		"stvx " VR5(r) " ,0,%[DST5]\n"			\
350 		"stvx " VR6(r) " ,0,%[DST6]\n"			\
351 		"stvx " VR7(r) " ,0,%[DST7]\n"			\
352 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
353 		[DST1] "r" ((OFFSET(dst, 16))),			\
354 		[DST2] "r" ((OFFSET(dst, 32))),			\
355 		[DST3] "r" ((OFFSET(dst, 48))),			\
356 		[DST4] "r" ((OFFSET(dst, 64))),			\
357 		[DST5] "r" ((OFFSET(dst, 80))),			\
358 		[DST6] "r" ((OFFSET(dst, 96))),			\
359 		[DST7] "r" ((OFFSET(dst, 112))),		\
360 		RVR0(r), RVR1(r), RVR2(r), RVR3(r),		\
361 		RVR4(r), RVR5(r), RVR6(r), RVR7(r)		\
362 		:	"memory");				\
363 		break;						\
364 	case 4:							\
365 		__asm(						\
366 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
367 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
368 		"stvx " VR2(r) " ,0,%[DST2]\n"			\
369 		"stvx " VR3(r) " ,0,%[DST3]\n"			\
370 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
371 		[DST1] "r" ((OFFSET(dst, 16))),			\
372 		[DST2] "r" ((OFFSET(dst, 32))),			\
373 		[DST3] "r" ((OFFSET(dst, 48))),			\
374 		RVR0(r), RVR1(r), RVR2(r), RVR3(r)		\
375 		: "memory");					\
376 		break;						\
377 	case 2:							\
378 		__asm(						\
379 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
380 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
381 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
382 		[DST1] "r" ((OFFSET(dst, 16))),			\
383 		RVR0(r), RVR1(r) : "memory");			\
384 		break;						\
385 	default:						\
386 		ZFS_ASM_BUG();					\
387 	}							\
388 }
389 
390 /*
391  * Unfortunately cannot use the macro, because GCC
392  * will try to use the macro name and not value
393  * later on...
394  * Kept as a reference to what a numbered variable is
395  */
396 #define	_00	"17"
397 #define	_1d	"16"
398 #define	_temp0	"19"
399 #define	_temp1	"18"
400 
401 #define	MUL2_SETUP()						\
402 {								\
403 	__asm(							\
404 		"vspltisb " VR(16) ",14\n"			\
405 		"vspltisb " VR(17) ",15\n"			\
406 		"vaddubm " VR(16) "," VR(17) "," VR(16) "\n"	\
407 		"vxor " VR(17) "," VR(17) "," VR(17) "\n"	\
408 		:	WVR(16), WVR(17));			\
409 }
410 
411 #define	MUL2(r...)						\
412 {								\
413 	switch (REG_CNT(r)) {					\
414 	case 4:							\
415 		__asm(						\
416 		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
417 		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
418 		"vcmpgtsb 21," VR(17) "," VR2(r) "\n"		\
419 		"vcmpgtsb 20," VR(17) "," VR3(r) "\n"		\
420 		"vand 19,19," VR(16) "\n"			\
421 		"vand 18,18," VR(16) "\n"			\
422 		"vand 21,21," VR(16) "\n"			\
423 		"vand 20,20," VR(16) "\n"			\
424 		"vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
425 		"vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
426 		"vaddubm " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
427 		"vaddubm " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
428 		"vxor " VR0(r) ",19," VR0(r) "\n"		\
429 		"vxor " VR1(r) ",18," VR1(r) "\n"		\
430 		"vxor " VR2(r) ",21," VR2(r) "\n"		\
431 		"vxor " VR3(r) ",20," VR3(r) "\n"		\
432 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)	\
433 		:	RVR(17), RVR(16)			\
434 		:	"v18", "v19", "v20", "v21");		\
435 		break;						\
436 	case 2:							\
437 		__asm(						\
438 		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
439 		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
440 		"vand 19,19," VR(16) "\n"			\
441 		"vand 18,18," VR(16) "\n"			\
442 		"vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
443 		"vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
444 		"vxor " VR0(r) ",19," VR0(r) "\n"		\
445 		"vxor " VR1(r) ",18," VR1(r) "\n"		\
446 		:	UVR0(r), UVR1(r)			\
447 		:	RVR(17), RVR(16)			\
448 		:	"v18", "v19");				\
449 		break;						\
450 	default:						\
451 		ZFS_ASM_BUG();					\
452 	}							\
453 }
454 
455 #define	MUL4(r...)						\
456 {								\
457 	MUL2(r);						\
458 	MUL2(r);						\
459 }
460 
461 /*
462  * Unfortunately cannot use the macro, because GCC
463  * will try to use the macro name and not value
464  * later on...
465  * Kept as a reference to what a register is
466  * (here we're using actual registers for the
467  * clobbered ones)
468  */
469 #define	_0f		"15"
470 #define	_a_save		"14"
471 #define	_b_save		"13"
472 #define	_lt_mod_a	"12"
473 #define	_lt_clmul_a	"11"
474 #define	_lt_mod_b	"10"
475 #define	_lt_clmul_b	"15"
476 
477 #define	_MULx2(c, r...)						\
478 {								\
479 	switch (REG_CNT(r)) {					\
480 	case 2:							\
481 		__asm(						\
482 		/* lts for upper part */			\
483 		"vspltisb 15,15\n"				\
484 		"lvx 10,0,%[lt0]\n"				\
485 		"lvx 11,0,%[lt1]\n"				\
486 		/* upper part */				\
487 		"vand 14," VR0(r) ",15\n"			\
488 		"vand 13," VR1(r) ",15\n"			\
489 		"vspltisb 15,4\n"				\
490 		"vsrab " VR0(r) "," VR0(r) ",15\n"		\
491 		"vsrab " VR1(r) "," VR1(r) ",15\n"		\
492 								\
493 		"vperm 12,10,10," VR0(r) "\n"			\
494 		"vperm 10,10,10," VR1(r) "\n"			\
495 		"vperm 15,11,11," VR0(r) "\n"			\
496 		"vperm 11,11,11," VR1(r) "\n"			\
497 								\
498 		"vxor " VR0(r) ",15,12\n"			\
499 		"vxor " VR1(r) ",11,10\n"			\
500 		/* lts for lower part */			\
501 		"lvx 10,0,%[lt2]\n"				\
502 		"lvx 15,0,%[lt3]\n"				\
503 		/* lower part */				\
504 		"vperm 12,10,10,14\n"				\
505 		"vperm 10,10,10,13\n"				\
506 		"vperm 11,15,15,14\n"				\
507 		"vperm 15,15,15,13\n"				\
508 								\
509 		"vxor " VR0(r) "," VR0(r) ",12\n"		\
510 		"vxor " VR1(r) "," VR1(r) ",10\n"		\
511 		"vxor " VR0(r) "," VR0(r) ",11\n"		\
512 		"vxor " VR1(r) "," VR1(r) ",15\n"		\
513 		: UVR0(r), UVR1(r)				\
514 		: [lt0] "r" (&(gf_clmul_mod_lt[4*(c)+0][0])),	\
515 		[lt1] "r" (&(gf_clmul_mod_lt[4*(c)+1][0])),	\
516 		[lt2] "r" (&(gf_clmul_mod_lt[4*(c)+2][0])),	\
517 		[lt3] "r" (&(gf_clmul_mod_lt[4*(c)+3][0]))	\
518 		: "v10", "v11", "v12", "v13", "v14", "v15");	\
519 		break;						\
520 	default:						\
521 		ZFS_ASM_BUG();					\
522 	}							\
523 }
524 
525 #define	MUL(c, r...)						\
526 {								\
527 	switch (REG_CNT(r)) {					\
528 	case 4:							\
529 		_MULx2(c, R_23(r));				\
530 		_MULx2(c, R_01(r));				\
531 		break;						\
532 	case 2:							\
533 		_MULx2(c, R_01(r));				\
534 		break;						\
535 	default:						\
536 		ZFS_ASM_BUG();					\
537 	}							\
538 }
539 
540 #define	raidz_math_begin()	kfpu_begin()
541 #define	raidz_math_end()	kfpu_end()
542 
543 /* Overkill... */
544 #if 0 // defined(_KERNEL)
545 #define	GEN_X_DEFINE_0_3()	\
546 register unsigned char w0 asm("0") __attribute__((vector_size(16)));	\
547 register unsigned char w1 asm("1") __attribute__((vector_size(16)));	\
548 register unsigned char w2 asm("2") __attribute__((vector_size(16)));	\
549 register unsigned char w3 asm("3") __attribute__((vector_size(16)));
550 #define	GEN_X_DEFINE_4_5()	\
551 register unsigned char w4 asm("4") __attribute__((vector_size(16)));	\
552 register unsigned char w5 asm("5") __attribute__((vector_size(16)));
553 #define	GEN_X_DEFINE_6_7()	\
554 register unsigned char w6 asm("6") __attribute__((vector_size(16)));	\
555 register unsigned char w7 asm("7") __attribute__((vector_size(16)));
556 #define	GEN_X_DEFINE_8_9()	\
557 register unsigned char w8 asm("8") __attribute__((vector_size(16)));	\
558 register unsigned char w9 asm("9") __attribute__((vector_size(16)));
559 #define	GEN_X_DEFINE_10_11()	\
560 register unsigned char w10 asm("10") __attribute__((vector_size(16)));	\
561 register unsigned char w11 asm("11") __attribute__((vector_size(16)));
562 #define	GEN_X_DEFINE_12_15()	\
563 register unsigned char w12 asm("12") __attribute__((vector_size(16)));	\
564 register unsigned char w13 asm("13") __attribute__((vector_size(16)));	\
565 register unsigned char w14 asm("14") __attribute__((vector_size(16)));	\
566 register unsigned char w15 asm("15") __attribute__((vector_size(16)));
567 #define	GEN_X_DEFINE_16()	\
568 register unsigned char w16 asm("16") __attribute__((vector_size(16)));
569 #define	GEN_X_DEFINE_17()	\
570 register unsigned char w17 asm("17") __attribute__((vector_size(16)));
571 #define	GEN_X_DEFINE_18_21()	\
572 register unsigned char w18 asm("18") __attribute__((vector_size(16)));	\
573 register unsigned char w19 asm("19") __attribute__((vector_size(16)));	\
574 register unsigned char w20 asm("20") __attribute__((vector_size(16)));	\
575 register unsigned char w21 asm("21") __attribute__((vector_size(16)));
576 #define	GEN_X_DEFINE_22_23()	\
577 register unsigned char w22 asm("22") __attribute__((vector_size(16)));	\
578 register unsigned char w23 asm("23") __attribute__((vector_size(16)));
579 #define	GEN_X_DEFINE_24_27()	\
580 register unsigned char w24 asm("24") __attribute__((vector_size(16)));	\
581 register unsigned char w25 asm("25") __attribute__((vector_size(16)));	\
582 register unsigned char w26 asm("26") __attribute__((vector_size(16)));	\
583 register unsigned char w27 asm("27") __attribute__((vector_size(16)));
584 #define	GEN_X_DEFINE_28_30()	\
585 register unsigned char w28 asm("28") __attribute__((vector_size(16)));	\
586 register unsigned char w29 asm("29") __attribute__((vector_size(16)));	\
587 register unsigned char w30 asm("30") __attribute__((vector_size(16)));
588 #define	GEN_X_DEFINE_31()	\
589 register unsigned char w31 asm("31") __attribute__((vector_size(16)));
590 #define	GEN_X_DEFINE_32()	\
591 register unsigned char w32 asm("31") __attribute__((vector_size(16)));
592 #define	GEN_X_DEFINE_33_36()	\
593 register unsigned char w33 asm("31") __attribute__((vector_size(16)));	\
594 register unsigned char w34 asm("31") __attribute__((vector_size(16)));	\
595 register unsigned char w35 asm("31") __attribute__((vector_size(16)));	\
596 register unsigned char w36 asm("31") __attribute__((vector_size(16)));
597 #define	GEN_X_DEFINE_37_38()	\
598 register unsigned char w37 asm("31") __attribute__((vector_size(16)));	\
599 register unsigned char w38 asm("31") __attribute__((vector_size(16)));
600 #define	GEN_X_DEFINE_ALL()	\
601 	GEN_X_DEFINE_0_3()	\
602 	GEN_X_DEFINE_4_5()	\
603 	GEN_X_DEFINE_6_7()	\
604 	GEN_X_DEFINE_8_9()	\
605 	GEN_X_DEFINE_10_11()	\
606 	GEN_X_DEFINE_12_15()	\
607 	GEN_X_DEFINE_16()	\
608 	GEN_X_DEFINE_17()	\
609 	GEN_X_DEFINE_18_21()	\
610 	GEN_X_DEFINE_22_23()	\
611 	GEN_X_DEFINE_24_27()	\
612 	GEN_X_DEFINE_28_30()	\
613 	GEN_X_DEFINE_31()	\
614 	GEN_X_DEFINE_32()	\
615 	GEN_X_DEFINE_33_36() 	\
616 	GEN_X_DEFINE_37_38()
617 #else
618 #define	GEN_X_DEFINE_0_3()	\
619 	unsigned char w0 __attribute__((vector_size(16)));	\
620 	unsigned char w1 __attribute__((vector_size(16)));	\
621 	unsigned char w2 __attribute__((vector_size(16)));	\
622 	unsigned char w3 __attribute__((vector_size(16)));
623 #define	GEN_X_DEFINE_4_5()	\
624 	unsigned char w4 __attribute__((vector_size(16)));	\
625 	unsigned char w5 __attribute__((vector_size(16)));
626 #define	GEN_X_DEFINE_6_7()	\
627 	unsigned char w6 __attribute__((vector_size(16)));	\
628 	unsigned char w7 __attribute__((vector_size(16)));
629 #define	GEN_X_DEFINE_8_9()	\
630 	unsigned char w8 __attribute__((vector_size(16)));	\
631 	unsigned char w9 __attribute__((vector_size(16)));
632 #define	GEN_X_DEFINE_10_11()	\
633 	unsigned char w10 __attribute__((vector_size(16)));	\
634 	unsigned char w11 __attribute__((vector_size(16)));
635 #define	GEN_X_DEFINE_12_15()	\
636 	unsigned char w12 __attribute__((vector_size(16)));	\
637 	unsigned char w13 __attribute__((vector_size(16)));	\
638 	unsigned char w14 __attribute__((vector_size(16)));	\
639 	unsigned char w15 __attribute__((vector_size(16)));
640 #define	GEN_X_DEFINE_16()	\
641 	unsigned char w16 __attribute__((vector_size(16)));
642 #define	GEN_X_DEFINE_17()	\
643 	unsigned char w17 __attribute__((vector_size(16)));
644 #define	GEN_X_DEFINE_18_21()	\
645 	unsigned char w18 __attribute__((vector_size(16)));	\
646 	unsigned char w19 __attribute__((vector_size(16)));	\
647 	unsigned char w20 __attribute__((vector_size(16)));	\
648 	unsigned char w21 __attribute__((vector_size(16)));
649 #define	GEN_X_DEFINE_22_23()	\
650 	unsigned char w22 __attribute__((vector_size(16)));	\
651 	unsigned char w23 __attribute__((vector_size(16)));
652 #define	GEN_X_DEFINE_24_27()	\
653 	unsigned char w24 __attribute__((vector_size(16)));	\
654 	unsigned char w25 __attribute__((vector_size(16)));	\
655 	unsigned char w26 __attribute__((vector_size(16)));	\
656 	unsigned char w27 __attribute__((vector_size(16)));
657 #define	GEN_X_DEFINE_28_30()	\
658 	unsigned char w28 __attribute__((vector_size(16)));	\
659 	unsigned char w29 __attribute__((vector_size(16)));	\
660 	unsigned char w30 __attribute__((vector_size(16)));
661 #define	GEN_X_DEFINE_31()	\
662 	unsigned char w31 __attribute__((vector_size(16)));
663 #define	GEN_X_DEFINE_32()	\
664 	unsigned char w32 __attribute__((vector_size(16)));
665 #define	GEN_X_DEFINE_33_36()	\
666 	unsigned char w33 __attribute__((vector_size(16)));	\
667 	unsigned char w34 __attribute__((vector_size(16)));	\
668 	unsigned char w35 __attribute__((vector_size(16)));	\
669 	unsigned char w36 __attribute__((vector_size(16)));
670 #define	GEN_X_DEFINE_37_38()	\
671 	unsigned char w37 __attribute__((vector_size(16)));	\
672 	unsigned char w38 __attribute__((vector_size(16)));
673 #define	GEN_X_DEFINE_ALL()	\
674 	GEN_X_DEFINE_0_3()	\
675 	GEN_X_DEFINE_4_5()	\
676 	GEN_X_DEFINE_6_7()	\
677 	GEN_X_DEFINE_8_9()	\
678 	GEN_X_DEFINE_10_11()	\
679 	GEN_X_DEFINE_12_15()	\
680 	GEN_X_DEFINE_16()	\
681 	GEN_X_DEFINE_17()	\
682 	GEN_X_DEFINE_18_21()	\
683 	GEN_X_DEFINE_22_23()	\
684 	GEN_X_DEFINE_24_27()	\
685 	GEN_X_DEFINE_28_30()	\
686 	GEN_X_DEFINE_31()	\
687 	GEN_X_DEFINE_32()	\
688 	GEN_X_DEFINE_33_36()	\
689 	GEN_X_DEFINE_37_38()
690 #endif
691