xref: /qemu/target/arm/tcg/crypto_helper.c (revision 940bb5fa)
1 /*
2  * crypto_helper.c - emulate v8 Crypto Extensions instructions
3  *
4  * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  */
11 
12 #include "qemu/osdep.h"
13 
14 #include "cpu.h"
15 #include "exec/helper-proto.h"
16 #include "tcg/tcg-gvec-desc.h"
17 #include "crypto/aes-round.h"
18 #include "crypto/sm4.h"
19 #include "vec_internal.h"
20 
21 union CRYPTO_STATE {
22     uint8_t    bytes[16];
23     uint32_t   words[4];
24     uint64_t   l[2];
25 };
26 
27 #if HOST_BIG_ENDIAN
28 #define CR_ST_BYTE(state, i)   ((state).bytes[(15 - (i)) ^ 8])
29 #define CR_ST_WORD(state, i)   ((state).words[(3 - (i)) ^ 2])
30 #else
31 #define CR_ST_BYTE(state, i)   ((state).bytes[i])
32 #define CR_ST_WORD(state, i)   ((state).words[i])
33 #endif
34 
35 /*
36  * The caller has not been converted to full gvec, and so only
37  * modifies the low 16 bytes of the vector register.
38  */
39 static void clear_tail_16(void *vd, uint32_t desc)
40 {
41     int opr_sz = simd_oprsz(desc);
42     int max_sz = simd_maxsz(desc);
43 
44     assert(opr_sz == 16);
45     clear_tail(vd, opr_sz, max_sz);
46 }
47 
48 static const AESState aes_zero = { };
49 
50 void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc)
51 {
52     intptr_t i, opr_sz = simd_oprsz(desc);
53 
54     for (i = 0; i < opr_sz; i += 16) {
55         AESState *ad = (AESState *)(vd + i);
56         AESState *st = (AESState *)(vn + i);
57         AESState *rk = (AESState *)(vm + i);
58         AESState t;
59 
60         /*
61          * Our uint64_t are in the wrong order for big-endian.
62          * The Arm AddRoundKey comes first, while the API AddRoundKey
63          * comes last: perform the xor here, and provide zero to API.
64          */
65         if (HOST_BIG_ENDIAN) {
66             t.d[0] = st->d[1] ^ rk->d[1];
67             t.d[1] = st->d[0] ^ rk->d[0];
68             aesenc_SB_SR_AK(&t, &t, &aes_zero, false);
69             ad->d[0] = t.d[1];
70             ad->d[1] = t.d[0];
71         } else {
72             t.v = st->v ^ rk->v;
73             aesenc_SB_SR_AK(ad, &t, &aes_zero, false);
74         }
75     }
76     clear_tail(vd, opr_sz, simd_maxsz(desc));
77 }
78 
79 void HELPER(crypto_aesd)(void *vd, void *vn, void *vm, uint32_t desc)
80 {
81     intptr_t i, opr_sz = simd_oprsz(desc);
82 
83     for (i = 0; i < opr_sz; i += 16) {
84         AESState *ad = (AESState *)(vd + i);
85         AESState *st = (AESState *)(vn + i);
86         AESState *rk = (AESState *)(vm + i);
87         AESState t;
88 
89         /* Our uint64_t are in the wrong order for big-endian. */
90         if (HOST_BIG_ENDIAN) {
91             t.d[0] = st->d[1] ^ rk->d[1];
92             t.d[1] = st->d[0] ^ rk->d[0];
93             aesdec_ISB_ISR_AK(&t, &t, &aes_zero, false);
94             ad->d[0] = t.d[1];
95             ad->d[1] = t.d[0];
96         } else {
97             t.v = st->v ^ rk->v;
98             aesdec_ISB_ISR_AK(ad, &t, &aes_zero, false);
99         }
100     }
101     clear_tail(vd, opr_sz, simd_maxsz(desc));
102 }
103 
104 void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc)
105 {
106     intptr_t i, opr_sz = simd_oprsz(desc);
107 
108     for (i = 0; i < opr_sz; i += 16) {
109         AESState *ad = (AESState *)(vd + i);
110         AESState *st = (AESState *)(vm + i);
111         AESState t;
112 
113         /* Our uint64_t are in the wrong order for big-endian. */
114         if (HOST_BIG_ENDIAN) {
115             t.d[0] = st->d[1];
116             t.d[1] = st->d[0];
117             aesenc_MC(&t, &t, false);
118             ad->d[0] = t.d[1];
119             ad->d[1] = t.d[0];
120         } else {
121             aesenc_MC(ad, st, false);
122         }
123     }
124     clear_tail(vd, opr_sz, simd_maxsz(desc));
125 }
126 
127 void HELPER(crypto_aesimc)(void *vd, void *vm, uint32_t desc)
128 {
129     intptr_t i, opr_sz = simd_oprsz(desc);
130 
131     for (i = 0; i < opr_sz; i += 16) {
132         AESState *ad = (AESState *)(vd + i);
133         AESState *st = (AESState *)(vm + i);
134         AESState t;
135 
136         /* Our uint64_t are in the wrong order for big-endian. */
137         if (HOST_BIG_ENDIAN) {
138             t.d[0] = st->d[1];
139             t.d[1] = st->d[0];
140             aesdec_IMC(&t, &t, false);
141             ad->d[0] = t.d[1];
142             ad->d[1] = t.d[0];
143         } else {
144             aesdec_IMC(ad, st, false);
145         }
146     }
147     clear_tail(vd, opr_sz, simd_maxsz(desc));
148 }
149 
150 /*
151  * SHA-1 logical functions
152  */
153 
154 static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
155 {
156     return (x & (y ^ z)) ^ z;
157 }
158 
159 static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
160 {
161     return x ^ y ^ z;
162 }
163 
164 static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
165 {
166     return (x & y) | ((x | y) & z);
167 }
168 
169 void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc)
170 {
171     uint64_t *d = vd, *n = vn, *m = vm;
172     uint64_t d0, d1;
173 
174     d0 = d[1] ^ d[0] ^ m[0];
175     d1 = n[0] ^ d[1] ^ m[1];
176     d[0] = d0;
177     d[1] = d1;
178 
179     clear_tail_16(vd, desc);
180 }
181 
182 static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn,
183                                     uint64_t *rm, uint32_t desc,
184                                     uint32_t (*fn)(union CRYPTO_STATE *d))
185 {
186     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
187     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
188     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
189     int i;
190 
191     for (i = 0; i < 4; i++) {
192         uint32_t t = fn(&d);
193 
194         t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0)
195              + CR_ST_WORD(m, i);
196 
197         CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3);
198         CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
199         CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2);
200         CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
201         CR_ST_WORD(d, 0) = t;
202     }
203     rd[0] = d.l[0];
204     rd[1] = d.l[1];
205 
206     clear_tail_16(rd, desc);
207 }
208 
209 static uint32_t do_sha1c(union CRYPTO_STATE *d)
210 {
211     return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
212 }
213 
214 void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc)
215 {
216     crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c);
217 }
218 
219 static uint32_t do_sha1p(union CRYPTO_STATE *d)
220 {
221     return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
222 }
223 
224 void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc)
225 {
226     crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p);
227 }
228 
229 static uint32_t do_sha1m(union CRYPTO_STATE *d)
230 {
231     return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
232 }
233 
234 void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc)
235 {
236     crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m);
237 }
238 
239 void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc)
240 {
241     uint64_t *rd = vd;
242     uint64_t *rm = vm;
243     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
244 
245     CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2);
246     CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0;
247 
248     rd[0] = m.l[0];
249     rd[1] = m.l[1];
250 
251     clear_tail_16(vd, desc);
252 }
253 
254 void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc)
255 {
256     uint64_t *rd = vd;
257     uint64_t *rm = vm;
258     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
259     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
260 
261     CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1);
262     CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1);
263     CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1);
264     CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1);
265 
266     rd[0] = d.l[0];
267     rd[1] = d.l[1];
268 
269     clear_tail_16(vd, desc);
270 }
271 
272 /*
273  * The SHA-256 logical functions, according to
274  * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
275  */
276 
277 static uint32_t S0(uint32_t x)
278 {
279     return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
280 }
281 
282 static uint32_t S1(uint32_t x)
283 {
284     return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
285 }
286 
287 static uint32_t s0(uint32_t x)
288 {
289     return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
290 }
291 
292 static uint32_t s1(uint32_t x)
293 {
294     return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
295 }
296 
297 void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc)
298 {
299     uint64_t *rd = vd;
300     uint64_t *rn = vn;
301     uint64_t *rm = vm;
302     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
303     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
304     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
305     int i;
306 
307     for (i = 0; i < 4; i++) {
308         uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2))
309                      + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0))
310                      + CR_ST_WORD(m, i);
311 
312         CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2);
313         CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1);
314         CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0);
315         CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t;
316 
317         t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
318              + S0(CR_ST_WORD(d, 0));
319 
320         CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
321         CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
322         CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
323         CR_ST_WORD(d, 0) = t;
324     }
325 
326     rd[0] = d.l[0];
327     rd[1] = d.l[1];
328 
329     clear_tail_16(vd, desc);
330 }
331 
332 void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc)
333 {
334     uint64_t *rd = vd;
335     uint64_t *rn = vn;
336     uint64_t *rm = vm;
337     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
338     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
339     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
340     int i;
341 
342     for (i = 0; i < 4; i++) {
343         uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
344                      + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0))
345                      + CR_ST_WORD(m, i);
346 
347         CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
348         CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
349         CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
350         CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t;
351     }
352 
353     rd[0] = d.l[0];
354     rd[1] = d.l[1];
355 
356     clear_tail_16(vd, desc);
357 }
358 
359 void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc)
360 {
361     uint64_t *rd = vd;
362     uint64_t *rm = vm;
363     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
364     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
365 
366     CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1));
367     CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2));
368     CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3));
369     CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0));
370 
371     rd[0] = d.l[0];
372     rd[1] = d.l[1];
373 
374     clear_tail_16(vd, desc);
375 }
376 
377 void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc)
378 {
379     uint64_t *rd = vd;
380     uint64_t *rn = vn;
381     uint64_t *rm = vm;
382     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
383     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
384     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
385 
386     CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1);
387     CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2);
388     CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3);
389     CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0);
390 
391     rd[0] = d.l[0];
392     rd[1] = d.l[1];
393 
394     clear_tail_16(vd, desc);
395 }
396 
397 /*
398  * The SHA-512 logical functions (same as above but using 64-bit operands)
399  */
400 
401 static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z)
402 {
403     return (x & (y ^ z)) ^ z;
404 }
405 
406 static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z)
407 {
408     return (x & y) | ((x | y) & z);
409 }
410 
411 static uint64_t S0_512(uint64_t x)
412 {
413     return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39);
414 }
415 
416 static uint64_t S1_512(uint64_t x)
417 {
418     return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41);
419 }
420 
421 static uint64_t s0_512(uint64_t x)
422 {
423     return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7);
424 }
425 
426 static uint64_t s1_512(uint64_t x)
427 {
428     return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6);
429 }
430 
431 void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc)
432 {
433     uint64_t *rd = vd;
434     uint64_t *rn = vn;
435     uint64_t *rm = vm;
436     uint64_t d0 = rd[0];
437     uint64_t d1 = rd[1];
438 
439     d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]);
440     d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]);
441 
442     rd[0] = d0;
443     rd[1] = d1;
444 
445     clear_tail_16(vd, desc);
446 }
447 
448 void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc)
449 {
450     uint64_t *rd = vd;
451     uint64_t *rn = vn;
452     uint64_t *rm = vm;
453     uint64_t d0 = rd[0];
454     uint64_t d1 = rd[1];
455 
456     d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]);
457     d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]);
458 
459     rd[0] = d0;
460     rd[1] = d1;
461 
462     clear_tail_16(vd, desc);
463 }
464 
465 void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc)
466 {
467     uint64_t *rd = vd;
468     uint64_t *rn = vn;
469     uint64_t d0 = rd[0];
470     uint64_t d1 = rd[1];
471 
472     d0 += s0_512(rd[1]);
473     d1 += s0_512(rn[0]);
474 
475     rd[0] = d0;
476     rd[1] = d1;
477 
478     clear_tail_16(vd, desc);
479 }
480 
481 void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc)
482 {
483     uint64_t *rd = vd;
484     uint64_t *rn = vn;
485     uint64_t *rm = vm;
486 
487     rd[0] += s1_512(rn[0]) + rm[0];
488     rd[1] += s1_512(rn[1]) + rm[1];
489 
490     clear_tail_16(vd, desc);
491 }
492 
493 void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc)
494 {
495     uint64_t *rd = vd;
496     uint64_t *rn = vn;
497     uint64_t *rm = vm;
498     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
499     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
500     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
501     uint32_t t;
502 
503     t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17);
504     CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9);
505 
506     t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17);
507     CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9);
508 
509     t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17);
510     CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9);
511 
512     t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17);
513     CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9);
514 
515     rd[0] = d.l[0];
516     rd[1] = d.l[1];
517 
518     clear_tail_16(vd, desc);
519 }
520 
521 void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc)
522 {
523     uint64_t *rd = vd;
524     uint64_t *rn = vn;
525     uint64_t *rm = vm;
526     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
527     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
528     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
529     uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25);
530 
531     CR_ST_WORD(d, 0) ^= t;
532     CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25);
533     CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25);
534     CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^
535                         ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26);
536 
537     rd[0] = d.l[0];
538     rd[1] = d.l[1];
539 
540     clear_tail_16(vd, desc);
541 }
542 
543 static inline void QEMU_ALWAYS_INLINE
544 crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm,
545              uint32_t desc, uint32_t opcode)
546 {
547     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
548     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
549     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
550     uint32_t imm2 = simd_data(desc);
551     uint32_t t;
552 
553     assert(imm2 < 4);
554 
555     if (opcode == 0 || opcode == 2) {
556         /* SM3TT1A, SM3TT2A */
557         t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
558     } else if (opcode == 1) {
559         /* SM3TT1B */
560         t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
561     } else if (opcode == 3) {
562         /* SM3TT2B */
563         t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
564     } else {
565         qemu_build_not_reached();
566     }
567 
568     t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2);
569 
570     CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1);
571 
572     if (opcode < 2) {
573         /* SM3TT1A, SM3TT1B */
574         t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20);
575 
576         CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23);
577     } else {
578         /* SM3TT2A, SM3TT2B */
579         t += CR_ST_WORD(n, 3);
580         t ^= rol32(t, 9) ^ rol32(t, 17);
581 
582         CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13);
583     }
584 
585     CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3);
586     CR_ST_WORD(d, 3) = t;
587 
588     rd[0] = d.l[0];
589     rd[1] = d.l[1];
590 
591     clear_tail_16(rd, desc);
592 }
593 
594 #define DO_SM3TT(NAME, OPCODE) \
595     void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
596     { crypto_sm3tt(vd, vn, vm, desc, OPCODE); }
597 
598 DO_SM3TT(crypto_sm3tt1a, 0)
599 DO_SM3TT(crypto_sm3tt1b, 1)
600 DO_SM3TT(crypto_sm3tt2a, 2)
601 DO_SM3TT(crypto_sm3tt2b, 3)
602 
603 #undef DO_SM3TT
604 
605 static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm)
606 {
607     union CRYPTO_STATE d = { .l = { rn[0], rn[1] } };
608     union CRYPTO_STATE n = { .l = { rm[0], rm[1] } };
609     uint32_t t, i;
610 
611     for (i = 0; i < 4; i++) {
612         t = CR_ST_WORD(d, (i + 1) % 4) ^
613             CR_ST_WORD(d, (i + 2) % 4) ^
614             CR_ST_WORD(d, (i + 3) % 4) ^
615             CR_ST_WORD(n, i);
616 
617         t = sm4_subword(t);
618 
619         CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^
620                             rol32(t, 24);
621     }
622 
623     rd[0] = d.l[0];
624     rd[1] = d.l[1];
625 }
626 
627 void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc)
628 {
629     intptr_t i, opr_sz = simd_oprsz(desc);
630 
631     for (i = 0; i < opr_sz; i += 16) {
632         do_crypto_sm4e(vd + i, vn + i, vm + i);
633     }
634     clear_tail(vd, opr_sz, simd_maxsz(desc));
635 }
636 
637 static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm)
638 {
639     union CRYPTO_STATE d;
640     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
641     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
642     uint32_t t, i;
643 
644     d = n;
645     for (i = 0; i < 4; i++) {
646         t = CR_ST_WORD(d, (i + 1) % 4) ^
647             CR_ST_WORD(d, (i + 2) % 4) ^
648             CR_ST_WORD(d, (i + 3) % 4) ^
649             CR_ST_WORD(m, i);
650 
651         t = sm4_subword(t);
652 
653         CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23);
654     }
655 
656     rd[0] = d.l[0];
657     rd[1] = d.l[1];
658 }
659 
660 void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc)
661 {
662     intptr_t i, opr_sz = simd_oprsz(desc);
663 
664     for (i = 0; i < opr_sz; i += 16) {
665         do_crypto_sm4ekey(vd + i, vn + i, vm + i);
666     }
667     clear_tail(vd, opr_sz, simd_maxsz(desc));
668 }
669 
670 void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc)
671 {
672     intptr_t i, opr_sz = simd_oprsz(desc);
673     uint64_t *d = vd, *n = vn, *m = vm;
674 
675     for (i = 0; i < opr_sz / 8; ++i) {
676         d[i] = n[i] ^ rol64(m[i], 1);
677     }
678     clear_tail(vd, opr_sz, simd_maxsz(desc));
679 }
680