1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 2021-04-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "CpuArch.h"
7 
8 #ifdef MY_CPU_X86_OR_AMD64
9 
10   #if defined(__clang__)
11     #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8)
12       #define USE_INTEL_AES
13         #define ATTRIB_AES __attribute__((__target__("aes")))
14       #if (__clang_major__ >= 8)
15         #define USE_INTEL_VAES
16         #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
17       #endif
18     #endif
19   #elif defined(__GNUC__)
20     #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
21       #define USE_INTEL_AES
22       #ifndef __AES__
23         #define ATTRIB_AES __attribute__((__target__("aes")))
24       #endif
25       #if (__GNUC__ >= 8)
26         #define USE_INTEL_VAES
27         #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
28       #endif
29     #endif
30   #elif defined(__INTEL_COMPILER)
31     #if (__INTEL_COMPILER >= 1110)
32       #define USE_INTEL_AES
33       #if (__INTEL_COMPILER >= 1900)
34         #define USE_INTEL_VAES
35       #endif
36     #endif
37   #elif defined(_MSC_VER)
38     #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
39       #define USE_INTEL_AES
40       #if (_MSC_VER >= 1910)
41         #define USE_INTEL_VAES
42       #endif
43     #endif
44   #endif
45 
46 #ifndef ATTRIB_AES
47   #define ATTRIB_AES
48 #endif
49 #ifndef ATTRIB_VAES
50   #define ATTRIB_VAES
51 #endif
52 
53 
54 #ifdef USE_INTEL_AES
55 
56 #include <wmmintrin.h>
57 
58 #ifndef USE_INTEL_VAES
59 #define AES_TYPE_keys __m128i
60 #define AES_TYPE_data __m128i
61 #endif
62 
63 #define AES_FUNC_START(name) \
64     void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks)
65 
66 #define AES_FUNC_START2(name) \
67 AES_FUNC_START (name); \
68 ATTRIB_AES \
69 AES_FUNC_START (name)
70 
71 #define MM_OP(op, dest, src)  dest = op(dest, src);
72 #define MM_OP_m(op, src)      MM_OP(op, m, src);
73 
74 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src);
75 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src);
76 
77 
AES_FUNC_START2(AesCbc_Encode_HW)78 AES_FUNC_START2 (AesCbc_Encode_HW)
79 {
80   __m128i m = *p;
81   const __m128i k0 = p[2];
82   const __m128i k1 = p[3];
83   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
84   for (; numBlocks != 0; numBlocks--, data++)
85   {
86     UInt32 r = numRounds2;
87     const __m128i *w = p + 4;
88     __m128i temp = *data;
89     MM_XOR (temp, k0);
90     MM_XOR (m, temp);
91     MM_OP_m (_mm_aesenc_si128, k1);
92     do
93     {
94       MM_OP_m (_mm_aesenc_si128, w[0]);
95       MM_OP_m (_mm_aesenc_si128, w[1]);
96       w += 2;
97     }
98     while (--r);
99     MM_OP_m (_mm_aesenclast_si128, w[0]);
100     *data = m;
101   }
102   *p = m;
103 }
104 
105 
106 #define WOP_1(op)
107 #define WOP_2(op)   WOP_1 (op)  op (m1, 1);
108 #define WOP_3(op)   WOP_2 (op)  op (m2, 2);
109 #define WOP_4(op)   WOP_3 (op)  op (m3, 3);
110 #ifdef MY_CPU_AMD64
111 #define WOP_5(op)   WOP_4 (op)  op (m4, 4);
112 #define WOP_6(op)   WOP_5 (op)  op (m5, 5);
113 #define WOP_7(op)   WOP_6 (op)  op (m6, 6);
114 #define WOP_8(op)   WOP_7 (op)  op (m7, 7);
115 #endif
116 /*
117 #define WOP_9(op)   WOP_8 (op)  op (m8, 8);
118 #define WOP_10(op)  WOP_9 (op)  op (m9, 9);
119 #define WOP_11(op)  WOP_10(op)  op (m10, 10);
120 #define WOP_12(op)  WOP_11(op)  op (m11, 11);
121 #define WOP_13(op)  WOP_12(op)  op (m12, 12);
122 #define WOP_14(op)  WOP_13(op)  op (m13, 13);
123 */
124 
125 #ifdef MY_CPU_AMD64
126   #define NUM_WAYS      8
127   #define WOP_M1    WOP_8
128 #else
129   #define NUM_WAYS      4
130   #define WOP_M1    WOP_4
131 #endif
132 
133 #define WOP(op)  op (m0, 0);  WOP_M1(op)
134 
135 
136 #define DECLARE_VAR(reg, ii)  __m128i reg
137 #define LOAD_data(  reg, ii)  reg = data[ii];
138 #define STORE_data( reg, ii)  data[ii] = reg;
139 #if (NUM_WAYS > 1)
140 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1]);
141 #endif
142 
143 #define AVX__DECLARE_VAR(reg, ii)  __m256i reg
144 #define AVX__LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
145 #define AVX__STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
146 #define AVX__XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]));
147 
148 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
149 
150 #define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
151 #define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
152 #define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
153 #define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
154 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
155 
156 
157 #define AVX__AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
158 #define AVX__AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
159 #define AVX__AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
160 #define AVX__AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
161 #define AVX__AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
162 
163 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one); reg = ctr;
164 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg);
165 
166 #define AVX__CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key);
167 #define AVX__CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg);
168 
169 #define WOP_KEY(op, n) { \
170     const __m128i key = w[n]; \
171     WOP(op); }
172 
173 #define AVX__WOP_KEY(op, n) { \
174     const __m256i key = w[n]; \
175     WOP(op); }
176 
177 
178 #define WIDE_LOOP_START  \
179     dataEnd = data + numBlocks;  \
180     if (numBlocks >= NUM_WAYS)  \
181     { dataEnd -= NUM_WAYS; do {  \
182 
183 
184 #define WIDE_LOOP_END  \
185     data += NUM_WAYS;  \
186     } while (data <= dataEnd);  \
187     dataEnd += NUM_WAYS; }  \
188 
189 
190 #define SINGLE_LOOP  \
191     for (; data < dataEnd; data++)
192 
193 
194 #define NUM_AES_KEYS_MAX 15
195 
196 #define WIDE_LOOP_START_AVX(OP)  \
197     dataEnd = data + numBlocks;  \
198     if (numBlocks >= NUM_WAYS * 2)  \
199     { __m256i keys[NUM_AES_KEYS_MAX]; \
200     UInt32 ii; \
201     OP \
202     for (ii = 0; ii < numRounds; ii++) \
203       keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
204     dataEnd -= NUM_WAYS * 2; do {  \
205 
206 
207 #define WIDE_LOOP_END_AVX(OP)  \
208     data += NUM_WAYS * 2;  \
209     } while (data <= dataEnd);  \
210     dataEnd += NUM_WAYS * 2;  \
211     OP  \
212     _mm256_zeroupper();  \
213     }  \
214 
215 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
216    MSVC still can insert vzeroupper instruction. */
217 
218 
AES_FUNC_START2(AesCbc_Decode_HW)219 AES_FUNC_START2 (AesCbc_Decode_HW)
220 {
221   __m128i iv = *p;
222   const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
223   const __m128i *dataEnd;
224   p += 2;
225 
226   WIDE_LOOP_START
227   {
228     const __m128i *w = wStart;
229 
230     WOP (DECLARE_VAR)
231     WOP (LOAD_data);
232     WOP_KEY (AES_XOR, 1)
233 
234     do
235     {
236       WOP_KEY (AES_DEC, 0)
237       w--;
238     }
239     while (w != p);
240     WOP_KEY (AES_DEC_LAST, 0)
241 
242     MM_XOR (m0, iv);
243     WOP_M1 (XOR_data_M1)
244     iv = data[NUM_WAYS - 1];
245     WOP (STORE_data);
246   }
247   WIDE_LOOP_END
248 
249   SINGLE_LOOP
250   {
251     const __m128i *w = wStart - 1;
252     __m128i m = _mm_xor_si128 (w[2], *data);
253     do
254     {
255       MM_OP_m (_mm_aesdec_si128, w[1]);
256       MM_OP_m (_mm_aesdec_si128, w[0]);
257       w -= 2;
258     }
259     while (w != p);
260     MM_OP_m (_mm_aesdec_si128,     w[1]);
261     MM_OP_m (_mm_aesdeclast_si128, w[0]);
262 
263     MM_XOR (m, iv);
264     iv = *data;
265     *data = m;
266   }
267 
268   p[-2] = iv;
269 }
270 
271 
AES_FUNC_START2(AesCtr_Code_HW)272 AES_FUNC_START2 (AesCtr_Code_HW)
273 {
274   __m128i ctr = *p;
275   UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
276   const __m128i *dataEnd;
277   __m128i one = _mm_cvtsi32_si128(1);
278 
279   p += 2;
280 
281   WIDE_LOOP_START
282   {
283     const __m128i *w = p;
284     UInt32 r = numRoundsMinus2;
285     WOP (DECLARE_VAR)
286     WOP (CTR_START);
287     WOP_KEY (AES_XOR, 0)
288     w += 1;
289     do
290     {
291       WOP_KEY (AES_ENC, 0)
292       w += 1;
293     }
294     while (--r);
295     WOP_KEY (AES_ENC_LAST, 0)
296 
297     WOP (CTR_END);
298   }
299   WIDE_LOOP_END
300 
301   SINGLE_LOOP
302   {
303     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
304     const __m128i *w = p;
305     __m128i m;
306     MM_OP (_mm_add_epi64, ctr, one);
307     m = _mm_xor_si128 (ctr, p[0]);
308     w += 1;
309     do
310     {
311       MM_OP_m (_mm_aesenc_si128, w[0]);
312       MM_OP_m (_mm_aesenc_si128, w[1]);
313       w += 2;
314     }
315     while (--numRounds2);
316     MM_OP_m (_mm_aesenc_si128,     w[0]);
317     MM_OP_m (_mm_aesenclast_si128, w[1]);
318     MM_XOR (*data, m);
319   }
320 
321   p[-2] = ctr;
322 }
323 
324 
325 
326 #ifdef USE_INTEL_VAES
327 
328 #if defined(__clang__) && defined(_MSC_VER)
329 #define __SSE4_2__
330 #define __AES__
331 #define __AVX__
332 #define __AVX2__
333 #define __VAES__
334 #define __AVX512F__
335 #define __AVX512VL__
336 #endif
337 
338 #include <immintrin.h>
339 
340 #define VAES_FUNC_START2(name) \
341 AES_FUNC_START (name); \
342 ATTRIB_VAES \
343 AES_FUNC_START (name)
344 
VAES_FUNC_START2(AesCbc_Decode_HW_256)345 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
346 {
347   __m128i iv = *p;
348   const __m128i *dataEnd;
349   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
350   p += 2;
351 
352   WIDE_LOOP_START_AVX(;)
353   {
354     const __m256i *w = keys + numRounds - 2;
355 
356     WOP (AVX__DECLARE_VAR)
357     WOP (AVX__LOAD_data);
358     AVX__WOP_KEY (AVX__AES_XOR, 1)
359 
360     do
361     {
362       AVX__WOP_KEY (AVX__AES_DEC, 0)
363       w--;
364     }
365     while (w != keys);
366     AVX__WOP_KEY (AVX__AES_DEC_LAST, 0)
367 
368     AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]));
369     WOP_M1 (AVX__XOR_data_M1)
370     iv = data[NUM_WAYS * 2 - 1];
371     WOP (AVX__STORE_data);
372   }
373   WIDE_LOOP_END_AVX(;)
374 
375   SINGLE_LOOP
376   {
377     const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
378     __m128i m = _mm_xor_si128 (w[2], *data);
379     do
380     {
381       MM_OP_m (_mm_aesdec_si128, w[1]);
382       MM_OP_m (_mm_aesdec_si128, w[0]);
383       w -= 2;
384     }
385     while (w != p);
386     MM_OP_m (_mm_aesdec_si128,     w[1]);
387     MM_OP_m (_mm_aesdeclast_si128, w[0]);
388 
389     MM_XOR (m, iv);
390     iv = *data;
391     *data = m;
392   }
393 
394   p[-2] = iv;
395 }
396 
397 
398 /*
399 SSE2: _mm_cvtsi32_si128 : movd
400 AVX:  _mm256_setr_m128i            : vinsertf128
401 AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
402       _mm256_extracti128_si256     : vextracti128
403       _mm256_broadcastsi128_si256  : vbroadcasti128
404 */
405 
406 #define AVX__CTR_LOOP_START  \
407     ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
408     two = _mm256_setr_m128i(one, one); \
409     two = _mm256_add_epi64(two, two); \
410 
411 // two = _mm256_setr_epi64x(2, 0, 2, 0);
412 
413 #define AVX__CTR_LOOP_ENC  \
414     ctr = _mm256_extracti128_si256 (ctr2, 1); \
415 
VAES_FUNC_START2(AesCtr_Code_HW_256)416 VAES_FUNC_START2 (AesCtr_Code_HW_256)
417 {
418   __m128i ctr = *p;
419   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
420   const __m128i *dataEnd;
421   __m128i one = _mm_cvtsi32_si128(1);
422   __m256i ctr2, two;
423   p += 2;
424 
425   WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START)
426   {
427     const __m256i *w = keys;
428     UInt32 r = numRounds - 2;
429     WOP (AVX__DECLARE_VAR)
430     AVX__WOP_KEY (AVX__CTR_START, 0);
431 
432     w += 1;
433     do
434     {
435       AVX__WOP_KEY (AVX__AES_ENC, 0)
436       w += 1;
437     }
438     while (--r);
439     AVX__WOP_KEY (AVX__AES_ENC_LAST, 0)
440 
441     WOP (AVX__CTR_END);
442   }
443   WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC)
444 
445   SINGLE_LOOP
446   {
447     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
448     const __m128i *w = p;
449     __m128i m;
450     MM_OP (_mm_add_epi64, ctr, one);
451     m = _mm_xor_si128 (ctr, p[0]);
452     w += 1;
453     do
454     {
455       MM_OP_m (_mm_aesenc_si128, w[0]);
456       MM_OP_m (_mm_aesenc_si128, w[1]);
457       w += 2;
458     }
459     while (--numRounds2);
460     MM_OP_m (_mm_aesenc_si128,     w[0]);
461     MM_OP_m (_mm_aesenclast_si128, w[1]);
462     MM_XOR (*data, m);
463   }
464 
465   p[-2] = ctr;
466 }
467 
468 #endif // USE_INTEL_VAES
469 
470 #else // USE_INTEL_AES
471 
472 /* no USE_INTEL_AES */
473 
474 #pragma message("AES  HW_SW stub was used")
475 
476 #define AES_TYPE_keys UInt32
477 #define AES_TYPE_data Byte
478 
479 #define AES_FUNC_START(name) \
480     void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \
481 
482 #define AES_COMPAT_STUB(name) \
483     AES_FUNC_START(name); \
484     AES_FUNC_START(name ## _HW) \
485     { name(p, data, numBlocks); }
486 
487 AES_COMPAT_STUB (AesCbc_Encode)
488 AES_COMPAT_STUB (AesCbc_Decode)
489 AES_COMPAT_STUB (AesCtr_Code)
490 
491 #endif // USE_INTEL_AES
492 
493 
494 #ifndef USE_INTEL_VAES
495 
496 #pragma message("VAES HW_SW stub was used")
497 
498 #define VAES_COMPAT_STUB(name) \
499     void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
500     void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
501     { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
502 
503 VAES_COMPAT_STUB (AesCbc_Decode_HW)
504 VAES_COMPAT_STUB (AesCtr_Code_HW)
505 
506 #endif // ! USE_INTEL_VAES
507 
508 
509 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
510 
511   #if defined(__clang__)
512     #if (__clang_major__ >= 8) // fix that check
513       #define USE_HW_AES
514     #endif
515   #elif defined(__GNUC__)
516     #if (__GNUC__ >= 6) // fix that check
517       #define USE_HW_AES
518     #endif
519   #elif defined(_MSC_VER)
520     #if _MSC_VER >= 1910
521       #define USE_HW_AES
522     #endif
523   #endif
524 
525 #ifdef USE_HW_AES
526 
527 // #pragma message("=== AES HW === ")
528 
529 #if defined(__clang__) || defined(__GNUC__)
530   #ifdef MY_CPU_ARM64
531     #define ATTRIB_AES __attribute__((__target__("+crypto")))
532   #else
533     #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
534   #endif
535 #else
536   // _MSC_VER
537   // for arm32
538   #define _ARM_USE_NEW_NEON_INTRINSICS
539 #endif
540 
541 #ifndef ATTRIB_AES
542   #define ATTRIB_AES
543 #endif
544 
545 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
546 #include <arm64_neon.h>
547 #else
548 #include <arm_neon.h>
549 #endif
550 
551 typedef uint8x16_t v128;
552 
553 #define AES_FUNC_START(name) \
554     void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks)
555 
556 #define AES_FUNC_START2(name) \
557 AES_FUNC_START (name); \
558 ATTRIB_AES \
559 AES_FUNC_START (name)
560 
561 #define MM_OP(op, dest, src)  dest = op(dest, src);
562 #define MM_OP_m(op, src)      MM_OP(op, m, src);
563 #define MM_OP1_m(op)          m = op(m);
564 
565 #define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src);
566 #define MM_XOR_m( src)        MM_XOR(m, src);
567 
568 #define AES_E_m(k)     MM_OP_m (vaeseq_u8, k);
569 #define AES_E_MC_m(k)  AES_E_m (k);  MM_OP1_m(vaesmcq_u8);
570 
571 
572 AES_FUNC_START2 (AesCbc_Encode_HW)
573 {
574   v128 m = *p;
575   const v128 k0 = p[2];
576   const v128 k1 = p[3];
577   const v128 k2 = p[4];
578   const v128 k3 = p[5];
579   const v128 k4 = p[6];
580   const v128 k5 = p[7];
581   const v128 k6 = p[8];
582   const v128 k7 = p[9];
583   const v128 k8 = p[10];
584   const v128 k9 = p[11];
585   const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
586   const v128 *w = p + ((size_t)numRounds2 * 2);
587   const v128 k_z1 = w[1];
588   const v128 k_z0 = w[2];
589   for (; numBlocks != 0; numBlocks--, data++)
590   {
591     MM_XOR_m (*data);
592     AES_E_MC_m (k0)
593     AES_E_MC_m (k1)
594     AES_E_MC_m (k2)
595     AES_E_MC_m (k3)
596     AES_E_MC_m (k4)
597     AES_E_MC_m (k5)
598     AES_E_MC_m (k6)
599     AES_E_MC_m (k7)
600     AES_E_MC_m (k8)
601     if (numRounds2 >= 6)
602     {
603       AES_E_MC_m (k9)
604       AES_E_MC_m (p[12])
605       if (numRounds2 != 6)
606       {
607         AES_E_MC_m (p[13])
608         AES_E_MC_m (p[14])
609       }
610     }
611     AES_E_m  (k_z1);
612     MM_XOR_m (k_z0);
613     *data = m;
614   }
615   *p = m;
616 }
617 
618 
619 #define WOP_1(op)
620 #define WOP_2(op)   WOP_1 (op)  op (m1, 1);
621 #define WOP_3(op)   WOP_2 (op)  op (m2, 2);
622 #define WOP_4(op)   WOP_3 (op)  op (m3, 3);
623 #define WOP_5(op)   WOP_4 (op)  op (m4, 4);
624 #define WOP_6(op)   WOP_5 (op)  op (m5, 5);
625 #define WOP_7(op)   WOP_6 (op)  op (m6, 6);
626 #define WOP_8(op)   WOP_7 (op)  op (m7, 7);
627 
628   #define NUM_WAYS      8
629   #define WOP_M1    WOP_8
630 
631 #define WOP(op)  op (m0, 0);  WOP_M1(op)
632 
633 #define DECLARE_VAR(reg, ii)  v128 reg
634 #define LOAD_data(  reg, ii)  reg = data[ii];
635 #define STORE_data( reg, ii)  data[ii] = reg;
636 #if (NUM_WAYS > 1)
637 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1]);
638 #endif
639 
640 #define MM_OP_key(op, reg)  MM_OP (op, reg, key);
641 
642 #define AES_D_m(k)      MM_OP_m (vaesdq_u8, k);
643 #define AES_D_IMC_m(k)  AES_D_m (k);  MM_OP1_m (vaesimcq_u8);
644 
645 #define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
646 #define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
647 #define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
648 
649 #define AES_D_IMC( reg, ii)  AES_D (reg, ii);  reg = vaesimcq_u8(reg)
650 #define AES_E_MC(  reg, ii)  AES_E (reg, ii);  reg = vaesmcq_u8(reg)
651 
652 #define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one);  reg = vreinterpretq_u8_u64(ctr);
653 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg);
654 
655 #define WOP_KEY(op, n) { \
656     const v128 key = w[n]; \
657     WOP(op); }
658 
659 #define WIDE_LOOP_START  \
660     dataEnd = data + numBlocks;  \
661     if (numBlocks >= NUM_WAYS)  \
662     { dataEnd -= NUM_WAYS; do {  \
663 
664 #define WIDE_LOOP_END  \
665     data += NUM_WAYS;  \
666     } while (data <= dataEnd);  \
667     dataEnd += NUM_WAYS; }  \
668 
669 #define SINGLE_LOOP  \
670     for (; data < dataEnd; data++)
671 
672 
673 AES_FUNC_START2 (AesCbc_Decode_HW)
674 {
675   v128 iv = *p;
676   const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
677   const v128 *dataEnd;
678   p += 2;
679 
680   WIDE_LOOP_START
681   {
682     const v128 *w = wStart;
683     WOP (DECLARE_VAR)
684     WOP (LOAD_data);
685     WOP_KEY (AES_D_IMC, 2)
686     do
687     {
688       WOP_KEY (AES_D_IMC, 1)
689       WOP_KEY (AES_D_IMC, 0)
690       w -= 2;
691     }
692     while (w != p);
693     WOP_KEY (AES_D,   1)
694     WOP_KEY (AES_XOR, 0)
695     MM_XOR (m0, iv);
696     WOP_M1 (XOR_data_M1)
697     iv = data[NUM_WAYS - 1];
698     WOP (STORE_data);
699   }
700   WIDE_LOOP_END
701 
702   SINGLE_LOOP
703   {
704     const v128 *w = wStart;
705     v128 m = *data;
706     AES_D_IMC_m (w[2])
707     do
708     {
709       AES_D_IMC_m (w[1]);
710       AES_D_IMC_m (w[0]);
711       w -= 2;
712     }
713     while (w != p);
714     AES_D_m  (w[1]);
715     MM_XOR_m (w[0]);
716     MM_XOR_m (iv);
717     iv = *data;
718     *data = m;
719   }
720 
721   p[-2] = iv;
722 }
723 
724 
725 AES_FUNC_START2 (AesCtr_Code_HW)
726 {
727   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
728   const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
729   const v128 *dataEnd;
730   uint64x2_t one = vdupq_n_u64(0);
731   one = vsetq_lane_u64(1, one, 0);
732   p += 2;
733 
734   WIDE_LOOP_START
735   {
736     const v128 *w = p;
737     WOP (DECLARE_VAR)
738     WOP (CTR_START);
739     do
740     {
741       WOP_KEY (AES_E_MC, 0)
742       WOP_KEY (AES_E_MC, 1)
743       w += 2;
744     }
745     while (w != wEnd);
746     WOP_KEY (AES_E_MC, 0)
747     WOP_KEY (AES_E,    1)
748     WOP_KEY (AES_XOR,  2)
749     WOP (CTR_END);
750   }
751   WIDE_LOOP_END
752 
753   SINGLE_LOOP
754   {
755     const v128 *w = p;
756     v128 m;
757     CTR_START (m, 0);
758     do
759     {
760       AES_E_MC_m (w[0]);
761       AES_E_MC_m (w[1]);
762       w += 2;
763     }
764     while (w != wEnd);
765     AES_E_MC_m (w[0]);
766     AES_E_m    (w[1]);
767     MM_XOR_m   (w[2]);
768     CTR_END (m, 0);
769   }
770 
771   p[-2] = vreinterpretq_u8_u64(ctr);
772 }
773 
774 #endif // USE_HW_AES
775 
776 #endif // MY_CPU_ARM_OR_ARM64
777