1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 #include "secerr.h"
6 #include "rijndael.h"
7 
8 #if ((defined(__clang__) ||                                         \
9       (defined(__GNUC__) && defined(__GNUC_MINOR__) &&              \
10        (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)))) && \
11      defined(IS_LITTLE_ENDIAN))
12 
13 #ifndef __ARM_FEATURE_CRYPTO
14 #error "Compiler option is invalid"
15 #endif
16 
17 #include <arm_neon.h>
18 
19 SECStatus
arm_aes_encrypt_ecb_128(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)20 arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output,
21                         unsigned int *outputLen,
22                         unsigned int maxOutputLen,
23                         const unsigned char *input,
24                         unsigned int inputLen,
25                         unsigned int blocksize)
26 {
27 #if !defined(HAVE_UNALIGNED_ACCESS)
28     pre_align unsigned char buf[16] post_align;
29 #endif
30     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
31     uint8x16_t key11;
32     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
33 
34     if (!inputLen) {
35         return SECSuccess;
36     }
37 
38     key1 = vld1q_u8(key);
39     key2 = vld1q_u8(key + 16);
40     key3 = vld1q_u8(key + 32);
41     key4 = vld1q_u8(key + 48);
42     key5 = vld1q_u8(key + 64);
43     key6 = vld1q_u8(key + 80);
44     key7 = vld1q_u8(key + 96);
45     key8 = vld1q_u8(key + 112);
46     key9 = vld1q_u8(key + 128);
47     key10 = vld1q_u8(key + 144);
48     key11 = vld1q_u8(key + 160);
49 
50     while (inputLen > 0) {
51         uint8x16_t state;
52 #if defined(HAVE_UNALIGNED_ACCESS)
53         state = vld1q_u8(input);
54 #else
55         if ((uintptr_t)input & 0x7) {
56             memcpy(buf, input, 16);
57             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
58         } else {
59             state = vld1q_u8(__builtin_assume_aligned(input, 8));
60         }
61 #endif
62         input += 16;
63         inputLen -= 16;
64 
65         /* Rounds */
66         state = vaeseq_u8(state, key1);
67         state = vaesmcq_u8(state);
68         state = vaeseq_u8(state, key2);
69         state = vaesmcq_u8(state);
70         state = vaeseq_u8(state, key3);
71         state = vaesmcq_u8(state);
72         state = vaeseq_u8(state, key4);
73         state = vaesmcq_u8(state);
74         state = vaeseq_u8(state, key5);
75         state = vaesmcq_u8(state);
76         state = vaeseq_u8(state, key6);
77         state = vaesmcq_u8(state);
78         state = vaeseq_u8(state, key7);
79         state = vaesmcq_u8(state);
80         state = vaeseq_u8(state, key8);
81         state = vaesmcq_u8(state);
82         state = vaeseq_u8(state, key9);
83         state = vaesmcq_u8(state);
84         state = vaeseq_u8(state, key10);
85         /* AddRoundKey */
86         state = veorq_u8(state, key11);
87 
88 #if defined(HAVE_UNALIGNED_ACCESS)
89         vst1q_u8(output, state);
90 #else
91         if ((uintptr_t)output & 0x7) {
92             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
93             memcpy(output, buf, 16);
94         } else {
95             vst1q_u8(__builtin_assume_aligned(output, 8), state);
96         }
97 #endif
98         output += 16;
99     }
100 
101     return SECSuccess;
102 }
103 
104 SECStatus
arm_aes_decrypt_ecb_128(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)105 arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output,
106                         unsigned int *outputLen,
107                         unsigned int maxOutputLen,
108                         const unsigned char *input,
109                         unsigned int inputLen,
110                         unsigned int blocksize)
111 {
112 #if !defined(HAVE_UNALIGNED_ACCESS)
113     pre_align unsigned char buf[16] post_align;
114 #endif
115     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
116     uint8x16_t key11;
117     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
118 
119     if (inputLen == 0) {
120         return SECSuccess;
121     }
122 
123     key1 = vld1q_u8(key);
124     key2 = vld1q_u8(key + 16);
125     key3 = vld1q_u8(key + 32);
126     key4 = vld1q_u8(key + 48);
127     key5 = vld1q_u8(key + 64);
128     key6 = vld1q_u8(key + 80);
129     key7 = vld1q_u8(key + 96);
130     key8 = vld1q_u8(key + 112);
131     key9 = vld1q_u8(key + 128);
132     key10 = vld1q_u8(key + 144);
133     key11 = vld1q_u8(key + 160);
134 
135     while (inputLen > 0) {
136         uint8x16_t state;
137 #if defined(HAVE_UNALIGNED_ACCESS)
138         state = vld1q_u8(input);
139 #else
140         if ((uintptr_t)input & 0x7) {
141             memcpy(buf, input, 16);
142             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
143         } else {
144             state = vld1q_u8(__builtin_assume_aligned(input, 8));
145         }
146 #endif
147         input += 16;
148         inputLen -= 16;
149 
150         /* Rounds */
151         state = vaesdq_u8(state, key11);
152         state = vaesimcq_u8(state);
153         state = vaesdq_u8(state, key10);
154         state = vaesimcq_u8(state);
155         state = vaesdq_u8(state, key9);
156         state = vaesimcq_u8(state);
157         state = vaesdq_u8(state, key8);
158         state = vaesimcq_u8(state);
159         state = vaesdq_u8(state, key7);
160         state = vaesimcq_u8(state);
161         state = vaesdq_u8(state, key6);
162         state = vaesimcq_u8(state);
163         state = vaesdq_u8(state, key5);
164         state = vaesimcq_u8(state);
165         state = vaesdq_u8(state, key4);
166         state = vaesimcq_u8(state);
167         state = vaesdq_u8(state, key3);
168         state = vaesimcq_u8(state);
169         state = vaesdq_u8(state, key2);
170         /* AddRoundKey */
171         state = veorq_u8(state, key1);
172 
173 #if defined(HAVE_UNALIGNED_ACCESS)
174         vst1q_u8(output, state);
175 #else
176         if ((uintptr_t)output & 0x7) {
177             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
178             memcpy(output, buf, 16);
179         } else {
180             vst1q_u8(__builtin_assume_aligned(output, 8), state);
181         }
182 #endif
183         output += 16;
184     }
185 
186     return SECSuccess;
187 }
188 
189 SECStatus
arm_aes_encrypt_cbc_128(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)190 arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output,
191                         unsigned int *outputLen,
192                         unsigned int maxOutputLen,
193                         const unsigned char *input,
194                         unsigned int inputLen,
195                         unsigned int blocksize)
196 {
197 #if !defined(HAVE_UNALIGNED_ACCESS)
198     pre_align unsigned char buf[16] post_align;
199 #endif
200     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
201     uint8x16_t key11;
202     uint8x16_t iv;
203     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
204 
205     if (!inputLen) {
206         return SECSuccess;
207     }
208 
209     /* iv */
210     iv = vld1q_u8(cx->iv);
211 
212     /* expanedKey */
213     key1 = vld1q_u8(key);
214     key2 = vld1q_u8(key + 16);
215     key3 = vld1q_u8(key + 32);
216     key4 = vld1q_u8(key + 48);
217     key5 = vld1q_u8(key + 64);
218     key6 = vld1q_u8(key + 80);
219     key7 = vld1q_u8(key + 96);
220     key8 = vld1q_u8(key + 112);
221     key9 = vld1q_u8(key + 128);
222     key10 = vld1q_u8(key + 144);
223     key11 = vld1q_u8(key + 160);
224 
225     while (inputLen > 0) {
226         uint8x16_t state;
227 #if defined(HAVE_UNALIGNED_ACCESS)
228         state = vld1q_u8(input);
229 #else
230         if ((uintptr_t)input & 0x7) {
231             memcpy(buf, input, 16);
232             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
233         } else {
234             state = vld1q_u8(__builtin_assume_aligned(input, 8));
235         }
236 #endif
237         input += 16;
238         inputLen -= 16;
239 
240         state = veorq_u8(state, iv);
241 
242         /* Rounds */
243         state = vaeseq_u8(state, key1);
244         state = vaesmcq_u8(state);
245         state = vaeseq_u8(state, key2);
246         state = vaesmcq_u8(state);
247         state = vaeseq_u8(state, key3);
248         state = vaesmcq_u8(state);
249         state = vaeseq_u8(state, key4);
250         state = vaesmcq_u8(state);
251         state = vaeseq_u8(state, key5);
252         state = vaesmcq_u8(state);
253         state = vaeseq_u8(state, key6);
254         state = vaesmcq_u8(state);
255         state = vaeseq_u8(state, key7);
256         state = vaesmcq_u8(state);
257         state = vaeseq_u8(state, key8);
258         state = vaesmcq_u8(state);
259         state = vaeseq_u8(state, key9);
260         state = vaesmcq_u8(state);
261         state = vaeseq_u8(state, key10);
262         /* AddRoundKey */
263         state = veorq_u8(state, key11);
264 
265 #if defined(HAVE_UNALIGNED_ACCESS)
266         vst1q_u8(output, state);
267 #else
268         if ((uintptr_t)output & 0x7) {
269             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
270             memcpy(output, buf, 16);
271         } else {
272             vst1q_u8(__builtin_assume_aligned(output, 8), state);
273         }
274 #endif
275         output += 16;
276         iv = state;
277     }
278     vst1q_u8(cx->iv, iv);
279 
280     return SECSuccess;
281 }
282 
283 SECStatus
arm_aes_decrypt_cbc_128(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)284 arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output,
285                         unsigned int *outputLen,
286                         unsigned int maxOutputLen,
287                         const unsigned char *input,
288                         unsigned int inputLen,
289                         unsigned int blocksize)
290 {
291 #if !defined(HAVE_UNALIGNED_ACCESS)
292     pre_align unsigned char buf[16] post_align;
293 #endif
294     uint8x16_t iv;
295     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
296     uint8x16_t key11;
297     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
298 
299     if (!inputLen) {
300         return SECSuccess;
301     }
302 
303     /* iv */
304     iv = vld1q_u8(cx->iv);
305 
306     /* expanedKey */
307     key1 = vld1q_u8(key);
308     key2 = vld1q_u8(key + 16);
309     key3 = vld1q_u8(key + 32);
310     key4 = vld1q_u8(key + 48);
311     key5 = vld1q_u8(key + 64);
312     key6 = vld1q_u8(key + 80);
313     key7 = vld1q_u8(key + 96);
314     key8 = vld1q_u8(key + 112);
315     key9 = vld1q_u8(key + 128);
316     key10 = vld1q_u8(key + 144);
317     key11 = vld1q_u8(key + 160);
318 
319     while (inputLen > 0) {
320         uint8x16_t state, old_state;
321 #if defined(HAVE_UNALIGNED_ACCESS)
322         state = vld1q_u8(input);
323 #else
324         if ((uintptr_t)input & 0x7) {
325             memcpy(buf, input, 16);
326             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
327         } else {
328             state = vld1q_u8(__builtin_assume_aligned(input, 8));
329         }
330 #endif
331         old_state = state;
332         input += 16;
333         inputLen -= 16;
334 
335         /* Rounds */
336         state = vaesdq_u8(state, key11);
337         state = vaesimcq_u8(state);
338         state = vaesdq_u8(state, key10);
339         state = vaesimcq_u8(state);
340         state = vaesdq_u8(state, key9);
341         state = vaesimcq_u8(state);
342         state = vaesdq_u8(state, key8);
343         state = vaesimcq_u8(state);
344         state = vaesdq_u8(state, key7);
345         state = vaesimcq_u8(state);
346         state = vaesdq_u8(state, key6);
347         state = vaesimcq_u8(state);
348         state = vaesdq_u8(state, key5);
349         state = vaesimcq_u8(state);
350         state = vaesdq_u8(state, key4);
351         state = vaesimcq_u8(state);
352         state = vaesdq_u8(state, key3);
353         state = vaesimcq_u8(state);
354         state = vaesdq_u8(state, key2);
355         /* AddRoundKey */
356         state = veorq_u8(state, key1);
357 
358         state = veorq_u8(state, iv);
359 
360 #if defined(HAVE_UNALIGNED_ACCESS)
361         vst1q_u8(output, state);
362 #else
363         if ((uintptr_t)output & 0x7) {
364             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
365             memcpy(output, buf, 16);
366         } else {
367             vst1q_u8(__builtin_assume_aligned(output, 8), state);
368         }
369 #endif
370         output += 16;
371 
372         iv = old_state;
373     }
374     vst1q_u8(cx->iv, iv);
375 
376     return SECSuccess;
377 }
378 
379 SECStatus
arm_aes_encrypt_ecb_192(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)380 arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output,
381                         unsigned int *outputLen,
382                         unsigned int maxOutputLen,
383                         const unsigned char *input,
384                         unsigned int inputLen,
385                         unsigned int blocksize)
386 {
387 #if !defined(HAVE_UNALIGNED_ACCESS)
388     pre_align unsigned char buf[16] post_align;
389 #endif
390     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
391     uint8x16_t key11, key12, key13;
392     PRUint8 *key = (PRUint8 *)cx->k.expandedKey;
393 
394     if (!inputLen) {
395         return SECSuccess;
396     }
397 
398     key1 = vld1q_u8(key);
399     key2 = vld1q_u8(key + 16);
400     key3 = vld1q_u8(key + 32);
401     key4 = vld1q_u8(key + 48);
402     key5 = vld1q_u8(key + 64);
403     key6 = vld1q_u8(key + 80);
404     key7 = vld1q_u8(key + 96);
405     key8 = vld1q_u8(key + 112);
406     key9 = vld1q_u8(key + 128);
407     key10 = vld1q_u8(key + 144);
408     key11 = vld1q_u8(key + 160);
409     key12 = vld1q_u8(key + 176);
410     key13 = vld1q_u8(key + 192);
411 
412     while (inputLen > 0) {
413         uint8x16_t state;
414 #if defined(HAVE_UNALIGNED_ACCESS)
415         state = vld1q_u8(input);
416 #else
417         if ((uintptr_t)input & 0x7) {
418             memcpy(buf, input, 16);
419             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
420         } else {
421             state = vld1q_u8(__builtin_assume_aligned(input, 8));
422         }
423 #endif
424         input += 16;
425         inputLen -= 16;
426 
427         /* Rounds */
428         state = vaeseq_u8(state, key1);
429         state = vaesmcq_u8(state);
430         state = vaeseq_u8(state, key2);
431         state = vaesmcq_u8(state);
432         state = vaeseq_u8(state, key3);
433         state = vaesmcq_u8(state);
434         state = vaeseq_u8(state, key4);
435         state = vaesmcq_u8(state);
436         state = vaeseq_u8(state, key5);
437         state = vaesmcq_u8(state);
438         state = vaeseq_u8(state, key6);
439         state = vaesmcq_u8(state);
440         state = vaeseq_u8(state, key7);
441         state = vaesmcq_u8(state);
442         state = vaeseq_u8(state, key8);
443         state = vaesmcq_u8(state);
444         state = vaeseq_u8(state, key9);
445         state = vaesmcq_u8(state);
446         state = vaeseq_u8(state, key10);
447         state = vaesmcq_u8(state);
448         state = vaeseq_u8(state, key11);
449         state = vaesmcq_u8(state);
450         state = vaeseq_u8(state, key12);
451         /* AddRoundKey */
452         state = veorq_u8(state, key13);
453 
454 #if defined(HAVE_UNALIGNED_ACCESS)
455         vst1q_u8(output, state);
456 #else
457         if ((uintptr_t)output & 0x7) {
458             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
459             memcpy(output, buf, 16);
460         } else {
461             vst1q_u8(__builtin_assume_aligned(output, 8), state);
462         }
463 #endif
464         output += 16;
465     }
466 
467     return SECSuccess;
468 }
469 
470 SECStatus
arm_aes_decrypt_ecb_192(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)471 arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output,
472                         unsigned int *outputLen,
473                         unsigned int maxOutputLen,
474                         const unsigned char *input,
475                         unsigned int inputLen,
476                         unsigned int blocksize)
477 {
478 #if !defined(HAVE_UNALIGNED_ACCESS)
479     pre_align unsigned char buf[16] post_align;
480 #endif
481     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
482     uint8x16_t key11, key12, key13;
483     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
484 
485     if (!inputLen) {
486         return SECSuccess;
487     }
488 
489     key1 = vld1q_u8(key);
490     key2 = vld1q_u8(key + 16);
491     key3 = vld1q_u8(key + 32);
492     key4 = vld1q_u8(key + 48);
493     key5 = vld1q_u8(key + 64);
494     key6 = vld1q_u8(key + 80);
495     key7 = vld1q_u8(key + 96);
496     key8 = vld1q_u8(key + 112);
497     key9 = vld1q_u8(key + 128);
498     key10 = vld1q_u8(key + 144);
499     key11 = vld1q_u8(key + 160);
500     key12 = vld1q_u8(key + 176);
501     key13 = vld1q_u8(key + 192);
502 
503     while (inputLen > 0) {
504         uint8x16_t state;
505 #if defined(HAVE_UNALIGNED_ACCESS)
506         state = vld1q_u8(input);
507 #else
508         if ((uintptr_t)input & 0x7) {
509             memcpy(buf, input, 16);
510             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
511         } else {
512             state = vld1q_u8(__builtin_assume_aligned(input, 8));
513         }
514 #endif
515         input += 16;
516         inputLen -= 16;
517 
518         /* Rounds */
519         state = vaesdq_u8(state, key13);
520         state = vaesimcq_u8(state);
521         state = vaesdq_u8(state, key12);
522         state = vaesimcq_u8(state);
523         state = vaesdq_u8(state, key11);
524         state = vaesimcq_u8(state);
525         state = vaesdq_u8(state, key10);
526         state = vaesimcq_u8(state);
527         state = vaesdq_u8(state, key9);
528         state = vaesimcq_u8(state);
529         state = vaesdq_u8(state, key8);
530         state = vaesimcq_u8(state);
531         state = vaesdq_u8(state, key7);
532         state = vaesimcq_u8(state);
533         state = vaesdq_u8(state, key6);
534         state = vaesimcq_u8(state);
535         state = vaesdq_u8(state, key5);
536         state = vaesimcq_u8(state);
537         state = vaesdq_u8(state, key4);
538         state = vaesimcq_u8(state);
539         state = vaesdq_u8(state, key3);
540         state = vaesimcq_u8(state);
541         state = vaesdq_u8(state, key2);
542         /* AddRoundKey */
543         state = veorq_u8(state, key1);
544 
545 #if defined(HAVE_UNALIGNED_ACCESS)
546         vst1q_u8(output, state);
547 #else
548         if ((uintptr_t)output & 0x7) {
549             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
550             memcpy(output, buf, 16);
551         } else {
552             vst1q_u8(__builtin_assume_aligned(output, 8), state);
553         }
554 #endif
555         output += 16;
556     }
557 
558     return SECSuccess;
559 }
560 
561 SECStatus
arm_aes_encrypt_cbc_192(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)562 arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output,
563                         unsigned int *outputLen,
564                         unsigned int maxOutputLen,
565                         const unsigned char *input,
566                         unsigned int inputLen,
567                         unsigned int blocksize)
568 {
569 #if !defined(HAVE_UNALIGNED_ACCESS)
570     pre_align unsigned char buf[16] post_align;
571 #endif
572     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
573     uint8x16_t key11, key12, key13;
574     uint8x16_t iv;
575     PRUint8 *key = (PRUint8 *)cx->k.expandedKey;
576 
577     if (!inputLen) {
578         return SECSuccess;
579     }
580 
581     /* iv */
582     iv = vld1q_u8(cx->iv);
583 
584     /* expanedKey */
585     key1 = vld1q_u8(key);
586     key2 = vld1q_u8(key + 16);
587     key3 = vld1q_u8(key + 32);
588     key4 = vld1q_u8(key + 48);
589     key5 = vld1q_u8(key + 64);
590     key6 = vld1q_u8(key + 80);
591     key7 = vld1q_u8(key + 96);
592     key8 = vld1q_u8(key + 112);
593     key9 = vld1q_u8(key + 128);
594     key10 = vld1q_u8(key + 144);
595     key11 = vld1q_u8(key + 160);
596     key12 = vld1q_u8(key + 176);
597     key13 = vld1q_u8(key + 192);
598 
599     while (inputLen > 0) {
600         uint8x16_t state;
601 #if defined(HAVE_UNALIGNED_ACCESS)
602         state = vld1q_u8(input);
603 #else
604         if ((uintptr_t)input & 0x7) {
605             memcpy(buf, input, 16);
606             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
607         } else {
608             state = vld1q_u8(__builtin_assume_aligned(input, 8));
609         }
610 #endif
611         input += 16;
612         inputLen -= 16;
613 
614         state = veorq_u8(state, iv);
615 
616         /* Rounds */
617         state = vaeseq_u8(state, key1);
618         state = vaesmcq_u8(state);
619         state = vaeseq_u8(state, key2);
620         state = vaesmcq_u8(state);
621         state = vaeseq_u8(state, key3);
622         state = vaesmcq_u8(state);
623         state = vaeseq_u8(state, key4);
624         state = vaesmcq_u8(state);
625         state = vaeseq_u8(state, key5);
626         state = vaesmcq_u8(state);
627         state = vaeseq_u8(state, key6);
628         state = vaesmcq_u8(state);
629         state = vaeseq_u8(state, key7);
630         state = vaesmcq_u8(state);
631         state = vaeseq_u8(state, key8);
632         state = vaesmcq_u8(state);
633         state = vaeseq_u8(state, key9);
634         state = vaesmcq_u8(state);
635         state = vaeseq_u8(state, key10);
636         state = vaesmcq_u8(state);
637         state = vaeseq_u8(state, key11);
638         state = vaesmcq_u8(state);
639         state = vaeseq_u8(state, key12);
640         state = veorq_u8(state, key13);
641 
642 #if defined(HAVE_UNALIGNED_ACCESS)
643         vst1q_u8(output, state);
644 #else
645         if ((uintptr_t)output & 0x7) {
646             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
647             memcpy(output, buf, 16);
648         } else {
649             vst1q_u8(__builtin_assume_aligned(output, 8), state);
650         }
651 #endif
652         output += 16;
653         iv = state;
654     }
655     vst1q_u8(cx->iv, iv);
656 
657     return SECSuccess;
658 }
659 
660 SECStatus
arm_aes_decrypt_cbc_192(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)661 arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output,
662                         unsigned int *outputLen,
663                         unsigned int maxOutputLen,
664                         const unsigned char *input,
665                         unsigned int inputLen,
666                         unsigned int blocksize)
667 {
668 #if !defined(HAVE_UNALIGNED_ACCESS)
669     pre_align unsigned char buf[16] post_align;
670 #endif
671     uint8x16_t iv;
672     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
673     uint8x16_t key11, key12, key13;
674     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
675 
676     if (!inputLen) {
677         return SECSuccess;
678     }
679 
680     /* iv */
681     iv = vld1q_u8(cx->iv);
682 
683     /* expanedKey */
684     key1 = vld1q_u8(key);
685     key2 = vld1q_u8(key + 16);
686     key3 = vld1q_u8(key + 32);
687     key4 = vld1q_u8(key + 48);
688     key5 = vld1q_u8(key + 64);
689     key6 = vld1q_u8(key + 80);
690     key7 = vld1q_u8(key + 96);
691     key8 = vld1q_u8(key + 112);
692     key9 = vld1q_u8(key + 128);
693     key10 = vld1q_u8(key + 144);
694     key11 = vld1q_u8(key + 160);
695     key12 = vld1q_u8(key + 176);
696     key13 = vld1q_u8(key + 192);
697 
698     while (inputLen > 0) {
699         uint8x16_t state, old_state;
700 #if defined(HAVE_UNALIGNED_ACCESS)
701         state = vld1q_u8(input);
702 #else
703         if ((uintptr_t)input & 0x7) {
704             memcpy(buf, input, 16);
705             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
706         } else {
707             state = vld1q_u8(__builtin_assume_aligned(input, 8));
708         }
709 #endif
710         old_state = state;
711         input += 16;
712         inputLen -= 16;
713 
714         /* Rounds */
715         state = vaesdq_u8(state, key13);
716         state = vaesimcq_u8(state);
717         state = vaesdq_u8(state, key12);
718         state = vaesimcq_u8(state);
719         state = vaesdq_u8(state, key11);
720         state = vaesimcq_u8(state);
721         state = vaesdq_u8(state, key10);
722         state = vaesimcq_u8(state);
723         state = vaesdq_u8(state, key9);
724         state = vaesimcq_u8(state);
725         state = vaesdq_u8(state, key8);
726         state = vaesimcq_u8(state);
727         state = vaesdq_u8(state, key7);
728         state = vaesimcq_u8(state);
729         state = vaesdq_u8(state, key6);
730         state = vaesimcq_u8(state);
731         state = vaesdq_u8(state, key5);
732         state = vaesimcq_u8(state);
733         state = vaesdq_u8(state, key4);
734         state = vaesimcq_u8(state);
735         state = vaesdq_u8(state, key3);
736         state = vaesimcq_u8(state);
737         state = vaesdq_u8(state, key2);
738         /* AddRoundKey */
739         state = veorq_u8(state, key1);
740 
741         state = veorq_u8(state, iv);
742 
743 #if defined(HAVE_UNALIGNED_ACCESS)
744         vst1q_u8(output, state);
745 #else
746         if ((uintptr_t)output & 0x7) {
747             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
748             memcpy(output, buf, 16);
749         } else {
750             vst1q_u8(__builtin_assume_aligned(output, 8), state);
751         }
752 #endif
753         output += 16;
754 
755         iv = old_state;
756     }
757     vst1q_u8(cx->iv, iv);
758 
759     return SECSuccess;
760 }
761 
762 SECStatus
arm_aes_encrypt_ecb_256(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)763 arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output,
764                         unsigned int *outputLen,
765                         unsigned int maxOutputLen,
766                         const unsigned char *input,
767                         unsigned int inputLen,
768                         unsigned int blocksize)
769 {
770 #if !defined(HAVE_UNALIGNED_ACCESS)
771     pre_align unsigned char buf[16] post_align;
772 #endif
773     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
774     uint8x16_t key11, key12, key13, key14, key15;
775     PRUint8 *key = (PRUint8 *)cx->k.expandedKey;
776 
777     if (inputLen == 0) {
778         return SECSuccess;
779     }
780 
781     key1 = vld1q_u8(key);
782     key2 = vld1q_u8(key + 16);
783     key3 = vld1q_u8(key + 32);
784     key4 = vld1q_u8(key + 48);
785     key5 = vld1q_u8(key + 64);
786     key6 = vld1q_u8(key + 80);
787     key7 = vld1q_u8(key + 96);
788     key8 = vld1q_u8(key + 112);
789     key9 = vld1q_u8(key + 128);
790     key10 = vld1q_u8(key + 144);
791     key11 = vld1q_u8(key + 160);
792     key12 = vld1q_u8(key + 176);
793     key13 = vld1q_u8(key + 192);
794     key14 = vld1q_u8(key + 208);
795     key15 = vld1q_u8(key + 224);
796 
797     while (inputLen > 0) {
798         uint8x16_t state;
799 #if defined(HAVE_UNALIGNED_ACCESS)
800         state = vld1q_u8(input);
801 #else
802         if ((uintptr_t)input & 0x7) {
803             memcpy(buf, input, 16);
804             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
805         } else {
806             state = vld1q_u8(__builtin_assume_aligned(input, 8));
807         }
808 #endif
809         input += 16;
810         inputLen -= 16;
811 
812         /* Rounds */
813         state = vaeseq_u8(state, key1);
814         state = vaesmcq_u8(state);
815         state = vaeseq_u8(state, key2);
816         state = vaesmcq_u8(state);
817         state = vaeseq_u8(state, key3);
818         state = vaesmcq_u8(state);
819         state = vaeseq_u8(state, key4);
820         state = vaesmcq_u8(state);
821         state = vaeseq_u8(state, key5);
822         state = vaesmcq_u8(state);
823         state = vaeseq_u8(state, key6);
824         state = vaesmcq_u8(state);
825         state = vaeseq_u8(state, key7);
826         state = vaesmcq_u8(state);
827         state = vaeseq_u8(state, key8);
828         state = vaesmcq_u8(state);
829         state = vaeseq_u8(state, key9);
830         state = vaesmcq_u8(state);
831         state = vaeseq_u8(state, key10);
832         state = vaesmcq_u8(state);
833         state = vaeseq_u8(state, key11);
834         state = vaesmcq_u8(state);
835         state = vaeseq_u8(state, key12);
836         state = vaesmcq_u8(state);
837         state = vaeseq_u8(state, key13);
838         state = vaesmcq_u8(state);
839         state = vaeseq_u8(state, key14);
840         /* AddRoundKey */
841         state = veorq_u8(state, key15);
842 
843 #if defined(HAVE_UNALIGNED_ACCESS)
844         vst1q_u8(output, state);
845 #else
846         if ((uintptr_t)output & 0x7) {
847             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
848             memcpy(output, buf, 16);
849         } else {
850             vst1q_u8(__builtin_assume_aligned(output, 8), state);
851         }
852 #endif
853         output += 16;
854     }
855     return SECSuccess;
856 }
857 
858 SECStatus
arm_aes_decrypt_ecb_256(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)859 arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output,
860                         unsigned int *outputLen,
861                         unsigned int maxOutputLen,
862                         const unsigned char *input,
863                         unsigned int inputLen,
864                         unsigned int blocksize)
865 {
866 #if !defined(HAVE_UNALIGNED_ACCESS)
867     pre_align unsigned char buf[16] post_align;
868 #endif
869     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
870     uint8x16_t key11, key12, key13, key14, key15;
871     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
872 
873     if (!inputLen) {
874         return SECSuccess;
875     }
876 
877     key1 = vld1q_u8(key);
878     key2 = vld1q_u8(key + 16);
879     key3 = vld1q_u8(key + 32);
880     key4 = vld1q_u8(key + 48);
881     key5 = vld1q_u8(key + 64);
882     key6 = vld1q_u8(key + 80);
883     key7 = vld1q_u8(key + 96);
884     key8 = vld1q_u8(key + 112);
885     key9 = vld1q_u8(key + 128);
886     key10 = vld1q_u8(key + 144);
887     key11 = vld1q_u8(key + 160);
888     key12 = vld1q_u8(key + 176);
889     key13 = vld1q_u8(key + 192);
890     key14 = vld1q_u8(key + 208);
891     key15 = vld1q_u8(key + 224);
892 
893     while (inputLen > 0) {
894         uint8x16_t state;
895 #if defined(HAVE_UNALIGNED_ACCESS)
896         state = vld1q_u8(input);
897 #else
898         if ((uintptr_t)input & 0x7) {
899             memcpy(buf, input, 16);
900             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
901         } else {
902             state = vld1q_u8(__builtin_assume_aligned(input, 8));
903         }
904 #endif
905         input += 16;
906         inputLen -= 16;
907 
908         /* Rounds */
909         state = vaesdq_u8(state, key15);
910         state = vaesimcq_u8(state);
911         state = vaesdq_u8(state, key14);
912         state = vaesimcq_u8(state);
913         state = vaesdq_u8(state, key13);
914         state = vaesimcq_u8(state);
915         state = vaesdq_u8(state, key12);
916         state = vaesimcq_u8(state);
917         state = vaesdq_u8(state, key11);
918         state = vaesimcq_u8(state);
919         state = vaesdq_u8(state, key10);
920         state = vaesimcq_u8(state);
921         state = vaesdq_u8(state, key9);
922         state = vaesimcq_u8(state);
923         state = vaesdq_u8(state, key8);
924         state = vaesimcq_u8(state);
925         state = vaesdq_u8(state, key7);
926         state = vaesimcq_u8(state);
927         state = vaesdq_u8(state, key6);
928         state = vaesimcq_u8(state);
929         state = vaesdq_u8(state, key5);
930         state = vaesimcq_u8(state);
931         state = vaesdq_u8(state, key4);
932         state = vaesimcq_u8(state);
933         state = vaesdq_u8(state, key3);
934         state = vaesimcq_u8(state);
935         state = vaesdq_u8(state, key2);
936         /* AddRoundKey */
937         state = veorq_u8(state, key1);
938 
939 #if defined(HAVE_UNALIGNED_ACCESS)
940         vst1q_u8(output, state);
941 #else
942         if ((uintptr_t)output & 0x7) {
943             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
944             memcpy(output, buf, 16);
945         } else {
946             vst1q_u8(__builtin_assume_aligned(output, 8), state);
947         }
948 #endif
949         output += 16;
950     }
951 
952     return SECSuccess;
953 }
954 
955 SECStatus
arm_aes_encrypt_cbc_256(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)956 arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output,
957                         unsigned int *outputLen,
958                         unsigned int maxOutputLen,
959                         const unsigned char *input,
960                         unsigned int inputLen,
961                         unsigned int blocksize)
962 {
963 #if !defined(HAVE_UNALIGNED_ACCESS)
964     pre_align unsigned char buf[16] post_align;
965 #endif
966     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
967     uint8x16_t key11, key12, key13, key14, key15;
968     uint8x16_t iv;
969     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
970 
971     if (!inputLen) {
972         return SECSuccess;
973     }
974 
975     /* iv */
976     iv = vld1q_u8(cx->iv);
977 
978     /* expanedKey */
979     key1 = vld1q_u8(key);
980     key2 = vld1q_u8(key + 16);
981     key3 = vld1q_u8(key + 32);
982     key4 = vld1q_u8(key + 48);
983     key5 = vld1q_u8(key + 64);
984     key6 = vld1q_u8(key + 80);
985     key7 = vld1q_u8(key + 96);
986     key8 = vld1q_u8(key + 112);
987     key9 = vld1q_u8(key + 128);
988     key10 = vld1q_u8(key + 144);
989     key11 = vld1q_u8(key + 160);
990     key12 = vld1q_u8(key + 176);
991     key13 = vld1q_u8(key + 192);
992     key14 = vld1q_u8(key + 208);
993     key15 = vld1q_u8(key + 224);
994 
995     while (inputLen > 0) {
996         uint8x16_t state;
997 #if defined(HAVE_UNALIGNED_ACCESS)
998         state = vld1q_u8(input);
999 #else
1000         if ((uintptr_t)input & 0x7) {
1001             memcpy(buf, input, 16);
1002             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
1003         } else {
1004             state = vld1q_u8(__builtin_assume_aligned(input, 8));
1005         }
1006 #endif
1007         input += 16;
1008         inputLen -= 16;
1009 
1010         state = veorq_u8(state, iv);
1011 
1012         /* Rounds */
1013         state = vaeseq_u8(state, key1);
1014         state = vaesmcq_u8(state);
1015         state = vaeseq_u8(state, key2);
1016         state = vaesmcq_u8(state);
1017         state = vaeseq_u8(state, key3);
1018         state = vaesmcq_u8(state);
1019         state = vaeseq_u8(state, key4);
1020         state = vaesmcq_u8(state);
1021         state = vaeseq_u8(state, key5);
1022         state = vaesmcq_u8(state);
1023         state = vaeseq_u8(state, key6);
1024         state = vaesmcq_u8(state);
1025         state = vaeseq_u8(state, key7);
1026         state = vaesmcq_u8(state);
1027         state = vaeseq_u8(state, key8);
1028         state = vaesmcq_u8(state);
1029         state = vaeseq_u8(state, key9);
1030         state = vaesmcq_u8(state);
1031         state = vaeseq_u8(state, key10);
1032         state = vaesmcq_u8(state);
1033         state = vaeseq_u8(state, key11);
1034         state = vaesmcq_u8(state);
1035         state = vaeseq_u8(state, key12);
1036         state = vaesmcq_u8(state);
1037         state = vaeseq_u8(state, key13);
1038         state = vaesmcq_u8(state);
1039         state = vaeseq_u8(state, key14);
1040         /* AddRoundKey */
1041         state = veorq_u8(state, key15);
1042 
1043 #if defined(HAVE_UNALIGNED_ACCESS)
1044         vst1q_u8(output, state);
1045 #else
1046         if ((uintptr_t)output & 0x7) {
1047             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
1048             memcpy(output, buf, 16);
1049         } else {
1050             vst1q_u8(__builtin_assume_aligned(output, 8), state);
1051         }
1052 #endif
1053         output += 16;
1054         iv = state;
1055     }
1056     vst1q_u8(cx->iv, iv);
1057 
1058     return SECSuccess;
1059 }
1060 
1061 SECStatus
arm_aes_decrypt_cbc_256(AESContext * cx,unsigned char * output,unsigned int * outputLen,unsigned int maxOutputLen,const unsigned char * input,unsigned int inputLen,unsigned int blocksize)1062 arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output,
1063                         unsigned int *outputLen,
1064                         unsigned int maxOutputLen,
1065                         const unsigned char *input,
1066                         unsigned int inputLen,
1067                         unsigned int blocksize)
1068 {
1069 #if !defined(HAVE_UNALIGNED_ACCESS)
1070     pre_align unsigned char buf[16] post_align;
1071 #endif
1072     uint8x16_t iv;
1073     uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
1074     uint8x16_t key11, key12, key13, key14, key15;
1075     const PRUint8 *key = (const PRUint8 *)cx->k.expandedKey;
1076 
1077     if (!inputLen) {
1078         return SECSuccess;
1079     }
1080 
1081     /* iv */
1082     iv = vld1q_u8(cx->iv);
1083 
1084     /* expanedKey */
1085     key1 = vld1q_u8(key);
1086     key2 = vld1q_u8(key + 16);
1087     key3 = vld1q_u8(key + 32);
1088     key4 = vld1q_u8(key + 48);
1089     key5 = vld1q_u8(key + 64);
1090     key6 = vld1q_u8(key + 80);
1091     key7 = vld1q_u8(key + 96);
1092     key8 = vld1q_u8(key + 112);
1093     key9 = vld1q_u8(key + 128);
1094     key10 = vld1q_u8(key + 144);
1095     key11 = vld1q_u8(key + 160);
1096     key12 = vld1q_u8(key + 176);
1097     key13 = vld1q_u8(key + 192);
1098     key14 = vld1q_u8(key + 208);
1099     key15 = vld1q_u8(key + 224);
1100 
1101     while (inputLen > 0) {
1102         uint8x16_t state, old_state;
1103 #if defined(HAVE_UNALIGNED_ACCESS)
1104         state = vld1q_u8(input);
1105 #else
1106         if ((uintptr_t)input & 0x7) {
1107             memcpy(buf, input, 16);
1108             state = vld1q_u8(__builtin_assume_aligned(buf, 16));
1109         } else {
1110             state = vld1q_u8(__builtin_assume_aligned(input, 8));
1111         }
1112 #endif
1113         old_state = state;
1114         input += 16;
1115         inputLen -= 16;
1116 
1117         /* Rounds */
1118         state = vaesdq_u8(state, key15);
1119         state = vaesimcq_u8(state);
1120         state = vaesdq_u8(state, key14);
1121         state = vaesimcq_u8(state);
1122         state = vaesdq_u8(state, key13);
1123         state = vaesimcq_u8(state);
1124         state = vaesdq_u8(state, key12);
1125         state = vaesimcq_u8(state);
1126         state = vaesdq_u8(state, key11);
1127         state = vaesimcq_u8(state);
1128         state = vaesdq_u8(state, key10);
1129         state = vaesimcq_u8(state);
1130         state = vaesdq_u8(state, key9);
1131         state = vaesimcq_u8(state);
1132         state = vaesdq_u8(state, key8);
1133         state = vaesimcq_u8(state);
1134         state = vaesdq_u8(state, key7);
1135         state = vaesimcq_u8(state);
1136         state = vaesdq_u8(state, key6);
1137         state = vaesimcq_u8(state);
1138         state = vaesdq_u8(state, key5);
1139         state = vaesimcq_u8(state);
1140         state = vaesdq_u8(state, key4);
1141         state = vaesimcq_u8(state);
1142         state = vaesdq_u8(state, key3);
1143         state = vaesimcq_u8(state);
1144         state = vaesdq_u8(state, key2);
1145         /* AddRoundKey */
1146         state = veorq_u8(state, key1);
1147 
1148         state = veorq_u8(state, iv);
1149 
1150 #if defined(HAVE_UNALIGNED_ACCESS)
1151         vst1q_u8(output, state);
1152 #else
1153         if ((uintptr_t)output & 0x7) {
1154             vst1q_u8(__builtin_assume_aligned(buf, 16), state);
1155             memcpy(output, buf, 16);
1156         } else {
1157             vst1q_u8(__builtin_assume_aligned(output, 8), state);
1158         }
1159 #endif
1160         output += 16;
1161 
1162         iv = old_state;
1163     }
1164     vst1q_u8(cx->iv, iv);
1165 
1166     return SECSuccess;
1167 }
1168 
1169 #endif
1170