1; LICENSE:
2; This submission to NSS is to be made available under the terms of the
3; Mozilla Public License, v. 2.0. You can obtain one at http:
4; //mozilla.org/MPL/2.0/.
5;###############################################################################
6; Copyright(c) 2014, Intel Corp.
7; Developers and authors:
8; Shay Gueron and Vlad Krasnov
9; Intel Corporation, Israel Development Centre, Haifa, Israel
10; Please send feedback directly to crypto.feedback.alias@intel.com
11
12
13.DATA
14ALIGN 16
15Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
16Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
17Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
18Lcon1 dd 1,1,1,1
19Lcon2 dd 1bh,1bh,1bh,1bh
20
21.CODE
22
23ctx     textequ <rcx>
24output  textequ <rdx>
25input   textequ <r8>
26inputLen textequ <r9d>
27
28
29aes_rnd MACRO i
30    movdqu  xmm8, [i*16 + ctx]
31    aesenc  xmm0, xmm8
32    aesenc  xmm1, xmm8
33    aesenc  xmm2, xmm8
34    aesenc  xmm3, xmm8
35    aesenc  xmm4, xmm8
36    aesenc  xmm5, xmm8
37    aesenc  xmm6, xmm8
38    aesenc  xmm7, xmm8
39    ENDM
40
41aes_last_rnd MACRO i
42    movdqu  xmm8, [i*16 + ctx]
43    aesenclast  xmm0, xmm8
44    aesenclast  xmm1, xmm8
45    aesenclast  xmm2, xmm8
46    aesenclast  xmm3, xmm8
47    aesenclast  xmm4, xmm8
48    aesenclast  xmm5, xmm8
49    aesenclast  xmm6, xmm8
50    aesenclast  xmm7, xmm8
51    ENDM
52
53aes_dec_rnd MACRO i
54    movdqu  xmm8, [i*16 + ctx]
55    aesdec  xmm0, xmm8
56    aesdec  xmm1, xmm8
57    aesdec  xmm2, xmm8
58    aesdec  xmm3, xmm8
59    aesdec  xmm4, xmm8
60    aesdec  xmm5, xmm8
61    aesdec  xmm6, xmm8
62    aesdec  xmm7, xmm8
63    ENDM
64
65aes_dec_last_rnd MACRO i
66    movdqu  xmm8, [i*16 + ctx]
67    aesdeclast  xmm0, xmm8
68    aesdeclast  xmm1, xmm8
69    aesdeclast  xmm2, xmm8
70    aesdeclast  xmm3, xmm8
71    aesdeclast  xmm4, xmm8
72    aesdeclast  xmm5, xmm8
73    aesdeclast  xmm6, xmm8
74    aesdeclast  xmm7, xmm8
75    ENDM
76
77
78gen_aes_ecb_func MACRO enc, rnds
79
80LOCAL   loop8
81LOCAL   loop1
82LOCAL   bail
83
84        xor     inputLen, inputLen
85        mov     input,      [rsp + 1*8 + 8*4]
86        mov     inputLen,   [rsp + 1*8 + 8*5]
87
88        sub     rsp, 3*16
89
90        movdqu  [rsp + 0*16], xmm6
91        movdqu  [rsp + 1*16], xmm7
92        movdqu  [rsp + 2*16], xmm8
93
94loop8:
95        cmp     inputLen, 8*16
96        jb      loop1
97
98        movdqu  xmm0, [0*16 + input]
99        movdqu  xmm1, [1*16 + input]
100        movdqu  xmm2, [2*16 + input]
101        movdqu  xmm3, [3*16 + input]
102        movdqu  xmm4, [4*16 + input]
103        movdqu  xmm5, [5*16 + input]
104        movdqu  xmm6, [6*16 + input]
105        movdqu  xmm7, [7*16 + input]
106
107        movdqu  xmm8, [0*16 + ctx]
108        pxor    xmm0, xmm8
109        pxor    xmm1, xmm8
110        pxor    xmm2, xmm8
111        pxor    xmm3, xmm8
112        pxor    xmm4, xmm8
113        pxor    xmm5, xmm8
114        pxor    xmm6, xmm8
115        pxor    xmm7, xmm8
116
117IF enc eq 1
118        rnd textequ <aes_rnd>
119        lastrnd textequ <aes_last_rnd>
120        aesinst textequ <aesenc>
121        aeslastinst textequ <aesenclast>
122ELSE
123        rnd textequ <aes_dec_rnd>
124        lastrnd textequ <aes_dec_last_rnd>
125        aesinst textequ <aesdec>
126        aeslastinst textequ <aesdeclast>
127ENDIF
128
129        i = 1
130        WHILE i LT rnds
131            rnd i
132            i = i+1
133            ENDM
134        lastrnd rnds
135
136        movdqu  [0*16 + output], xmm0
137        movdqu  [1*16 + output], xmm1
138        movdqu  [2*16 + output], xmm2
139        movdqu  [3*16 + output], xmm3
140        movdqu  [4*16 + output], xmm4
141        movdqu  [5*16 + output], xmm5
142        movdqu  [6*16 + output], xmm6
143        movdqu  [7*16 + output], xmm7
144
145        lea input, [8*16 + input]
146        lea output, [8*16 + output]
147        sub inputLen, 8*16
148        jmp loop8
149
150loop1:
151        cmp     inputLen, 1*16
152        jb      bail
153
154        movdqu  xmm0, [input]
155        movdqu  xmm7, [0*16 + ctx]
156        pxor    xmm0, xmm7
157
158        i = 1
159    WHILE i LT rnds
160            movdqu  xmm7, [i*16 + ctx]
161            aesinst  xmm0, xmm7
162            i = i+1
163        ENDM
164        movdqu  xmm7, [rnds*16 + ctx]
165        aeslastinst xmm0, xmm7
166
167        movdqu  [output], xmm0
168
169        lea input, [1*16 + input]
170        lea output, [1*16 + output]
171        sub inputLen, 1*16
172        jmp loop1
173
174bail:
175        xor rax, rax
176
177        movdqu  xmm6, [rsp + 0*16]
178        movdqu  xmm7, [rsp + 1*16]
179        movdqu  xmm8, [rsp + 2*16]
180        add     rsp, 3*16
181        ret
182ENDM
183
184intel_aes_encrypt_ecb_128 PROC
185gen_aes_ecb_func 1, 10
186intel_aes_encrypt_ecb_128 ENDP
187
188intel_aes_encrypt_ecb_192 PROC
189gen_aes_ecb_func 1, 12
190intel_aes_encrypt_ecb_192 ENDP
191
192intel_aes_encrypt_ecb_256 PROC
193gen_aes_ecb_func 1, 14
194intel_aes_encrypt_ecb_256 ENDP
195
196intel_aes_decrypt_ecb_128 PROC
197gen_aes_ecb_func 0, 10
198intel_aes_decrypt_ecb_128 ENDP
199
200intel_aes_decrypt_ecb_192 PROC
201gen_aes_ecb_func 0, 12
202intel_aes_decrypt_ecb_192 ENDP
203
204intel_aes_decrypt_ecb_256 PROC
205gen_aes_ecb_func 0, 14
206intel_aes_decrypt_ecb_256 ENDP
207
208
209KEY textequ <rcx>
210KS  textequ <rdx>
211ITR textequ <r8>
212
213intel_aes_encrypt_init_128  PROC
214
215    movdqu  xmm1, [KEY]
216    movdqu  [KS], xmm1
217    movdqa  xmm2, xmm1
218
219    lea ITR, Lcon1
220    movdqa  xmm0, [ITR]
221    lea ITR, Lmask
222    movdqa  xmm4, [ITR]
223
224    mov ITR, 8
225
226Lenc_128_ks_loop:
227        lea KS, [16 + KS]
228        dec ITR
229
230        pshufb  xmm2, xmm4
231        aesenclast  xmm2, xmm0
232        pslld   xmm0, 1
233        movdqa  xmm3, xmm1
234        pslldq  xmm3, 4
235        pxor    xmm1, xmm3
236        pslldq  xmm3, 4
237        pxor    xmm1, xmm3
238        pslldq  xmm3, 4
239        pxor    xmm1, xmm3
240        pxor    xmm1, xmm2
241        movdqu  [KS], xmm1
242        movdqa  xmm2, xmm1
243
244        jne Lenc_128_ks_loop
245
246    lea ITR, Lcon2
247    movdqa  xmm0, [ITR]
248
249    pshufb  xmm2, xmm4
250    aesenclast  xmm2, xmm0
251    pslld   xmm0, 1
252    movdqa  xmm3, xmm1
253    pslldq  xmm3, 4
254    pxor    xmm1, xmm3
255    pslldq  xmm3, 4
256    pxor    xmm1, xmm3
257    pslldq  xmm3, 4
258    pxor    xmm1, xmm3
259    pxor    xmm1, xmm2
260    movdqu  [16 + KS], xmm1
261    movdqa  xmm2, xmm1
262
263    pshufb  xmm2, xmm4
264    aesenclast  xmm2, xmm0
265    movdqa  xmm3, xmm1
266    pslldq  xmm3, 4
267    pxor    xmm1, xmm3
268    pslldq  xmm3, 4
269    pxor    xmm1, xmm3
270    pslldq  xmm3, 4
271    pxor    xmm1, xmm3
272    pxor    xmm1, xmm2
273    movdqu  [32 + KS], xmm1
274    movdqa  xmm2, xmm1
275
276    ret
277intel_aes_encrypt_init_128  ENDP
278
279
280intel_aes_decrypt_init_128  PROC
281
282    push    KS
283    push    KEY
284
285    call    intel_aes_encrypt_init_128
286
287    pop     KEY
288    pop     KS
289
290    movdqu  xmm0, [0*16 + KS]
291    movdqu  xmm1, [10*16 + KS]
292    movdqu  [10*16 + KS], xmm0
293    movdqu  [0*16 + KS], xmm1
294
295    i = 1
296    WHILE i LT 5
297        movdqu  xmm0, [i*16 + KS]
298        movdqu  xmm1, [(10-i)*16 + KS]
299
300        aesimc  xmm0, xmm0
301        aesimc  xmm1, xmm1
302
303        movdqu  [(10-i)*16 + KS], xmm0
304        movdqu  [i*16 + KS], xmm1
305
306        i = i+1
307    ENDM
308
309    movdqu  xmm0, [5*16 + KS]
310    aesimc  xmm0, xmm0
311    movdqu  [5*16 + KS], xmm0
312    ret
313intel_aes_decrypt_init_128  ENDP
314
315
316intel_aes_encrypt_init_192  PROC
317
318    sub     rsp, 16*2
319    movdqu  [16*0 + rsp], xmm6
320    movdqu  [16*1 + rsp], xmm7
321
322    movdqu  xmm1, [KEY]
323    mov     ITR, [16 + KEY]
324    movd    xmm3, ITR
325
326    movdqu  [KS], xmm1
327    movdqa  xmm5, xmm3
328
329    lea ITR, Lcon1
330    movdqu  xmm0, [ITR]
331    lea ITR, Lmask192
332    movdqu  xmm4, [ITR]
333
334    mov ITR, 4
335
336Lenc_192_ks_loop:
337        movdqa  xmm2, xmm3
338        pshufb  xmm2, xmm4
339        aesenclast xmm2, xmm0
340        pslld   xmm0, 1
341
342        movdqa  xmm6, xmm1
343        movdqa  xmm7, xmm3
344        pslldq  xmm6, 4
345        pslldq  xmm7, 4
346        pxor    xmm1, xmm6
347        pxor    xmm3, xmm7
348        pslldq  xmm6, 4
349        pxor    xmm1, xmm6
350        pslldq  xmm6, 4
351        pxor    xmm1, xmm6
352        pxor    xmm1, xmm2
353        pshufd  xmm2, xmm1, 0ffh
354        pxor    xmm3, xmm2
355
356        movdqa  xmm6, xmm1
357        shufpd  xmm5, xmm1, 00h
358        shufpd  xmm6, xmm3, 01h
359
360        movdqu  [16 + KS], xmm5
361        movdqu  [32 + KS], xmm6
362
363        movdqa  xmm2, xmm3
364        pshufb  xmm2, xmm4
365        aesenclast  xmm2, xmm0
366        pslld   xmm0, 1
367
368        movdqa  xmm6, xmm1
369        movdqa  xmm7, xmm3
370        pslldq  xmm6, 4
371        pslldq  xmm7, 4
372        pxor    xmm1, xmm6
373        pxor    xmm3, xmm7
374        pslldq  xmm6, 4
375        pxor    xmm1, xmm6
376        pslldq  xmm6, 4
377        pxor    xmm1, xmm6
378        pxor    xmm1, xmm2
379        pshufd  xmm2, xmm1, 0ffh
380        pxor    xmm3, xmm2
381
382        movdqu  [48 + KS], xmm1
383        movdqa  xmm5, xmm3
384
385        lea KS, [48 + KS]
386
387        dec ITR
388        jnz Lenc_192_ks_loop
389
390    movdqu  [16 + KS], xmm5
391
392    movdqu  xmm7, [16*1 + rsp]
393    movdqu  xmm6, [16*0 + rsp]
394    add rsp, 16*2
395    ret
396intel_aes_encrypt_init_192  ENDP
397
398intel_aes_decrypt_init_192  PROC
399    push    KS
400    push    KEY
401
402    call    intel_aes_encrypt_init_192
403
404    pop     KEY
405    pop     KS
406
407    movdqu  xmm0, [0*16 + KS]
408    movdqu  xmm1, [12*16 + KS]
409    movdqu  [12*16 + KS], xmm0
410    movdqu  [0*16 + KS], xmm1
411
412    i = 1
413    WHILE i LT 6
414        movdqu  xmm0, [i*16 + KS]
415        movdqu  xmm1, [(12-i)*16 + KS]
416
417        aesimc  xmm0, xmm0
418        aesimc  xmm1, xmm1
419
420        movdqu  [(12-i)*16 + KS], xmm0
421        movdqu  [i*16 + KS], xmm1
422
423        i = i+1
424    ENDM
425
426    movdqu  xmm0, [6*16 + KS]
427    aesimc  xmm0, xmm0
428    movdqu  [6*16 + KS], xmm0
429    ret
430intel_aes_decrypt_init_192  ENDP
431
432
433intel_aes_encrypt_init_256  PROC
434    sub     rsp, 16*2
435    movdqu  [16*0 + rsp], xmm6
436    movdqu  [16*1 + rsp], xmm7
437
438    movdqu  xmm1, [16*0 + KEY]
439    movdqu  xmm3, [16*1 + KEY]
440
441    movdqu  [16*0 + KS], xmm1
442    movdqu  [16*1 + KS], xmm3
443
444    lea ITR, Lcon1
445    movdqu  xmm0, [ITR]
446    lea ITR, Lmask256
447    movdqu  xmm5, [ITR]
448
449    pxor    xmm6, xmm6
450
451    mov ITR, 6
452
453Lenc_256_ks_loop:
454
455        movdqa  xmm2, xmm3
456        pshufb  xmm2, xmm5
457        aesenclast  xmm2, xmm0
458        pslld   xmm0, 1
459        movdqa  xmm4, xmm1
460        pslldq  xmm4, 4
461        pxor    xmm1, xmm4
462        pslldq  xmm4, 4
463        pxor    xmm1, xmm4
464        pslldq  xmm4, 4
465        pxor    xmm1, xmm4
466        pxor    xmm1, xmm2
467        movdqu  [16*2 + KS], xmm1
468
469        pshufd  xmm2, xmm1, 0ffh
470        aesenclast  xmm2, xmm6
471        movdqa  xmm4, xmm3
472        pslldq  xmm4, 4
473        pxor    xmm3, xmm4
474        pslldq  xmm4, 4
475        pxor    xmm3, xmm4
476        pslldq  xmm4, 4
477        pxor    xmm3, xmm4
478        pxor    xmm3, xmm2
479        movdqu  [16*3 + KS], xmm3
480
481        lea KS, [32 + KS]
482        dec ITR
483        jnz Lenc_256_ks_loop
484
485    movdqa  xmm2, xmm3
486    pshufb  xmm2, xmm5
487    aesenclast  xmm2, xmm0
488    movdqa  xmm4, xmm1
489    pslldq  xmm4, 4
490    pxor    xmm1, xmm4
491    pslldq  xmm4, 4
492    pxor    xmm1, xmm4
493    pslldq  xmm4, 4
494    pxor    xmm1, xmm4
495    pxor    xmm1, xmm2
496    movdqu  [16*2 + KS], xmm1
497
498    movdqu  xmm7, [16*1 + rsp]
499    movdqu  xmm6, [16*0 + rsp]
500    add rsp, 16*2
501    ret
502
503intel_aes_encrypt_init_256  ENDP
504
505
506intel_aes_decrypt_init_256  PROC
507    push    KS
508    push    KEY
509
510    call    intel_aes_encrypt_init_256
511
512    pop     KEY
513    pop     KS
514
515    movdqu  xmm0, [0*16 + KS]
516    movdqu  xmm1, [14*16 + KS]
517    movdqu  [14*16 + KS], xmm0
518    movdqu  [0*16 + KS], xmm1
519
520    i = 1
521    WHILE i LT 7
522        movdqu  xmm0, [i*16 + KS]
523        movdqu  xmm1, [(14-i)*16 + KS]
524
525        aesimc  xmm0, xmm0
526        aesimc  xmm1, xmm1
527
528        movdqu  [(14-i)*16 + KS], xmm0
529        movdqu  [i*16 + KS], xmm1
530
531        i = i+1
532    ENDM
533
534    movdqu  xmm0, [7*16 + KS]
535    aesimc  xmm0, xmm0
536    movdqu  [7*16 + KS], xmm0
537    ret
538intel_aes_decrypt_init_256  ENDP
539
540
541
542gen_aes_cbc_enc_func MACRO rnds
543
544LOCAL   loop1
545LOCAL   bail
546
547        mov     input,      [rsp + 1*8 + 8*4]
548        mov     inputLen,   [rsp + 1*8 + 8*5]
549
550        sub     rsp, 3*16
551
552        movdqu  [rsp + 0*16], xmm6
553        movdqu  [rsp + 1*16], xmm7
554        movdqu  [rsp + 2*16], xmm8
555
556        movdqu  xmm0, [256+ctx]
557
558        movdqu  xmm2, [0*16 + ctx]
559        movdqu  xmm3, [1*16 + ctx]
560        movdqu  xmm4, [2*16 + ctx]
561        movdqu  xmm5, [3*16 + ctx]
562        movdqu  xmm6, [4*16 + ctx]
563        movdqu  xmm7, [5*16 + ctx]
564
565loop1:
566        cmp     inputLen, 1*16
567        jb      bail
568
569        movdqu  xmm1, [input]
570        pxor    xmm1, xmm2
571        pxor    xmm0, xmm1
572
573        aesenc  xmm0, xmm3
574        aesenc  xmm0, xmm4
575        aesenc  xmm0, xmm5
576        aesenc  xmm0, xmm6
577        aesenc  xmm0, xmm7
578
579        i = 6
580    WHILE i LT rnds
581            movdqu  xmm8, [i*16 + ctx]
582            aesenc  xmm0, xmm8
583            i = i+1
584        ENDM
585        movdqu  xmm8, [rnds*16 + ctx]
586        aesenclast xmm0, xmm8
587
588        movdqu  [output], xmm0
589
590        lea input, [1*16 + input]
591        lea output, [1*16 + output]
592        sub inputLen, 1*16
593        jmp loop1
594
595bail:
596        movdqu  [256+ctx], xmm0
597
598        xor rax, rax
599
600        movdqu  xmm6, [rsp + 0*16]
601        movdqu  xmm7, [rsp + 1*16]
602        movdqu  xmm8, [rsp + 2*16]
603        add     rsp, 3*16
604        ret
605
606ENDM
607
608gen_aes_cbc_dec_func MACRO rnds
609
610LOCAL   loop8
611LOCAL   loop1
612LOCAL   dec1
613LOCAL   bail
614
615        mov     input,      [rsp + 1*8 + 8*4]
616        mov     inputLen,   [rsp + 1*8 + 8*5]
617
618        sub     rsp, 3*16
619
620        movdqu  [rsp + 0*16], xmm6
621        movdqu  [rsp + 1*16], xmm7
622        movdqu  [rsp + 2*16], xmm8
623
624loop8:
625        cmp     inputLen, 8*16
626        jb      dec1
627
628        movdqu  xmm0, [0*16 + input]
629        movdqu  xmm1, [1*16 + input]
630        movdqu  xmm2, [2*16 + input]
631        movdqu  xmm3, [3*16 + input]
632        movdqu  xmm4, [4*16 + input]
633        movdqu  xmm5, [5*16 + input]
634        movdqu  xmm6, [6*16 + input]
635        movdqu  xmm7, [7*16 + input]
636
637        movdqu  xmm8, [0*16 + ctx]
638        pxor    xmm0, xmm8
639        pxor    xmm1, xmm8
640        pxor    xmm2, xmm8
641        pxor    xmm3, xmm8
642        pxor    xmm4, xmm8
643        pxor    xmm5, xmm8
644        pxor    xmm6, xmm8
645        pxor    xmm7, xmm8
646
647        i = 1
648        WHILE i LT rnds
649            aes_dec_rnd i
650            i = i+1
651            ENDM
652        aes_dec_last_rnd rnds
653
654        movdqu  xmm8, [256 + ctx]
655        pxor    xmm0, xmm8
656        movdqu  xmm8, [0*16 + input]
657        pxor    xmm1, xmm8
658        movdqu  xmm8, [1*16 + input]
659        pxor    xmm2, xmm8
660        movdqu  xmm8, [2*16 + input]
661        pxor    xmm3, xmm8
662        movdqu  xmm8, [3*16 + input]
663        pxor    xmm4, xmm8
664        movdqu  xmm8, [4*16 + input]
665        pxor    xmm5, xmm8
666        movdqu  xmm8, [5*16 + input]
667        pxor    xmm6, xmm8
668        movdqu  xmm8, [6*16 + input]
669        pxor    xmm7, xmm8
670        movdqu  xmm8, [7*16 + input]
671
672        movdqu  [0*16 + output], xmm0
673        movdqu  [1*16 + output], xmm1
674        movdqu  [2*16 + output], xmm2
675        movdqu  [3*16 + output], xmm3
676        movdqu  [4*16 + output], xmm4
677        movdqu  [5*16 + output], xmm5
678        movdqu  [6*16 + output], xmm6
679        movdqu  [7*16 + output], xmm7
680        movdqu  [256 + ctx], xmm8
681
682        lea input, [8*16 + input]
683        lea output, [8*16 + output]
684        sub inputLen, 8*16
685        jmp loop8
686dec1:
687
688        movdqu  xmm3, [256 + ctx]
689
690loop1:
691        cmp     inputLen, 1*16
692        jb      bail
693
694        movdqu  xmm0, [input]
695        movdqa  xmm4, xmm0
696        movdqu  xmm7, [0*16 + ctx]
697        pxor    xmm0, xmm7
698
699        i = 1
700    WHILE i LT rnds
701            movdqu  xmm7, [i*16 + ctx]
702            aesdec  xmm0, xmm7
703            i = i+1
704        ENDM
705        movdqu  xmm7, [rnds*16 + ctx]
706        aesdeclast xmm0, xmm7
707        pxor    xmm3, xmm0
708
709        movdqu  [output], xmm3
710        movdqa  xmm3, xmm4
711
712        lea input, [1*16 + input]
713        lea output, [1*16 + output]
714        sub inputLen, 1*16
715        jmp loop1
716
717bail:
718        movdqu  [256 + ctx], xmm3
719        xor rax, rax
720
721        movdqu  xmm6, [rsp + 0*16]
722        movdqu  xmm7, [rsp + 1*16]
723        movdqu  xmm8, [rsp + 2*16]
724        add     rsp, 3*16
725        ret
726ENDM
727
728intel_aes_encrypt_cbc_128 PROC
729gen_aes_cbc_enc_func  10
730intel_aes_encrypt_cbc_128 ENDP
731
732intel_aes_encrypt_cbc_192 PROC
733gen_aes_cbc_enc_func  12
734intel_aes_encrypt_cbc_192 ENDP
735
736intel_aes_encrypt_cbc_256 PROC
737gen_aes_cbc_enc_func  14
738intel_aes_encrypt_cbc_256 ENDP
739
740intel_aes_decrypt_cbc_128 PROC
741gen_aes_cbc_dec_func  10
742intel_aes_decrypt_cbc_128 ENDP
743
744intel_aes_decrypt_cbc_192 PROC
745gen_aes_cbc_dec_func  12
746intel_aes_decrypt_cbc_192 ENDP
747
748intel_aes_decrypt_cbc_256 PROC
749gen_aes_cbc_dec_func  14
750intel_aes_decrypt_cbc_256 ENDP
751
752
753
754ctrCtx textequ <r10>
755CTR textequ <r11d>
756CTRSave textequ <eax>
757
758gen_aes_ctr_func MACRO rnds
759
760LOCAL   loop8
761LOCAL   loop1
762LOCAL   enc1
763LOCAL   bail
764
765        mov     input,      [rsp + 8*1 + 4*8]
766        mov     inputLen,   [rsp + 8*1 + 5*8]
767
768        mov     ctrCtx, ctx
769        mov     ctx, [8+ctrCtx]
770
771        sub     rsp, 3*16
772        movdqu  [rsp + 0*16], xmm6
773        movdqu  [rsp + 1*16], xmm7
774        movdqu  [rsp + 2*16], xmm8
775
776
777        push    rbp
778        mov     rbp, rsp
779        sub     rsp, 8*16
780        and     rsp, -16
781
782
783        movdqu  xmm0, [16+ctrCtx]
784        mov     CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
785        bswap   CTRSave
786        movdqu  xmm1, [ctx + 0*16]
787
788        pxor    xmm0, xmm1
789
790        movdqa  [rsp + 0*16], xmm0
791        movdqa  [rsp + 1*16], xmm0
792        movdqa  [rsp + 2*16], xmm0
793        movdqa  [rsp + 3*16], xmm0
794        movdqa  [rsp + 4*16], xmm0
795        movdqa  [rsp + 5*16], xmm0
796        movdqa  [rsp + 6*16], xmm0
797        movdqa  [rsp + 7*16], xmm0
798
799        inc     CTRSave
800        mov     CTR, CTRSave
801        bswap   CTR
802        xor     CTR, DWORD PTR [ctx + 3*4]
803        mov     DWORD PTR [rsp + 1*16 + 3*4], CTR
804
805        inc     CTRSave
806        mov     CTR, CTRSave
807        bswap   CTR
808        xor     CTR, DWORD PTR [ctx + 3*4]
809        mov     DWORD PTR [rsp + 2*16 + 3*4], CTR
810
811        inc     CTRSave
812        mov     CTR, CTRSave
813        bswap   CTR
814        xor     CTR, DWORD PTR [ctx + 3*4]
815        mov     DWORD PTR [rsp + 3*16 + 3*4], CTR
816
817        inc     CTRSave
818        mov     CTR, CTRSave
819        bswap   CTR
820        xor     CTR, DWORD PTR [ctx + 3*4]
821        mov     DWORD PTR [rsp + 4*16 + 3*4], CTR
822
823        inc     CTRSave
824        mov     CTR, CTRSave
825        bswap   CTR
826        xor     CTR, DWORD PTR [ctx + 3*4]
827        mov     DWORD PTR [rsp + 5*16 + 3*4], CTR
828
829        inc     CTRSave
830        mov     CTR, CTRSave
831        bswap   CTR
832        xor     CTR, DWORD PTR [ctx + 3*4]
833        mov     DWORD PTR [rsp + 6*16 + 3*4], CTR
834
835        inc     CTRSave
836        mov     CTR, CTRSave
837        bswap   CTR
838        xor     CTR, DWORD PTR [ctx + 3*4]
839        mov     DWORD PTR [rsp + 7*16 + 3*4], CTR
840
841
842loop8:
843        cmp     inputLen, 8*16
844        jb      loop1
845
846        movdqu  xmm0, [0*16 + rsp]
847        movdqu  xmm1, [1*16 + rsp]
848        movdqu  xmm2, [2*16 + rsp]
849        movdqu  xmm3, [3*16 + rsp]
850        movdqu  xmm4, [4*16 + rsp]
851        movdqu  xmm5, [5*16 + rsp]
852        movdqu  xmm6, [6*16 + rsp]
853        movdqu  xmm7, [7*16 + rsp]
854
855        i = 1
856        WHILE i LE 8
857            aes_rnd i
858
859            inc     CTRSave
860            mov     CTR, CTRSave
861            bswap   CTR
862            xor     CTR, DWORD PTR [ctx + 3*4]
863            mov     DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
864
865            i = i+1
866        ENDM
867        WHILE i LT rnds
868            aes_rnd i
869            i = i+1
870            ENDM
871        aes_last_rnd rnds
872
873        movdqu  xmm8, [0*16 + input]
874        pxor    xmm0, xmm8
875        movdqu  xmm8, [1*16 + input]
876        pxor    xmm1, xmm8
877        movdqu  xmm8, [2*16 + input]
878        pxor    xmm2, xmm8
879        movdqu  xmm8, [3*16 + input]
880        pxor    xmm3, xmm8
881        movdqu  xmm8, [4*16 + input]
882        pxor    xmm4, xmm8
883        movdqu  xmm8, [5*16 + input]
884        pxor    xmm5, xmm8
885        movdqu  xmm8, [6*16 + input]
886        pxor    xmm6, xmm8
887        movdqu  xmm8, [7*16 + input]
888        pxor    xmm7, xmm8
889
890        movdqu  [0*16 + output], xmm0
891        movdqu  [1*16 + output], xmm1
892        movdqu  [2*16 + output], xmm2
893        movdqu  [3*16 + output], xmm3
894        movdqu  [4*16 + output], xmm4
895        movdqu  [5*16 + output], xmm5
896        movdqu  [6*16 + output], xmm6
897        movdqu  [7*16 + output], xmm7
898
899        lea input, [8*16 + input]
900        lea output, [8*16 + output]
901        sub inputLen, 8*16
902        jmp loop8
903
904
905loop1:
906        cmp     inputLen, 1*16
907        jb      bail
908
909        movdqu  xmm0, [rsp]
910        add     rsp, 16
911
912        i = 1
913    WHILE i LT rnds
914            movdqu  xmm7, [i*16 + ctx]
915            aesenc  xmm0, xmm7
916            i = i+1
917        ENDM
918        movdqu  xmm7, [rnds*16 + ctx]
919        aesenclast xmm0, xmm7
920
921        movdqu  xmm7, [input]
922        pxor    xmm0, xmm7
923        movdqu  [output], xmm0
924
925        lea input, [1*16 + input]
926        lea output, [1*16 + output]
927        sub inputLen, 1*16
928        jmp loop1
929
930bail:
931
932        movdqu  xmm0, [rsp]
933        movdqu  xmm1, [ctx + 0*16]
934        pxor    xmm0, xmm1
935        movdqu  [16+ctrCtx], xmm0
936
937
938        xor     rax, rax
939        mov     rsp, rbp
940        pop     rbp
941
942        movdqu  xmm6, [rsp + 0*16]
943        movdqu  xmm7, [rsp + 1*16]
944        movdqu  xmm8, [rsp + 2*16]
945        add     rsp, 3*16
946
947        ret
948ENDM
949
950
951intel_aes_encrypt_ctr_128 PROC
952gen_aes_ctr_func  10
953intel_aes_encrypt_ctr_128 ENDP
954
955intel_aes_encrypt_ctr_192 PROC
956gen_aes_ctr_func  12
957intel_aes_encrypt_ctr_192 ENDP
958
959intel_aes_encrypt_ctr_256 PROC
960gen_aes_ctr_func  14
961intel_aes_encrypt_ctr_256 ENDP
962
963
964END
965