1; LICENSE:
2; This submission to NSS is to be made available under the terms of the
3; Mozilla Public License, v. 2.0. You can obtain one at http:
4; //mozilla.org/MPL/2.0/.
5;###############################################################################
6; Copyright(c) 2014, Intel Corp.
7; Developers and authors:
8; Shay Gueron and Vlad Krasnov
9; Intel Corporation, Israel Development Centre, Haifa, Israel
10; Please send feedback directly to crypto.feedback.alias@intel.com
11
12
13.DATA
14ALIGN 16
15Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
16Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
17Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
18Lcon1 dd 1,1,1,1
19Lcon2 dd 1bh,1bh,1bh,1bh
20
21.CODE
22
23ctx     textequ <rcx>
24output  textequ <rdx>
25input   textequ <r8>
26inputLen textequ <r9d>
27
28
29aes_rnd MACRO i
30    movdqu  xmm8, [i*16 + ctx]
31    aesenc  xmm0, xmm8
32    aesenc  xmm1, xmm8
33    aesenc  xmm2, xmm8
34    aesenc  xmm3, xmm8
35    aesenc  xmm4, xmm8
36    aesenc  xmm5, xmm8
37    aesenc  xmm6, xmm8
38    aesenc  xmm7, xmm8
39    ENDM
40
41aes_last_rnd MACRO i
42    movdqu  xmm8, [i*16 + ctx]
43    aesenclast  xmm0, xmm8
44    aesenclast  xmm1, xmm8
45    aesenclast  xmm2, xmm8
46    aesenclast  xmm3, xmm8
47    aesenclast  xmm4, xmm8
48    aesenclast  xmm5, xmm8
49    aesenclast  xmm6, xmm8
50    aesenclast  xmm7, xmm8
51    ENDM
52
53aes_dec_rnd MACRO i
54    movdqu  xmm8, [i*16 + ctx]
55    aesdec  xmm0, xmm8
56    aesdec  xmm1, xmm8
57    aesdec  xmm2, xmm8
58    aesdec  xmm3, xmm8
59    aesdec  xmm4, xmm8
60    aesdec  xmm5, xmm8
61    aesdec  xmm6, xmm8
62    aesdec  xmm7, xmm8
63    ENDM
64
65aes_dec_last_rnd MACRO i
66    movdqu  xmm8, [i*16 + ctx]
67    aesdeclast  xmm0, xmm8
68    aesdeclast  xmm1, xmm8
69    aesdeclast  xmm2, xmm8
70    aesdeclast  xmm3, xmm8
71    aesdeclast  xmm4, xmm8
72    aesdeclast  xmm5, xmm8
73    aesdeclast  xmm6, xmm8
74    aesdeclast  xmm7, xmm8
75    ENDM
76
77
78gen_aes_ecb_func MACRO enc, rnds
79
80LOCAL   loop8
81LOCAL   loop1
82LOCAL   bail
83
84        xor     inputLen, inputLen
85        mov     input,      [rsp + 1*8 + 8*4]
86        mov     inputLen,   [rsp + 1*8 + 8*5]
87
88        sub     rsp, 3*16
89
90        movdqu  [rsp + 0*16], xmm6
91        movdqu  [rsp + 1*16], xmm7
92        movdqu  [rsp + 2*16], xmm8
93
94        lea     ctx, [48+ctx]
95
96loop8:
97        cmp     inputLen, 8*16
98        jb      loop1
99
100        movdqu  xmm0, [0*16 + input]
101        movdqu  xmm1, [1*16 + input]
102        movdqu  xmm2, [2*16 + input]
103        movdqu  xmm3, [3*16 + input]
104        movdqu  xmm4, [4*16 + input]
105        movdqu  xmm5, [5*16 + input]
106        movdqu  xmm6, [6*16 + input]
107        movdqu  xmm7, [7*16 + input]
108
109        movdqu  xmm8, [0*16 + ctx]
110        pxor    xmm0, xmm8
111        pxor    xmm1, xmm8
112        pxor    xmm2, xmm8
113        pxor    xmm3, xmm8
114        pxor    xmm4, xmm8
115        pxor    xmm5, xmm8
116        pxor    xmm6, xmm8
117        pxor    xmm7, xmm8
118
119IF enc eq 1
120        rnd textequ <aes_rnd>
121        lastrnd textequ <aes_last_rnd>
122        aesinst textequ <aesenc>
123        aeslastinst textequ <aesenclast>
124ELSE
125        rnd textequ <aes_dec_rnd>
126        lastrnd textequ <aes_dec_last_rnd>
127        aesinst textequ <aesdec>
128        aeslastinst textequ <aesdeclast>
129ENDIF
130
131        i = 1
132        WHILE i LT rnds
133            rnd i
134            i = i+1
135            ENDM
136        lastrnd rnds
137
138        movdqu  [0*16 + output], xmm0
139        movdqu  [1*16 + output], xmm1
140        movdqu  [2*16 + output], xmm2
141        movdqu  [3*16 + output], xmm3
142        movdqu  [4*16 + output], xmm4
143        movdqu  [5*16 + output], xmm5
144        movdqu  [6*16 + output], xmm6
145        movdqu  [7*16 + output], xmm7
146
147        lea input, [8*16 + input]
148        lea output, [8*16 + output]
149        sub inputLen, 8*16
150        jmp loop8
151
152loop1:
153        cmp     inputLen, 1*16
154        jb      bail
155
156        movdqu  xmm0, [input]
157        movdqu  xmm7, [0*16 + ctx]
158        pxor    xmm0, xmm7
159
160        i = 1
161    WHILE i LT rnds
162            movdqu  xmm7, [i*16 + ctx]
163            aesinst  xmm0, xmm7
164            i = i+1
165        ENDM
166        movdqu  xmm7, [rnds*16 + ctx]
167        aeslastinst xmm0, xmm7
168
169        movdqu  [output], xmm0
170
171        lea input, [1*16 + input]
172        lea output, [1*16 + output]
173        sub inputLen, 1*16
174        jmp loop1
175
176bail:
177        xor rax, rax
178
179        movdqu  xmm6, [rsp + 0*16]
180        movdqu  xmm7, [rsp + 1*16]
181        movdqu  xmm8, [rsp + 2*16]
182        add     rsp, 3*16
183        ret
184ENDM
185
186intel_aes_encrypt_ecb_128 PROC
187gen_aes_ecb_func 1, 10
188intel_aes_encrypt_ecb_128 ENDP
189
190intel_aes_encrypt_ecb_192 PROC
191gen_aes_ecb_func 1, 12
192intel_aes_encrypt_ecb_192 ENDP
193
194intel_aes_encrypt_ecb_256 PROC
195gen_aes_ecb_func 1, 14
196intel_aes_encrypt_ecb_256 ENDP
197
198intel_aes_decrypt_ecb_128 PROC
199gen_aes_ecb_func 0, 10
200intel_aes_decrypt_ecb_128 ENDP
201
202intel_aes_decrypt_ecb_192 PROC
203gen_aes_ecb_func 0, 12
204intel_aes_decrypt_ecb_192 ENDP
205
206intel_aes_decrypt_ecb_256 PROC
207gen_aes_ecb_func 0, 14
208intel_aes_decrypt_ecb_256 ENDP
209
210
211KEY textequ <rcx>
212KS  textequ <rdx>
213ITR textequ <r8>
214
215intel_aes_encrypt_init_128  PROC
216
217    movdqu  xmm1, [KEY]
218    movdqu  [KS], xmm1
219    movdqa  xmm2, xmm1
220
221    lea ITR, Lcon1
222    movdqa  xmm0, [ITR]
223    lea ITR, Lmask
224    movdqa  xmm4, [ITR]
225
226    mov ITR, 8
227
228Lenc_128_ks_loop:
229        lea KS, [16 + KS]
230        dec ITR
231
232        pshufb  xmm2, xmm4
233        aesenclast  xmm2, xmm0
234        pslld   xmm0, 1
235        movdqa  xmm3, xmm1
236        pslldq  xmm3, 4
237        pxor    xmm1, xmm3
238        pslldq  xmm3, 4
239        pxor    xmm1, xmm3
240        pslldq  xmm3, 4
241        pxor    xmm1, xmm3
242        pxor    xmm1, xmm2
243        movdqu  [KS], xmm1
244        movdqa  xmm2, xmm1
245
246        jne Lenc_128_ks_loop
247
248    lea ITR, Lcon2
249    movdqa  xmm0, [ITR]
250
251    pshufb  xmm2, xmm4
252    aesenclast  xmm2, xmm0
253    pslld   xmm0, 1
254    movdqa  xmm3, xmm1
255    pslldq  xmm3, 4
256    pxor    xmm1, xmm3
257    pslldq  xmm3, 4
258    pxor    xmm1, xmm3
259    pslldq  xmm3, 4
260    pxor    xmm1, xmm3
261    pxor    xmm1, xmm2
262    movdqu  [16 + KS], xmm1
263    movdqa  xmm2, xmm1
264
265    pshufb  xmm2, xmm4
266    aesenclast  xmm2, xmm0
267    movdqa  xmm3, xmm1
268    pslldq  xmm3, 4
269    pxor    xmm1, xmm3
270    pslldq  xmm3, 4
271    pxor    xmm1, xmm3
272    pslldq  xmm3, 4
273    pxor    xmm1, xmm3
274    pxor    xmm1, xmm2
275    movdqu  [32 + KS], xmm1
276    movdqa  xmm2, xmm1
277
278    ret
279intel_aes_encrypt_init_128  ENDP
280
281
282intel_aes_decrypt_init_128  PROC
283
284    push    KS
285    push    KEY
286
287    call    intel_aes_encrypt_init_128
288
289    pop     KEY
290    pop     KS
291
292    movdqu  xmm0, [0*16 + KS]
293    movdqu  xmm1, [10*16 + KS]
294    movdqu  [10*16 + KS], xmm0
295    movdqu  [0*16 + KS], xmm1
296
297    i = 1
298    WHILE i LT 5
299        movdqu  xmm0, [i*16 + KS]
300        movdqu  xmm1, [(10-i)*16 + KS]
301
302        aesimc  xmm0, xmm0
303        aesimc  xmm1, xmm1
304
305        movdqu  [(10-i)*16 + KS], xmm0
306        movdqu  [i*16 + KS], xmm1
307
308        i = i+1
309    ENDM
310
311    movdqu  xmm0, [5*16 + KS]
312    aesimc  xmm0, xmm0
313    movdqu  [5*16 + KS], xmm0
314    ret
315intel_aes_decrypt_init_128  ENDP
316
317
318intel_aes_encrypt_init_192  PROC
319
320    sub     rsp, 16*2
321    movdqu  [16*0 + rsp], xmm6
322    movdqu  [16*1 + rsp], xmm7
323
324    movdqu  xmm1, [KEY]
325    mov     ITR, [16 + KEY]
326    movd    xmm3, ITR
327
328    movdqu  [KS], xmm1
329    movdqa  xmm5, xmm3
330
331    lea ITR, Lcon1
332    movdqu  xmm0, [ITR]
333    lea ITR, Lmask192
334    movdqu  xmm4, [ITR]
335
336    mov ITR, 4
337
338Lenc_192_ks_loop:
339        movdqa  xmm2, xmm3
340        pshufb  xmm2, xmm4
341        aesenclast xmm2, xmm0
342        pslld   xmm0, 1
343
344        movdqa  xmm6, xmm1
345        movdqa  xmm7, xmm3
346        pslldq  xmm6, 4
347        pslldq  xmm7, 4
348        pxor    xmm1, xmm6
349        pxor    xmm3, xmm7
350        pslldq  xmm6, 4
351        pxor    xmm1, xmm6
352        pslldq  xmm6, 4
353        pxor    xmm1, xmm6
354        pxor    xmm1, xmm2
355        pshufd  xmm2, xmm1, 0ffh
356        pxor    xmm3, xmm2
357
358        movdqa  xmm6, xmm1
359        shufpd  xmm5, xmm1, 00h
360        shufpd  xmm6, xmm3, 01h
361
362        movdqu  [16 + KS], xmm5
363        movdqu  [32 + KS], xmm6
364
365        movdqa  xmm2, xmm3
366        pshufb  xmm2, xmm4
367        aesenclast  xmm2, xmm0
368        pslld   xmm0, 1
369
370        movdqa  xmm6, xmm1
371        movdqa  xmm7, xmm3
372        pslldq  xmm6, 4
373        pslldq  xmm7, 4
374        pxor    xmm1, xmm6
375        pxor    xmm3, xmm7
376        pslldq  xmm6, 4
377        pxor    xmm1, xmm6
378        pslldq  xmm6, 4
379        pxor    xmm1, xmm6
380        pxor    xmm1, xmm2
381        pshufd  xmm2, xmm1, 0ffh
382        pxor    xmm3, xmm2
383
384        movdqu  [48 + KS], xmm1
385        movdqa  xmm5, xmm3
386
387        lea KS, [48 + KS]
388
389        dec ITR
390        jnz Lenc_192_ks_loop
391
392    movdqu  [16 + KS], xmm5
393
394    movdqu  xmm7, [16*1 + rsp]
395    movdqu  xmm6, [16*0 + rsp]
396    add rsp, 16*2
397    ret
398intel_aes_encrypt_init_192  ENDP
399
400intel_aes_decrypt_init_192  PROC
401    push    KS
402    push    KEY
403
404    call    intel_aes_encrypt_init_192
405
406    pop     KEY
407    pop     KS
408
409    movdqu  xmm0, [0*16 + KS]
410    movdqu  xmm1, [12*16 + KS]
411    movdqu  [12*16 + KS], xmm0
412    movdqu  [0*16 + KS], xmm1
413
414    i = 1
415    WHILE i LT 6
416        movdqu  xmm0, [i*16 + KS]
417        movdqu  xmm1, [(12-i)*16 + KS]
418
419        aesimc  xmm0, xmm0
420        aesimc  xmm1, xmm1
421
422        movdqu  [(12-i)*16 + KS], xmm0
423        movdqu  [i*16 + KS], xmm1
424
425        i = i+1
426    ENDM
427
428    movdqu  xmm0, [6*16 + KS]
429    aesimc  xmm0, xmm0
430    movdqu  [6*16 + KS], xmm0
431    ret
432intel_aes_decrypt_init_192  ENDP
433
434
435intel_aes_encrypt_init_256  PROC
436    sub     rsp, 16*2
437    movdqu  [16*0 + rsp], xmm6
438    movdqu  [16*1 + rsp], xmm7
439
440    movdqu  xmm1, [16*0 + KEY]
441    movdqu  xmm3, [16*1 + KEY]
442
443    movdqu  [16*0 + KS], xmm1
444    movdqu  [16*1 + KS], xmm3
445
446    lea ITR, Lcon1
447    movdqu  xmm0, [ITR]
448    lea ITR, Lmask256
449    movdqu  xmm5, [ITR]
450
451    pxor    xmm6, xmm6
452
453    mov ITR, 6
454
455Lenc_256_ks_loop:
456
457        movdqa  xmm2, xmm3
458        pshufb  xmm2, xmm5
459        aesenclast  xmm2, xmm0
460        pslld   xmm0, 1
461        movdqa  xmm4, xmm1
462        pslldq  xmm4, 4
463        pxor    xmm1, xmm4
464        pslldq  xmm4, 4
465        pxor    xmm1, xmm4
466        pslldq  xmm4, 4
467        pxor    xmm1, xmm4
468        pxor    xmm1, xmm2
469        movdqu  [16*2 + KS], xmm1
470
471        pshufd  xmm2, xmm1, 0ffh
472        aesenclast  xmm2, xmm6
473        movdqa  xmm4, xmm3
474        pslldq  xmm4, 4
475        pxor    xmm3, xmm4
476        pslldq  xmm4, 4
477        pxor    xmm3, xmm4
478        pslldq  xmm4, 4
479        pxor    xmm3, xmm4
480        pxor    xmm3, xmm2
481        movdqu  [16*3 + KS], xmm3
482
483        lea KS, [32 + KS]
484        dec ITR
485        jnz Lenc_256_ks_loop
486
487    movdqa  xmm2, xmm3
488    pshufb  xmm2, xmm5
489    aesenclast  xmm2, xmm0
490    movdqa  xmm4, xmm1
491    pslldq  xmm4, 4
492    pxor    xmm1, xmm4
493    pslldq  xmm4, 4
494    pxor    xmm1, xmm4
495    pslldq  xmm4, 4
496    pxor    xmm1, xmm4
497    pxor    xmm1, xmm2
498    movdqu  [16*2 + KS], xmm1
499
500    movdqu  xmm7, [16*1 + rsp]
501    movdqu  xmm6, [16*0 + rsp]
502    add rsp, 16*2
503    ret
504
505intel_aes_encrypt_init_256  ENDP
506
507
508intel_aes_decrypt_init_256  PROC
509    push    KS
510    push    KEY
511
512    call    intel_aes_encrypt_init_256
513
514    pop     KEY
515    pop     KS
516
517    movdqu  xmm0, [0*16 + KS]
518    movdqu  xmm1, [14*16 + KS]
519    movdqu  [14*16 + KS], xmm0
520    movdqu  [0*16 + KS], xmm1
521
522    i = 1
523    WHILE i LT 7
524        movdqu  xmm0, [i*16 + KS]
525        movdqu  xmm1, [(14-i)*16 + KS]
526
527        aesimc  xmm0, xmm0
528        aesimc  xmm1, xmm1
529
530        movdqu  [(14-i)*16 + KS], xmm0
531        movdqu  [i*16 + KS], xmm1
532
533        i = i+1
534    ENDM
535
536    movdqu  xmm0, [7*16 + KS]
537    aesimc  xmm0, xmm0
538    movdqu  [7*16 + KS], xmm0
539    ret
540intel_aes_decrypt_init_256  ENDP
541
542
543
544gen_aes_cbc_enc_func MACRO rnds
545
546LOCAL   loop1
547LOCAL   bail
548
549        mov     input,      [rsp + 1*8 + 8*4]
550        mov     inputLen,   [rsp + 1*8 + 8*5]
551
552        sub     rsp, 3*16
553
554        movdqu  [rsp + 0*16], xmm6
555        movdqu  [rsp + 1*16], xmm7
556        movdqu  [rsp + 2*16], xmm8
557
558        lea     ctx, [48+ctx]
559
560        movdqu  xmm0, [-32+ctx]
561
562        movdqu  xmm2, [0*16 + ctx]
563        movdqu  xmm3, [1*16 + ctx]
564        movdqu  xmm4, [2*16 + ctx]
565        movdqu  xmm5, [3*16 + ctx]
566        movdqu  xmm6, [4*16 + ctx]
567        movdqu  xmm7, [5*16 + ctx]
568
569loop1:
570        cmp     inputLen, 1*16
571        jb      bail
572
573        movdqu  xmm1, [input]
574        pxor    xmm1, xmm2
575        pxor    xmm0, xmm1
576
577        aesenc  xmm0, xmm3
578        aesenc  xmm0, xmm4
579        aesenc  xmm0, xmm5
580        aesenc  xmm0, xmm6
581        aesenc  xmm0, xmm7
582
583        i = 6
584    WHILE i LT rnds
585            movdqu  xmm8, [i*16 + ctx]
586            aesenc  xmm0, xmm8
587            i = i+1
588        ENDM
589        movdqu  xmm8, [rnds*16 + ctx]
590        aesenclast xmm0, xmm8
591
592        movdqu  [output], xmm0
593
594        lea input, [1*16 + input]
595        lea output, [1*16 + output]
596        sub inputLen, 1*16
597        jmp loop1
598
599bail:
600        movdqu  [-32+ctx], xmm0
601
602        xor rax, rax
603
604        movdqu  xmm6, [rsp + 0*16]
605        movdqu  xmm7, [rsp + 1*16]
606        movdqu  xmm8, [rsp + 2*16]
607        add     rsp, 3*16
608        ret
609
610ENDM
611
612gen_aes_cbc_dec_func MACRO rnds
613
614LOCAL   loop8
615LOCAL   loop1
616LOCAL   dec1
617LOCAL   bail
618
619        mov     input,      [rsp + 1*8 + 8*4]
620        mov     inputLen,   [rsp + 1*8 + 8*5]
621
622        sub     rsp, 3*16
623
624        movdqu  [rsp + 0*16], xmm6
625        movdqu  [rsp + 1*16], xmm7
626        movdqu  [rsp + 2*16], xmm8
627
628        lea     ctx, [48+ctx]
629
630loop8:
631        cmp     inputLen, 8*16
632        jb      dec1
633
634        movdqu  xmm0, [0*16 + input]
635        movdqu  xmm1, [1*16 + input]
636        movdqu  xmm2, [2*16 + input]
637        movdqu  xmm3, [3*16 + input]
638        movdqu  xmm4, [4*16 + input]
639        movdqu  xmm5, [5*16 + input]
640        movdqu  xmm6, [6*16 + input]
641        movdqu  xmm7, [7*16 + input]
642
643        movdqu  xmm8, [0*16 + ctx]
644        pxor    xmm0, xmm8
645        pxor    xmm1, xmm8
646        pxor    xmm2, xmm8
647        pxor    xmm3, xmm8
648        pxor    xmm4, xmm8
649        pxor    xmm5, xmm8
650        pxor    xmm6, xmm8
651        pxor    xmm7, xmm8
652
653        i = 1
654        WHILE i LT rnds
655            aes_dec_rnd i
656            i = i+1
657            ENDM
658        aes_dec_last_rnd rnds
659
660        movdqu  xmm8, [-32 + ctx]
661        pxor    xmm0, xmm8
662        movdqu  xmm8, [0*16 + input]
663        pxor    xmm1, xmm8
664        movdqu  xmm8, [1*16 + input]
665        pxor    xmm2, xmm8
666        movdqu  xmm8, [2*16 + input]
667        pxor    xmm3, xmm8
668        movdqu  xmm8, [3*16 + input]
669        pxor    xmm4, xmm8
670        movdqu  xmm8, [4*16 + input]
671        pxor    xmm5, xmm8
672        movdqu  xmm8, [5*16 + input]
673        pxor    xmm6, xmm8
674        movdqu  xmm8, [6*16 + input]
675        pxor    xmm7, xmm8
676        movdqu  xmm8, [7*16 + input]
677
678        movdqu  [0*16 + output], xmm0
679        movdqu  [1*16 + output], xmm1
680        movdqu  [2*16 + output], xmm2
681        movdqu  [3*16 + output], xmm3
682        movdqu  [4*16 + output], xmm4
683        movdqu  [5*16 + output], xmm5
684        movdqu  [6*16 + output], xmm6
685        movdqu  [7*16 + output], xmm7
686        movdqu  [-32 + ctx], xmm8
687
688        lea input, [8*16 + input]
689        lea output, [8*16 + output]
690        sub inputLen, 8*16
691        jmp loop8
692dec1:
693
694        movdqu  xmm3, [-32 + ctx]
695
696loop1:
697        cmp     inputLen, 1*16
698        jb      bail
699
700        movdqu  xmm0, [input]
701        movdqa  xmm4, xmm0
702        movdqu  xmm7, [0*16 + ctx]
703        pxor    xmm0, xmm7
704
705        i = 1
706    WHILE i LT rnds
707            movdqu  xmm7, [i*16 + ctx]
708            aesdec  xmm0, xmm7
709            i = i+1
710        ENDM
711        movdqu  xmm7, [rnds*16 + ctx]
712        aesdeclast xmm0, xmm7
713        pxor    xmm3, xmm0
714
715        movdqu  [output], xmm3
716        movdqa  xmm3, xmm4
717
718        lea input, [1*16 + input]
719        lea output, [1*16 + output]
720        sub inputLen, 1*16
721        jmp loop1
722
723bail:
724        movdqu  [-32 + ctx], xmm3
725        xor rax, rax
726
727        movdqu  xmm6, [rsp + 0*16]
728        movdqu  xmm7, [rsp + 1*16]
729        movdqu  xmm8, [rsp + 2*16]
730        add     rsp, 3*16
731        ret
732ENDM
733
734intel_aes_encrypt_cbc_128 PROC
735gen_aes_cbc_enc_func  10
736intel_aes_encrypt_cbc_128 ENDP
737
738intel_aes_encrypt_cbc_192 PROC
739gen_aes_cbc_enc_func  12
740intel_aes_encrypt_cbc_192 ENDP
741
742intel_aes_encrypt_cbc_256 PROC
743gen_aes_cbc_enc_func  14
744intel_aes_encrypt_cbc_256 ENDP
745
746intel_aes_decrypt_cbc_128 PROC
747gen_aes_cbc_dec_func  10
748intel_aes_decrypt_cbc_128 ENDP
749
750intel_aes_decrypt_cbc_192 PROC
751gen_aes_cbc_dec_func  12
752intel_aes_decrypt_cbc_192 ENDP
753
754intel_aes_decrypt_cbc_256 PROC
755gen_aes_cbc_dec_func  14
756intel_aes_decrypt_cbc_256 ENDP
757
758
759
760ctrCtx textequ <r10>
761CTR textequ <r11d>
762CTRSave textequ <eax>
763
764gen_aes_ctr_func MACRO rnds
765
766LOCAL   loop8
767LOCAL   loop1
768LOCAL   enc1
769LOCAL   bail
770
771        mov     input,      [rsp + 8*1 + 4*8]
772        mov     inputLen,   [rsp + 8*1 + 5*8]
773
774        mov     ctrCtx, ctx
775        mov     ctx, [8+ctrCtx]
776        lea     ctx, [48+ctx]
777
778        sub     rsp, 3*16
779        movdqu  [rsp + 0*16], xmm6
780        movdqu  [rsp + 1*16], xmm7
781        movdqu  [rsp + 2*16], xmm8
782
783
784        push    rbp
785        mov     rbp, rsp
786        sub     rsp, 8*16
787        and     rsp, -16
788
789
790        movdqu  xmm0, [16+ctrCtx]
791        mov     CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
792        bswap   CTRSave
793        movdqu  xmm1, [ctx + 0*16]
794
795        pxor    xmm0, xmm1
796
797        movdqa  [rsp + 0*16], xmm0
798        movdqa  [rsp + 1*16], xmm0
799        movdqa  [rsp + 2*16], xmm0
800        movdqa  [rsp + 3*16], xmm0
801        movdqa  [rsp + 4*16], xmm0
802        movdqa  [rsp + 5*16], xmm0
803        movdqa  [rsp + 6*16], xmm0
804        movdqa  [rsp + 7*16], xmm0
805
806        inc     CTRSave
807        mov     CTR, CTRSave
808        bswap   CTR
809        xor     CTR, DWORD PTR [ctx + 3*4]
810        mov     DWORD PTR [rsp + 1*16 + 3*4], CTR
811
812        inc     CTRSave
813        mov     CTR, CTRSave
814        bswap   CTR
815        xor     CTR, DWORD PTR [ctx + 3*4]
816        mov     DWORD PTR [rsp + 2*16 + 3*4], CTR
817
818        inc     CTRSave
819        mov     CTR, CTRSave
820        bswap   CTR
821        xor     CTR, DWORD PTR [ctx + 3*4]
822        mov     DWORD PTR [rsp + 3*16 + 3*4], CTR
823
824        inc     CTRSave
825        mov     CTR, CTRSave
826        bswap   CTR
827        xor     CTR, DWORD PTR [ctx + 3*4]
828        mov     DWORD PTR [rsp + 4*16 + 3*4], CTR
829
830        inc     CTRSave
831        mov     CTR, CTRSave
832        bswap   CTR
833        xor     CTR, DWORD PTR [ctx + 3*4]
834        mov     DWORD PTR [rsp + 5*16 + 3*4], CTR
835
836        inc     CTRSave
837        mov     CTR, CTRSave
838        bswap   CTR
839        xor     CTR, DWORD PTR [ctx + 3*4]
840        mov     DWORD PTR [rsp + 6*16 + 3*4], CTR
841
842        inc     CTRSave
843        mov     CTR, CTRSave
844        bswap   CTR
845        xor     CTR, DWORD PTR [ctx + 3*4]
846        mov     DWORD PTR [rsp + 7*16 + 3*4], CTR
847
848
849loop8:
850        cmp     inputLen, 8*16
851        jb      loop1
852
853        movdqu  xmm0, [0*16 + rsp]
854        movdqu  xmm1, [1*16 + rsp]
855        movdqu  xmm2, [2*16 + rsp]
856        movdqu  xmm3, [3*16 + rsp]
857        movdqu  xmm4, [4*16 + rsp]
858        movdqu  xmm5, [5*16 + rsp]
859        movdqu  xmm6, [6*16 + rsp]
860        movdqu  xmm7, [7*16 + rsp]
861
862        i = 1
863        WHILE i LE 8
864            aes_rnd i
865
866            inc     CTRSave
867            mov     CTR, CTRSave
868            bswap   CTR
869            xor     CTR, DWORD PTR [ctx + 3*4]
870            mov     DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
871
872            i = i+1
873        ENDM
874        WHILE i LT rnds
875            aes_rnd i
876            i = i+1
877            ENDM
878        aes_last_rnd rnds
879
880        movdqu  xmm8, [0*16 + input]
881        pxor    xmm0, xmm8
882        movdqu  xmm8, [1*16 + input]
883        pxor    xmm1, xmm8
884        movdqu  xmm8, [2*16 + input]
885        pxor    xmm2, xmm8
886        movdqu  xmm8, [3*16 + input]
887        pxor    xmm3, xmm8
888        movdqu  xmm8, [4*16 + input]
889        pxor    xmm4, xmm8
890        movdqu  xmm8, [5*16 + input]
891        pxor    xmm5, xmm8
892        movdqu  xmm8, [6*16 + input]
893        pxor    xmm6, xmm8
894        movdqu  xmm8, [7*16 + input]
895        pxor    xmm7, xmm8
896
897        movdqu  [0*16 + output], xmm0
898        movdqu  [1*16 + output], xmm1
899        movdqu  [2*16 + output], xmm2
900        movdqu  [3*16 + output], xmm3
901        movdqu  [4*16 + output], xmm4
902        movdqu  [5*16 + output], xmm5
903        movdqu  [6*16 + output], xmm6
904        movdqu  [7*16 + output], xmm7
905
906        lea input, [8*16 + input]
907        lea output, [8*16 + output]
908        sub inputLen, 8*16
909        jmp loop8
910
911
912loop1:
913        cmp     inputLen, 1*16
914        jb      bail
915
916        movdqu  xmm0, [rsp]
917        add     rsp, 16
918
919        i = 1
920    WHILE i LT rnds
921            movdqu  xmm7, [i*16 + ctx]
922            aesenc  xmm0, xmm7
923            i = i+1
924        ENDM
925        movdqu  xmm7, [rnds*16 + ctx]
926        aesenclast xmm0, xmm7
927
928        movdqu  xmm7, [input]
929        pxor    xmm0, xmm7
930        movdqu  [output], xmm0
931
932        lea input, [1*16 + input]
933        lea output, [1*16 + output]
934        sub inputLen, 1*16
935        jmp loop1
936
937bail:
938
939        movdqu  xmm0, [rsp]
940        movdqu  xmm1, [ctx + 0*16]
941        pxor    xmm0, xmm1
942        movdqu  [16+ctrCtx], xmm0
943
944
945        xor     rax, rax
946        mov     rsp, rbp
947        pop     rbp
948
949        movdqu  xmm6, [rsp + 0*16]
950        movdqu  xmm7, [rsp + 1*16]
951        movdqu  xmm8, [rsp + 2*16]
952        add     rsp, 3*16
953
954        ret
955ENDM
956
957
958intel_aes_encrypt_ctr_128 PROC
959gen_aes_ctr_func  10
960intel_aes_encrypt_ctr_128 ENDP
961
962intel_aes_encrypt_ctr_192 PROC
963gen_aes_ctr_func  12
964intel_aes_encrypt_ctr_192 ENDP
965
966intel_aes_encrypt_ctr_256 PROC
967gen_aes_ctr_func  14
968intel_aes_encrypt_ctr_256 ENDP
969
970
971END
972