1; LICENSE:
2; This submission to NSS is to be made available under the terms of the
3; Mozilla Public License, v. 2.0. You can obtain one at http:
4; //mozilla.org/MPL/2.0/.
5;###############################################################################
6; Copyright(c) 2014, Intel Corp.
7; Developers and authors:
8; Shay Gueron and Vlad Krasnov
9; Intel Corporation, Israel Development Centre, Haifa, Israel
10; Please send feedback directly to crypto.feedback.alias@intel.com
11
12
13.MODEL FLAT, C
14.XMM
15
16.DATA
17ALIGN 16
18Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
19Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
20Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
21Lcon1 dd 1,1,1,1
22Lcon2 dd 1bh,1bh,1bh,1bh
23
24.CODE
25
26ctx     textequ <ecx>
27output  textequ <edx>
28input   textequ <eax>
29inputLen textequ <edi>
30
31
32aes_rnd MACRO i
33    movdqu  xmm7, [i*16 + ctx]
34    aesenc  xmm0, xmm7
35    aesenc  xmm1, xmm7
36    aesenc  xmm2, xmm7
37    aesenc  xmm3, xmm7
38    aesenc  xmm4, xmm7
39    aesenc  xmm5, xmm7
40    aesenc  xmm6, xmm7
41    ENDM
42
43aes_last_rnd MACRO i
44    movdqu  xmm7, [i*16 + ctx]
45    aesenclast  xmm0, xmm7
46    aesenclast  xmm1, xmm7
47    aesenclast  xmm2, xmm7
48    aesenclast  xmm3, xmm7
49    aesenclast  xmm4, xmm7
50    aesenclast  xmm5, xmm7
51    aesenclast  xmm6, xmm7
52    ENDM
53
54aes_dec_rnd MACRO i
55    movdqu  xmm7, [i*16 + ctx]
56    aesdec  xmm0, xmm7
57    aesdec  xmm1, xmm7
58    aesdec  xmm2, xmm7
59    aesdec  xmm3, xmm7
60    aesdec  xmm4, xmm7
61    aesdec  xmm5, xmm7
62    aesdec  xmm6, xmm7
63    ENDM
64
65aes_dec_last_rnd MACRO i
66    movdqu  xmm7, [i*16 + ctx]
67    aesdeclast  xmm0, xmm7
68    aesdeclast  xmm1, xmm7
69    aesdeclast  xmm2, xmm7
70    aesdeclast  xmm3, xmm7
71    aesdeclast  xmm4, xmm7
72    aesdeclast  xmm5, xmm7
73    aesdeclast  xmm6, xmm7
74    ENDM
75
76
77gen_aes_ecb_func MACRO enc, rnds
78
79LOCAL   loop7
80LOCAL   loop1
81LOCAL   bail
82
83        push    inputLen
84
85        mov     ctx,    [esp + 2*4 + 0*4]
86        mov     output,     [esp + 2*4 + 1*4]
87        mov     input,      [esp + 2*4 + 4*4]
88        mov     inputLen,   [esp + 2*4 + 5*4]
89
90loop7:
91        cmp     inputLen, 7*16
92        jb      loop1
93
94        movdqu  xmm0, [0*16 + input]
95        movdqu  xmm1, [1*16 + input]
96        movdqu  xmm2, [2*16 + input]
97        movdqu  xmm3, [3*16 + input]
98        movdqu  xmm4, [4*16 + input]
99        movdqu  xmm5, [5*16 + input]
100        movdqu  xmm6, [6*16 + input]
101
102        movdqu  xmm7, [0*16 + ctx]
103        pxor    xmm0, xmm7
104        pxor    xmm1, xmm7
105        pxor    xmm2, xmm7
106        pxor    xmm3, xmm7
107        pxor    xmm4, xmm7
108        pxor    xmm5, xmm7
109        pxor    xmm6, xmm7
110
111IF enc eq 1
112        rnd textequ <aes_rnd>
113        lastrnd textequ <aes_last_rnd>
114        aesinst textequ <aesenc>
115        aeslastinst textequ <aesenclast>
116ELSE
117        rnd textequ <aes_dec_rnd>
118        lastrnd textequ <aes_dec_last_rnd>
119        aesinst textequ <aesdec>
120        aeslastinst textequ <aesdeclast>
121ENDIF
122
123        i = 1
124        WHILE i LT rnds
125            rnd i
126            i = i+1
127            ENDM
128        lastrnd rnds
129
130        movdqu  [0*16 + output], xmm0
131        movdqu  [1*16 + output], xmm1
132        movdqu  [2*16 + output], xmm2
133        movdqu  [3*16 + output], xmm3
134        movdqu  [4*16 + output], xmm4
135        movdqu  [5*16 + output], xmm5
136        movdqu  [6*16 + output], xmm6
137
138        lea input, [7*16 + input]
139        lea output, [7*16 + output]
140        sub inputLen, 7*16
141        jmp loop7
142
143loop1:
144        cmp     inputLen, 1*16
145        jb      bail
146
147        movdqu  xmm0, [input]
148        movdqu  xmm7, [0*16 + ctx]
149        pxor    xmm0, xmm7
150
151        i = 1
152    WHILE i LT rnds
153            movdqu  xmm7, [i*16 + ctx]
154            aesinst  xmm0, xmm7
155            i = i+1
156        ENDM
157        movdqu  xmm7, [rnds*16 + ctx]
158        aeslastinst xmm0, xmm7
159
160        movdqu  [output], xmm0
161
162        lea input, [1*16 + input]
163        lea output, [1*16 + output]
164        sub inputLen, 1*16
165        jmp loop1
166
167bail:
168        xor eax, eax
169        pop     inputLen
170        ret
171
172ENDM
173
174ALIGN 16
175intel_aes_encrypt_ecb_128 PROC
176gen_aes_ecb_func 1, 10
177intel_aes_encrypt_ecb_128 ENDP
178
179ALIGN 16
180intel_aes_encrypt_ecb_192 PROC
181gen_aes_ecb_func 1, 12
182intel_aes_encrypt_ecb_192 ENDP
183
184ALIGN 16
185intel_aes_encrypt_ecb_256 PROC
186gen_aes_ecb_func 1, 14
187intel_aes_encrypt_ecb_256 ENDP
188
189ALIGN 16
190intel_aes_decrypt_ecb_128 PROC
191gen_aes_ecb_func 0, 10
192intel_aes_decrypt_ecb_128 ENDP
193
194ALIGN 16
195intel_aes_decrypt_ecb_192 PROC
196gen_aes_ecb_func 0, 12
197intel_aes_decrypt_ecb_192 ENDP
198
199ALIGN 16
200intel_aes_decrypt_ecb_256 PROC
201gen_aes_ecb_func 0, 14
202intel_aes_decrypt_ecb_256 ENDP
203
204
205KEY textequ <ecx>
206KS  textequ <edx>
207ITR textequ <eax>
208
209ALIGN 16
210intel_aes_encrypt_init_128  PROC
211
212    mov     KEY,        [esp + 1*4 + 0*4]
213    mov     KS,         [esp + 1*4 + 1*4]
214
215
216    movdqu  xmm1, [KEY]
217    movdqu  [KS], xmm1
218    movdqa  xmm2, xmm1
219
220    lea ITR, Lcon1
221    movdqa  xmm0, [ITR]
222    lea ITR, Lmask
223    movdqa  xmm4, [ITR]
224
225    mov ITR, 8
226
227Lenc_128_ks_loop:
228        lea KS, [16 + KS]
229        dec ITR
230
231        pshufb  xmm2, xmm4
232        aesenclast  xmm2, xmm0
233        pslld   xmm0, 1
234        movdqa  xmm3, xmm1
235        pslldq  xmm3, 4
236        pxor    xmm1, xmm3
237        pslldq  xmm3, 4
238        pxor    xmm1, xmm3
239        pslldq  xmm3, 4
240        pxor    xmm1, xmm3
241        pxor    xmm1, xmm2
242        movdqu  [KS], xmm1
243        movdqa  xmm2, xmm1
244
245        jne Lenc_128_ks_loop
246
247    lea ITR, Lcon2
248    movdqa  xmm0, [ITR]
249
250    pshufb  xmm2, xmm4
251    aesenclast  xmm2, xmm0
252    pslld   xmm0, 1
253    movdqa  xmm3, xmm1
254    pslldq  xmm3, 4
255    pxor    xmm1, xmm3
256    pslldq  xmm3, 4
257    pxor    xmm1, xmm3
258    pslldq  xmm3, 4
259    pxor    xmm1, xmm3
260    pxor    xmm1, xmm2
261    movdqu  [16 + KS], xmm1
262    movdqa  xmm2, xmm1
263
264    pshufb  xmm2, xmm4
265    aesenclast  xmm2, xmm0
266    movdqa  xmm3, xmm1
267    pslldq  xmm3, 4
268    pxor    xmm1, xmm3
269    pslldq  xmm3, 4
270    pxor    xmm1, xmm3
271    pslldq  xmm3, 4
272    pxor    xmm1, xmm3
273    pxor    xmm1, xmm2
274    movdqu  [32 + KS], xmm1
275    movdqa  xmm2, xmm1
276
277    ret
278intel_aes_encrypt_init_128  ENDP
279
280
281ALIGN 16
282intel_aes_decrypt_init_128  PROC
283
284    mov     KEY,        [esp + 1*4 + 0*4]
285    mov     KS,         [esp + 1*4 + 1*4]
286
287    push    KS
288    push    KEY
289
290    call    intel_aes_encrypt_init_128
291
292    pop     KEY
293    pop     KS
294
295    movdqu  xmm0, [0*16 + KS]
296    movdqu  xmm1, [10*16 + KS]
297    movdqu  [10*16 + KS], xmm0
298    movdqu  [0*16 + KS], xmm1
299
300    i = 1
301    WHILE i LT 5
302        movdqu  xmm0, [i*16 + KS]
303        movdqu  xmm1, [(10-i)*16 + KS]
304
305        aesimc  xmm0, xmm0
306        aesimc  xmm1, xmm1
307
308        movdqu  [(10-i)*16 + KS], xmm0
309        movdqu  [i*16 + KS], xmm1
310
311        i = i+1
312    ENDM
313
314    movdqu  xmm0, [5*16 + KS]
315    aesimc  xmm0, xmm0
316    movdqu  [5*16 + KS], xmm0
317    ret
318intel_aes_decrypt_init_128  ENDP
319
320
321ALIGN 16
322intel_aes_encrypt_init_192  PROC
323
324    mov     KEY, [esp + 1*4 + 0*4]
325    mov     KS,  [esp + 1*4 + 1*4]
326
327    pxor    xmm3, xmm3
328    movdqu  xmm1, [KEY]
329    pinsrd  xmm3, DWORD PTR [16 + KEY], 0
330    pinsrd  xmm3, DWORD PTR [20 + KEY], 1
331
332    movdqu  [KS], xmm1
333    movdqa  xmm5, xmm3
334
335    lea ITR, Lcon1
336    movdqu  xmm0, [ITR]
337    lea ITR, Lmask192
338    movdqu  xmm4, [ITR]
339
340    mov ITR, 4
341
342Lenc_192_ks_loop:
343        movdqa  xmm2, xmm3
344        pshufb  xmm2, xmm4
345        aesenclast xmm2, xmm0
346        pslld   xmm0, 1
347
348        movdqa  xmm6, xmm1
349        movdqa  xmm7, xmm3
350        pslldq  xmm6, 4
351        pslldq  xmm7, 4
352        pxor    xmm1, xmm6
353        pxor    xmm3, xmm7
354        pslldq  xmm6, 4
355        pxor    xmm1, xmm6
356        pslldq  xmm6, 4
357        pxor    xmm1, xmm6
358        pxor    xmm1, xmm2
359        pshufd  xmm2, xmm1, 0ffh
360        pxor    xmm3, xmm2
361
362        movdqa  xmm6, xmm1
363        shufpd  xmm5, xmm1, 00h
364        shufpd  xmm6, xmm3, 01h
365
366        movdqu  [16 + KS], xmm5
367        movdqu  [32 + KS], xmm6
368
369        movdqa  xmm2, xmm3
370        pshufb  xmm2, xmm4
371        aesenclast  xmm2, xmm0
372        pslld   xmm0, 1
373
374        movdqa  xmm6, xmm1
375        movdqa  xmm7, xmm3
376        pslldq  xmm6, 4
377        pslldq  xmm7, 4
378        pxor    xmm1, xmm6
379        pxor    xmm3, xmm7
380        pslldq  xmm6, 4
381        pxor    xmm1, xmm6
382        pslldq  xmm6, 4
383        pxor    xmm1, xmm6
384        pxor    xmm1, xmm2
385        pshufd  xmm2, xmm1, 0ffh
386        pxor    xmm3, xmm2
387
388        movdqu  [48 + KS], xmm1
389        movdqa  xmm5, xmm3
390
391        lea KS, [48 + KS]
392
393        dec ITR
394        jnz Lenc_192_ks_loop
395
396    movdqu  [16 + KS], xmm5
397ret
398intel_aes_encrypt_init_192  ENDP
399
400ALIGN 16
401intel_aes_decrypt_init_192  PROC
402    mov     KEY,        [esp + 1*4 + 0*4]
403    mov     KS,         [esp + 1*4 + 1*4]
404
405    push    KS
406    push    KEY
407
408    call    intel_aes_encrypt_init_192
409
410    pop     KEY
411    pop     KS
412
413    movdqu  xmm0, [0*16 + KS]
414    movdqu  xmm1, [12*16 + KS]
415    movdqu  [12*16 + KS], xmm0
416    movdqu  [0*16 + KS], xmm1
417
418    i = 1
419    WHILE i LT 6
420        movdqu  xmm0, [i*16 + KS]
421        movdqu  xmm1, [(12-i)*16 + KS]
422
423        aesimc  xmm0, xmm0
424        aesimc  xmm1, xmm1
425
426        movdqu  [(12-i)*16 + KS], xmm0
427        movdqu  [i*16 + KS], xmm1
428
429        i = i+1
430    ENDM
431
432    movdqu  xmm0, [6*16 + KS]
433    aesimc  xmm0, xmm0
434    movdqu  [6*16 + KS], xmm0
435    ret
436intel_aes_decrypt_init_192  ENDP
437
438ALIGN 16
439intel_aes_encrypt_init_256  PROC
440
441    mov     KEY,    [esp + 1*4 + 0*4]
442    mov     KS,     [esp + 1*4 + 1*4]
443    movdqu  xmm1, [16*0 + KEY]
444    movdqu  xmm3, [16*1 + KEY]
445
446    movdqu  [16*0 + KS], xmm1
447    movdqu  [16*1 + KS], xmm3
448
449    lea ITR, Lcon1
450    movdqu  xmm0, [ITR]
451    lea ITR, Lmask256
452    movdqu  xmm5, [ITR]
453
454    pxor    xmm6, xmm6
455
456    mov ITR, 6
457
458Lenc_256_ks_loop:
459
460        movdqa  xmm2, xmm3
461        pshufb  xmm2, xmm5
462        aesenclast  xmm2, xmm0
463        pslld   xmm0, 1
464        movdqa  xmm4, xmm1
465        pslldq  xmm4, 4
466        pxor    xmm1, xmm4
467        pslldq  xmm4, 4
468        pxor    xmm1, xmm4
469        pslldq  xmm4, 4
470        pxor    xmm1, xmm4
471        pxor    xmm1, xmm2
472        movdqu  [16*2 + KS], xmm1
473
474        pshufd  xmm2, xmm1, 0ffh
475        aesenclast  xmm2, xmm6
476        movdqa  xmm4, xmm3
477        pslldq  xmm4, 4
478        pxor    xmm3, xmm4
479        pslldq  xmm4, 4
480        pxor    xmm3, xmm4
481        pslldq  xmm4, 4
482        pxor    xmm3, xmm4
483        pxor    xmm3, xmm2
484        movdqu  [16*3 + KS], xmm3
485
486        lea KS, [32 + KS]
487        dec ITR
488        jnz Lenc_256_ks_loop
489
490    movdqa  xmm2, xmm3
491    pshufb  xmm2, xmm5
492    aesenclast  xmm2, xmm0
493    movdqa  xmm4, xmm1
494    pslldq  xmm4, 4
495    pxor    xmm1, xmm4
496    pslldq  xmm4, 4
497    pxor    xmm1, xmm4
498    pslldq  xmm4, 4
499    pxor    xmm1, xmm4
500    pxor    xmm1, xmm2
501    movdqu  [16*2 + KS], xmm1
502
503    ret
504intel_aes_encrypt_init_256  ENDP
505
506ALIGN 16
507intel_aes_decrypt_init_256  PROC
508    mov     KEY,        [esp + 1*4 + 0*4]
509    mov     KS,         [esp + 1*4 + 1*4]
510
511    push    KS
512    push    KEY
513
514    call    intel_aes_encrypt_init_256
515
516    pop     KEY
517    pop     KS
518
519    movdqu  xmm0, [0*16 + KS]
520    movdqu  xmm1, [14*16 + KS]
521    movdqu  [14*16 + KS], xmm0
522    movdqu  [0*16 + KS], xmm1
523
524    i = 1
525    WHILE i LT 7
526        movdqu  xmm0, [i*16 + KS]
527        movdqu  xmm1, [(14-i)*16 + KS]
528
529        aesimc  xmm0, xmm0
530        aesimc  xmm1, xmm1
531
532        movdqu  [(14-i)*16 + KS], xmm0
533        movdqu  [i*16 + KS], xmm1
534
535        i = i+1
536    ENDM
537
538    movdqu  xmm0, [7*16 + KS]
539    aesimc  xmm0, xmm0
540    movdqu  [7*16 + KS], xmm0
541    ret
542intel_aes_decrypt_init_256  ENDP
543
544
545
546gen_aes_cbc_enc_func MACRO rnds
547
548LOCAL   loop1
549LOCAL   bail
550
551        push    inputLen
552
553        mov     ctx,    [esp + 2*4 + 0*4]
554        mov     output,     [esp + 2*4 + 1*4]
555        mov     input,      [esp + 2*4 + 4*4]
556        mov     inputLen,   [esp + 2*4 + 5*4]
557
558        movdqu  xmm0, [252+ctx]
559
560        movdqu  xmm2, [0*16 + ctx]
561        movdqu  xmm3, [1*16 + ctx]
562        movdqu  xmm4, [2*16 + ctx]
563        movdqu  xmm5, [3*16 + ctx]
564        movdqu  xmm6, [4*16 + ctx]
565
566loop1:
567        cmp     inputLen, 1*16
568        jb      bail
569
570        movdqu  xmm1, [input]
571        pxor    xmm1, xmm2
572        pxor    xmm0, xmm1
573
574        aesenc  xmm0, xmm3
575        aesenc  xmm0, xmm4
576        aesenc  xmm0, xmm5
577        aesenc  xmm0, xmm6
578
579        i = 5
580    WHILE i LT rnds
581            movdqu  xmm7, [i*16 + ctx]
582            aesenc  xmm0, xmm7
583            i = i+1
584        ENDM
585        movdqu  xmm7, [rnds*16 + ctx]
586        aesenclast xmm0, xmm7
587
588        movdqu  [output], xmm0
589
590        lea input, [1*16 + input]
591        lea output, [1*16 + output]
592        sub inputLen, 1*16
593        jmp loop1
594
595bail:
596        movdqu  [252+ctx], xmm0
597
598        xor eax, eax
599        pop inputLen
600        ret
601
602ENDM
603
604gen_aes_cbc_dec_func MACRO rnds
605
606LOCAL   loop7
607LOCAL   loop1
608LOCAL   dec1
609LOCAL   bail
610
611        push    inputLen
612
613        mov     ctx,    [esp + 2*4 + 0*4]
614        mov     output,     [esp + 2*4 + 1*4]
615        mov     input,      [esp + 2*4 + 4*4]
616        mov     inputLen,   [esp + 2*4 + 5*4]
617
618loop7:
619        cmp     inputLen, 7*16
620        jb      dec1
621
622        movdqu  xmm0, [0*16 + input]
623        movdqu  xmm1, [1*16 + input]
624        movdqu  xmm2, [2*16 + input]
625        movdqu  xmm3, [3*16 + input]
626        movdqu  xmm4, [4*16 + input]
627        movdqu  xmm5, [5*16 + input]
628        movdqu  xmm6, [6*16 + input]
629
630        movdqu  xmm7, [0*16 + ctx]
631        pxor    xmm0, xmm7
632        pxor    xmm1, xmm7
633        pxor    xmm2, xmm7
634        pxor    xmm3, xmm7
635        pxor    xmm4, xmm7
636        pxor    xmm5, xmm7
637        pxor    xmm6, xmm7
638
639        i = 1
640        WHILE i LT rnds
641            aes_dec_rnd i
642            i = i+1
643            ENDM
644        aes_dec_last_rnd rnds
645
646        movdqu  xmm7, [252 + ctx]
647        pxor    xmm0, xmm7
648        movdqu  xmm7, [0*16 + input]
649        pxor    xmm1, xmm7
650        movdqu  xmm7, [1*16 + input]
651        pxor    xmm2, xmm7
652        movdqu  xmm7, [2*16 + input]
653        pxor    xmm3, xmm7
654        movdqu  xmm7, [3*16 + input]
655        pxor    xmm4, xmm7
656        movdqu  xmm7, [4*16 + input]
657        pxor    xmm5, xmm7
658        movdqu  xmm7, [5*16 + input]
659        pxor    xmm6, xmm7
660        movdqu  xmm7, [6*16 + input]
661
662        movdqu  [0*16 + output], xmm0
663        movdqu  [1*16 + output], xmm1
664        movdqu  [2*16 + output], xmm2
665        movdqu  [3*16 + output], xmm3
666        movdqu  [4*16 + output], xmm4
667        movdqu  [5*16 + output], xmm5
668        movdqu  [6*16 + output], xmm6
669        movdqu  [252 + ctx], xmm7
670
671        lea input, [7*16 + input]
672        lea output, [7*16 + output]
673        sub inputLen, 7*16
674        jmp loop7
675dec1:
676
677        movdqu  xmm3, [252 + ctx]
678
679loop1:
680        cmp     inputLen, 1*16
681        jb      bail
682
683        movdqu  xmm0, [input]
684        movdqa  xmm4, xmm0
685        movdqu  xmm7, [0*16 + ctx]
686        pxor    xmm0, xmm7
687
688        i = 1
689    WHILE i LT rnds
690            movdqu  xmm7, [i*16 + ctx]
691            aesdec  xmm0, xmm7
692            i = i+1
693        ENDM
694        movdqu  xmm7, [rnds*16 + ctx]
695        aesdeclast xmm0, xmm7
696        pxor    xmm3, xmm0
697
698        movdqu  [output], xmm3
699        movdqa  xmm3, xmm4
700
701        lea input, [1*16 + input]
702        lea output, [1*16 + output]
703        sub inputLen, 1*16
704        jmp loop1
705
706bail:
707        movdqu  [252 + ctx], xmm3
708        xor eax, eax
709        pop     inputLen
710        ret
711ENDM
712
713ALIGN 16
714intel_aes_encrypt_cbc_128 PROC
715gen_aes_cbc_enc_func  10
716intel_aes_encrypt_cbc_128 ENDP
717
718ALIGN 16
719intel_aes_encrypt_cbc_192 PROC
720gen_aes_cbc_enc_func  12
721intel_aes_encrypt_cbc_192 ENDP
722
723ALIGN 16
724intel_aes_encrypt_cbc_256 PROC
725gen_aes_cbc_enc_func  14
726intel_aes_encrypt_cbc_256 ENDP
727
728ALIGN 16
729intel_aes_decrypt_cbc_128 PROC
730gen_aes_cbc_dec_func  10
731intel_aes_decrypt_cbc_128 ENDP
732
733ALIGN 16
734intel_aes_decrypt_cbc_192 PROC
735gen_aes_cbc_dec_func  12
736intel_aes_decrypt_cbc_192 ENDP
737
738ALIGN 16
739intel_aes_decrypt_cbc_256 PROC
740gen_aes_cbc_dec_func  14
741intel_aes_decrypt_cbc_256 ENDP
742
743
744
745ctrCtx textequ <esi>
746CTR textequ <ebx>
747
748gen_aes_ctr_func MACRO rnds
749
750LOCAL   loop7
751LOCAL   loop1
752LOCAL   enc1
753LOCAL   bail
754
755        push    inputLen
756        push    ctrCtx
757        push    CTR
758        push    ebp
759
760        mov     ctrCtx, [esp + 4*5 + 0*4]
761        mov     output, [esp + 4*5 + 1*4]
762        mov     input,  [esp + 4*5 + 4*4]
763        mov     inputLen, [esp + 4*5 + 5*4]
764
765        mov     ctx, [4+ctrCtx]
766
767        mov     ebp, esp
768        sub     esp, 7*16
769        and     esp, -16
770
771        movdqu  xmm0, [8+ctrCtx]
772        mov     ctrCtx, [ctrCtx + 8 + 3*4]
773        bswap   ctrCtx
774        movdqu  xmm1, [ctx + 0*16]
775
776        pxor    xmm0, xmm1
777
778        movdqa  [esp + 0*16], xmm0
779        movdqa  [esp + 1*16], xmm0
780        movdqa  [esp + 2*16], xmm0
781        movdqa  [esp + 3*16], xmm0
782        movdqa  [esp + 4*16], xmm0
783        movdqa  [esp + 5*16], xmm0
784        movdqa  [esp + 6*16], xmm0
785
786        inc     ctrCtx
787        mov     CTR, ctrCtx
788        bswap   CTR
789        xor     CTR, [ctx + 3*4]
790        mov     [esp + 1*16 + 3*4], CTR
791
792        inc     ctrCtx
793        mov     CTR, ctrCtx
794        bswap   CTR
795        xor     CTR, [ctx + 3*4]
796        mov     [esp + 2*16 + 3*4], CTR
797
798        inc     ctrCtx
799        mov     CTR, ctrCtx
800        bswap   CTR
801        xor     CTR, [ctx + 3*4]
802        mov     [esp + 3*16 + 3*4], CTR
803
804        inc     ctrCtx
805        mov     CTR, ctrCtx
806        bswap   CTR
807        xor     CTR, [ctx + 3*4]
808        mov     [esp + 4*16 + 3*4], CTR
809
810        inc     ctrCtx
811        mov     CTR, ctrCtx
812        bswap   CTR
813        xor     CTR, [ctx + 3*4]
814        mov     [esp + 5*16 + 3*4], CTR
815
816        inc     ctrCtx
817        mov     CTR, ctrCtx
818        bswap   CTR
819        xor     CTR, [ctx + 3*4]
820        mov     [esp + 6*16 + 3*4], CTR
821
822
823loop7:
824        cmp     inputLen, 7*16
825        jb      loop1
826
827        movdqu  xmm0, [0*16 + esp]
828        movdqu  xmm1, [1*16 + esp]
829        movdqu  xmm2, [2*16 + esp]
830        movdqu  xmm3, [3*16 + esp]
831        movdqu  xmm4, [4*16 + esp]
832        movdqu  xmm5, [5*16 + esp]
833        movdqu  xmm6, [6*16 + esp]
834
835        i = 1
836        WHILE i LE 7
837            aes_rnd i
838
839            inc     ctrCtx
840            mov     CTR, ctrCtx
841            bswap   CTR
842            xor     CTR, [ctx + 3*4]
843            mov     [esp + (i-1)*16 + 3*4], CTR
844
845            i = i+1
846        ENDM
847        WHILE i LT rnds
848            aes_rnd i
849            i = i+1
850            ENDM
851        aes_last_rnd rnds
852
853        movdqu  xmm7, [0*16 + input]
854        pxor    xmm0, xmm7
855        movdqu  xmm7, [1*16 + input]
856        pxor    xmm1, xmm7
857        movdqu  xmm7, [2*16 + input]
858        pxor    xmm2, xmm7
859        movdqu  xmm7, [3*16 + input]
860        pxor    xmm3, xmm7
861        movdqu  xmm7, [4*16 + input]
862        pxor    xmm4, xmm7
863        movdqu  xmm7, [5*16 + input]
864        pxor    xmm5, xmm7
865        movdqu  xmm7, [6*16 + input]
866        pxor    xmm6, xmm7
867
868        movdqu  [0*16 + output], xmm0
869        movdqu  [1*16 + output], xmm1
870        movdqu  [2*16 + output], xmm2
871        movdqu  [3*16 + output], xmm3
872        movdqu  [4*16 + output], xmm4
873        movdqu  [5*16 + output], xmm5
874        movdqu  [6*16 + output], xmm6
875
876        lea input, [7*16 + input]
877        lea output, [7*16 + output]
878        sub inputLen, 7*16
879        jmp loop7
880
881
882loop1:
883        cmp     inputLen, 1*16
884        jb      bail
885
886        movdqu  xmm0, [esp]
887        add     esp, 16
888
889        i = 1
890    WHILE i LT rnds
891            movdqu  xmm7, [i*16 + ctx]
892            aesenc  xmm0, xmm7
893            i = i+1
894        ENDM
895        movdqu  xmm7, [rnds*16 + ctx]
896        aesenclast xmm0, xmm7
897
898        movdqu  xmm7, [input]
899        pxor    xmm0, xmm7
900        movdqu  [output], xmm0
901
902        lea input, [1*16 + input]
903        lea output, [1*16 + output]
904        sub inputLen, 1*16
905        jmp loop1
906
907bail:
908
909        mov     ctrCtx, [ebp + 4*5 + 0*4]
910        movdqu  xmm0, [esp]
911        movdqu  xmm1, [ctx + 0*16]
912        pxor    xmm0, xmm1
913        movdqu  [8+ctrCtx], xmm0
914
915
916        xor     eax, eax
917        mov     esp, ebp
918        pop     ebp
919        pop     CTR
920        pop     ctrCtx
921        pop     inputLen
922        ret
923ENDM
924
925
926ALIGN 16
927intel_aes_encrypt_ctr_128 PROC
928gen_aes_ctr_func  10
929intel_aes_encrypt_ctr_128 ENDP
930
931ALIGN 16
932intel_aes_encrypt_ctr_192 PROC
933gen_aes_ctr_func  12
934intel_aes_encrypt_ctr_192 ENDP
935
936ALIGN 16
937intel_aes_encrypt_ctr_256 PROC
938gen_aes_ctr_func  14
939intel_aes_encrypt_ctr_256 ENDP
940
941
942END
943