1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125# constants in mergeable sections, linker can reorder and merge
126.section	.rodata.cst16.POLY, "aM", @progbits, 16
127.align 16
128POLY:            .octa     0xC2000000000000000000000000000001
129
130.section	.rodata.cst16.POLY2, "aM", @progbits, 16
131.align 16
132POLY2:           .octa     0xC20000000000000000000001C2000000
133
134.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
135.align 16
136TWOONE:          .octa     0x00000001000000000000000000000001
137
138.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139.align 16
140SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
141
142.section	.rodata.cst16.ONE, "aM", @progbits, 16
143.align 16
144ONE:             .octa     0x00000000000000000000000000000001
145
146.section	.rodata.cst16.ONEf, "aM", @progbits, 16
147.align 16
148ONEf:            .octa     0x01000000000000000000000000000000
149
150# order of these constants should not change.
151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152.section	.rodata, "a", @progbits
153.align 16
154SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
155ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
156                 .octa     0x00000000000000000000000000000000
157
158.section .rodata
159.align 16
160.type aad_shift_arr, @object
161.size aad_shift_arr, 272
162aad_shift_arr:
163        .octa     0xffffffffffffffffffffffffffffffff
164        .octa     0xffffffffffffffffffffffffffffff0C
165        .octa     0xffffffffffffffffffffffffffff0D0C
166        .octa     0xffffffffffffffffffffffffff0E0D0C
167        .octa     0xffffffffffffffffffffffff0F0E0D0C
168        .octa     0xffffffffffffffffffffff0C0B0A0908
169        .octa     0xffffffffffffffffffff0D0C0B0A0908
170        .octa     0xffffffffffffffffff0E0D0C0B0A0908
171        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
172        .octa     0xffffffffffffff0C0B0A090807060504
173        .octa     0xffffffffffff0D0C0B0A090807060504
174        .octa     0xffffffffff0E0D0C0B0A090807060504
175        .octa     0xffffffff0F0E0D0C0B0A090807060504
176        .octa     0xffffff0C0B0A09080706050403020100
177        .octa     0xffff0D0C0B0A09080706050403020100
178        .octa     0xff0E0D0C0B0A09080706050403020100
179        .octa     0x0F0E0D0C0B0A09080706050403020100
180
181
182.text
183
184
185#define AadHash 16*0
186#define AadLen 16*1
187#define InLen (16*1)+8
188#define PBlockEncKey 16*2
189#define OrigIV 16*3
190#define CurCount 16*4
191#define PBlockLen 16*5
192
193HashKey        = 16*6   # store HashKey <<1 mod poly here
194HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
195HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
196HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
197HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
198HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
199HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
200HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
201HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
209
210#define arg1 %rdi
211#define arg2 %rsi
212#define arg3 %rdx
213#define arg4 %rcx
214#define arg5 %r8
215#define arg6 %r9
216#define arg7 STACK_OFFSET+8*1(%r14)
217#define arg8 STACK_OFFSET+8*2(%r14)
218#define arg9 STACK_OFFSET+8*3(%r14)
219#define arg10 STACK_OFFSET+8*4(%r14)
220#define keysize 2*15*16(arg1)
221
222i = 0
223j = 0
224
225out_order = 0
226in_order = 1
227DEC = 0
228ENC = 1
229
230.macro define_reg r n
231reg_\r = %xmm\n
232.endm
233
234.macro setreg
235.altmacro
236define_reg i %i
237define_reg j %j
238.noaltmacro
239.endm
240
241# need to push 4 registers into stack to maintain
242STACK_OFFSET = 8*4
243
244TMP1 =   16*0    # Temporary storage for AAD
245TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246TMP3 =   16*2    # Temporary storage for AES State 3
247TMP4 =   16*3    # Temporary storage for AES State 4
248TMP5 =   16*4    # Temporary storage for AES State 5
249TMP6 =   16*5    # Temporary storage for AES State 6
250TMP7 =   16*6    # Temporary storage for AES State 7
251TMP8 =   16*7    # Temporary storage for AES State 8
252
253VARIABLE_OFFSET = 16*8
254
255################################
256# Utility Macros
257################################
258
259.macro FUNC_SAVE
260        #the number of pushes must equal STACK_OFFSET
261        push    %r12
262        push    %r13
263        push    %r14
264        push    %r15
265
266        mov     %rsp, %r14
267
268
269
270        sub     $VARIABLE_OFFSET, %rsp
271        and     $~63, %rsp                    # align rsp to 64 bytes
272.endm
273
274.macro FUNC_RESTORE
275        mov     %r14, %rsp
276
277        pop     %r15
278        pop     %r14
279        pop     %r13
280        pop     %r12
281.endm
282
283# Encryption of a single block
284.macro ENCRYPT_SINGLE_BLOCK REP XMM0
285                vpxor    (arg1), \XMM0, \XMM0
286               i = 1
287               setreg
288.rep \REP
289                vaesenc  16*i(arg1), \XMM0, \XMM0
290               i = (i+1)
291               setreg
292.endr
293                vaesenclast 16*i(arg1), \XMM0, \XMM0
294.endm
295
296# combined for GCM encrypt and decrypt functions
297# clobbering all xmm registers
298# clobbering r10, r11, r12, r13, r14, r15
299.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300        vmovdqu AadHash(arg2), %xmm8
301        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
302        add arg5, InLen(arg2)
303
304        # initialize the data pointer offset as zero
305        xor     %r11d, %r11d
306
307        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
308        sub %r11, arg5
309
310        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
311        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
312
313        mov     %r13, %r12
314        shr     $4, %r12
315        and     $7, %r12
316        jz      _initial_num_blocks_is_0\@
317
318        cmp     $7, %r12
319        je      _initial_num_blocks_is_7\@
320        cmp     $6, %r12
321        je      _initial_num_blocks_is_6\@
322        cmp     $5, %r12
323        je      _initial_num_blocks_is_5\@
324        cmp     $4, %r12
325        je      _initial_num_blocks_is_4\@
326        cmp     $3, %r12
327        je      _initial_num_blocks_is_3\@
328        cmp     $2, %r12
329        je      _initial_num_blocks_is_2\@
330
331        jmp     _initial_num_blocks_is_1\@
332
333_initial_num_blocks_is_7\@:
334        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335        sub     $16*7, %r13
336        jmp     _initial_blocks_encrypted\@
337
338_initial_num_blocks_is_6\@:
339        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340        sub     $16*6, %r13
341        jmp     _initial_blocks_encrypted\@
342
343_initial_num_blocks_is_5\@:
344        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345        sub     $16*5, %r13
346        jmp     _initial_blocks_encrypted\@
347
348_initial_num_blocks_is_4\@:
349        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350        sub     $16*4, %r13
351        jmp     _initial_blocks_encrypted\@
352
353_initial_num_blocks_is_3\@:
354        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355        sub     $16*3, %r13
356        jmp     _initial_blocks_encrypted\@
357
358_initial_num_blocks_is_2\@:
359        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
360        sub     $16*2, %r13
361        jmp     _initial_blocks_encrypted\@
362
363_initial_num_blocks_is_1\@:
364        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
365        sub     $16*1, %r13
366        jmp     _initial_blocks_encrypted\@
367
368_initial_num_blocks_is_0\@:
369        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
370
371
372_initial_blocks_encrypted\@:
373        cmp     $0, %r13
374        je      _zero_cipher_left\@
375
376        sub     $128, %r13
377        je      _eight_cipher_left\@
378
379
380
381
382        vmovd   %xmm9, %r15d
383        and     $255, %r15d
384        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
385
386
387_encrypt_by_8_new\@:
388        cmp     $(255-8), %r15d
389        jg      _encrypt_by_8\@
390
391
392
393        add     $8, %r15b
394        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
395        add     $128, %r11
396        sub     $128, %r13
397        jne     _encrypt_by_8_new\@
398
399        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400        jmp     _eight_cipher_left\@
401
402_encrypt_by_8\@:
403        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
404        add     $8, %r15b
405        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
407        add     $128, %r11
408        sub     $128, %r13
409        jne     _encrypt_by_8_new\@
410
411        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
412
413
414
415
416_eight_cipher_left\@:
417        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
418
419
420_zero_cipher_left\@:
421        vmovdqu %xmm14, AadHash(arg2)
422        vmovdqu %xmm9, CurCount(arg2)
423
424        # check for 0 length
425        mov     arg5, %r13
426        and     $15, %r13                            # r13 = (arg5 mod 16)
427
428        je      _multiple_of_16_bytes\@
429
430        # handle the last <16 Byte block separately
431
432        mov %r13, PBlockLen(arg2)
433
434        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
435        vmovdqu %xmm9, CurCount(arg2)
436        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
437
438        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
439        vmovdqu %xmm9, PBlockEncKey(arg2)
440
441        cmp $16, arg5
442        jge _large_enough_update\@
443
444        lea (arg4,%r11,1), %r10
445        mov %r13, %r12
446
447        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
448
449        lea     SHIFT_MASK+16(%rip), %r12
450        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
451						     # able to shift 16-r13 bytes (r13 is the
452	# number of bytes in plaintext mod 16)
453
454        jmp _final_ghash_mul\@
455
456_large_enough_update\@:
457        sub $16, %r11
458        add %r13, %r11
459
460        # receive the last <16 Byte block
461        vmovdqu	(arg4, %r11, 1), %xmm1
462
463        sub	%r13, %r11
464        add	$16, %r11
465
466        lea	SHIFT_MASK+16(%rip), %r12
467        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
468        # (r13 is the number of bytes in plaintext mod 16)
469        sub	%r13, %r12
470        # get the appropriate shuffle mask
471        vmovdqu	(%r12), %xmm2
472        # shift right 16-r13 bytes
473        vpshufb  %xmm2, %xmm1, %xmm1
474
475_final_ghash_mul\@:
476        .if  \ENC_DEC ==  DEC
477        vmovdqa %xmm1, %xmm2
478        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
479        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
480						     # mask out top 16-r13 bytes of xmm9
481        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
482        vpand   %xmm1, %xmm2, %xmm2
483        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484        vpxor   %xmm2, %xmm14, %xmm14
485
486        vmovdqu %xmm14, AadHash(arg2)
487        .else
488        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
489        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
490						     # mask out top 16-r13 bytes of xmm9
491        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
492        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493        vpxor   %xmm9, %xmm14, %xmm14
494
495        vmovdqu %xmm14, AadHash(arg2)
496        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
497        .endif
498
499
500        #############################
501        # output r13 Bytes
502        vmovq   %xmm9, %rax
503        cmp     $8, %r13
504        jle     _less_than_8_bytes_left\@
505
506        mov     %rax, (arg3 , %r11)
507        add     $8, %r11
508        vpsrldq $8, %xmm9, %xmm9
509        vmovq   %xmm9, %rax
510        sub     $8, %r13
511
512_less_than_8_bytes_left\@:
513        movb    %al, (arg3 , %r11)
514        add     $1, %r11
515        shr     $8, %rax
516        sub     $1, %r13
517        jne     _less_than_8_bytes_left\@
518        #############################
519
520_multiple_of_16_bytes\@:
521.endm
522
523
524# GCM_COMPLETE Finishes update of tag of last partial block
525# Output: Authorization Tag (AUTH_TAG)
526# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
527.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528        vmovdqu AadHash(arg2), %xmm14
529        vmovdqu HashKey(arg2), %xmm13
530
531        mov PBlockLen(arg2), %r12
532        cmp $0, %r12
533        je _partial_done\@
534
535	#GHASH computation for the last <16 Byte block
536        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
537
538_partial_done\@:
539        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
540        shl     $3, %r12                             # convert into number of bits
541        vmovd   %r12d, %xmm15                        # len(A) in xmm15
542
543        mov InLen(arg2), %r12
544        shl     $3, %r12                        # len(C) in bits  (*128)
545        vmovq   %r12, %xmm1
546        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
547        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
548
549        vpxor   %xmm15, %xmm14, %xmm14
550        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
551        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
552
553        vmovdqu OrigIV(arg2), %xmm9
554
555        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
556
557        vpxor   %xmm14, %xmm9, %xmm9
558
559
560
561_return_T\@:
562        mov     \AUTH_TAG, %r10              # r10 = authTag
563        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
564
565        cmp     $16, %r11
566        je      _T_16\@
567
568        cmp     $8, %r11
569        jl      _T_4\@
570
571_T_8\@:
572        vmovq   %xmm9, %rax
573        mov     %rax, (%r10)
574        add     $8, %r10
575        sub     $8, %r11
576        vpsrldq $8, %xmm9, %xmm9
577        cmp     $0, %r11
578        je     _return_T_done\@
579_T_4\@:
580        vmovd   %xmm9, %eax
581        mov     %eax, (%r10)
582        add     $4, %r10
583        sub     $4, %r11
584        vpsrldq     $4, %xmm9, %xmm9
585        cmp     $0, %r11
586        je     _return_T_done\@
587_T_123\@:
588        vmovd     %xmm9, %eax
589        cmp     $2, %r11
590        jl     _T_1\@
591        mov     %ax, (%r10)
592        cmp     $2, %r11
593        je     _return_T_done\@
594        add     $2, %r10
595        sar     $16, %eax
596_T_1\@:
597        mov     %al, (%r10)
598        jmp     _return_T_done\@
599
600_T_16\@:
601        vmovdqu %xmm9, (%r10)
602
603_return_T_done\@:
604.endm
605
606.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
607
608	mov     \AAD, %r10                      # r10 = AAD
609	mov     \AADLEN, %r12                      # r12 = aadLen
610
611
612	mov     %r12, %r11
613
614	vpxor   \T8, \T8, \T8
615	vpxor   \T7, \T7, \T7
616	cmp     $16, %r11
617	jl      _get_AAD_rest8\@
618_get_AAD_blocks\@:
619	vmovdqu (%r10), \T7
620	vpshufb SHUF_MASK(%rip), \T7, \T7
621	vpxor   \T7, \T8, \T8
622	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
623	add     $16, %r10
624	sub     $16, %r12
625	sub     $16, %r11
626	cmp     $16, %r11
627	jge     _get_AAD_blocks\@
628	vmovdqu \T8, \T7
629	cmp     $0, %r11
630	je      _get_AAD_done\@
631
632	vpxor   \T7, \T7, \T7
633
634	/* read the last <16B of AAD. since we have at least 4B of
635	data right after the AAD (the ICV, and maybe some CT), we can
636	read 4B/8B blocks safely, and then get rid of the extra stuff */
637_get_AAD_rest8\@:
638	cmp     $4, %r11
639	jle     _get_AAD_rest4\@
640	movq    (%r10), \T1
641	add     $8, %r10
642	sub     $8, %r11
643	vpslldq $8, \T1, \T1
644	vpsrldq $8, \T7, \T7
645	vpxor   \T1, \T7, \T7
646	jmp     _get_AAD_rest8\@
647_get_AAD_rest4\@:
648	cmp     $0, %r11
649	jle      _get_AAD_rest0\@
650	mov     (%r10), %eax
651	movq    %rax, \T1
652	add     $4, %r10
653	sub     $4, %r11
654	vpslldq $12, \T1, \T1
655	vpsrldq $4, \T7, \T7
656	vpxor   \T1, \T7, \T7
657_get_AAD_rest0\@:
658	/* finalize: shift out the extra bytes we read, and align
659	left. since pslldq can only shift by an immediate, we use
660	vpshufb and an array of shuffle masks */
661	movq    %r12, %r11
662	salq    $4, %r11
663	vmovdqu  aad_shift_arr(%r11), \T1
664	vpshufb \T1, \T7, \T7
665_get_AAD_rest_final\@:
666	vpshufb SHUF_MASK(%rip), \T7, \T7
667	vpxor   \T8, \T7, \T7
668	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
669
670_get_AAD_done\@:
671        vmovdqu \T7, AadHash(arg2)
672.endm
673
674.macro INIT GHASH_MUL PRECOMPUTE
675        mov arg6, %r11
676        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
677        xor %r11d, %r11d
678        mov %r11, InLen(arg2) # ctx_data.in_length = 0
679
680        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
681        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
682        mov arg3, %rax
683        movdqu (%rax), %xmm0
684        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
685
686        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
688
689        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
690
691        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
692        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
693        vmovdqa  %xmm6, %xmm2
694        vpsllq   $1, %xmm6, %xmm6
695        vpsrlq   $63, %xmm2, %xmm2
696        vmovdqa  %xmm2, %xmm1
697        vpslldq  $8, %xmm2, %xmm2
698        vpsrldq  $8, %xmm1, %xmm1
699        vpor     %xmm2, %xmm6, %xmm6
700        #reduction
701        vpshufd  $0b00100100, %xmm1, %xmm2
702        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703        vpand    POLY(%rip), %xmm2, %xmm2
704        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
705        #######################################################################
706        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
707
708        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
709
710        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
711.endm
712
713
714# Reads DLEN bytes starting at DPTR and stores in XMMDst
715# where 0 < DLEN < 16
716# Clobbers %rax, DLEN
717.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718        vpxor \XMMDst, \XMMDst, \XMMDst
719
720        cmp $8, \DLEN
721        jl _read_lt8_\@
722        mov (\DPTR), %rax
723        vpinsrq $0, %rax, \XMMDst, \XMMDst
724        sub $8, \DLEN
725        jz _done_read_partial_block_\@
726        xor %eax, %eax
727_read_next_byte_\@:
728        shl $8, %rax
729        mov 7(\DPTR, \DLEN, 1), %al
730        dec \DLEN
731        jnz _read_next_byte_\@
732        vpinsrq $1, %rax, \XMMDst, \XMMDst
733        jmp _done_read_partial_block_\@
734_read_lt8_\@:
735        xor %eax, %eax
736_read_next_byte_lt8_\@:
737        shl $8, %rax
738        mov -1(\DPTR, \DLEN, 1), %al
739        dec \DLEN
740        jnz _read_next_byte_lt8_\@
741        vpinsrq $0, %rax, \XMMDst, \XMMDst
742_done_read_partial_block_\@:
743.endm
744
745# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
746# between update calls.
747# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
748# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
749# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
750.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
751        AAD_HASH ENC_DEC
752        mov 	PBlockLen(arg2), %r13
753        cmp	$0, %r13
754        je	_partial_block_done_\@	# Leave Macro if no partial blocks
755        # Read in input data without over reading
756        cmp	$16, \PLAIN_CYPH_LEN
757        jl	_fewer_than_16_bytes_\@
758        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
759        jmp	_data_read_\@
760
761_fewer_than_16_bytes_\@:
762        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763        mov	\PLAIN_CYPH_LEN, %r12
764        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
765
766        mov PBlockLen(arg2), %r13
767
768_data_read_\@:				# Finished reading in data
769
770        vmovdqu	PBlockEncKey(arg2), %xmm9
771        vmovdqu	HashKey(arg2), %xmm13
772
773        lea	SHIFT_MASK(%rip), %r12
774
775        # adjust the shuffle mask pointer to be able to shift r13 bytes
776        # r16-r13 is the number of bytes in plaintext mod 16)
777        add	%r13, %r12
778        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
779        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
780
781.if  \ENC_DEC ==  DEC
782        vmovdqa	%xmm1, %xmm3
783        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
784
785        mov	\PLAIN_CYPH_LEN, %r10
786        add	%r13, %r10
787        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
788        sub	$16, %r10
789        # Determine if if partial block is not being filled and
790        # shift mask accordingly
791        jge	_no_extra_mask_1_\@
792        sub	%r10, %r12
793_no_extra_mask_1_\@:
794
795        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
796        # get the appropriate mask to mask out bottom r13 bytes of xmm9
797        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
798
799        vpand	%xmm1, %xmm3, %xmm3
800        vmovdqa	SHUF_MASK(%rip), %xmm10
801        vpshufb	%xmm10, %xmm3, %xmm3
802        vpshufb	%xmm2, %xmm3, %xmm3
803        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
804
805        cmp	$0, %r10
806        jl	_partial_incomplete_1_\@
807
808        # GHASH computation for the last <16 Byte block
809        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
810        xor	%eax,%eax
811
812        mov	%rax, PBlockLen(arg2)
813        jmp	_dec_done_\@
814_partial_incomplete_1_\@:
815        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
816_dec_done_\@:
817        vmovdqu	\AAD_HASH, AadHash(arg2)
818.else
819        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
820
821        mov	\PLAIN_CYPH_LEN, %r10
822        add	%r13, %r10
823        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
824        sub	$16, %r10
825        # Determine if if partial block is not being filled and
826        # shift mask accordingly
827        jge	_no_extra_mask_2_\@
828        sub	%r10, %r12
829_no_extra_mask_2_\@:
830
831        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
832        # get the appropriate mask to mask out bottom r13 bytes of xmm9
833        vpand	%xmm1, %xmm9, %xmm9
834
835        vmovdqa	SHUF_MASK(%rip), %xmm1
836        vpshufb %xmm1, %xmm9, %xmm9
837        vpshufb %xmm2, %xmm9, %xmm9
838        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
839
840        cmp	$0, %r10
841        jl	_partial_incomplete_2_\@
842
843        # GHASH computation for the last <16 Byte block
844        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
845        xor	%eax,%eax
846
847        mov	%rax, PBlockLen(arg2)
848        jmp	_encode_done_\@
849_partial_incomplete_2_\@:
850        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
851_encode_done_\@:
852        vmovdqu	\AAD_HASH, AadHash(arg2)
853
854        vmovdqa	SHUF_MASK(%rip), %xmm10
855        # shuffle xmm9 back to output as ciphertext
856        vpshufb	%xmm10, %xmm9, %xmm9
857        vpshufb	%xmm2, %xmm9, %xmm9
858.endif
859        # output encrypted Bytes
860        cmp	$0, %r10
861        jl	_partial_fill_\@
862        mov	%r13, %r12
863        mov	$16, %r13
864        # Set r13 to be the number of bytes to write out
865        sub	%r12, %r13
866        jmp	_count_set_\@
867_partial_fill_\@:
868        mov	\PLAIN_CYPH_LEN, %r13
869_count_set_\@:
870        vmovdqa	%xmm9, %xmm0
871        vmovq	%xmm0, %rax
872        cmp	$8, %r13
873        jle	_less_than_8_bytes_left_\@
874
875        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
876        add	$8, \DATA_OFFSET
877        psrldq	$8, %xmm0
878        vmovq	%xmm0, %rax
879        sub	$8, %r13
880_less_than_8_bytes_left_\@:
881        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
882        add	$1, \DATA_OFFSET
883        shr	$8, %rax
884        sub	$1, %r13
885        jne	_less_than_8_bytes_left_\@
886_partial_block_done_\@:
887.endm # PARTIAL_BLOCK
888
889#ifdef CONFIG_AS_AVX
890###############################################################################
891# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
892# Input: A and B (128-bits each, bit-reflected)
893# Output: C = A*B*x mod poly, (i.e. >>1 )
894# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
895# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
896###############################################################################
897.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
898
899        vpshufd         $0b01001110, \GH, \T2
900        vpshufd         $0b01001110, \HK, \T3
901        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
902        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
903
904        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
905        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
906        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
907        vpxor           \GH, \T2,\T2
908        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
909
910        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
911        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
912        vpxor           \T3, \GH, \GH
913        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
914
915        #first phase of the reduction
916        vpslld  $31, \GH, \T2                   # packed right shifting << 31
917        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
918        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
919
920        vpxor   \T3, \T2, \T2                   # xor the shifted versions
921        vpxor   \T4, \T2, \T2
922
923        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
924
925        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
926        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
927
928        #second phase of the reduction
929
930        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
931        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
932        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
933        vpxor   \T3, \T2, \T2                   # xor the shifted versions
934        vpxor   \T4, \T2, \T2
935
936        vpxor   \T5, \T2, \T2
937        vpxor   \T2, \GH, \GH
938        vpxor   \T1, \GH, \GH                   # the result is in GH
939
940
941.endm
942
943.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
944
945        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
946        vmovdqa  \HK, \T5
947
948        vpshufd  $0b01001110, \T5, \T1
949        vpxor    \T5, \T1, \T1
950        vmovdqu  \T1, HashKey_k(arg2)
951
952        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
953        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
954        vpshufd  $0b01001110, \T5, \T1
955        vpxor    \T5, \T1, \T1
956        vmovdqu  \T1, HashKey_2_k(arg2)
957
958        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
959        vmovdqu  \T5, HashKey_3(arg2)
960        vpshufd  $0b01001110, \T5, \T1
961        vpxor    \T5, \T1, \T1
962        vmovdqu  \T1, HashKey_3_k(arg2)
963
964        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
965        vmovdqu  \T5, HashKey_4(arg2)
966        vpshufd  $0b01001110, \T5, \T1
967        vpxor    \T5, \T1, \T1
968        vmovdqu  \T1, HashKey_4_k(arg2)
969
970        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
971        vmovdqu  \T5, HashKey_5(arg2)
972        vpshufd  $0b01001110, \T5, \T1
973        vpxor    \T5, \T1, \T1
974        vmovdqu  \T1, HashKey_5_k(arg2)
975
976        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
977        vmovdqu  \T5, HashKey_6(arg2)
978        vpshufd  $0b01001110, \T5, \T1
979        vpxor    \T5, \T1, \T1
980        vmovdqu  \T1, HashKey_6_k(arg2)
981
982        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
983        vmovdqu  \T5, HashKey_7(arg2)
984        vpshufd  $0b01001110, \T5, \T1
985        vpxor    \T5, \T1, \T1
986        vmovdqu  \T1, HashKey_7_k(arg2)
987
988        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
989        vmovdqu  \T5, HashKey_8(arg2)
990        vpshufd  $0b01001110, \T5, \T1
991        vpxor    \T5, \T1, \T1
992        vmovdqu  \T1, HashKey_8_k(arg2)
993
994.endm
995
996## if a = number of total plaintext bytes
997## b = floor(a/16)
998## num_initial_blocks = b mod 4#
999## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1000## r10, r11, r12, rax are clobbered
1001## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1002
1003.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1004	i = (8-\num_initial_blocks)
1005	setreg
1006        vmovdqu AadHash(arg2), reg_i
1007
1008	# start AES for num_initial_blocks blocks
1009	vmovdqu CurCount(arg2), \CTR
1010
1011	i = (9-\num_initial_blocks)
1012	setreg
1013.rep \num_initial_blocks
1014                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
1015                vmovdqa \CTR, reg_i
1016                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1017	i = (i+1)
1018	setreg
1019.endr
1020
1021	vmovdqa  (arg1), \T_key
1022	i = (9-\num_initial_blocks)
1023	setreg
1024.rep \num_initial_blocks
1025                vpxor   \T_key, reg_i, reg_i
1026	i = (i+1)
1027	setreg
1028.endr
1029
1030       j = 1
1031       setreg
1032.rep \REP
1033       vmovdqa  16*j(arg1), \T_key
1034	i = (9-\num_initial_blocks)
1035	setreg
1036.rep \num_initial_blocks
1037        vaesenc \T_key, reg_i, reg_i
1038	i = (i+1)
1039	setreg
1040.endr
1041
1042       j = (j+1)
1043       setreg
1044.endr
1045
1046	vmovdqa  16*j(arg1), \T_key
1047	i = (9-\num_initial_blocks)
1048	setreg
1049.rep \num_initial_blocks
1050        vaesenclast      \T_key, reg_i, reg_i
1051	i = (i+1)
1052	setreg
1053.endr
1054
1055	i = (9-\num_initial_blocks)
1056	setreg
1057.rep \num_initial_blocks
1058                vmovdqu (arg4, %r11), \T1
1059                vpxor   \T1, reg_i, reg_i
1060                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1061                add     $16, %r11
1062.if  \ENC_DEC == DEC
1063                vmovdqa \T1, reg_i
1064.endif
1065                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1066	i = (i+1)
1067	setreg
1068.endr
1069
1070
1071	i = (8-\num_initial_blocks)
1072	j = (9-\num_initial_blocks)
1073	setreg
1074
1075.rep \num_initial_blocks
1076        vpxor    reg_i, reg_j, reg_j
1077        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1078	i = (i+1)
1079	j = (j+1)
1080	setreg
1081.endr
1082        # XMM8 has the combined result here
1083
1084        vmovdqa  \XMM8, TMP1(%rsp)
1085        vmovdqa  \XMM8, \T3
1086
1087        cmp     $128, %r13
1088        jl      _initial_blocks_done\@                  # no need for precomputed constants
1089
1090###############################################################################
1091# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1092                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1093                vmovdqa  \CTR, \XMM1
1094                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1095
1096                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1097                vmovdqa  \CTR, \XMM2
1098                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1099
1100                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1101                vmovdqa  \CTR, \XMM3
1102                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1103
1104                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1105                vmovdqa  \CTR, \XMM4
1106                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1107
1108                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1109                vmovdqa  \CTR, \XMM5
1110                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1111
1112                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1113                vmovdqa  \CTR, \XMM6
1114                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1115
1116                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1117                vmovdqa  \CTR, \XMM7
1118                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1119
1120                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1121                vmovdqa  \CTR, \XMM8
1122                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1123
1124                vmovdqa  (arg1), \T_key
1125                vpxor    \T_key, \XMM1, \XMM1
1126                vpxor    \T_key, \XMM2, \XMM2
1127                vpxor    \T_key, \XMM3, \XMM3
1128                vpxor    \T_key, \XMM4, \XMM4
1129                vpxor    \T_key, \XMM5, \XMM5
1130                vpxor    \T_key, \XMM6, \XMM6
1131                vpxor    \T_key, \XMM7, \XMM7
1132                vpxor    \T_key, \XMM8, \XMM8
1133
1134               i = 1
1135               setreg
1136.rep    \REP       # do REP rounds
1137                vmovdqa  16*i(arg1), \T_key
1138                vaesenc  \T_key, \XMM1, \XMM1
1139                vaesenc  \T_key, \XMM2, \XMM2
1140                vaesenc  \T_key, \XMM3, \XMM3
1141                vaesenc  \T_key, \XMM4, \XMM4
1142                vaesenc  \T_key, \XMM5, \XMM5
1143                vaesenc  \T_key, \XMM6, \XMM6
1144                vaesenc  \T_key, \XMM7, \XMM7
1145                vaesenc  \T_key, \XMM8, \XMM8
1146               i = (i+1)
1147               setreg
1148.endr
1149
1150                vmovdqa  16*i(arg1), \T_key
1151                vaesenclast  \T_key, \XMM1, \XMM1
1152                vaesenclast  \T_key, \XMM2, \XMM2
1153                vaesenclast  \T_key, \XMM3, \XMM3
1154                vaesenclast  \T_key, \XMM4, \XMM4
1155                vaesenclast  \T_key, \XMM5, \XMM5
1156                vaesenclast  \T_key, \XMM6, \XMM6
1157                vaesenclast  \T_key, \XMM7, \XMM7
1158                vaesenclast  \T_key, \XMM8, \XMM8
1159
1160                vmovdqu  (arg4, %r11), \T1
1161                vpxor    \T1, \XMM1, \XMM1
1162                vmovdqu  \XMM1, (arg3 , %r11)
1163                .if   \ENC_DEC == DEC
1164                vmovdqa  \T1, \XMM1
1165                .endif
1166
1167                vmovdqu  16*1(arg4, %r11), \T1
1168                vpxor    \T1, \XMM2, \XMM2
1169                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1170                .if   \ENC_DEC == DEC
1171                vmovdqa  \T1, \XMM2
1172                .endif
1173
1174                vmovdqu  16*2(arg4, %r11), \T1
1175                vpxor    \T1, \XMM3, \XMM3
1176                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1177                .if   \ENC_DEC == DEC
1178                vmovdqa  \T1, \XMM3
1179                .endif
1180
1181                vmovdqu  16*3(arg4, %r11), \T1
1182                vpxor    \T1, \XMM4, \XMM4
1183                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1184                .if   \ENC_DEC == DEC
1185                vmovdqa  \T1, \XMM4
1186                .endif
1187
1188                vmovdqu  16*4(arg4, %r11), \T1
1189                vpxor    \T1, \XMM5, \XMM5
1190                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1191                .if   \ENC_DEC == DEC
1192                vmovdqa  \T1, \XMM5
1193                .endif
1194
1195                vmovdqu  16*5(arg4, %r11), \T1
1196                vpxor    \T1, \XMM6, \XMM6
1197                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1198                .if   \ENC_DEC == DEC
1199                vmovdqa  \T1, \XMM6
1200                .endif
1201
1202                vmovdqu  16*6(arg4, %r11), \T1
1203                vpxor    \T1, \XMM7, \XMM7
1204                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1205                .if   \ENC_DEC == DEC
1206                vmovdqa  \T1, \XMM7
1207                .endif
1208
1209                vmovdqu  16*7(arg4, %r11), \T1
1210                vpxor    \T1, \XMM8, \XMM8
1211                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1212                .if   \ENC_DEC == DEC
1213                vmovdqa  \T1, \XMM8
1214                .endif
1215
1216                add     $128, %r11
1217
1218                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1219                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1220                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1221                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1222                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1223                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1224                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1225                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1226                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1227
1228###############################################################################
1229
1230_initial_blocks_done\@:
1231
1232.endm
1233
1234# encrypt 8 blocks at a time
1235# ghash the 8 previously encrypted ciphertext blocks
1236# arg1, arg3, arg4 are used as pointers only, not modified
1237# r11 is the data offset value
1238.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1239
1240        vmovdqa \XMM1, \T2
1241        vmovdqa \XMM2, TMP2(%rsp)
1242        vmovdqa \XMM3, TMP3(%rsp)
1243        vmovdqa \XMM4, TMP4(%rsp)
1244        vmovdqa \XMM5, TMP5(%rsp)
1245        vmovdqa \XMM6, TMP6(%rsp)
1246        vmovdqa \XMM7, TMP7(%rsp)
1247        vmovdqa \XMM8, TMP8(%rsp)
1248
1249.if \loop_idx == in_order
1250                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1251                vpaddd  ONE(%rip), \XMM1, \XMM2
1252                vpaddd  ONE(%rip), \XMM2, \XMM3
1253                vpaddd  ONE(%rip), \XMM3, \XMM4
1254                vpaddd  ONE(%rip), \XMM4, \XMM5
1255                vpaddd  ONE(%rip), \XMM5, \XMM6
1256                vpaddd  ONE(%rip), \XMM6, \XMM7
1257                vpaddd  ONE(%rip), \XMM7, \XMM8
1258                vmovdqa \XMM8, \CTR
1259
1260                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1261                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1262                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1263                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1264                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1265                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1266                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1267                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1268.else
1269                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1270                vpaddd  ONEf(%rip), \XMM1, \XMM2
1271                vpaddd  ONEf(%rip), \XMM2, \XMM3
1272                vpaddd  ONEf(%rip), \XMM3, \XMM4
1273                vpaddd  ONEf(%rip), \XMM4, \XMM5
1274                vpaddd  ONEf(%rip), \XMM5, \XMM6
1275                vpaddd  ONEf(%rip), \XMM6, \XMM7
1276                vpaddd  ONEf(%rip), \XMM7, \XMM8
1277                vmovdqa \XMM8, \CTR
1278.endif
1279
1280
1281        #######################################################################
1282
1283                vmovdqu (arg1), \T1
1284                vpxor   \T1, \XMM1, \XMM1
1285                vpxor   \T1, \XMM2, \XMM2
1286                vpxor   \T1, \XMM3, \XMM3
1287                vpxor   \T1, \XMM4, \XMM4
1288                vpxor   \T1, \XMM5, \XMM5
1289                vpxor   \T1, \XMM6, \XMM6
1290                vpxor   \T1, \XMM7, \XMM7
1291                vpxor   \T1, \XMM8, \XMM8
1292
1293        #######################################################################
1294
1295
1296
1297
1298
1299                vmovdqu 16*1(arg1), \T1
1300                vaesenc \T1, \XMM1, \XMM1
1301                vaesenc \T1, \XMM2, \XMM2
1302                vaesenc \T1, \XMM3, \XMM3
1303                vaesenc \T1, \XMM4, \XMM4
1304                vaesenc \T1, \XMM5, \XMM5
1305                vaesenc \T1, \XMM6, \XMM6
1306                vaesenc \T1, \XMM7, \XMM7
1307                vaesenc \T1, \XMM8, \XMM8
1308
1309                vmovdqu 16*2(arg1), \T1
1310                vaesenc \T1, \XMM1, \XMM1
1311                vaesenc \T1, \XMM2, \XMM2
1312                vaesenc \T1, \XMM3, \XMM3
1313                vaesenc \T1, \XMM4, \XMM4
1314                vaesenc \T1, \XMM5, \XMM5
1315                vaesenc \T1, \XMM6, \XMM6
1316                vaesenc \T1, \XMM7, \XMM7
1317                vaesenc \T1, \XMM8, \XMM8
1318
1319
1320        #######################################################################
1321
1322        vmovdqu         HashKey_8(arg2), \T5
1323        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1324        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1325
1326        vpshufd         $0b01001110, \T2, \T6
1327        vpxor           \T2, \T6, \T6
1328
1329        vmovdqu         HashKey_8_k(arg2), \T5
1330        vpclmulqdq      $0x00, \T5, \T6, \T6
1331
1332                vmovdqu 16*3(arg1), \T1
1333                vaesenc \T1, \XMM1, \XMM1
1334                vaesenc \T1, \XMM2, \XMM2
1335                vaesenc \T1, \XMM3, \XMM3
1336                vaesenc \T1, \XMM4, \XMM4
1337                vaesenc \T1, \XMM5, \XMM5
1338                vaesenc \T1, \XMM6, \XMM6
1339                vaesenc \T1, \XMM7, \XMM7
1340                vaesenc \T1, \XMM8, \XMM8
1341
1342        vmovdqa         TMP2(%rsp), \T1
1343        vmovdqu         HashKey_7(arg2), \T5
1344        vpclmulqdq      $0x11, \T5, \T1, \T3
1345        vpxor           \T3, \T4, \T4
1346        vpclmulqdq      $0x00, \T5, \T1, \T3
1347        vpxor           \T3, \T7, \T7
1348
1349        vpshufd         $0b01001110, \T1, \T3
1350        vpxor           \T1, \T3, \T3
1351        vmovdqu         HashKey_7_k(arg2), \T5
1352        vpclmulqdq      $0x10, \T5, \T3, \T3
1353        vpxor           \T3, \T6, \T6
1354
1355                vmovdqu 16*4(arg1), \T1
1356                vaesenc \T1, \XMM1, \XMM1
1357                vaesenc \T1, \XMM2, \XMM2
1358                vaesenc \T1, \XMM3, \XMM3
1359                vaesenc \T1, \XMM4, \XMM4
1360                vaesenc \T1, \XMM5, \XMM5
1361                vaesenc \T1, \XMM6, \XMM6
1362                vaesenc \T1, \XMM7, \XMM7
1363                vaesenc \T1, \XMM8, \XMM8
1364
1365        #######################################################################
1366
1367        vmovdqa         TMP3(%rsp), \T1
1368        vmovdqu         HashKey_6(arg2), \T5
1369        vpclmulqdq      $0x11, \T5, \T1, \T3
1370        vpxor           \T3, \T4, \T4
1371        vpclmulqdq      $0x00, \T5, \T1, \T3
1372        vpxor           \T3, \T7, \T7
1373
1374        vpshufd         $0b01001110, \T1, \T3
1375        vpxor           \T1, \T3, \T3
1376        vmovdqu         HashKey_6_k(arg2), \T5
1377        vpclmulqdq      $0x10, \T5, \T3, \T3
1378        vpxor           \T3, \T6, \T6
1379
1380                vmovdqu 16*5(arg1), \T1
1381                vaesenc \T1, \XMM1, \XMM1
1382                vaesenc \T1, \XMM2, \XMM2
1383                vaesenc \T1, \XMM3, \XMM3
1384                vaesenc \T1, \XMM4, \XMM4
1385                vaesenc \T1, \XMM5, \XMM5
1386                vaesenc \T1, \XMM6, \XMM6
1387                vaesenc \T1, \XMM7, \XMM7
1388                vaesenc \T1, \XMM8, \XMM8
1389
1390        vmovdqa         TMP4(%rsp), \T1
1391        vmovdqu         HashKey_5(arg2), \T5
1392        vpclmulqdq      $0x11, \T5, \T1, \T3
1393        vpxor           \T3, \T4, \T4
1394        vpclmulqdq      $0x00, \T5, \T1, \T3
1395        vpxor           \T3, \T7, \T7
1396
1397        vpshufd         $0b01001110, \T1, \T3
1398        vpxor           \T1, \T3, \T3
1399        vmovdqu         HashKey_5_k(arg2), \T5
1400        vpclmulqdq      $0x10, \T5, \T3, \T3
1401        vpxor           \T3, \T6, \T6
1402
1403                vmovdqu 16*6(arg1), \T1
1404                vaesenc \T1, \XMM1, \XMM1
1405                vaesenc \T1, \XMM2, \XMM2
1406                vaesenc \T1, \XMM3, \XMM3
1407                vaesenc \T1, \XMM4, \XMM4
1408                vaesenc \T1, \XMM5, \XMM5
1409                vaesenc \T1, \XMM6, \XMM6
1410                vaesenc \T1, \XMM7, \XMM7
1411                vaesenc \T1, \XMM8, \XMM8
1412
1413
1414        vmovdqa         TMP5(%rsp), \T1
1415        vmovdqu         HashKey_4(arg2), \T5
1416        vpclmulqdq      $0x11, \T5, \T1, \T3
1417        vpxor           \T3, \T4, \T4
1418        vpclmulqdq      $0x00, \T5, \T1, \T3
1419        vpxor           \T3, \T7, \T7
1420
1421        vpshufd         $0b01001110, \T1, \T3
1422        vpxor           \T1, \T3, \T3
1423        vmovdqu         HashKey_4_k(arg2), \T5
1424        vpclmulqdq      $0x10, \T5, \T3, \T3
1425        vpxor           \T3, \T6, \T6
1426
1427                vmovdqu 16*7(arg1), \T1
1428                vaesenc \T1, \XMM1, \XMM1
1429                vaesenc \T1, \XMM2, \XMM2
1430                vaesenc \T1, \XMM3, \XMM3
1431                vaesenc \T1, \XMM4, \XMM4
1432                vaesenc \T1, \XMM5, \XMM5
1433                vaesenc \T1, \XMM6, \XMM6
1434                vaesenc \T1, \XMM7, \XMM7
1435                vaesenc \T1, \XMM8, \XMM8
1436
1437        vmovdqa         TMP6(%rsp), \T1
1438        vmovdqu         HashKey_3(arg2), \T5
1439        vpclmulqdq      $0x11, \T5, \T1, \T3
1440        vpxor           \T3, \T4, \T4
1441        vpclmulqdq      $0x00, \T5, \T1, \T3
1442        vpxor           \T3, \T7, \T7
1443
1444        vpshufd         $0b01001110, \T1, \T3
1445        vpxor           \T1, \T3, \T3
1446        vmovdqu         HashKey_3_k(arg2), \T5
1447        vpclmulqdq      $0x10, \T5, \T3, \T3
1448        vpxor           \T3, \T6, \T6
1449
1450
1451                vmovdqu 16*8(arg1), \T1
1452                vaesenc \T1, \XMM1, \XMM1
1453                vaesenc \T1, \XMM2, \XMM2
1454                vaesenc \T1, \XMM3, \XMM3
1455                vaesenc \T1, \XMM4, \XMM4
1456                vaesenc \T1, \XMM5, \XMM5
1457                vaesenc \T1, \XMM6, \XMM6
1458                vaesenc \T1, \XMM7, \XMM7
1459                vaesenc \T1, \XMM8, \XMM8
1460
1461        vmovdqa         TMP7(%rsp), \T1
1462        vmovdqu         HashKey_2(arg2), \T5
1463        vpclmulqdq      $0x11, \T5, \T1, \T3
1464        vpxor           \T3, \T4, \T4
1465        vpclmulqdq      $0x00, \T5, \T1, \T3
1466        vpxor           \T3, \T7, \T7
1467
1468        vpshufd         $0b01001110, \T1, \T3
1469        vpxor           \T1, \T3, \T3
1470        vmovdqu         HashKey_2_k(arg2), \T5
1471        vpclmulqdq      $0x10, \T5, \T3, \T3
1472        vpxor           \T3, \T6, \T6
1473
1474        #######################################################################
1475
1476                vmovdqu 16*9(arg1), \T5
1477                vaesenc \T5, \XMM1, \XMM1
1478                vaesenc \T5, \XMM2, \XMM2
1479                vaesenc \T5, \XMM3, \XMM3
1480                vaesenc \T5, \XMM4, \XMM4
1481                vaesenc \T5, \XMM5, \XMM5
1482                vaesenc \T5, \XMM6, \XMM6
1483                vaesenc \T5, \XMM7, \XMM7
1484                vaesenc \T5, \XMM8, \XMM8
1485
1486        vmovdqa         TMP8(%rsp), \T1
1487        vmovdqu         HashKey(arg2), \T5
1488        vpclmulqdq      $0x11, \T5, \T1, \T3
1489        vpxor           \T3, \T4, \T4
1490        vpclmulqdq      $0x00, \T5, \T1, \T3
1491        vpxor           \T3, \T7, \T7
1492
1493        vpshufd         $0b01001110, \T1, \T3
1494        vpxor           \T1, \T3, \T3
1495        vmovdqu         HashKey_k(arg2), \T5
1496        vpclmulqdq      $0x10, \T5, \T3, \T3
1497        vpxor           \T3, \T6, \T6
1498
1499        vpxor           \T4, \T6, \T6
1500        vpxor           \T7, \T6, \T6
1501
1502                vmovdqu 16*10(arg1), \T5
1503
1504        i = 11
1505        setreg
1506.rep (\REP-9)
1507
1508        vaesenc \T5, \XMM1, \XMM1
1509        vaesenc \T5, \XMM2, \XMM2
1510        vaesenc \T5, \XMM3, \XMM3
1511        vaesenc \T5, \XMM4, \XMM4
1512        vaesenc \T5, \XMM5, \XMM5
1513        vaesenc \T5, \XMM6, \XMM6
1514        vaesenc \T5, \XMM7, \XMM7
1515        vaesenc \T5, \XMM8, \XMM8
1516
1517        vmovdqu 16*i(arg1), \T5
1518        i = i + 1
1519        setreg
1520.endr
1521
1522	i = 0
1523	j = 1
1524	setreg
1525.rep 8
1526		vpxor	16*i(arg4, %r11), \T5, \T2
1527                .if \ENC_DEC == ENC
1528                vaesenclast     \T2, reg_j, reg_j
1529                .else
1530                vaesenclast     \T2, reg_j, \T3
1531                vmovdqu 16*i(arg4, %r11), reg_j
1532                vmovdqu \T3, 16*i(arg3, %r11)
1533                .endif
1534	i = (i+1)
1535	j = (j+1)
1536	setreg
1537.endr
1538	#######################################################################
1539
1540
1541	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1542	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1543	vpxor	\T3, \T7, \T7
1544	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1545
1546
1547
1548	#######################################################################
1549	#first phase of the reduction
1550	#######################################################################
1551        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1552        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1553        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1554
1555        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1556        vpxor   \T4, \T2, \T2
1557
1558        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1559
1560        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1561        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1562	#######################################################################
1563                .if \ENC_DEC == ENC
1564		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1565		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1566		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1567		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1568		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1569		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1570		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1571		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1572                .endif
1573
1574	#######################################################################
1575	#second phase of the reduction
1576        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1577        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1578        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1579        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1580        vpxor   \T4, \T2, \T2
1581
1582        vpxor   \T1, \T2, \T2
1583        vpxor   \T2, \T7, \T7
1584        vpxor   \T7, \T6, \T6                           # the result is in T6
1585	#######################################################################
1586
1587		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1588		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1589		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1590		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1591		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1592		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1593		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1594		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1595
1596
1597	vpxor	\T6, \XMM1, \XMM1
1598
1599
1600
1601.endm
1602
1603
1604# GHASH the last 4 ciphertext blocks.
1605.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1606
1607        ## Karatsuba Method
1608
1609
1610        vpshufd         $0b01001110, \XMM1, \T2
1611        vpxor           \XMM1, \T2, \T2
1612        vmovdqu         HashKey_8(arg2), \T5
1613        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1614        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1615
1616        vmovdqu         HashKey_8_k(arg2), \T3
1617        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1618
1619        ######################
1620
1621        vpshufd         $0b01001110, \XMM2, \T2
1622        vpxor           \XMM2, \T2, \T2
1623        vmovdqu         HashKey_7(arg2), \T5
1624        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1625        vpxor           \T4, \T6, \T6
1626
1627        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1628        vpxor           \T4, \T7, \T7
1629
1630        vmovdqu         HashKey_7_k(arg2), \T3
1631        vpclmulqdq      $0x00, \T3, \T2, \T2
1632        vpxor           \T2, \XMM1, \XMM1
1633
1634        ######################
1635
1636        vpshufd         $0b01001110, \XMM3, \T2
1637        vpxor           \XMM3, \T2, \T2
1638        vmovdqu         HashKey_6(arg2), \T5
1639        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1640        vpxor           \T4, \T6, \T6
1641
1642        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1643        vpxor           \T4, \T7, \T7
1644
1645        vmovdqu         HashKey_6_k(arg2), \T3
1646        vpclmulqdq      $0x00, \T3, \T2, \T2
1647        vpxor           \T2, \XMM1, \XMM1
1648
1649        ######################
1650
1651        vpshufd         $0b01001110, \XMM4, \T2
1652        vpxor           \XMM4, \T2, \T2
1653        vmovdqu         HashKey_5(arg2), \T5
1654        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1655        vpxor           \T4, \T6, \T6
1656
1657        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1658        vpxor           \T4, \T7, \T7
1659
1660        vmovdqu         HashKey_5_k(arg2), \T3
1661        vpclmulqdq      $0x00, \T3, \T2, \T2
1662        vpxor           \T2, \XMM1, \XMM1
1663
1664        ######################
1665
1666        vpshufd         $0b01001110, \XMM5, \T2
1667        vpxor           \XMM5, \T2, \T2
1668        vmovdqu         HashKey_4(arg2), \T5
1669        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1670        vpxor           \T4, \T6, \T6
1671
1672        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1673        vpxor           \T4, \T7, \T7
1674
1675        vmovdqu         HashKey_4_k(arg2), \T3
1676        vpclmulqdq      $0x00, \T3, \T2, \T2
1677        vpxor           \T2, \XMM1, \XMM1
1678
1679        ######################
1680
1681        vpshufd         $0b01001110, \XMM6, \T2
1682        vpxor           \XMM6, \T2, \T2
1683        vmovdqu         HashKey_3(arg2), \T5
1684        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1685        vpxor           \T4, \T6, \T6
1686
1687        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1688        vpxor           \T4, \T7, \T7
1689
1690        vmovdqu         HashKey_3_k(arg2), \T3
1691        vpclmulqdq      $0x00, \T3, \T2, \T2
1692        vpxor           \T2, \XMM1, \XMM1
1693
1694        ######################
1695
1696        vpshufd         $0b01001110, \XMM7, \T2
1697        vpxor           \XMM7, \T2, \T2
1698        vmovdqu         HashKey_2(arg2), \T5
1699        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1700        vpxor           \T4, \T6, \T6
1701
1702        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1703        vpxor           \T4, \T7, \T7
1704
1705        vmovdqu         HashKey_2_k(arg2), \T3
1706        vpclmulqdq      $0x00, \T3, \T2, \T2
1707        vpxor           \T2, \XMM1, \XMM1
1708
1709        ######################
1710
1711        vpshufd         $0b01001110, \XMM8, \T2
1712        vpxor           \XMM8, \T2, \T2
1713        vmovdqu         HashKey(arg2), \T5
1714        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1715        vpxor           \T4, \T6, \T6
1716
1717        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1718        vpxor           \T4, \T7, \T7
1719
1720        vmovdqu         HashKey_k(arg2), \T3
1721        vpclmulqdq      $0x00, \T3, \T2, \T2
1722
1723        vpxor           \T2, \XMM1, \XMM1
1724        vpxor           \T6, \XMM1, \XMM1
1725        vpxor           \T7, \XMM1, \T2
1726
1727
1728
1729
1730        vpslldq $8, \T2, \T4
1731        vpsrldq $8, \T2, \T2
1732
1733        vpxor   \T4, \T7, \T7
1734        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1735				# the accumulated carry-less multiplications
1736
1737        #######################################################################
1738        #first phase of the reduction
1739        vpslld  $31, \T7, \T2   # packed right shifting << 31
1740        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1741        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1742
1743        vpxor   \T3, \T2, \T2   # xor the shifted versions
1744        vpxor   \T4, \T2, \T2
1745
1746        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1747
1748        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1749        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1750        #######################################################################
1751
1752
1753        #second phase of the reduction
1754        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1755        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1756        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1757        vpxor   \T3, \T2, \T2   # xor the shifted versions
1758        vpxor   \T4, \T2, \T2
1759
1760        vpxor   \T1, \T2, \T2
1761        vpxor   \T2, \T7, \T7
1762        vpxor   \T7, \T6, \T6   # the result is in T6
1763
1764.endm
1765
1766#############################################################
1767#void   aesni_gcm_precomp_avx_gen2
1768#        (gcm_data     *my_ctx_data,
1769#         gcm_context_data *data,
1770#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1771#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1772#			(from Security Association) concatenated with 8 byte
1773#			Initialisation Vector (from IPSec ESP Payload)
1774#			concatenated with 0x00000001. 16-byte aligned pointer. */
1775#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1776#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1777#############################################################
1778SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1779        FUNC_SAVE
1780        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1781        FUNC_RESTORE
1782        ret
1783SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1784
1785###############################################################################
1786#void   aesni_gcm_enc_update_avx_gen2(
1787#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1788#        gcm_context_data *data,
1789#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1790#        const   u8 *in, /* Plaintext input */
1791#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1792###############################################################################
1793SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1794        FUNC_SAVE
1795        mov     keysize, %eax
1796        cmp     $32, %eax
1797        je      key_256_enc_update
1798        cmp     $16, %eax
1799        je      key_128_enc_update
1800        # must be 192
1801        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1802        FUNC_RESTORE
1803        ret
1804key_128_enc_update:
1805        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1806        FUNC_RESTORE
1807        ret
1808key_256_enc_update:
1809        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1810        FUNC_RESTORE
1811        ret
1812SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1813
1814###############################################################################
1815#void   aesni_gcm_dec_update_avx_gen2(
1816#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1817#        gcm_context_data *data,
1818#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1819#        const   u8 *in, /* Ciphertext input */
1820#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1821###############################################################################
1822SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1823        FUNC_SAVE
1824        mov     keysize,%eax
1825        cmp     $32, %eax
1826        je      key_256_dec_update
1827        cmp     $16, %eax
1828        je      key_128_dec_update
1829        # must be 192
1830        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1831        FUNC_RESTORE
1832        ret
1833key_128_dec_update:
1834        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1835        FUNC_RESTORE
1836        ret
1837key_256_dec_update:
1838        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1839        FUNC_RESTORE
1840        ret
1841SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1842
1843###############################################################################
1844#void   aesni_gcm_finalize_avx_gen2(
1845#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1846#        gcm_context_data *data,
1847#        u8      *auth_tag, /* Authenticated Tag output. */
1848#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1849#				Valid values are 16 (most likely), 12 or 8. */
1850###############################################################################
1851SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1852        FUNC_SAVE
1853        mov	keysize,%eax
1854        cmp     $32, %eax
1855        je      key_256_finalize
1856        cmp     $16, %eax
1857        je      key_128_finalize
1858        # must be 192
1859        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1860        FUNC_RESTORE
1861        ret
1862key_128_finalize:
1863        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1864        FUNC_RESTORE
1865        ret
1866key_256_finalize:
1867        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1868        FUNC_RESTORE
1869        ret
1870SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1871
1872#endif /* CONFIG_AS_AVX */
1873
1874#ifdef CONFIG_AS_AVX2
1875###############################################################################
1876# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1877# Input: A and B (128-bits each, bit-reflected)
1878# Output: C = A*B*x mod poly, (i.e. >>1 )
1879# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1880# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1881###############################################################################
1882.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1883
1884        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1885        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1886        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1887        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1888        vpxor           \T3, \GH, \GH
1889
1890
1891        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1892        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1893
1894        vpxor           \T3, \T1, \T1
1895        vpxor           \T2, \GH, \GH
1896
1897        #######################################################################
1898        #first phase of the reduction
1899        vmovdqa         POLY2(%rip), \T3
1900
1901        vpclmulqdq      $0x01, \GH, \T3, \T2
1902        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1903
1904        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1905        #######################################################################
1906        #second phase of the reduction
1907        vpclmulqdq      $0x00, \GH, \T3, \T2
1908        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1909
1910        vpclmulqdq      $0x10, \GH, \T3, \GH
1911        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1912
1913        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1914        #######################################################################
1915        vpxor           \T1, \GH, \GH          # the result is in GH
1916
1917
1918.endm
1919
1920.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1921
1922        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1923        vmovdqa  \HK, \T5
1924        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1925        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1926
1927        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1928        vmovdqu  \T5, HashKey_3(arg2)
1929
1930        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1931        vmovdqu  \T5, HashKey_4(arg2)
1932
1933        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1934        vmovdqu  \T5, HashKey_5(arg2)
1935
1936        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1937        vmovdqu  \T5, HashKey_6(arg2)
1938
1939        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1940        vmovdqu  \T5, HashKey_7(arg2)
1941
1942        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1943        vmovdqu  \T5, HashKey_8(arg2)
1944
1945.endm
1946
1947## if a = number of total plaintext bytes
1948## b = floor(a/16)
1949## num_initial_blocks = b mod 4#
1950## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1951## r10, r11, r12, rax are clobbered
1952## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1953
1954.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1955	i = (8-\num_initial_blocks)
1956	setreg
1957	vmovdqu AadHash(arg2), reg_i
1958
1959	# start AES for num_initial_blocks blocks
1960	vmovdqu CurCount(arg2), \CTR
1961
1962	i = (9-\num_initial_blocks)
1963	setreg
1964.rep \num_initial_blocks
1965                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1966                vmovdqa \CTR, reg_i
1967                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1968	i = (i+1)
1969	setreg
1970.endr
1971
1972	vmovdqa  (arg1), \T_key
1973	i = (9-\num_initial_blocks)
1974	setreg
1975.rep \num_initial_blocks
1976                vpxor   \T_key, reg_i, reg_i
1977	i = (i+1)
1978	setreg
1979.endr
1980
1981	j = 1
1982	setreg
1983.rep \REP
1984	vmovdqa  16*j(arg1), \T_key
1985	i = (9-\num_initial_blocks)
1986	setreg
1987.rep \num_initial_blocks
1988        vaesenc \T_key, reg_i, reg_i
1989	i = (i+1)
1990	setreg
1991.endr
1992
1993	j = (j+1)
1994	setreg
1995.endr
1996
1997
1998	vmovdqa  16*j(arg1), \T_key
1999	i = (9-\num_initial_blocks)
2000	setreg
2001.rep \num_initial_blocks
2002        vaesenclast      \T_key, reg_i, reg_i
2003	i = (i+1)
2004	setreg
2005.endr
2006
2007	i = (9-\num_initial_blocks)
2008	setreg
2009.rep \num_initial_blocks
2010                vmovdqu (arg4, %r11), \T1
2011                vpxor   \T1, reg_i, reg_i
2012                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
2013						       # num_initial_blocks blocks
2014                add     $16, %r11
2015.if  \ENC_DEC == DEC
2016                vmovdqa \T1, reg_i
2017.endif
2018                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2019	i = (i+1)
2020	setreg
2021.endr
2022
2023
2024	i = (8-\num_initial_blocks)
2025	j = (9-\num_initial_blocks)
2026	setreg
2027
2028.rep \num_initial_blocks
2029        vpxor    reg_i, reg_j, reg_j
2030        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2031	i = (i+1)
2032	j = (j+1)
2033	setreg
2034.endr
2035        # XMM8 has the combined result here
2036
2037        vmovdqa  \XMM8, TMP1(%rsp)
2038        vmovdqa  \XMM8, \T3
2039
2040        cmp     $128, %r13
2041        jl      _initial_blocks_done\@                  # no need for precomputed constants
2042
2043###############################################################################
2044# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2045                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2046                vmovdqa  \CTR, \XMM1
2047                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2048
2049                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2050                vmovdqa  \CTR, \XMM2
2051                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2052
2053                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2054                vmovdqa  \CTR, \XMM3
2055                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2056
2057                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2058                vmovdqa  \CTR, \XMM4
2059                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2060
2061                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2062                vmovdqa  \CTR, \XMM5
2063                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2064
2065                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2066                vmovdqa  \CTR, \XMM6
2067                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2068
2069                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2070                vmovdqa  \CTR, \XMM7
2071                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2072
2073                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2074                vmovdqa  \CTR, \XMM8
2075                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2076
2077                vmovdqa  (arg1), \T_key
2078                vpxor    \T_key, \XMM1, \XMM1
2079                vpxor    \T_key, \XMM2, \XMM2
2080                vpxor    \T_key, \XMM3, \XMM3
2081                vpxor    \T_key, \XMM4, \XMM4
2082                vpxor    \T_key, \XMM5, \XMM5
2083                vpxor    \T_key, \XMM6, \XMM6
2084                vpxor    \T_key, \XMM7, \XMM7
2085                vpxor    \T_key, \XMM8, \XMM8
2086
2087		i = 1
2088		setreg
2089.rep    \REP       # do REP rounds
2090                vmovdqa  16*i(arg1), \T_key
2091                vaesenc  \T_key, \XMM1, \XMM1
2092                vaesenc  \T_key, \XMM2, \XMM2
2093                vaesenc  \T_key, \XMM3, \XMM3
2094                vaesenc  \T_key, \XMM4, \XMM4
2095                vaesenc  \T_key, \XMM5, \XMM5
2096                vaesenc  \T_key, \XMM6, \XMM6
2097                vaesenc  \T_key, \XMM7, \XMM7
2098                vaesenc  \T_key, \XMM8, \XMM8
2099		i = (i+1)
2100		setreg
2101.endr
2102
2103
2104                vmovdqa  16*i(arg1), \T_key
2105                vaesenclast  \T_key, \XMM1, \XMM1
2106                vaesenclast  \T_key, \XMM2, \XMM2
2107                vaesenclast  \T_key, \XMM3, \XMM3
2108                vaesenclast  \T_key, \XMM4, \XMM4
2109                vaesenclast  \T_key, \XMM5, \XMM5
2110                vaesenclast  \T_key, \XMM6, \XMM6
2111                vaesenclast  \T_key, \XMM7, \XMM7
2112                vaesenclast  \T_key, \XMM8, \XMM8
2113
2114                vmovdqu  (arg4, %r11), \T1
2115                vpxor    \T1, \XMM1, \XMM1
2116                vmovdqu  \XMM1, (arg3 , %r11)
2117                .if   \ENC_DEC == DEC
2118                vmovdqa  \T1, \XMM1
2119                .endif
2120
2121                vmovdqu  16*1(arg4, %r11), \T1
2122                vpxor    \T1, \XMM2, \XMM2
2123                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2124                .if   \ENC_DEC == DEC
2125                vmovdqa  \T1, \XMM2
2126                .endif
2127
2128                vmovdqu  16*2(arg4, %r11), \T1
2129                vpxor    \T1, \XMM3, \XMM3
2130                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2131                .if   \ENC_DEC == DEC
2132                vmovdqa  \T1, \XMM3
2133                .endif
2134
2135                vmovdqu  16*3(arg4, %r11), \T1
2136                vpxor    \T1, \XMM4, \XMM4
2137                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2138                .if   \ENC_DEC == DEC
2139                vmovdqa  \T1, \XMM4
2140                .endif
2141
2142                vmovdqu  16*4(arg4, %r11), \T1
2143                vpxor    \T1, \XMM5, \XMM5
2144                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2145                .if   \ENC_DEC == DEC
2146                vmovdqa  \T1, \XMM5
2147                .endif
2148
2149                vmovdqu  16*5(arg4, %r11), \T1
2150                vpxor    \T1, \XMM6, \XMM6
2151                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2152                .if   \ENC_DEC == DEC
2153                vmovdqa  \T1, \XMM6
2154                .endif
2155
2156                vmovdqu  16*6(arg4, %r11), \T1
2157                vpxor    \T1, \XMM7, \XMM7
2158                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2159                .if   \ENC_DEC == DEC
2160                vmovdqa  \T1, \XMM7
2161                .endif
2162
2163                vmovdqu  16*7(arg4, %r11), \T1
2164                vpxor    \T1, \XMM8, \XMM8
2165                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2166                .if   \ENC_DEC == DEC
2167                vmovdqa  \T1, \XMM8
2168                .endif
2169
2170                add     $128, %r11
2171
2172                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2173                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2174							   # the corresponding ciphertext
2175                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2176                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2177                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2178                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2179                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2180                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2181                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2182
2183###############################################################################
2184
2185_initial_blocks_done\@:
2186
2187
2188.endm
2189
2190
2191
2192# encrypt 8 blocks at a time
2193# ghash the 8 previously encrypted ciphertext blocks
2194# arg1, arg3, arg4 are used as pointers only, not modified
2195# r11 is the data offset value
2196.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2197
2198        vmovdqa \XMM1, \T2
2199        vmovdqa \XMM2, TMP2(%rsp)
2200        vmovdqa \XMM3, TMP3(%rsp)
2201        vmovdqa \XMM4, TMP4(%rsp)
2202        vmovdqa \XMM5, TMP5(%rsp)
2203        vmovdqa \XMM6, TMP6(%rsp)
2204        vmovdqa \XMM7, TMP7(%rsp)
2205        vmovdqa \XMM8, TMP8(%rsp)
2206
2207.if \loop_idx == in_order
2208                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2209                vpaddd  ONE(%rip), \XMM1, \XMM2
2210                vpaddd  ONE(%rip), \XMM2, \XMM3
2211                vpaddd  ONE(%rip), \XMM3, \XMM4
2212                vpaddd  ONE(%rip), \XMM4, \XMM5
2213                vpaddd  ONE(%rip), \XMM5, \XMM6
2214                vpaddd  ONE(%rip), \XMM6, \XMM7
2215                vpaddd  ONE(%rip), \XMM7, \XMM8
2216                vmovdqa \XMM8, \CTR
2217
2218                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2219                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2220                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2221                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2222                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2223                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2224                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2225                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2226.else
2227                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2228                vpaddd  ONEf(%rip), \XMM1, \XMM2
2229                vpaddd  ONEf(%rip), \XMM2, \XMM3
2230                vpaddd  ONEf(%rip), \XMM3, \XMM4
2231                vpaddd  ONEf(%rip), \XMM4, \XMM5
2232                vpaddd  ONEf(%rip), \XMM5, \XMM6
2233                vpaddd  ONEf(%rip), \XMM6, \XMM7
2234                vpaddd  ONEf(%rip), \XMM7, \XMM8
2235                vmovdqa \XMM8, \CTR
2236.endif
2237
2238
2239        #######################################################################
2240
2241                vmovdqu (arg1), \T1
2242                vpxor   \T1, \XMM1, \XMM1
2243                vpxor   \T1, \XMM2, \XMM2
2244                vpxor   \T1, \XMM3, \XMM3
2245                vpxor   \T1, \XMM4, \XMM4
2246                vpxor   \T1, \XMM5, \XMM5
2247                vpxor   \T1, \XMM6, \XMM6
2248                vpxor   \T1, \XMM7, \XMM7
2249                vpxor   \T1, \XMM8, \XMM8
2250
2251        #######################################################################
2252
2253
2254
2255
2256
2257                vmovdqu 16*1(arg1), \T1
2258                vaesenc \T1, \XMM1, \XMM1
2259                vaesenc \T1, \XMM2, \XMM2
2260                vaesenc \T1, \XMM3, \XMM3
2261                vaesenc \T1, \XMM4, \XMM4
2262                vaesenc \T1, \XMM5, \XMM5
2263                vaesenc \T1, \XMM6, \XMM6
2264                vaesenc \T1, \XMM7, \XMM7
2265                vaesenc \T1, \XMM8, \XMM8
2266
2267                vmovdqu 16*2(arg1), \T1
2268                vaesenc \T1, \XMM1, \XMM1
2269                vaesenc \T1, \XMM2, \XMM2
2270                vaesenc \T1, \XMM3, \XMM3
2271                vaesenc \T1, \XMM4, \XMM4
2272                vaesenc \T1, \XMM5, \XMM5
2273                vaesenc \T1, \XMM6, \XMM6
2274                vaesenc \T1, \XMM7, \XMM7
2275                vaesenc \T1, \XMM8, \XMM8
2276
2277
2278        #######################################################################
2279
2280        vmovdqu         HashKey_8(arg2), \T5
2281        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2282        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2283        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2284        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2285        vpxor           \T5, \T6, \T6
2286
2287                vmovdqu 16*3(arg1), \T1
2288                vaesenc \T1, \XMM1, \XMM1
2289                vaesenc \T1, \XMM2, \XMM2
2290                vaesenc \T1, \XMM3, \XMM3
2291                vaesenc \T1, \XMM4, \XMM4
2292                vaesenc \T1, \XMM5, \XMM5
2293                vaesenc \T1, \XMM6, \XMM6
2294                vaesenc \T1, \XMM7, \XMM7
2295                vaesenc \T1, \XMM8, \XMM8
2296
2297        vmovdqa         TMP2(%rsp), \T1
2298        vmovdqu         HashKey_7(arg2), \T5
2299        vpclmulqdq      $0x11, \T5, \T1, \T3
2300        vpxor           \T3, \T4, \T4
2301
2302        vpclmulqdq      $0x00, \T5, \T1, \T3
2303        vpxor           \T3, \T7, \T7
2304
2305        vpclmulqdq      $0x01, \T5, \T1, \T3
2306        vpxor           \T3, \T6, \T6
2307
2308        vpclmulqdq      $0x10, \T5, \T1, \T3
2309        vpxor           \T3, \T6, \T6
2310
2311                vmovdqu 16*4(arg1), \T1
2312                vaesenc \T1, \XMM1, \XMM1
2313                vaesenc \T1, \XMM2, \XMM2
2314                vaesenc \T1, \XMM3, \XMM3
2315                vaesenc \T1, \XMM4, \XMM4
2316                vaesenc \T1, \XMM5, \XMM5
2317                vaesenc \T1, \XMM6, \XMM6
2318                vaesenc \T1, \XMM7, \XMM7
2319                vaesenc \T1, \XMM8, \XMM8
2320
2321        #######################################################################
2322
2323        vmovdqa         TMP3(%rsp), \T1
2324        vmovdqu         HashKey_6(arg2), \T5
2325        vpclmulqdq      $0x11, \T5, \T1, \T3
2326        vpxor           \T3, \T4, \T4
2327
2328        vpclmulqdq      $0x00, \T5, \T1, \T3
2329        vpxor           \T3, \T7, \T7
2330
2331        vpclmulqdq      $0x01, \T5, \T1, \T3
2332        vpxor           \T3, \T6, \T6
2333
2334        vpclmulqdq      $0x10, \T5, \T1, \T3
2335        vpxor           \T3, \T6, \T6
2336
2337                vmovdqu 16*5(arg1), \T1
2338                vaesenc \T1, \XMM1, \XMM1
2339                vaesenc \T1, \XMM2, \XMM2
2340                vaesenc \T1, \XMM3, \XMM3
2341                vaesenc \T1, \XMM4, \XMM4
2342                vaesenc \T1, \XMM5, \XMM5
2343                vaesenc \T1, \XMM6, \XMM6
2344                vaesenc \T1, \XMM7, \XMM7
2345                vaesenc \T1, \XMM8, \XMM8
2346
2347        vmovdqa         TMP4(%rsp), \T1
2348        vmovdqu         HashKey_5(arg2), \T5
2349        vpclmulqdq      $0x11, \T5, \T1, \T3
2350        vpxor           \T3, \T4, \T4
2351
2352        vpclmulqdq      $0x00, \T5, \T1, \T3
2353        vpxor           \T3, \T7, \T7
2354
2355        vpclmulqdq      $0x01, \T5, \T1, \T3
2356        vpxor           \T3, \T6, \T6
2357
2358        vpclmulqdq      $0x10, \T5, \T1, \T3
2359        vpxor           \T3, \T6, \T6
2360
2361                vmovdqu 16*6(arg1), \T1
2362                vaesenc \T1, \XMM1, \XMM1
2363                vaesenc \T1, \XMM2, \XMM2
2364                vaesenc \T1, \XMM3, \XMM3
2365                vaesenc \T1, \XMM4, \XMM4
2366                vaesenc \T1, \XMM5, \XMM5
2367                vaesenc \T1, \XMM6, \XMM6
2368                vaesenc \T1, \XMM7, \XMM7
2369                vaesenc \T1, \XMM8, \XMM8
2370
2371
2372        vmovdqa         TMP5(%rsp), \T1
2373        vmovdqu         HashKey_4(arg2), \T5
2374        vpclmulqdq      $0x11, \T5, \T1, \T3
2375        vpxor           \T3, \T4, \T4
2376
2377        vpclmulqdq      $0x00, \T5, \T1, \T3
2378        vpxor           \T3, \T7, \T7
2379
2380        vpclmulqdq      $0x01, \T5, \T1, \T3
2381        vpxor           \T3, \T6, \T6
2382
2383        vpclmulqdq      $0x10, \T5, \T1, \T3
2384        vpxor           \T3, \T6, \T6
2385
2386                vmovdqu 16*7(arg1), \T1
2387                vaesenc \T1, \XMM1, \XMM1
2388                vaesenc \T1, \XMM2, \XMM2
2389                vaesenc \T1, \XMM3, \XMM3
2390                vaesenc \T1, \XMM4, \XMM4
2391                vaesenc \T1, \XMM5, \XMM5
2392                vaesenc \T1, \XMM6, \XMM6
2393                vaesenc \T1, \XMM7, \XMM7
2394                vaesenc \T1, \XMM8, \XMM8
2395
2396        vmovdqa         TMP6(%rsp), \T1
2397        vmovdqu         HashKey_3(arg2), \T5
2398        vpclmulqdq      $0x11, \T5, \T1, \T3
2399        vpxor           \T3, \T4, \T4
2400
2401        vpclmulqdq      $0x00, \T5, \T1, \T3
2402        vpxor           \T3, \T7, \T7
2403
2404        vpclmulqdq      $0x01, \T5, \T1, \T3
2405        vpxor           \T3, \T6, \T6
2406
2407        vpclmulqdq      $0x10, \T5, \T1, \T3
2408        vpxor           \T3, \T6, \T6
2409
2410                vmovdqu 16*8(arg1), \T1
2411                vaesenc \T1, \XMM1, \XMM1
2412                vaesenc \T1, \XMM2, \XMM2
2413                vaesenc \T1, \XMM3, \XMM3
2414                vaesenc \T1, \XMM4, \XMM4
2415                vaesenc \T1, \XMM5, \XMM5
2416                vaesenc \T1, \XMM6, \XMM6
2417                vaesenc \T1, \XMM7, \XMM7
2418                vaesenc \T1, \XMM8, \XMM8
2419
2420        vmovdqa         TMP7(%rsp), \T1
2421        vmovdqu         HashKey_2(arg2), \T5
2422        vpclmulqdq      $0x11, \T5, \T1, \T3
2423        vpxor           \T3, \T4, \T4
2424
2425        vpclmulqdq      $0x00, \T5, \T1, \T3
2426        vpxor           \T3, \T7, \T7
2427
2428        vpclmulqdq      $0x01, \T5, \T1, \T3
2429        vpxor           \T3, \T6, \T6
2430
2431        vpclmulqdq      $0x10, \T5, \T1, \T3
2432        vpxor           \T3, \T6, \T6
2433
2434
2435        #######################################################################
2436
2437                vmovdqu 16*9(arg1), \T5
2438                vaesenc \T5, \XMM1, \XMM1
2439                vaesenc \T5, \XMM2, \XMM2
2440                vaesenc \T5, \XMM3, \XMM3
2441                vaesenc \T5, \XMM4, \XMM4
2442                vaesenc \T5, \XMM5, \XMM5
2443                vaesenc \T5, \XMM6, \XMM6
2444                vaesenc \T5, \XMM7, \XMM7
2445                vaesenc \T5, \XMM8, \XMM8
2446
2447        vmovdqa         TMP8(%rsp), \T1
2448        vmovdqu         HashKey(arg2), \T5
2449
2450        vpclmulqdq      $0x00, \T5, \T1, \T3
2451        vpxor           \T3, \T7, \T7
2452
2453        vpclmulqdq      $0x01, \T5, \T1, \T3
2454        vpxor           \T3, \T6, \T6
2455
2456        vpclmulqdq      $0x10, \T5, \T1, \T3
2457        vpxor           \T3, \T6, \T6
2458
2459        vpclmulqdq      $0x11, \T5, \T1, \T3
2460        vpxor           \T3, \T4, \T1
2461
2462
2463                vmovdqu 16*10(arg1), \T5
2464
2465        i = 11
2466        setreg
2467.rep (\REP-9)
2468        vaesenc \T5, \XMM1, \XMM1
2469        vaesenc \T5, \XMM2, \XMM2
2470        vaesenc \T5, \XMM3, \XMM3
2471        vaesenc \T5, \XMM4, \XMM4
2472        vaesenc \T5, \XMM5, \XMM5
2473        vaesenc \T5, \XMM6, \XMM6
2474        vaesenc \T5, \XMM7, \XMM7
2475        vaesenc \T5, \XMM8, \XMM8
2476
2477        vmovdqu 16*i(arg1), \T5
2478        i = i + 1
2479        setreg
2480.endr
2481
2482	i = 0
2483	j = 1
2484	setreg
2485.rep 8
2486		vpxor	16*i(arg4, %r11), \T5, \T2
2487                .if \ENC_DEC == ENC
2488                vaesenclast     \T2, reg_j, reg_j
2489                .else
2490                vaesenclast     \T2, reg_j, \T3
2491                vmovdqu 16*i(arg4, %r11), reg_j
2492                vmovdqu \T3, 16*i(arg3, %r11)
2493                .endif
2494	i = (i+1)
2495	j = (j+1)
2496	setreg
2497.endr
2498	#######################################################################
2499
2500
2501	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2502	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2503	vpxor	\T3, \T7, \T7
2504	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2505
2506
2507
2508	#######################################################################
2509	#first phase of the reduction
2510	vmovdqa         POLY2(%rip), \T3
2511
2512	vpclmulqdq	$0x01, \T7, \T3, \T2
2513	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2514
2515	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2516	#######################################################################
2517                .if \ENC_DEC == ENC
2518		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2519		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2520		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2521		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2522		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2523		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2524		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2525		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2526                .endif
2527
2528	#######################################################################
2529	#second phase of the reduction
2530	vpclmulqdq	$0x00, \T7, \T3, \T2
2531	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2532
2533	vpclmulqdq	$0x10, \T7, \T3, \T4
2534	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2535
2536	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2537	#######################################################################
2538	vpxor		\T4, \T1, \T1			# the result is in T1
2539
2540		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2541		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2542		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2543		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2544		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2545		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2546		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2547		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2548
2549
2550	vpxor	\T1, \XMM1, \XMM1
2551
2552
2553
2554.endm
2555
2556
2557# GHASH the last 4 ciphertext blocks.
2558.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2559
2560        ## Karatsuba Method
2561
2562        vmovdqu         HashKey_8(arg2), \T5
2563
2564        vpshufd         $0b01001110, \XMM1, \T2
2565        vpshufd         $0b01001110, \T5, \T3
2566        vpxor           \XMM1, \T2, \T2
2567        vpxor           \T5, \T3, \T3
2568
2569        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2570        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2571
2572        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2573
2574        ######################
2575
2576        vmovdqu         HashKey_7(arg2), \T5
2577        vpshufd         $0b01001110, \XMM2, \T2
2578        vpshufd         $0b01001110, \T5, \T3
2579        vpxor           \XMM2, \T2, \T2
2580        vpxor           \T5, \T3, \T3
2581
2582        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2583        vpxor           \T4, \T6, \T6
2584
2585        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2586        vpxor           \T4, \T7, \T7
2587
2588        vpclmulqdq      $0x00, \T3, \T2, \T2
2589
2590        vpxor           \T2, \XMM1, \XMM1
2591
2592        ######################
2593
2594        vmovdqu         HashKey_6(arg2), \T5
2595        vpshufd         $0b01001110, \XMM3, \T2
2596        vpshufd         $0b01001110, \T5, \T3
2597        vpxor           \XMM3, \T2, \T2
2598        vpxor           \T5, \T3, \T3
2599
2600        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2601        vpxor           \T4, \T6, \T6
2602
2603        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2604        vpxor           \T4, \T7, \T7
2605
2606        vpclmulqdq      $0x00, \T3, \T2, \T2
2607
2608        vpxor           \T2, \XMM1, \XMM1
2609
2610        ######################
2611
2612        vmovdqu         HashKey_5(arg2), \T5
2613        vpshufd         $0b01001110, \XMM4, \T2
2614        vpshufd         $0b01001110, \T5, \T3
2615        vpxor           \XMM4, \T2, \T2
2616        vpxor           \T5, \T3, \T3
2617
2618        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2619        vpxor           \T4, \T6, \T6
2620
2621        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2622        vpxor           \T4, \T7, \T7
2623
2624        vpclmulqdq      $0x00, \T3, \T2, \T2
2625
2626        vpxor           \T2, \XMM1, \XMM1
2627
2628        ######################
2629
2630        vmovdqu         HashKey_4(arg2), \T5
2631        vpshufd         $0b01001110, \XMM5, \T2
2632        vpshufd         $0b01001110, \T5, \T3
2633        vpxor           \XMM5, \T2, \T2
2634        vpxor           \T5, \T3, \T3
2635
2636        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2637        vpxor           \T4, \T6, \T6
2638
2639        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2640        vpxor           \T4, \T7, \T7
2641
2642        vpclmulqdq      $0x00, \T3, \T2, \T2
2643
2644        vpxor           \T2, \XMM1, \XMM1
2645
2646        ######################
2647
2648        vmovdqu         HashKey_3(arg2), \T5
2649        vpshufd         $0b01001110, \XMM6, \T2
2650        vpshufd         $0b01001110, \T5, \T3
2651        vpxor           \XMM6, \T2, \T2
2652        vpxor           \T5, \T3, \T3
2653
2654        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2655        vpxor           \T4, \T6, \T6
2656
2657        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2658        vpxor           \T4, \T7, \T7
2659
2660        vpclmulqdq      $0x00, \T3, \T2, \T2
2661
2662        vpxor           \T2, \XMM1, \XMM1
2663
2664        ######################
2665
2666        vmovdqu         HashKey_2(arg2), \T5
2667        vpshufd         $0b01001110, \XMM7, \T2
2668        vpshufd         $0b01001110, \T5, \T3
2669        vpxor           \XMM7, \T2, \T2
2670        vpxor           \T5, \T3, \T3
2671
2672        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2673        vpxor           \T4, \T6, \T6
2674
2675        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2676        vpxor           \T4, \T7, \T7
2677
2678        vpclmulqdq      $0x00, \T3, \T2, \T2
2679
2680        vpxor           \T2, \XMM1, \XMM1
2681
2682        ######################
2683
2684        vmovdqu         HashKey(arg2), \T5
2685        vpshufd         $0b01001110, \XMM8, \T2
2686        vpshufd         $0b01001110, \T5, \T3
2687        vpxor           \XMM8, \T2, \T2
2688        vpxor           \T5, \T3, \T3
2689
2690        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2691        vpxor           \T4, \T6, \T6
2692
2693        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2694        vpxor           \T4, \T7, \T7
2695
2696        vpclmulqdq      $0x00, \T3, \T2, \T2
2697
2698        vpxor           \T2, \XMM1, \XMM1
2699        vpxor           \T6, \XMM1, \XMM1
2700        vpxor           \T7, \XMM1, \T2
2701
2702
2703
2704
2705        vpslldq $8, \T2, \T4
2706        vpsrldq $8, \T2, \T2
2707
2708        vpxor   \T4, \T7, \T7
2709        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2710						   # accumulated carry-less multiplications
2711
2712        #######################################################################
2713        #first phase of the reduction
2714        vmovdqa         POLY2(%rip), \T3
2715
2716        vpclmulqdq      $0x01, \T7, \T3, \T2
2717        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2718
2719        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2720        #######################################################################
2721
2722
2723        #second phase of the reduction
2724        vpclmulqdq      $0x00, \T7, \T3, \T2
2725        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2726
2727        vpclmulqdq      $0x10, \T7, \T3, \T4
2728        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2729
2730        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2731        #######################################################################
2732        vpxor           \T4, \T6, \T6              # the result is in T6
2733.endm
2734
2735
2736
2737#############################################################
2738#void   aesni_gcm_init_avx_gen4
2739#        (gcm_data     *my_ctx_data,
2740#         gcm_context_data *data,
2741#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2742#			(from Security Association) concatenated with 8 byte
2743#			Initialisation Vector (from IPSec ESP Payload)
2744#			concatenated with 0x00000001. 16-byte aligned pointer. */
2745#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2746#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2747#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2748#############################################################
2749SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2750        FUNC_SAVE
2751        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2752        FUNC_RESTORE
2753        ret
2754SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2755
2756###############################################################################
2757#void   aesni_gcm_enc_avx_gen4(
2758#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2759#        gcm_context_data *data,
2760#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2761#        const   u8 *in, /* Plaintext input */
2762#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2763###############################################################################
2764SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2765        FUNC_SAVE
2766        mov     keysize,%eax
2767        cmp     $32, %eax
2768        je      key_256_enc_update4
2769        cmp     $16, %eax
2770        je      key_128_enc_update4
2771        # must be 192
2772        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2773        FUNC_RESTORE
2774	ret
2775key_128_enc_update4:
2776        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2777        FUNC_RESTORE
2778	ret
2779key_256_enc_update4:
2780        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2781        FUNC_RESTORE
2782	ret
2783SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2784
2785###############################################################################
2786#void   aesni_gcm_dec_update_avx_gen4(
2787#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2788#        gcm_context_data *data,
2789#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2790#        const   u8 *in, /* Ciphertext input */
2791#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2792###############################################################################
2793SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2794        FUNC_SAVE
2795        mov     keysize,%eax
2796        cmp     $32, %eax
2797        je      key_256_dec_update4
2798        cmp     $16, %eax
2799        je      key_128_dec_update4
2800        # must be 192
2801        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2802        FUNC_RESTORE
2803        ret
2804key_128_dec_update4:
2805        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2806        FUNC_RESTORE
2807        ret
2808key_256_dec_update4:
2809        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2810        FUNC_RESTORE
2811        ret
2812SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2813
2814###############################################################################
2815#void   aesni_gcm_finalize_avx_gen4(
2816#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2817#        gcm_context_data *data,
2818#        u8      *auth_tag, /* Authenticated Tag output. */
2819#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2820#                              Valid values are 16 (most likely), 12 or 8. */
2821###############################################################################
2822SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2823        FUNC_SAVE
2824        mov	keysize,%eax
2825        cmp     $32, %eax
2826        je      key_256_finalize4
2827        cmp     $16, %eax
2828        je      key_128_finalize4
2829        # must be 192
2830        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2831        FUNC_RESTORE
2832        ret
2833key_128_finalize4:
2834        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2835        FUNC_RESTORE
2836        ret
2837key_256_finalize4:
2838        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2839        FUNC_RESTORE
2840        ret
2841SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2842
2843#endif /* CONFIG_AS_AVX2 */
2844