1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2020, Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;
31; Authors:
32;       Erdinc Ozturk
33;       Vinodh Gopal
34;       James Guilford
35;
36;
37; References:
38;       This code was derived and highly optimized from the code described in paper:
39;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
40;       The details of the implementation is explained in:
41;               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
42;
43;
44;
45;
46; Assumptions:
47;
48;
49;
50; iv:
51;       0                   1                   2                   3
52;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
53;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
54;       |                             Salt  (From the SA)               |
55;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
56;       |                     Initialization Vector                     |
57;       |         (This is the sequence number from IPSec header)       |
58;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
59;       |                              0x1                              |
60;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
61;
62;
63;
64; AAD:
65;       AAD will be padded with 0 to the next 16byte multiple
66;       for example, assume AAD is a u32 vector
67;
68;       if AAD is 8 bytes:
69;       AAD[3] = {A0, A1};
70;       padded AAD in xmm register = {A1 A0 0 0}
71;
72;       0                   1                   2                   3
73;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
74;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
75;       |                               SPI (A1)                        |
76;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
77;       |                     32-bit Sequence Number (A0)               |
78;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
79;       |                              0x0                              |
80;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
81;
82;                                       AAD Format with 32-bit Sequence Number
83;
84;       if AAD is 12 bytes:
85;       AAD[3] = {A0, A1, A2};
86;       padded AAD in xmm register = {A2 A1 A0 0}
87;
88;       0                   1                   2                   3
89;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
90;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
91;       |                               SPI (A2)                        |
92;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93;       |                 64-bit Extended Sequence Number {A1,A0}       |
94;       |                                                               |
95;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
96;       |                              0x0                              |
97;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98;
99;        AAD Format with 64-bit Extended Sequence Number
100;
101;
102; aadLen:
103;       Must be a multiple of 4 bytes and from the definition of the spec.
104;       The code additionally supports any aadLen length.
105;
106; TLen:
107;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
108;
109; poly = x^128 + x^127 + x^126 + x^121 + 1
110; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
111;
112
113%include "include/os.asm"
114%include "include/reg_sizes.asm"
115%include "include/clear_regs.asm"
116%include "include/gcm_defines.asm"
117%include "include/gcm_keys_avx2_avx512.asm"
118%include "include/memcpy.asm"
119
120%ifndef GCM128_MODE
121%ifndef GCM192_MODE
122%ifndef GCM256_MODE
123%error "No GCM mode selected for gcm_avx_gen4.asm!"
124%endif
125%endif
126%endif
127
128;; Decide on AES-GCM key size to compile for
129%ifdef GCM128_MODE
130%define NROUNDS 9
131%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4
132%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _128_ %+ avx_gen4
133%endif
134
135%ifdef GCM192_MODE
136%define NROUNDS 11
137%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4
138%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _192_ %+ avx_gen4
139%endif
140
141%ifdef GCM256_MODE
142%define NROUNDS 13
143%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4
144%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _256_ %+ avx_gen4
145%endif
146
147section .text
148default rel
149
150; need to push 4 registers into stack to maintain
151%define STACK_OFFSET 8*4
152
153%define TMP2    16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
154%define TMP3    16*1    ; Temporary storage for AES State 3
155%define TMP4    16*2    ; Temporary storage for AES State 4
156%define TMP5    16*3    ; Temporary storage for AES State 5
157%define TMP6    16*4    ; Temporary storage for AES State 6
158%define TMP7    16*5    ; Temporary storage for AES State 7
159%define TMP8    16*6    ; Temporary storage for AES State 8
160
161%define LOCAL_STORAGE   16*7
162
163%ifidn __OUTPUT_FORMAT__, win64
164        %define XMM_STORAGE     16*10
165%else
166        %define XMM_STORAGE     0
167%endif
168
169%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
170
171;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
172; Utility Macros
173;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174
175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
177; Input: A and B (128-bits each, bit-reflected)
178; Output: C = A*B*x mod poly, (i.e. >>1 )
179; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
180; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
181;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
182%macro  GHASH_MUL  7
183%define %%GH %1         ; 16 Bytes
184%define %%HK %2         ; 16 Bytes
185%define %%T1 %3
186%define %%T2 %4
187%define %%T3 %5
188%define %%T4 %6
189%define %%T5 %7
190        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191
192        vpclmulqdq      %%T1, %%GH, %%HK, 0x11          ; %%T1 = a1*b1
193        vpclmulqdq      %%T2, %%GH, %%HK, 0x00          ; %%T2 = a0*b0
194        vpclmulqdq      %%T3, %%GH, %%HK, 0x01          ; %%T3 = a1*b0
195        vpclmulqdq      %%GH, %%GH, %%HK, 0x10          ; %%GH = a0*b1
196        vpxor           %%GH, %%GH, %%T3
197
198
199        vpsrldq         %%T3, %%GH, 8                   ; shift-R %%GH 2 DWs
200        vpslldq         %%GH, %%GH, 8                   ; shift-L %%GH 2 DWs
201
202        vpxor           %%T1, %%T1, %%T3
203        vpxor           %%GH, %%GH, %%T2
204
205        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
206        ;first phase of the reduction
207        vmovdqa         %%T3, [rel POLY2]
208
209        vpclmulqdq      %%T2, %%T3, %%GH, 0x01
210        vpslldq         %%T2, %%T2, 8                    ; shift-L %%T2 2 DWs
211
212        vpxor           %%GH, %%GH, %%T2                 ; first phase of the reduction complete
213        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
214        ;second phase of the reduction
215        vpclmulqdq      %%T2, %%T3, %%GH, 0x00
216        vpsrldq         %%T2, %%T2, 4                    ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
217
218        vpclmulqdq      %%GH, %%T3, %%GH, 0x10
219        vpslldq         %%GH, %%GH, 4                    ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
220
221        vpxor           %%GH, %%GH, %%T2                 ; second phase of the reduction complete
222        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
223        vpxor           %%GH, %%GH, %%T1                 ; the result is in %%GH
224
225%endmacro
226
227
228; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
229; functions, but are kept to allow users to switch cpu architectures between calls
230; of pre, init, update, and finalize.
231%macro  PRECOMPUTE 8
232%define %%GDATA %1
233%define %%HK    %2
234%define %%T1    %3
235%define %%T2    %4
236%define %%T3    %5
237%define %%T4    %6
238%define %%T5    %7
239%define %%T6    %8
240
241        ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
242        vmovdqa  %%T5, %%HK
243
244        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^2<<1 mod poly
245        vmovdqu  [%%GDATA + HashKey_2], %%T5                    ;  [HashKey_2] = HashKey^2<<1 mod poly
246
247        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^3<<1 mod poly
248        vmovdqu  [%%GDATA + HashKey_3], %%T5
249
250        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^4<<1 mod poly
251        vmovdqu  [%%GDATA + HashKey_4], %%T5
252
253        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^5<<1 mod poly
254        vmovdqu  [%%GDATA + HashKey_5], %%T5
255
256        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^6<<1 mod poly
257        vmovdqu  [%%GDATA + HashKey_6], %%T5
258
259        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^7<<1 mod poly
260        vmovdqu  [%%GDATA + HashKey_7], %%T5
261
262        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^8<<1 mod poly
263        vmovdqu  [%%GDATA + HashKey_8], %%T5
264%endmacro
265
266
267;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
268; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
269; Returns 0 if data has length 0.
270; Input: The input data (INPUT), that data's length (LENGTH).
271; Output: The packed xmm register (OUTPUT).
272;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
273%macro READ_SMALL_DATA_INPUT    6
274%define %%OUTPUT                %1 ; %%OUTPUT is an xmm register
275%define %%INPUT                 %2
276%define %%LENGTH                %3
277%define %%END_READ_LOCATION     %4 ; All this and the lower inputs are temp registers
278%define %%COUNTER               %5
279%define %%TMP1                  %6
280
281        vpxor   %%OUTPUT, %%OUTPUT
282        mov     %%COUNTER, %%LENGTH
283        mov     %%END_READ_LOCATION, %%INPUT
284        add     %%END_READ_LOCATION, %%LENGTH
285        xor     %%TMP1, %%TMP1
286
287
288        cmp     %%COUNTER, 8
289        jl      %%_byte_loop_2
290        vpinsrq %%OUTPUT, [%%INPUT],0           ;Read in 8 bytes if they exists
291        je      %%_done
292
293        sub     %%COUNTER, 8
294
295%%_byte_loop_1:                                 ;Read in data 1 byte at a time while data is left
296        shl     %%TMP1, 8                       ;This loop handles when 8 bytes were already read in
297        dec     %%END_READ_LOCATION
298        mov     BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
299        dec     %%COUNTER
300        jg      %%_byte_loop_1
301        vpinsrq %%OUTPUT, %%TMP1, 1
302        jmp     %%_done
303
304%%_byte_loop_2:                                 ;Read in data 1 byte at a time while data is left
305	;; NOTE: in current implementation check for zero length is obsolete here.
306        ;;      The adequate checks are done by callers of this macro.
307        ;; cmp     %%COUNTER, 0
308        ;; je      %%_done
309        shl     %%TMP1, 8                       ;This loop handles when no bytes were already read in
310        dec     %%END_READ_LOCATION
311        mov     BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
312        dec     %%COUNTER
313        jg      %%_byte_loop_2
314        vpinsrq %%OUTPUT, %%TMP1, 0
315%%_done:
316
317%endmacro ; READ_SMALL_DATA_INPUT
318
319
320;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
322; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
323; Output: The hash of the data (AAD_HASH).
324;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
325%macro  CALC_AAD_HASH   15
326%define %%A_IN          %1
327%define %%A_LEN         %2
328%define %%AAD_HASH      %3
329%define %%GDATA_KEY     %4
330%define %%XTMP0         %5      ; xmm temp reg 5
331%define %%XTMP1         %6      ; xmm temp reg 5
332%define %%XTMP2         %7
333%define %%XTMP3         %8
334%define %%XTMP4         %9
335%define %%XTMP5         %10     ; xmm temp reg 5
336%define %%T1            %11     ; temp reg 1
337%define %%T2            %12
338%define %%T3            %13
339%define %%T4            %14
340%define %%T5            %15     ; temp reg 5
341
342
343        mov     %%T1, %%A_IN            ; T1 = AAD
344        mov     %%T2, %%A_LEN           ; T2 = aadLen
345
346%%_get_AAD_loop128:
347        cmp     %%T2, 128
348        jl      %%_exit_AAD_loop128
349
350        vmovdqu         %%XTMP0, [%%T1 + 16*0]
351        vpshufb         %%XTMP0, [rel SHUF_MASK]
352
353        vpxor           %%XTMP0, %%AAD_HASH
354
355        vmovdqu         %%XTMP5, [%%GDATA_KEY + HashKey_8]
356        vpclmulqdq      %%XTMP1, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = a1*b1
357        vpclmulqdq      %%XTMP2, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = a0*b0
358        vpclmulqdq      %%XTMP3, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = a1*b0
359        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10                 ; %%T4 = a0*b1
360        vpxor           %%XTMP3, %%XTMP3, %%XTMP4                       ; %%T3 = a1*b0 + a0*b1
361
362%assign i 1
363%assign j 7
364%rep 7
365        vmovdqu         %%XTMP0, [%%T1 + 16*i]
366        vpshufb         %%XTMP0, [rel SHUF_MASK]
367
368        vmovdqu         %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
369        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = T1 + a1*b1
370        vpxor           %%XTMP1, %%XTMP1, %%XTMP4
371
372        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = T2 + a0*b0
373        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
374
375        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = T3 + a1*b0 + a0*b1
376        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
377        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10
378        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
379%assign i (i + 1)
380%assign j (j - 1)
381%endrep
382
383        vpslldq         %%XTMP4, %%XTMP3, 8                             ; shift-L 2 DWs
384        vpsrldq         %%XTMP3, %%XTMP3, 8                             ; shift-R 2 DWs
385        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
386        vpxor           %%XTMP1, %%XTMP1, %%XTMP3                       ; accumulate the results in %%T1(M):%%T2(L)
387
388        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389        ;first phase of the reduction
390        vmovdqa         %%XTMP5, [rel POLY2]
391        vpclmulqdq      %%XTMP0, %%XTMP5, %%XTMP2, 0x01
392        vpslldq         %%XTMP0, %%XTMP0, 8                             ; shift-L xmm2 2 DWs
393        vpxor           %%XTMP2, %%XTMP2, %%XTMP0                       ; first phase of the reduction complete
394
395        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
396        ;second phase of the reduction
397        vpclmulqdq      %%XTMP3, %%XTMP5, %%XTMP2, 0x00
398        vpsrldq         %%XTMP3, %%XTMP3, 4                             ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
399
400        vpclmulqdq      %%XTMP4, %%XTMP5, %%XTMP2, 0x10
401        vpslldq         %%XTMP4, %%XTMP4, 4                             ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
402
403        vpxor           %%XTMP4, %%XTMP4, %%XTMP3                       ; second phase of the reduction complete
404        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
405        vpxor           %%AAD_HASH, %%XTMP1, %%XTMP4                    ; the result is in %%T1
406
407        sub     %%T2, 128
408        je      %%_CALC_AAD_done
409
410        add     %%T1, 128
411        jmp     %%_get_AAD_loop128
412
413%%_exit_AAD_loop128:
414        cmp     %%T2, 16
415        jl      %%_get_small_AAD_block
416
417        ;; calculate hash_key position to start with
418        mov     %%T3, %%T2
419        and     %%T3, -16       ; 1 to 7 blocks possible here
420        neg     %%T3
421        add     %%T3, HashKey_1 + 16
422        lea     %%T3, [%%GDATA_KEY + %%T3]
423
424        vmovdqu         %%XTMP0, [%%T1]
425        vpshufb         %%XTMP0, [rel SHUF_MASK]
426
427        vpxor           %%XTMP0, %%AAD_HASH
428
429        vmovdqu         %%XTMP5, [%%T3]
430        vpclmulqdq      %%XTMP1, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = a1*b1
431        vpclmulqdq      %%XTMP2, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = a0*b0
432        vpclmulqdq      %%XTMP3, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = a1*b0
433        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10                 ; %%T4 = a0*b1
434        vpxor           %%XTMP3, %%XTMP3, %%XTMP4                       ; %%T3 = a1*b0 + a0*b1
435
436        add     %%T3, 16        ; move to next hashkey
437        add     %%T1, 16        ; move to next data block
438        sub     %%T2, 16
439        cmp     %%T2, 16
440        jl      %%_AAD_reduce
441
442%%_AAD_blocks:
443        vmovdqu         %%XTMP0, [%%T1]
444        vpshufb         %%XTMP0, [rel SHUF_MASK]
445
446        vmovdqu         %%XTMP5, [%%T3]
447        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = T1 + a1*b1
448        vpxor           %%XTMP1, %%XTMP1, %%XTMP4
449
450        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = T2 + a0*b0
451        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
452
453        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = T3 + a1*b0 + a0*b1
454        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
455        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10
456        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
457
458        add     %%T3, 16        ; move to next hashkey
459        add     %%T1, 16
460        sub     %%T2, 16
461        cmp     %%T2, 16
462        jl      %%_AAD_reduce
463        jmp     %%_AAD_blocks
464
465%%_AAD_reduce:
466        vpslldq         %%XTMP4, %%XTMP3, 8                             ; shift-L 2 DWs
467        vpsrldq         %%XTMP3, %%XTMP3, 8                             ; shift-R 2 DWs
468        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
469        vpxor           %%XTMP1, %%XTMP1, %%XTMP3                       ; accumulate the results in %%T1(M):%%T2(L)
470
471        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
472        ;first phase of the reduction
473        vmovdqa         %%XTMP5, [rel POLY2]
474        vpclmulqdq      %%XTMP0, %%XTMP5, %%XTMP2, 0x01
475        vpslldq         %%XTMP0, %%XTMP0, 8                             ; shift-L xmm2 2 DWs
476        vpxor           %%XTMP2, %%XTMP2, %%XTMP0                       ; first phase of the reduction complete
477
478        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
479        ;second phase of the reduction
480        vpclmulqdq      %%XTMP3, %%XTMP5, %%XTMP2, 0x00
481        vpsrldq         %%XTMP3, %%XTMP3, 4                             ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
482
483        vpclmulqdq      %%XTMP4, %%XTMP5, %%XTMP2, 0x10
484        vpslldq         %%XTMP4, %%XTMP4, 4                             ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
485
486        vpxor           %%XTMP4, %%XTMP4, %%XTMP3                       ; second phase of the reduction complete
487        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
488        vpxor           %%AAD_HASH, %%XTMP1, %%XTMP4                    ; the result is in %%T1
489
490        or      %%T2, %%T2
491        je      %%_CALC_AAD_done
492
493%%_get_small_AAD_block:
494        vmovdqu         %%XTMP0, [%%GDATA_KEY + HashKey]
495        READ_SMALL_DATA_INPUT   %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
496        ;byte-reflect the AAD data
497        vpshufb         %%XTMP1, [rel SHUF_MASK]
498        vpxor           %%AAD_HASH, %%XTMP1
499        GHASH_MUL       %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
500
501%%_CALC_AAD_done:
502
503%endmacro ; CALC_AAD_HASH
504
505
506
507;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
508; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
509; Requires the input data be at least 1 byte long.
510; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
511; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
512; the hash subkey (HASH_SUBKEY) and whether encoding or decoding (ENC_DEC)
513; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
514; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11
515;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
516%macro PARTIAL_BLOCK    8
517%define %%GDATA_CTX             %1
518%define %%CYPH_PLAIN_OUT        %2
519%define %%PLAIN_CYPH_IN         %3
520%define %%PLAIN_CYPH_LEN        %4
521%define %%DATA_OFFSET           %5
522%define %%AAD_HASH              %6
523%define %%HASH_SUBKEY           %7
524%define %%ENC_DEC               %8
525
526        mov     r13, [%%GDATA_CTX + PBlockLen]
527        cmp     r13, 0
528        je      %%_partial_block_done           ;Leave Macro if no partial blocks
529
530        cmp     %%PLAIN_CYPH_LEN, 16            ;Read in input data without over reading
531        jl      %%_fewer_than_16_bytes
532        VXLDR   xmm1, [%%PLAIN_CYPH_IN]         ;If more than 16 bytes of data, just fill the xmm register
533        jmp     %%_data_read
534
535%%_fewer_than_16_bytes:
536        lea     r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
537        READ_SMALL_DATA_INPUT   xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
538
539%%_data_read:                           ;Finished reading in data
540
541
542        vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey]  ;xmm9 = my_ctx_data.partial_block_enc_key
543
544        lea     r12, [rel SHIFT_MASK]
545
546        add     r12, r13                        ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
547        vmovdqu xmm2, [r12]                     ; get the appropriate shuffle mask
548        vpshufb xmm9, xmm2                      ;shift right r13 bytes
549
550%ifidn  %%ENC_DEC, DEC
551        vmovdqa xmm3, xmm1
552        vpxor   xmm9, xmm1                      ; Cyphertext XOR E(K, Yn)
553
554        mov     r15, %%PLAIN_CYPH_LEN
555        add     r15, r13
556        sub     r15, 16                         ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
557        jge     %%_no_extra_mask_1              ;Determine if if partial block is not being filled and shift mask accordingly
558        sub     r12, r15
559%%_no_extra_mask_1:
560
561        vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
562        vpand   xmm9, xmm1                      ; mask out bottom r13 bytes of xmm9
563
564        vpand   xmm3, xmm1
565        vpshufb xmm3, [rel SHUF_MASK]
566        vpshufb xmm3, xmm2
567        vpxor   %%AAD_HASH, xmm3
568
569
570        cmp     r15,0
571        jl      %%_partial_incomplete_1
572
573        GHASH_MUL       %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6       ;GHASH computation for the last <16 Byte block
574        xor     rax,rax
575        mov     [%%GDATA_CTX + PBlockLen], rax
576        jmp     %%_dec_done
577%%_partial_incomplete_1:
578%ifidn __OUTPUT_FORMAT__, win64
579        mov     rax, %%PLAIN_CYPH_LEN
580       	add     [%%GDATA_CTX + PBlockLen], rax
581%else
582        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
583%endif
584%%_dec_done:
585        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
586
587%else
588        vpxor   xmm9, xmm1      ; Plaintext XOR E(K, Yn)
589
590        mov     r15, %%PLAIN_CYPH_LEN
591        add     r15, r13
592        sub     r15, 16                         ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
593        jge     %%_no_extra_mask_2              ;Determine if if partial block is not being filled and shift mask accordingly
594        sub     r12, r15
595%%_no_extra_mask_2:
596
597        vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK]  ; get the appropriate mask to mask out bottom r13 bytes of xmm9
598        vpand   xmm9, xmm1                      ; mask out bottom r13  bytes of xmm9
599
600        vpshufb xmm9, [rel SHUF_MASK]
601        vpshufb xmm9, xmm2
602        vpxor   %%AAD_HASH, xmm9
603
604        cmp     r15,0
605        jl      %%_partial_incomplete_2
606
607        GHASH_MUL       %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6       ;GHASH computation for the last <16 Byte block
608        xor     rax,rax
609        mov     [%%GDATA_CTX + PBlockLen], rax
610        jmp     %%_encode_done
611%%_partial_incomplete_2:
612%ifidn __OUTPUT_FORMAT__, win64
613        mov     rax, %%PLAIN_CYPH_LEN
614       	add     [%%GDATA_CTX + PBlockLen], rax
615%else
616        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
617%endif
618%%_encode_done:
619        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
620
621        vpshufb xmm9, [rel SHUF_MASK]       ; shuffle xmm9 back to output as ciphertext
622        vpshufb xmm9, xmm2
623%endif
624
625
626        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
627        ; output encrypted Bytes
628        cmp     r15,0
629        jl      %%_partial_fill
630        mov     r12, r13
631        mov     r13, 16
632        sub     r13, r12                        ; Set r13 to be the number of bytes to write out
633        jmp     %%_count_set
634%%_partial_fill:
635        mov     r13, %%PLAIN_CYPH_LEN
636%%_count_set:
637        vmovq   rax, xmm9
638        cmp     r13, 8
639        jle     %%_less_than_8_bytes_left
640
641        mov     [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
642        add     %%DATA_OFFSET, 8
643        vpsrldq xmm9, xmm9, 8
644        vmovq   rax, xmm9
645        sub     r13, 8
646%%_less_than_8_bytes_left:
647        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
648        add     %%DATA_OFFSET, 1
649        shr     rax, 8
650        sub     r13, 1
651        jne     %%_less_than_8_bytes_left
652         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
653
654%%_partial_block_done:
655%endmacro ; PARTIAL_BLOCK
656
657
658%macro GHASH_SINGLE_MUL 9
659%define %%GDATA                 %1
660%define %%HASHKEY               %2
661%define %%CIPHER                %3
662%define %%STATE_11              %4
663%define %%STATE_00              %5
664%define %%STATE_MID             %6
665%define %%T1                    %7
666%define %%T2                    %8
667%define %%FIRST                 %9
668
669        vmovdqu         %%T1, [%%GDATA + %%HASHKEY]
670%ifidn %%FIRST, first
671        vpclmulqdq      %%STATE_11, %%CIPHER, %%T1, 0x11         ; %%T4 = a1*b1
672        vpclmulqdq      %%STATE_00, %%CIPHER, %%T1, 0x00         ; %%T4_2 = a0*b0
673        vpclmulqdq      %%STATE_MID, %%CIPHER, %%T1, 0x01        ; %%T6 = a1*b0
674        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10               ; %%T5 = a0*b1
675        vpxor           %%STATE_MID, %%STATE_MID, %%T2
676%else
677        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x11
678        vpxor           %%STATE_11, %%STATE_11, %%T2
679
680        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x00
681        vpxor           %%STATE_00, %%STATE_00, %%T2
682
683        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x01
684        vpxor           %%STATE_MID, %%STATE_MID, %%T2
685
686        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10
687        vpxor           %%STATE_MID, %%STATE_MID, %%T2
688%endif
689
690%endmacro
691
692; if a = number of total plaintext bytes
693; b = floor(a/16)
694; %%num_initial_blocks = b mod 8;
695; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
696; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
697; Updated AAD_HASH is returned in %%T3
698
699%macro INITIAL_BLOCKS 23
700%define %%GDATA_KEY             %1
701%define %%CYPH_PLAIN_OUT        %2
702%define %%PLAIN_CYPH_IN         %3
703%define %%LENGTH                %4
704%define %%DATA_OFFSET           %5
705%define %%num_initial_blocks    %6      ; can be 0, 1, 2, 3, 4, 5, 6 or 7
706%define %%T1                    %7
707%define %%T2                    %8
708%define %%T3                    %9
709%define %%T4                    %10
710%define %%T5                    %11
711%define %%CTR                   %12
712%define %%XMM1                  %13
713%define %%XMM2                  %14
714%define %%XMM3                  %15
715%define %%XMM4                  %16
716%define %%XMM5                  %17
717%define %%XMM6                  %18
718%define %%XMM7                  %19
719%define %%XMM8                  %20
720%define %%T6                    %21
721%define %%T_key                 %22
722%define %%ENC_DEC               %23
723
724%assign i (8-%%num_initial_blocks)
725                ;; Move AAD_HASH to temp reg
726                vmovdqu  %%T2, %%XMM8
727                ;; Start AES for %%num_initial_blocks blocks
728                ;; vmovdqu  %%CTR, [%%GDATA_CTX + CurCount]   ; %%CTR = Y0
729
730%assign i (9-%%num_initial_blocks)
731%rep %%num_initial_blocks
732                vpaddd   %%CTR, %%CTR, [rel ONE]     ; INCR Y0
733                vmovdqa  reg(i), %%CTR
734                vpshufb  reg(i), [rel SHUF_MASK]     ; perform a 16Byte swap
735%assign i (i+1)
736%endrep
737
738%if(%%num_initial_blocks>0)
739vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
740%assign i (9-%%num_initial_blocks)
741%rep %%num_initial_blocks
742                vpxor    reg(i),reg(i),%%T_key
743%assign i (i+1)
744%endrep
745
746%assign j 1
747%rep NROUNDS
748vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
749%assign i (9-%%num_initial_blocks)
750%rep %%num_initial_blocks
751                vaesenc  reg(i),%%T_key
752%assign i (i+1)
753%endrep
754
755%assign j (j+1)
756%endrep
757
758
759vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
760%assign i (9-%%num_initial_blocks)
761%rep %%num_initial_blocks
762                vaesenclast      reg(i),%%T_key
763%assign i (i+1)
764%endrep
765
766%endif ; %if(%%num_initial_blocks>0)
767
768
769
770%assign i (9-%%num_initial_blocks)
771%rep %%num_initial_blocks
772                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
773                vpxor    reg(i), reg(i), %%T1
774                ;; Write back ciphertext for %%num_initial_blocks blocks
775                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
776                add     %%DATA_OFFSET, 16
777                %ifidn  %%ENC_DEC, DEC
778                    vmovdqa  reg(i), %%T1
779                %endif
780                ;; Prepare ciphertext for GHASH computations
781                vpshufb  reg(i), [rel SHUF_MASK]
782%assign i (i+1)
783%endrep
784
785
786;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
787
788%assign i (9-%%num_initial_blocks)
789%if(%%num_initial_blocks>0)
790        vmovdqa %%T3, reg(i)
791%assign i (i+1)
792%endif
793%if(%%num_initial_blocks>1)
794%rep %%num_initial_blocks-1
795        vmovdqu [rsp + TMP %+ i], reg(i)
796%assign i (i+1)
797%endrep
798%endif
799                ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
800                ;; Prepare 8 counter blocks and perform rounds of AES cipher on
801                ;; them, load plain/cipher text and store cipher/plain text.
802                ;; Stitch GHASH computation in between AES rounds.
803                vpaddd   %%XMM1, %%CTR, [rel ONE]   ; INCR Y0
804                vpaddd   %%XMM2, %%CTR, [rel TWO]   ; INCR Y0
805                vpaddd   %%XMM3, %%XMM1, [rel TWO]  ; INCR Y0
806                vpaddd   %%XMM4, %%XMM2, [rel TWO]  ; INCR Y0
807                vpaddd   %%XMM5, %%XMM3, [rel TWO]  ; INCR Y0
808                vpaddd   %%XMM6, %%XMM4, [rel TWO]  ; INCR Y0
809                vpaddd   %%XMM7, %%XMM5, [rel TWO]  ; INCR Y0
810                vpaddd   %%XMM8, %%XMM6, [rel TWO]  ; INCR Y0
811                vmovdqa  %%CTR, %%XMM8
812
813                vpshufb  %%XMM1, [rel SHUF_MASK]    ; perform a 16Byte swap
814                vpshufb  %%XMM2, [rel SHUF_MASK]    ; perform a 16Byte swap
815                vpshufb  %%XMM3, [rel SHUF_MASK]    ; perform a 16Byte swap
816                vpshufb  %%XMM4, [rel SHUF_MASK]    ; perform a 16Byte swap
817                vpshufb  %%XMM5, [rel SHUF_MASK]    ; perform a 16Byte swap
818                vpshufb  %%XMM6, [rel SHUF_MASK]    ; perform a 16Byte swap
819                vpshufb  %%XMM7, [rel SHUF_MASK]    ; perform a 16Byte swap
820                vpshufb  %%XMM8, [rel SHUF_MASK]    ; perform a 16Byte swap
821
822                vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
823                vpxor    %%XMM1, %%XMM1, %%T_key
824                vpxor    %%XMM2, %%XMM2, %%T_key
825                vpxor    %%XMM3, %%XMM3, %%T_key
826                vpxor    %%XMM4, %%XMM4, %%T_key
827                vpxor    %%XMM5, %%XMM5, %%T_key
828                vpxor    %%XMM6, %%XMM6, %%T_key
829                vpxor    %%XMM7, %%XMM7, %%T_key
830                vpxor    %%XMM8, %%XMM8, %%T_key
831
832%assign i (8-%%num_initial_blocks)
833%assign j (9-%%num_initial_blocks)
834%assign k (%%num_initial_blocks)
835
836%define %%T4_2 %%T4
837%if(%%num_initial_blocks>0)
838        ;; Hash in AES state
839        ;; T2 - incoming AAD hash
840        vpxor %%T2, %%T3
841
842        ;;                 GDATA,       HASHKEY, CIPHER,
843        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
844        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
845                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, first
846%endif
847
848                vmovdqu  %%T_key, [%%GDATA_KEY+16*1]
849                vaesenc  %%XMM1, %%T_key
850                vaesenc  %%XMM2, %%T_key
851                vaesenc  %%XMM3, %%T_key
852                vaesenc  %%XMM4, %%T_key
853                vaesenc  %%XMM5, %%T_key
854                vaesenc  %%XMM6, %%T_key
855                vaesenc  %%XMM7, %%T_key
856                vaesenc  %%XMM8, %%T_key
857
858                vmovdqu  %%T_key, [%%GDATA_KEY+16*2]
859                vaesenc  %%XMM1, %%T_key
860                vaesenc  %%XMM2, %%T_key
861                vaesenc  %%XMM3, %%T_key
862                vaesenc  %%XMM4, %%T_key
863                vaesenc  %%XMM5, %%T_key
864                vaesenc  %%XMM6, %%T_key
865                vaesenc  %%XMM7, %%T_key
866                vaesenc  %%XMM8, %%T_key
867
868%assign i (i+1)
869%assign j (j+1)
870%assign k (k-1)
871%if(%%num_initial_blocks>1)
872        ;;                 GDATA,       HASHKEY, CIPHER,
873        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
874        vmovdqu         %%T2, [rsp + TMP %+ j]
875        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
876                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
877%endif
878
879                vmovdqu  %%T_key, [%%GDATA_KEY+16*3]
880                vaesenc  %%XMM1, %%T_key
881                vaesenc  %%XMM2, %%T_key
882                vaesenc  %%XMM3, %%T_key
883                vaesenc  %%XMM4, %%T_key
884                vaesenc  %%XMM5, %%T_key
885                vaesenc  %%XMM6, %%T_key
886                vaesenc  %%XMM7, %%T_key
887                vaesenc  %%XMM8, %%T_key
888
889                vmovdqu  %%T_key, [%%GDATA_KEY+16*4]
890                vaesenc  %%XMM1, %%T_key
891                vaesenc  %%XMM2, %%T_key
892                vaesenc  %%XMM3, %%T_key
893                vaesenc  %%XMM4, %%T_key
894                vaesenc  %%XMM5, %%T_key
895                vaesenc  %%XMM6, %%T_key
896                vaesenc  %%XMM7, %%T_key
897                vaesenc  %%XMM8, %%T_key
898
899%assign i (i+1)
900%assign j (j+1)
901%assign k (k-1)
902%if(%%num_initial_blocks>2)
903        ;;                 GDATA,       HASHKEY, CIPHER,
904        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
905        vmovdqu         %%T2, [rsp + TMP %+ j]
906        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
907                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
908%endif
909
910%assign i (i+1)
911%assign j (j+1)
912%assign k (k-1)
913%if(%%num_initial_blocks>3)
914        ;;                 GDATA,       HASHKEY, CIPHER,
915        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
916        vmovdqu         %%T2, [rsp + TMP %+ j]
917        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
918                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
919%endif
920
921                vmovdqu  %%T_key, [%%GDATA_KEY+16*5]
922                vaesenc  %%XMM1, %%T_key
923                vaesenc  %%XMM2, %%T_key
924                vaesenc  %%XMM3, %%T_key
925                vaesenc  %%XMM4, %%T_key
926                vaesenc  %%XMM5, %%T_key
927                vaesenc  %%XMM6, %%T_key
928                vaesenc  %%XMM7, %%T_key
929                vaesenc  %%XMM8, %%T_key
930
931                vmovdqu  %%T_key, [%%GDATA_KEY+16*6]
932                vaesenc  %%XMM1, %%T_key
933                vaesenc  %%XMM2, %%T_key
934                vaesenc  %%XMM3, %%T_key
935                vaesenc  %%XMM4, %%T_key
936                vaesenc  %%XMM5, %%T_key
937                vaesenc  %%XMM6, %%T_key
938                vaesenc  %%XMM7, %%T_key
939                vaesenc  %%XMM8, %%T_key
940
941%assign i (i+1)
942%assign j (j+1)
943%assign k (k-1)
944%if(%%num_initial_blocks>4)
945        ;;                 GDATA,       HASHKEY, CIPHER,
946        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
947        vmovdqu         %%T2, [rsp + TMP %+ j]
948        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
949                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
950%endif
951
952                vmovdqu  %%T_key, [%%GDATA_KEY+16*7]
953                vaesenc  %%XMM1, %%T_key
954                vaesenc  %%XMM2, %%T_key
955                vaesenc  %%XMM3, %%T_key
956                vaesenc  %%XMM4, %%T_key
957                vaesenc  %%XMM5, %%T_key
958                vaesenc  %%XMM6, %%T_key
959                vaesenc  %%XMM7, %%T_key
960                vaesenc  %%XMM8, %%T_key
961
962                vmovdqu  %%T_key, [%%GDATA_KEY+16*8]
963                vaesenc  %%XMM1, %%T_key
964                vaesenc  %%XMM2, %%T_key
965                vaesenc  %%XMM3, %%T_key
966                vaesenc  %%XMM4, %%T_key
967                vaesenc  %%XMM5, %%T_key
968                vaesenc  %%XMM6, %%T_key
969                vaesenc  %%XMM7, %%T_key
970                vaesenc  %%XMM8, %%T_key
971
972%assign i (i+1)
973%assign j (j+1)
974%assign k (k-1)
975%if(%%num_initial_blocks>5)
976        ;;                 GDATA,       HASHKEY, CIPHER,
977        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
978        vmovdqu         %%T2, [rsp + TMP %+ j]
979        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
980                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
981%endif
982
983                vmovdqu  %%T_key, [%%GDATA_KEY+16*9]
984                vaesenc  %%XMM1, %%T_key
985                vaesenc  %%XMM2, %%T_key
986                vaesenc  %%XMM3, %%T_key
987                vaesenc  %%XMM4, %%T_key
988                vaesenc  %%XMM5, %%T_key
989                vaesenc  %%XMM6, %%T_key
990                vaesenc  %%XMM7, %%T_key
991                vaesenc  %%XMM8, %%T_key
992
993%ifndef GCM128_MODE
994                vmovdqu  %%T_key, [%%GDATA_KEY+16*10]
995                vaesenc  %%XMM1, %%T_key
996                vaesenc  %%XMM2, %%T_key
997                vaesenc  %%XMM3, %%T_key
998                vaesenc  %%XMM4, %%T_key
999                vaesenc  %%XMM5, %%T_key
1000                vaesenc  %%XMM6, %%T_key
1001                vaesenc  %%XMM7, %%T_key
1002                vaesenc  %%XMM8, %%T_key
1003%endif
1004
1005%assign i (i+1)
1006%assign j (j+1)
1007%assign k (k-1)
1008%if(%%num_initial_blocks>6)
1009        ;;                 GDATA,       HASHKEY, CIPHER,
1010        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
1011        vmovdqu         %%T2, [rsp + TMP %+ j]
1012        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1013                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
1014%endif
1015
1016%ifdef GCM128_MODE
1017                vmovdqu  %%T_key, [%%GDATA_KEY+16*10]
1018                vaesenclast  %%XMM1, %%T_key
1019                vaesenclast  %%XMM2, %%T_key
1020                vaesenclast  %%XMM3, %%T_key
1021                vaesenclast  %%XMM4, %%T_key
1022                vaesenclast  %%XMM5, %%T_key
1023                vaesenclast  %%XMM6, %%T_key
1024                vaesenclast  %%XMM7, %%T_key
1025                vaesenclast  %%XMM8, %%T_key
1026%endif
1027
1028%ifdef GCM192_MODE
1029                vmovdqu  %%T_key, [%%GDATA_KEY+16*11]
1030                vaesenc  %%XMM1, %%T_key
1031                vaesenc  %%XMM2, %%T_key
1032                vaesenc  %%XMM3, %%T_key
1033                vaesenc  %%XMM4, %%T_key
1034                vaesenc  %%XMM5, %%T_key
1035                vaesenc  %%XMM6, %%T_key
1036                vaesenc  %%XMM7, %%T_key
1037                vaesenc  %%XMM8, %%T_key
1038
1039                vmovdqu          %%T_key, [%%GDATA_KEY+16*12]
1040                vaesenclast      %%XMM1, %%T_key
1041                vaesenclast      %%XMM2, %%T_key
1042                vaesenclast      %%XMM3, %%T_key
1043                vaesenclast      %%XMM4, %%T_key
1044                vaesenclast      %%XMM5, %%T_key
1045                vaesenclast      %%XMM6, %%T_key
1046                vaesenclast      %%XMM7, %%T_key
1047                vaesenclast      %%XMM8, %%T_key
1048%endif
1049%ifdef GCM256_MODE
1050                vmovdqu  %%T_key, [%%GDATA_KEY+16*11]
1051                vaesenc  %%XMM1, %%T_key
1052                vaesenc  %%XMM2, %%T_key
1053                vaesenc  %%XMM3, %%T_key
1054                vaesenc  %%XMM4, %%T_key
1055                vaesenc  %%XMM5, %%T_key
1056                vaesenc  %%XMM6, %%T_key
1057                vaesenc  %%XMM7, %%T_key
1058                vaesenc  %%XMM8, %%T_key
1059
1060                vmovdqu          %%T_key, [%%GDATA_KEY+16*12]
1061                vaesenc  %%XMM1, %%T_key
1062                vaesenc  %%XMM2, %%T_key
1063                vaesenc  %%XMM3, %%T_key
1064                vaesenc  %%XMM4, %%T_key
1065                vaesenc  %%XMM5, %%T_key
1066                vaesenc  %%XMM6, %%T_key
1067                vaesenc  %%XMM7, %%T_key
1068                vaesenc  %%XMM8, %%T_key
1069%endif
1070
1071%assign i (i+1)
1072%assign j (j+1)
1073%assign k (k-1)
1074%if(%%num_initial_blocks>7)
1075        ;;                 GDATA,       HASHKEY, CIPHER,
1076        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
1077        vmovdqu         %%T2, [rsp + TMP %+ j]
1078        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1079                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
1080%endif
1081
1082%ifdef GCM256_MODE             ; GCM256
1083                vmovdqu  %%T_key, [%%GDATA_KEY+16*13]
1084                vaesenc  %%XMM1, %%T_key
1085                vaesenc  %%XMM2, %%T_key
1086                vaesenc  %%XMM3, %%T_key
1087                vaesenc  %%XMM4, %%T_key
1088                vaesenc  %%XMM5, %%T_key
1089                vaesenc  %%XMM6, %%T_key
1090                vaesenc  %%XMM7, %%T_key
1091                vaesenc  %%XMM8, %%T_key
1092
1093                vmovdqu          %%T_key, [%%GDATA_KEY+16*14]
1094                vaesenclast      %%XMM1, %%T_key
1095                vaesenclast      %%XMM2, %%T_key
1096                vaesenclast      %%XMM3, %%T_key
1097                vaesenclast      %%XMM4, %%T_key
1098                vaesenclast      %%XMM5, %%T_key
1099                vaesenclast      %%XMM6, %%T_key
1100                vaesenclast      %%XMM7, %%T_key
1101                vaesenclast      %%XMM8, %%T_key
1102%endif                          ;  GCM256 mode
1103
1104%if(%%num_initial_blocks>0)
1105        vpsrldq %%T3, %%T6, 8            ; shift-R %%T2 2 DWs
1106        vpslldq %%T6, %%T6, 8            ; shift-L %%T3 2 DWs
1107        vpxor   %%T1, %%T1, %%T3         ; accumulate the results in %%T1:%%T4
1108        vpxor   %%T4, %%T6, %%T4
1109
1110        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1111        ; First phase of the reduction
1112        vmovdqa         %%T3, [rel POLY2]
1113
1114        vpclmulqdq      %%T2, %%T3, %%T4, 0x01
1115        vpslldq         %%T2, %%T2, 8             ; shift-L xmm2 2 DWs
1116
1117        ;; First phase of the reduction complete
1118        vpxor           %%T4, %%T4, %%T2
1119
1120        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1121        ; Second phase of the reduction
1122        vpclmulqdq      %%T2, %%T3, %%T4, 0x00
1123        ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1124        vpsrldq         %%T2, %%T2, 4
1125
1126        vpclmulqdq      %%T4, %%T3, %%T4, 0x10
1127        ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1128        vpslldq         %%T4, %%T4, 4
1129        ;; Second phase of the reduction complete
1130        vpxor           %%T4, %%T4, %%T2
1131        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1132        ; The result is in %%T3
1133        vpxor           %%T3, %%T1, %%T4
1134%else
1135        ;; The hash should end up in T3
1136        vmovdqa  %%T3, %%T2
1137%endif
1138
1139        ;; Final hash is now in T3
1140%if %%num_initial_blocks > 0
1141        ;; NOTE: obsolete in case %%num_initial_blocks = 0
1142        sub     %%LENGTH, 16*%%num_initial_blocks
1143%endif
1144
1145                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
1146                vpxor    %%XMM1, %%XMM1, %%T1
1147                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
1148                %ifidn  %%ENC_DEC, DEC
1149                vmovdqa  %%XMM1, %%T1
1150                %endif
1151
1152                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
1153                vpxor    %%XMM2, %%XMM2, %%T1
1154                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
1155                %ifidn  %%ENC_DEC, DEC
1156                vmovdqa  %%XMM2, %%T1
1157                %endif
1158
1159                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
1160                vpxor    %%XMM3, %%XMM3, %%T1
1161                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
1162                %ifidn  %%ENC_DEC, DEC
1163                vmovdqa  %%XMM3, %%T1
1164                %endif
1165
1166                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
1167                vpxor    %%XMM4, %%XMM4, %%T1
1168                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
1169                %ifidn  %%ENC_DEC, DEC
1170                vmovdqa  %%XMM4, %%T1
1171                %endif
1172
1173                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
1174                vpxor    %%XMM5, %%XMM5, %%T1
1175                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
1176                %ifidn  %%ENC_DEC, DEC
1177                vmovdqa  %%XMM5, %%T1
1178                %endif
1179
1180                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
1181                vpxor    %%XMM6, %%XMM6, %%T1
1182                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
1183                %ifidn  %%ENC_DEC, DEC
1184                vmovdqa  %%XMM6, %%T1
1185                %endif
1186
1187               VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
1188                vpxor    %%XMM7, %%XMM7, %%T1
1189                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
1190                %ifidn  %%ENC_DEC, DEC
1191                vmovdqa  %%XMM7, %%T1
1192                %endif
1193
1194%if %%num_initial_blocks > 0
1195                ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
1196                ;;      This macro is executed for length 128 and up,
1197                ;;      zero length is checked in GCM_ENC_DEC.
1198                ;; If the last block is partial then the xor will be done later
1199                ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
1200                ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
1201                cmp %%LENGTH, 128
1202                jl %%_initial_skip_last_word_write
1203%endif
1204                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
1205                vpxor    %%XMM8, %%XMM8, %%T1
1206                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
1207                %ifidn  %%ENC_DEC, DEC
1208                vmovdqa  %%XMM8, %%T1
1209                %endif
1210
1211                ;; Update %%LENGTH with the number of blocks processed
1212                sub     %%LENGTH, 16
1213                add     %%DATA_OFFSET, 16
1214%%_initial_skip_last_word_write:
1215                sub     %%LENGTH, 128-16
1216                add     %%DATA_OFFSET, 128-16
1217
1218                vpshufb  %%XMM1, [rel SHUF_MASK]             ; perform a 16Byte swap
1219                ;; Combine GHASHed value with the corresponding ciphertext
1220                vpxor    %%XMM1, %%XMM1, %%T3
1221                vpshufb  %%XMM2, [rel SHUF_MASK]             ; perform a 16Byte swap
1222                vpshufb  %%XMM3, [rel SHUF_MASK]             ; perform a 16Byte swap
1223                vpshufb  %%XMM4, [rel SHUF_MASK]             ; perform a 16Byte swap
1224                vpshufb  %%XMM5, [rel SHUF_MASK]             ; perform a 16Byte swap
1225                vpshufb  %%XMM6, [rel SHUF_MASK]             ; perform a 16Byte swap
1226                vpshufb  %%XMM7, [rel SHUF_MASK]             ; perform a 16Byte swap
1227                vpshufb  %%XMM8, [rel SHUF_MASK]             ; perform a 16Byte swap
1228
1229;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1230
1231%%_initial_blocks_done:
1232
1233
1234%endmacro
1235
1236;;; INITIAL_BLOCKS macro with support for a partial final block.
1237;;; num_initial_blocks is expected to include the partial final block
1238;;;     in the count.
1239%macro INITIAL_BLOCKS_PARTIAL 25
1240%define %%GDATA_KEY             %1
1241%define %%GDATA_CTX             %2
1242%define %%CYPH_PLAIN_OUT        %3
1243%define %%PLAIN_CYPH_IN         %4
1244%define %%LENGTH                %5
1245%define %%DATA_OFFSET           %6
1246%define %%num_initial_blocks    %7  ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
1247%define %%T1                    %8
1248%define %%T2                    %9
1249%define %%T3                    %10
1250%define %%T4                    %11
1251%define %%T5                    %12
1252%define %%CTR                   %13
1253%define %%XMM1                  %14
1254%define %%XMM2                  %15
1255%define %%XMM3                  %16
1256%define %%XMM4                  %17
1257%define %%XMM5                  %18
1258%define %%XMM6                  %19
1259%define %%XMM7                  %20
1260%define %%XMM8                  %21
1261%define %%T6                    %22
1262%define %%T_key                 %23
1263%define %%ENC_DEC               %24
1264%define %%INSTANCE_TYPE         %25
1265
1266%assign i (8-%%num_initial_blocks)
1267                ;; Move AAD_HASH to temp reg
1268                vmovdqu  %%T2, %%XMM8
1269                ;; vmovdqu  %%CTR, [%%GDATA_CTX + CurCount]  ; %%CTR = Y0
1270
1271%assign i (9-%%num_initial_blocks)
1272%rep %%num_initial_blocks
1273                ;; Compute AES counters
1274                vpaddd   %%CTR, %%CTR, [rel ONE]     ; INCR Y0
1275                vmovdqa  reg(i), %%CTR
1276                vpshufb  reg(i), [rel SHUF_MASK]     ; perform a 16Byte swap
1277%assign i (i+1)
1278%endrep
1279
1280vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
1281%assign i (9-%%num_initial_blocks)
1282%rep %%num_initial_blocks
1283                ; Start AES for %%num_initial_blocks blocks
1284                vpxor    reg(i),reg(i),%%T_key
1285%assign i (i+1)
1286%endrep
1287
1288%assign j 1
1289%rep NROUNDS
1290vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
1291%assign i (9-%%num_initial_blocks)
1292%rep %%num_initial_blocks
1293                vaesenc  reg(i),%%T_key
1294%assign i (i+1)
1295%endrep
1296
1297%assign j (j+1)
1298%endrep
1299
1300
1301vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
1302%assign i (9-%%num_initial_blocks)
1303%rep %%num_initial_blocks
1304                vaesenclast      reg(i),%%T_key
1305%assign i (i+1)
1306%endrep
1307
1308;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1309;;; Hash all but the last block of data
1310;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1311
1312%assign i (9-%%num_initial_blocks)
1313%rep %%num_initial_blocks-1
1314                ;; Encrypt the message for all but the last block
1315                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1316                vpxor    reg(i), reg(i), %%T1
1317                ;; write back ciphertext for %%num_initial_blocks blocks
1318                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1319                add     %%DATA_OFFSET, 16
1320                %ifidn  %%ENC_DEC, DEC
1321                    vmovdqa  reg(i), %%T1
1322                %endif
1323                ;; Prepare ciphertext for GHASH computations
1324                vpshufb  reg(i), [rel SHUF_MASK]
1325%assign i (i+1)
1326%endrep
1327
1328                ;; The final block of data may be <16B
1329                sub      %%LENGTH, 16*(%%num_initial_blocks-1)
1330
1331%if %%num_initial_blocks < 8
1332                ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
1333                ;;      This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
1334                cmp      %%LENGTH, 16
1335                jl       %%_small_initial_partial_block
1336
1337;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1338;;; Handle a full length final block - encrypt and hash all blocks
1339;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1340
1341                sub      %%LENGTH, 16
1342	        mov	[%%GDATA_CTX + PBlockLen], %%LENGTH
1343
1344                ;; Encrypt the message
1345                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1346                vpxor    reg(i), reg(i), %%T1
1347                ;; write back ciphertext for %%num_initial_blocks blocks
1348                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1349                add     %%DATA_OFFSET, 16
1350                %ifidn  %%ENC_DEC, DEC
1351                    vmovdqa  reg(i), %%T1
1352                %endif
1353                ;; Prepare ciphertext for GHASH computations
1354                vpshufb  reg(i), [rel SHUF_MASK]
1355
1356        ;; Hash all of the data
1357%assign i (8-%%num_initial_blocks)
1358%assign j (9-%%num_initial_blocks)
1359%assign k (%%num_initial_blocks)
1360%assign last_block_to_hash 0
1361
1362%if(%%num_initial_blocks>last_block_to_hash)
1363        ;; Hash in AES state
1364        vpxor %%T2, reg(j)
1365
1366        ;; T2 - incoming AAD hash
1367        ;; reg(i) holds ciphertext
1368        ;; T5 - hash key
1369        ;; T6 - updated xor
1370        ;; reg(1)/xmm1 should now be available for tmp use
1371        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1372        vpclmulqdq      %%T1, %%T2, %%T5, 0x11             ; %%T4 = a1*b1
1373        vpclmulqdq      %%T4, %%T2, %%T5, 0x00             ; %%T4 = a0*b0
1374        vpclmulqdq      %%T6, %%T2, %%T5, 0x01             ; %%T6 = a1*b0
1375        vpclmulqdq      %%T5, %%T2, %%T5, 0x10             ; %%T5 = a0*b1
1376        vpxor           %%T6, %%T6, %%T5
1377%endif
1378
1379%assign i (i+1)
1380%assign j (j+1)
1381%assign k (k-1)
1382%assign rep_count (%%num_initial_blocks-1)
1383%rep rep_count
1384
1385        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1386        vpclmulqdq      %%T3, reg(j), %%T5, 0x11
1387        vpxor           %%T1, %%T1, %%T3
1388
1389        vpclmulqdq      %%T3, reg(j), %%T5, 0x00
1390        vpxor           %%T4, %%T4, %%T3
1391
1392        vpclmulqdq      %%T3, reg(j), %%T5, 0x01
1393        vpxor           %%T6, %%T6, %%T3
1394
1395        vpclmulqdq      %%T3, reg(j), %%T5, 0x10
1396        vpxor           %%T6, %%T6, %%T3
1397
1398%assign i (i+1)
1399%assign j (j+1)
1400%assign k (k-1)
1401%endrep
1402
1403        ;; Record that a reduction is needed
1404        mov            r12, 1
1405
1406        jmp      %%_small_initial_compute_hash
1407
1408
1409%endif                          ; %if %%num_initial_blocks < 8
1410
1411%%_small_initial_partial_block:
1412
1413;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1414;;; Handle ghash for a <16B final block
1415;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1416
1417        ;; In this case if it's a single call to encrypt we can
1418        ;; hash all of the data but if it's an init / update / finalize
1419        ;; series of call we need to leave the last block if it's
1420        ;; less than a full block of data.
1421
1422	mov	[%%GDATA_CTX + PBlockLen], %%LENGTH
1423        vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
1424        ;; Handle a partial final block
1425        ;;                            GDATA,    KEY,   T1,   T2
1426        ;; r13 - length
1427        ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
1428        ;;      NOTE: could be replaced with %%LENGTH but at this point
1429        ;;      %%LENGTH is always less than 16.
1430        ;;      No PLAIN_CYPH_LEN argument available in this macro.
1431        ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
1432        vpshufb  reg(i), [rel SHUF_MASK]
1433
1434%ifidn %%INSTANCE_TYPE, multi_call
1435%assign i (8-%%num_initial_blocks)
1436%assign j (9-%%num_initial_blocks)
1437%assign k (%%num_initial_blocks-1)
1438%assign last_block_to_hash 1
1439%else
1440%assign i (8-%%num_initial_blocks)
1441%assign j (9-%%num_initial_blocks)
1442%assign k (%%num_initial_blocks)
1443%assign last_block_to_hash 0
1444%endif
1445
1446%if(%%num_initial_blocks>last_block_to_hash)
1447        ;; Record that a reduction is needed
1448        mov            r12, 1
1449        ;; Hash in AES state
1450        vpxor          %%T2, reg(j)
1451
1452        ;; T2 - incoming AAD hash
1453        ;; reg(i) holds ciphertext
1454        ;; T5 - hash key
1455        ;; T6 - updated xor
1456        ;; reg(1)/xmm1 should now be available for tmp use
1457        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1458        vpclmulqdq      %%T1, %%T2, %%T5, 0x11             ; %%T4 = a1*b1
1459        vpclmulqdq      %%T4, %%T2, %%T5, 0x00             ; %%T4 = a0*b0
1460        vpclmulqdq      %%T6, %%T2, %%T5, 0x01             ; %%T6 = a1*b0
1461        vpclmulqdq      %%T5, %%T2, %%T5, 0x10             ; %%T5 = a0*b1
1462        vpxor           %%T6, %%T6, %%T5
1463%else
1464        ;; Record that a reduction is not needed -
1465        ;; In this case no hashes are computed because there
1466        ;; is only one initial block and it is < 16B in length.
1467        mov            r12, 0
1468%endif
1469
1470%assign i (i+1)
1471%assign j (j+1)
1472%assign k (k-1)
1473%ifidn %%INSTANCE_TYPE, multi_call
1474%assign rep_count (%%num_initial_blocks-2)
1475%%_multi_call_hash:
1476%else
1477%assign rep_count (%%num_initial_blocks-1)
1478%endif
1479
1480%if rep_count < 0
1481        ;; quick fix for negative rep_count (to be investigated)
1482%assign rep_count 0
1483%endif
1484
1485%rep rep_count
1486
1487        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1488        vpclmulqdq      %%T3, reg(j), %%T5, 0x11
1489        vpxor           %%T1, %%T1, %%T3
1490
1491        vpclmulqdq      %%T3, reg(j), %%T5, 0x00
1492        vpxor           %%T4, %%T4, %%T3
1493
1494        vpclmulqdq      %%T3, reg(j), %%T5, 0x01
1495        vpxor           %%T6, %%T6, %%T3
1496
1497        vpclmulqdq      %%T3, reg(j), %%T5, 0x10
1498        vpxor           %%T6, %%T6, %%T3
1499
1500%assign i (i+1)
1501%assign j (j+1)
1502%assign k (k-1)
1503%endrep
1504
1505%%_small_initial_compute_hash:
1506
1507;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1508;;; Ghash reduction
1509;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1510
1511%if(%%num_initial_blocks=1)
1512%ifidn %%INSTANCE_TYPE, multi_call
1513        ;; We only need to check if a reduction is needed if
1514        ;; initial_blocks == 1 and init/update/final is being used.
1515        ;; In this case we may just have a partial block, and that
1516        ;; gets hashed in finalize.
1517        cmp     r12, 0
1518        je      %%_no_reduction_needed
1519%endif
1520%endif
1521
1522        vpsrldq %%T3, %%T6, 8          ; shift-R %%T2 2 DWs
1523        vpslldq %%T6, %%T6, 8          ; shift-L %%T3 2 DWs
1524        vpxor   %%T1, %%T1, %%T3       ; accumulate the results in %%T1:%%T4
1525        vpxor   %%T4, %%T6, %%T4
1526
1527        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1528        ;; First phase of the reduction
1529        vmovdqa         %%T3, [rel POLY2]
1530
1531        vpclmulqdq      %%T2, %%T3, %%T4, 0x01
1532        ;; shift-L xmm2 2 DWs
1533        vpslldq         %%T2, %%T2, 8
1534        vpxor           %%T4, %%T4, %%T2
1535
1536        ;; First phase of the reduction complete
1537        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1538        ;; Second phase of the reduction
1539
1540        vpclmulqdq      %%T2, %%T3, %%T4, 0x00
1541        ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1542        vpsrldq         %%T2, %%T2, 4
1543
1544        vpclmulqdq      %%T4, %%T3, %%T4, 0x10
1545        ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1546        vpslldq         %%T4, %%T4, 4
1547
1548        vpxor           %%T4, %%T4, %%T2
1549        ;; Second phase of the reduction complete
1550        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1551        vpxor           %%T3, %%T1, %%T4
1552
1553%ifidn %%INSTANCE_TYPE, multi_call
1554        ;; If using init/update/finalize, we need to xor any partial block data
1555        ;; into the hash.
1556%if %%num_initial_blocks > 1
1557        ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
1558%if %%num_initial_blocks != 8
1559        ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
1560        cmp             qword [%%GDATA_CTX + PBlockLen], 0
1561        je              %%_no_partial_block_xor
1562%endif                          ; %%num_initial_blocks != 8
1563        vpxor           %%T3, %%T3, reg(8)
1564%%_no_partial_block_xor:
1565%endif                          ; %%num_initial_blocks > 1
1566%endif                          ; %%INSTANCE_TYPE, multi_call
1567
1568%if(%%num_initial_blocks=1)
1569%ifidn %%INSTANCE_TYPE, multi_call
1570        ;; NOTE: %%_no_reduction_needed case only valid for
1571        ;;      multi_call with initial_blocks = 1.
1572        ;; Look for comment above around '_no_reduction_needed'
1573        ;; The jmp below is obsolete as the code will fall through.
1574
1575        ;; The result is in %%T3
1576        jmp             %%_after_reduction
1577
1578%%_no_reduction_needed:
1579        ;; The hash should end up in T3. The only way we should get here is if
1580        ;; there is a partial block of data, so xor that into the hash.
1581        vpxor            %%T3, %%T2, reg(8)
1582%endif                          ; %%INSTANCE_TYPE = multi_call
1583%endif                          ; %%num_initial_blocks=1
1584
1585%%_after_reduction:
1586        ;; Final hash is now in T3
1587
1588%endmacro                       ; INITIAL_BLOCKS_PARTIAL
1589
1590
1591
1592; encrypt 8 blocks at a time
1593; ghash the 8 previously encrypted ciphertext blocks
1594; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
1595; %%DATA_OFFSET is the data offset value
1596%macro  GHASH_8_ENCRYPT_8_PARALLEL 23
1597%define %%GDATA                 %1
1598%define %%CYPH_PLAIN_OUT        %2
1599%define %%PLAIN_CYPH_IN         %3
1600%define %%DATA_OFFSET           %4
1601%define %%T1    %5
1602%define %%T2    %6
1603%define %%T3    %7
1604%define %%T4    %8
1605%define %%T5    %9
1606%define %%T6    %10
1607%define %%CTR   %11
1608%define %%XMM1  %12
1609%define %%XMM2  %13
1610%define %%XMM3  %14
1611%define %%XMM4  %15
1612%define %%XMM5  %16
1613%define %%XMM6  %17
1614%define %%XMM7  %18
1615%define %%XMM8  %19
1616%define %%T7    %20
1617%define %%loop_idx      %21
1618%define %%ENC_DEC       %22
1619%define %%FULL_PARTIAL  %23
1620
1621        vmovdqa %%T2, %%XMM1
1622        vmovdqu [rsp + TMP2], %%XMM2
1623        vmovdqu [rsp + TMP3], %%XMM3
1624        vmovdqu [rsp + TMP4], %%XMM4
1625        vmovdqu [rsp + TMP5], %%XMM5
1626        vmovdqu [rsp + TMP6], %%XMM6
1627        vmovdqu [rsp + TMP7], %%XMM7
1628        vmovdqu [rsp + TMP8], %%XMM8
1629
1630%ifidn %%loop_idx, in_order
1631                vpaddd  %%XMM1, %%CTR,  [rel ONE]           ; INCR CNT
1632                vmovdqa %%T5, [rel TWO]
1633                vpaddd  %%XMM2, %%CTR, %%T5
1634                vpaddd  %%XMM3, %%XMM1, %%T5
1635                vpaddd  %%XMM4, %%XMM2, %%T5
1636                vpaddd  %%XMM5, %%XMM3, %%T5
1637                vpaddd  %%XMM6, %%XMM4, %%T5
1638                vpaddd  %%XMM7, %%XMM5, %%T5
1639                vpaddd  %%XMM8, %%XMM6, %%T5
1640                vmovdqa %%CTR, %%XMM8
1641
1642                vmovdqa %%T5, [rel SHUF_MASK]
1643                vpshufb %%XMM1, %%T5             ; perform a 16Byte swap
1644                vpshufb %%XMM2, %%T5             ; perform a 16Byte swap
1645                vpshufb %%XMM3, %%T5             ; perform a 16Byte swap
1646                vpshufb %%XMM4, %%T5             ; perform a 16Byte swap
1647                vpshufb %%XMM5, %%T5             ; perform a 16Byte swap
1648                vpshufb %%XMM6, %%T5             ; perform a 16Byte swap
1649                vpshufb %%XMM7, %%T5             ; perform a 16Byte swap
1650                vpshufb %%XMM8, %%T5             ; perform a 16Byte swap
1651%else
1652                vpaddd  %%XMM1, %%CTR,  [rel ONEf]          ; INCR CNT
1653                vmovdqa %%T5, [rel TWOf]
1654                vpaddd  %%XMM2, %%CTR,  %%T5
1655                vpaddd  %%XMM3, %%XMM1, %%T5
1656                vpaddd  %%XMM4, %%XMM2, %%T5
1657                vpaddd  %%XMM5, %%XMM3, %%T5
1658                vpaddd  %%XMM6, %%XMM4, %%T5
1659                vpaddd  %%XMM7, %%XMM5, %%T5
1660                vpaddd  %%XMM8, %%XMM6, %%T5
1661                vmovdqa %%CTR, %%XMM8
1662%endif
1663
1664
1665
1666        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1667
1668                vmovdqu %%T1, [%%GDATA + 16*0]
1669                vpxor   %%XMM1, %%XMM1, %%T1
1670                vpxor   %%XMM2, %%XMM2, %%T1
1671                vpxor   %%XMM3, %%XMM3, %%T1
1672                vpxor   %%XMM4, %%XMM4, %%T1
1673                vpxor   %%XMM5, %%XMM5, %%T1
1674                vpxor   %%XMM6, %%XMM6, %%T1
1675                vpxor   %%XMM7, %%XMM7, %%T1
1676                vpxor   %%XMM8, %%XMM8, %%T1
1677
1678        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1679
1680                vmovdqu %%T1, [%%GDATA + 16*1]
1681                vaesenc %%XMM1, %%T1
1682                vaesenc %%XMM2, %%T1
1683                vaesenc %%XMM3, %%T1
1684                vaesenc %%XMM4, %%T1
1685                vaesenc %%XMM5, %%T1
1686                vaesenc %%XMM6, %%T1
1687                vaesenc %%XMM7, %%T1
1688                vaesenc %%XMM8, %%T1
1689
1690
1691                vmovdqu %%T1, [%%GDATA + 16*2]
1692                vaesenc %%XMM1, %%T1
1693                vaesenc %%XMM2, %%T1
1694                vaesenc %%XMM3, %%T1
1695                vaesenc %%XMM4, %%T1
1696                vaesenc %%XMM5, %%T1
1697                vaesenc %%XMM6, %%T1
1698                vaesenc %%XMM7, %%T1
1699                vaesenc %%XMM8, %%T1
1700
1701        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1702
1703        vmovdqu         %%T5, [%%GDATA + HashKey_8]
1704        vpclmulqdq      %%T4, %%T2, %%T5, 0x11                  ; %%T4 = a1*b1
1705        vpclmulqdq      %%T7, %%T2, %%T5, 0x00                  ; %%T7 = a0*b0
1706        vpclmulqdq      %%T6, %%T2, %%T5, 0x01                  ; %%T6 = a1*b0
1707        vpclmulqdq      %%T5, %%T2, %%T5, 0x10                  ; %%T5 = a0*b1
1708        vpxor           %%T6, %%T6, %%T5
1709
1710                vmovdqu %%T1, [%%GDATA + 16*3]
1711                vaesenc %%XMM1, %%T1
1712                vaesenc %%XMM2, %%T1
1713                vaesenc %%XMM3, %%T1
1714                vaesenc %%XMM4, %%T1
1715                vaesenc %%XMM5, %%T1
1716                vaesenc %%XMM6, %%T1
1717                vaesenc %%XMM7, %%T1
1718                vaesenc %%XMM8, %%T1
1719
1720        vmovdqu         %%T1, [rsp + TMP2]
1721        vmovdqu         %%T5, [%%GDATA + HashKey_7]
1722        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1723        vpxor           %%T4, %%T4, %%T3
1724
1725        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1726        vpxor           %%T7, %%T7, %%T3
1727
1728        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1729        vpxor           %%T6, %%T6, %%T3
1730
1731        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1732        vpxor           %%T6, %%T6, %%T3
1733
1734                vmovdqu %%T1, [%%GDATA + 16*4]
1735                vaesenc %%XMM1, %%T1
1736                vaesenc %%XMM2, %%T1
1737                vaesenc %%XMM3, %%T1
1738                vaesenc %%XMM4, %%T1
1739                vaesenc %%XMM5, %%T1
1740                vaesenc %%XMM6, %%T1
1741                vaesenc %%XMM7, %%T1
1742                vaesenc %%XMM8, %%T1
1743
1744        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1745        vmovdqu         %%T1, [rsp + TMP3]
1746        vmovdqu         %%T5, [%%GDATA + HashKey_6]
1747        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1748        vpxor           %%T4, %%T4, %%T3
1749
1750        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1751        vpxor           %%T7, %%T7, %%T3
1752
1753        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1754        vpxor           %%T6, %%T6, %%T3
1755
1756        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1757        vpxor           %%T6, %%T6, %%T3
1758
1759                vmovdqu %%T1, [%%GDATA + 16*5]
1760                vaesenc %%XMM1, %%T1
1761                vaesenc %%XMM2, %%T1
1762                vaesenc %%XMM3, %%T1
1763                vaesenc %%XMM4, %%T1
1764                vaesenc %%XMM5, %%T1
1765                vaesenc %%XMM6, %%T1
1766                vaesenc %%XMM7, %%T1
1767                vaesenc %%XMM8, %%T1
1768
1769
1770        vmovdqu         %%T1, [rsp + TMP4]
1771        vmovdqu         %%T5, [%%GDATA + HashKey_5]
1772        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1773        vpxor           %%T4, %%T4, %%T3
1774
1775        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1776        vpxor           %%T7, %%T7, %%T3
1777
1778        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1779        vpxor           %%T6, %%T6, %%T3
1780
1781        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1782        vpxor           %%T6, %%T6, %%T3
1783
1784                vmovdqu %%T1, [%%GDATA + 16*6]
1785                vaesenc %%XMM1, %%T1
1786                vaesenc %%XMM2, %%T1
1787                vaesenc %%XMM3, %%T1
1788                vaesenc %%XMM4, %%T1
1789                vaesenc %%XMM5, %%T1
1790                vaesenc %%XMM6, %%T1
1791                vaesenc %%XMM7, %%T1
1792                vaesenc %%XMM8, %%T1
1793
1794        vmovdqu         %%T1, [rsp + TMP5]
1795        vmovdqu         %%T5, [%%GDATA + HashKey_4]
1796        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1797        vpxor           %%T4, %%T4, %%T3
1798
1799        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1800        vpxor           %%T7, %%T7, %%T3
1801
1802        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1803        vpxor           %%T6, %%T6, %%T3
1804
1805        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1806        vpxor           %%T6, %%T6, %%T3
1807
1808                vmovdqu %%T1, [%%GDATA + 16*7]
1809                vaesenc %%XMM1, %%T1
1810                vaesenc %%XMM2, %%T1
1811                vaesenc %%XMM3, %%T1
1812                vaesenc %%XMM4, %%T1
1813                vaesenc %%XMM5, %%T1
1814                vaesenc %%XMM6, %%T1
1815                vaesenc %%XMM7, %%T1
1816                vaesenc %%XMM8, %%T1
1817
1818        vmovdqu         %%T1, [rsp + TMP6]
1819        vmovdqu         %%T5, [%%GDATA + HashKey_3]
1820        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1821        vpxor           %%T4, %%T4, %%T3
1822
1823        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1824        vpxor           %%T7, %%T7, %%T3
1825
1826        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1827        vpxor           %%T6, %%T6, %%T3
1828
1829        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1830        vpxor           %%T6, %%T6, %%T3
1831
1832                vmovdqu %%T1, [%%GDATA + 16*8]
1833                vaesenc %%XMM1, %%T1
1834                vaesenc %%XMM2, %%T1
1835                vaesenc %%XMM3, %%T1
1836                vaesenc %%XMM4, %%T1
1837                vaesenc %%XMM5, %%T1
1838                vaesenc %%XMM6, %%T1
1839                vaesenc %%XMM7, %%T1
1840                vaesenc %%XMM8, %%T1
1841
1842        vmovdqu         %%T1, [rsp + TMP7]
1843        vmovdqu         %%T5, [%%GDATA + HashKey_2]
1844        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1845        vpxor           %%T4, %%T4, %%T3
1846
1847        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1848        vpxor           %%T7, %%T7, %%T3
1849
1850        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1851        vpxor           %%T6, %%T6, %%T3
1852
1853        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1854        vpxor           %%T6, %%T6, %%T3
1855
1856        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1857
1858                vmovdqu %%T5, [%%GDATA + 16*9]
1859                vaesenc %%XMM1, %%T5
1860                vaesenc %%XMM2, %%T5
1861                vaesenc %%XMM3, %%T5
1862                vaesenc %%XMM4, %%T5
1863                vaesenc %%XMM5, %%T5
1864                vaesenc %%XMM6, %%T5
1865                vaesenc %%XMM7, %%T5
1866                vaesenc %%XMM8, %%T5
1867
1868        vmovdqu         %%T1, [rsp + TMP8]
1869        vmovdqu         %%T5, [%%GDATA + HashKey]
1870
1871
1872        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1873        vpxor           %%T7, %%T7, %%T3
1874
1875        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1876        vpxor           %%T6, %%T6, %%T3
1877
1878        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1879        vpxor           %%T6, %%T6, %%T3
1880
1881        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1882        vpxor           %%T1, %%T4, %%T3
1883
1884
1885                vmovdqu %%T5, [%%GDATA + 16*10]
1886 %ifndef GCM128_MODE            ; GCM192 or GCM256
1887                vaesenc %%XMM1, %%T5
1888                vaesenc %%XMM2, %%T5
1889                vaesenc %%XMM3, %%T5
1890                vaesenc %%XMM4, %%T5
1891                vaesenc %%XMM5, %%T5
1892                vaesenc %%XMM6, %%T5
1893                vaesenc %%XMM7, %%T5
1894                vaesenc %%XMM8, %%T5
1895
1896                vmovdqu %%T5, [%%GDATA + 16*11]
1897                vaesenc %%XMM1, %%T5
1898                vaesenc %%XMM2, %%T5
1899                vaesenc %%XMM3, %%T5
1900                vaesenc %%XMM4, %%T5
1901                vaesenc %%XMM5, %%T5
1902                vaesenc %%XMM6, %%T5
1903                vaesenc %%XMM7, %%T5
1904                vaesenc %%XMM8, %%T5
1905
1906                vmovdqu %%T5, [%%GDATA + 16*12]
1907%endif
1908%ifdef GCM256_MODE
1909                vaesenc %%XMM1, %%T5
1910                vaesenc %%XMM2, %%T5
1911                vaesenc %%XMM3, %%T5
1912                vaesenc %%XMM4, %%T5
1913                vaesenc %%XMM5, %%T5
1914                vaesenc %%XMM6, %%T5
1915                vaesenc %%XMM7, %%T5
1916                vaesenc %%XMM8, %%T5
1917
1918                vmovdqu %%T5, [%%GDATA + 16*13]
1919                vaesenc %%XMM1, %%T5
1920                vaesenc %%XMM2, %%T5
1921                vaesenc %%XMM3, %%T5
1922                vaesenc %%XMM4, %%T5
1923                vaesenc %%XMM5, %%T5
1924                vaesenc %%XMM6, %%T5
1925                vaesenc %%XMM7, %%T5
1926                vaesenc %%XMM8, %%T5
1927
1928                vmovdqu %%T5, [%%GDATA + 16*14]
1929%endif                          ; GCM256
1930
1931%assign i 0
1932%assign j 1
1933%rep 8
1934
1935        ;; SNP TBD: This is pretty ugly - consider whether just XORing the
1936        ;; data in after vaesenclast is simpler and performant. Would
1937        ;; also have to ripple it through partial block and ghash_mul_8.
1938%ifidn %%FULL_PARTIAL, full
1939    %ifdef  NT_LD
1940        VXLDR   %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1941        vpxor   %%T2, %%T2, %%T5
1942    %else
1943        vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1944    %endif
1945
1946    %ifidn %%ENC_DEC, ENC
1947        vaesenclast     reg(j), reg(j), %%T2
1948    %else
1949        vaesenclast     %%T3, reg(j), %%T2
1950        vpxor   reg(j), %%T2, %%T5
1951        VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1952    %endif
1953
1954%else
1955    ; Don't read the final data during partial block processing
1956    %ifdef  NT_LD
1957        %if (i<7)
1958            VXLDR   %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1959            vpxor   %%T2, %%T2, %%T5
1960        %else
1961            ;; Stage the key directly in T2 rather than hash it with plaintext
1962            vmovdqu %%T2, %%T5
1963        %endif
1964    %else
1965        %if (i<7)
1966            vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1967        %else
1968            ;; Stage the key directly in T2 rather than hash it with plaintext
1969            vmovdqu %%T2, %%T5
1970        %endif
1971    %endif
1972
1973    %ifidn %%ENC_DEC, ENC
1974        vaesenclast     reg(j), reg(j), %%T2
1975    %else
1976        %if (i<7)
1977            vaesenclast     %%T3, reg(j), %%T2
1978            vpxor   reg(j), %%T2, %%T5
1979            ;; Do not read the data since it could fault
1980            VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1981        %else
1982            vaesenclast     reg(j), reg(j), %%T2
1983        %endif
1984    %endif
1985%endif
1986
1987%assign i (i+1)
1988%assign j (j+1)
1989%endrep
1990
1991
1992;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1993
1994
1995        vpslldq %%T3, %%T6, 8                                   ; shift-L %%T3 2 DWs
1996        vpsrldq %%T6, %%T6, 8                                   ; shift-R %%T2 2 DWs
1997        vpxor   %%T7, %%T7, %%T3
1998        vpxor   %%T1, %%T1, %%T6                                ; accumulate the results in %%T1:%%T7
1999
2000
2001
2002        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2003        ;first phase of the reduction
2004        vmovdqa         %%T3, [rel POLY2]
2005
2006        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
2007        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
2008
2009        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
2010        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2011
2012    %ifidn %%ENC_DEC, ENC
2013        ; Write to the Ciphertext buffer
2014        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
2015        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
2016        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
2017        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
2018        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
2019        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
2020        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
2021        %ifidn %%FULL_PARTIAL, full
2022            ;; Avoid writing past the buffer if handling a partial block
2023            VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
2024        %endif
2025    %endif
2026
2027
2028;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2029        ;second phase of the reduction
2030        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
2031        vpsrldq         %%T2, %%T2, 4                                   ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2032
2033        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
2034        vpslldq         %%T4, %%T4, 4                                   ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2035
2036        vpxor           %%T4, %%T4, %%T2                                ; second phase of the reduction complete
2037        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2038        vpxor           %%T1, %%T1, %%T4                                ; the result is in %%T1
2039
2040                vpshufb %%XMM1, [rel SHUF_MASK]             ; perform a 16Byte swap
2041                vpshufb %%XMM2, [rel SHUF_MASK]             ; perform a 16Byte swap
2042                vpshufb %%XMM3, [rel SHUF_MASK]             ; perform a 16Byte swap
2043                vpshufb %%XMM4, [rel SHUF_MASK]             ; perform a 16Byte swap
2044                vpshufb %%XMM5, [rel SHUF_MASK]             ; perform a 16Byte swap
2045                vpshufb %%XMM6, [rel SHUF_MASK]             ; perform a 16Byte swap
2046                vpshufb %%XMM7, [rel SHUF_MASK]             ; perform a 16Byte swap
2047        vpshufb %%XMM8, [rel SHUF_MASK]             ; perform a 16Byte swap
2048
2049
2050        vpxor   %%XMM1, %%T1
2051
2052
2053%endmacro                       ; GHASH_8_ENCRYPT_8_PARALLEL
2054
2055
2056; GHASH the last 4 ciphertext blocks.
2057%macro  GHASH_LAST_8 16
2058%define %%GDATA %1
2059%define %%T1    %2
2060%define %%T2    %3
2061%define %%T3    %4
2062%define %%T4    %5
2063%define %%T5    %6
2064%define %%T6    %7
2065%define %%T7    %8
2066%define %%XMM1  %9
2067%define %%XMM2  %10
2068%define %%XMM3  %11
2069%define %%XMM4  %12
2070%define %%XMM5  %13
2071%define %%XMM6  %14
2072%define %%XMM7  %15
2073%define %%XMM8  %16
2074
2075        ;; Karatsuba Method
2076
2077        vmovdqu         %%T5, [%%GDATA + HashKey_8]
2078
2079        vpshufd         %%T2, %%XMM1, 01001110b
2080        vpshufd         %%T3, %%T5, 01001110b
2081        vpxor           %%T2, %%T2, %%XMM1
2082        vpxor           %%T3, %%T3, %%T5
2083
2084        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
2085        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
2086
2087        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
2088
2089        ;;;;;;;;;;;;;;;;;;;;;;
2090
2091        vmovdqu         %%T5, [%%GDATA + HashKey_7]
2092        vpshufd         %%T2, %%XMM2, 01001110b
2093        vpshufd         %%T3, %%T5, 01001110b
2094        vpxor           %%T2, %%T2, %%XMM2
2095        vpxor           %%T3, %%T3, %%T5
2096
2097        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
2098        vpxor           %%T6, %%T6, %%T4
2099
2100        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
2101        vpxor           %%T7, %%T7, %%T4
2102
2103        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2104
2105        vpxor           %%XMM1, %%XMM1, %%T2
2106
2107        ;;;;;;;;;;;;;;;;;;;;;;
2108
2109        vmovdqu         %%T5, [%%GDATA + HashKey_6]
2110        vpshufd         %%T2, %%XMM3, 01001110b
2111        vpshufd         %%T3, %%T5, 01001110b
2112        vpxor           %%T2, %%T2, %%XMM3
2113        vpxor           %%T3, %%T3, %%T5
2114
2115        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
2116        vpxor           %%T6, %%T6, %%T4
2117
2118        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
2119        vpxor           %%T7, %%T7, %%T4
2120
2121        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2122
2123        vpxor           %%XMM1, %%XMM1, %%T2
2124
2125        ;;;;;;;;;;;;;;;;;;;;;;
2126
2127        vmovdqu         %%T5, [%%GDATA + HashKey_5]
2128        vpshufd         %%T2, %%XMM4, 01001110b
2129        vpshufd         %%T3, %%T5, 01001110b
2130        vpxor           %%T2, %%T2, %%XMM4
2131        vpxor           %%T3, %%T3, %%T5
2132
2133        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
2134        vpxor           %%T6, %%T6, %%T4
2135
2136        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
2137        vpxor           %%T7, %%T7, %%T4
2138
2139        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2140
2141        vpxor           %%XMM1, %%XMM1, %%T2
2142
2143        ;;;;;;;;;;;;;;;;;;;;;;
2144
2145        vmovdqu         %%T5, [%%GDATA + HashKey_4]
2146        vpshufd         %%T2, %%XMM5, 01001110b
2147        vpshufd         %%T3, %%T5, 01001110b
2148        vpxor           %%T2, %%T2, %%XMM5
2149        vpxor           %%T3, %%T3, %%T5
2150
2151        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
2152        vpxor           %%T6, %%T6, %%T4
2153
2154        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
2155        vpxor           %%T7, %%T7, %%T4
2156
2157        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2158
2159        vpxor           %%XMM1, %%XMM1, %%T2
2160
2161        ;;;;;;;;;;;;;;;;;;;;;;
2162
2163        vmovdqu         %%T5, [%%GDATA + HashKey_3]
2164        vpshufd         %%T2, %%XMM6, 01001110b
2165        vpshufd         %%T3, %%T5, 01001110b
2166        vpxor           %%T2, %%T2, %%XMM6
2167        vpxor           %%T3, %%T3, %%T5
2168
2169        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
2170        vpxor           %%T6, %%T6, %%T4
2171
2172        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
2173        vpxor           %%T7, %%T7, %%T4
2174
2175        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2176
2177        vpxor           %%XMM1, %%XMM1, %%T2
2178
2179        ;;;;;;;;;;;;;;;;;;;;;;
2180
2181        vmovdqu         %%T5, [%%GDATA + HashKey_2]
2182        vpshufd         %%T2, %%XMM7, 01001110b
2183        vpshufd         %%T3, %%T5, 01001110b
2184        vpxor           %%T2, %%T2, %%XMM7
2185        vpxor           %%T3, %%T3, %%T5
2186
2187        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
2188        vpxor           %%T6, %%T6, %%T4
2189
2190        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
2191        vpxor           %%T7, %%T7, %%T4
2192
2193        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2194
2195        vpxor           %%XMM1, %%XMM1, %%T2
2196
2197        ;;;;;;;;;;;;;;;;;;;;;;
2198
2199        vmovdqu         %%T5, [%%GDATA + HashKey]
2200        vpshufd         %%T2, %%XMM8, 01001110b
2201        vpshufd         %%T3, %%T5, 01001110b
2202        vpxor           %%T2, %%T2, %%XMM8
2203        vpxor           %%T3, %%T3, %%T5
2204
2205        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x11
2206        vpxor           %%T6, %%T6, %%T4
2207
2208        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x00
2209        vpxor           %%T7, %%T7, %%T4
2210
2211        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2212
2213        vpxor           %%XMM1, %%XMM1, %%T2
2214        vpxor           %%XMM1, %%XMM1, %%T6
2215        vpxor           %%T2, %%XMM1, %%T7
2216
2217
2218
2219
2220        vpslldq %%T4, %%T2, 8
2221        vpsrldq %%T2, %%T2, 8
2222
2223        vpxor   %%T7, %%T7, %%T4
2224        vpxor   %%T6, %%T6, %%T2                               ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2225
2226        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2227        ;first phase of the reduction
2228        vmovdqa         %%T3, [rel POLY2]
2229
2230        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
2231        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
2232
2233        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
2234        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2235
2236
2237        ;second phase of the reduction
2238        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
2239        vpsrldq         %%T2, %%T2, 4                           ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2240
2241        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
2242        vpslldq         %%T4, %%T4, 4                           ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2243
2244        vpxor           %%T4, %%T4, %%T2                        ; second phase of the reduction complete
2245        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2246        vpxor           %%T6, %%T6, %%T4                        ; the result is in %%T6
2247%endmacro
2248
2249
2250; GHASH the last 4 ciphertext blocks.
2251%macro  GHASH_LAST_7 15
2252%define %%GDATA %1
2253%define %%T1    %2
2254%define %%T2    %3
2255%define %%T3    %4
2256%define %%T4    %5
2257%define %%T5    %6
2258%define %%T6    %7
2259%define %%T7    %8
2260%define %%XMM1  %9
2261%define %%XMM2  %10
2262%define %%XMM3  %11
2263%define %%XMM4  %12
2264%define %%XMM5  %13
2265%define %%XMM6  %14
2266%define %%XMM7  %15
2267
2268        ;; Karatsuba Method
2269
2270        vmovdqu         %%T5, [%%GDATA + HashKey_7]
2271
2272        vpshufd         %%T2, %%XMM1, 01001110b
2273        vpshufd         %%T3, %%T5, 01001110b
2274        vpxor           %%T2, %%T2, %%XMM1
2275        vpxor           %%T3, %%T3, %%T5
2276
2277        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
2278        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
2279
2280        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
2281
2282        ;;;;;;;;;;;;;;;;;;;;;;
2283
2284        vmovdqu         %%T5, [%%GDATA + HashKey_6]
2285        vpshufd         %%T2, %%XMM2, 01001110b
2286        vpshufd         %%T3, %%T5, 01001110b
2287        vpxor           %%T2, %%T2, %%XMM2
2288        vpxor           %%T3, %%T3, %%T5
2289
2290        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
2291        vpxor           %%T6, %%T6, %%T4
2292
2293        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
2294        vpxor           %%T7, %%T7, %%T4
2295
2296        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2297
2298        vpxor           %%XMM1, %%XMM1, %%T2
2299
2300        ;;;;;;;;;;;;;;;;;;;;;;
2301
2302        vmovdqu         %%T5, [%%GDATA + HashKey_5]
2303        vpshufd         %%T2, %%XMM3, 01001110b
2304        vpshufd         %%T3, %%T5, 01001110b
2305        vpxor           %%T2, %%T2, %%XMM3
2306        vpxor           %%T3, %%T3, %%T5
2307
2308        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
2309        vpxor           %%T6, %%T6, %%T4
2310
2311        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
2312        vpxor           %%T7, %%T7, %%T4
2313
2314        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2315
2316        vpxor           %%XMM1, %%XMM1, %%T2
2317
2318        ;;;;;;;;;;;;;;;;;;;;;;
2319
2320        vmovdqu         %%T5, [%%GDATA + HashKey_4]
2321        vpshufd         %%T2, %%XMM4, 01001110b
2322        vpshufd         %%T3, %%T5, 01001110b
2323        vpxor           %%T2, %%T2, %%XMM4
2324        vpxor           %%T3, %%T3, %%T5
2325
2326        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
2327        vpxor           %%T6, %%T6, %%T4
2328
2329        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
2330        vpxor           %%T7, %%T7, %%T4
2331
2332        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2333
2334        vpxor           %%XMM1, %%XMM1, %%T2
2335
2336        ;;;;;;;;;;;;;;;;;;;;;;
2337
2338        vmovdqu         %%T5, [%%GDATA + HashKey_3]
2339        vpshufd         %%T2, %%XMM5, 01001110b
2340        vpshufd         %%T3, %%T5, 01001110b
2341        vpxor           %%T2, %%T2, %%XMM5
2342        vpxor           %%T3, %%T3, %%T5
2343
2344        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
2345        vpxor           %%T6, %%T6, %%T4
2346
2347        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
2348        vpxor           %%T7, %%T7, %%T4
2349
2350        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2351
2352        vpxor           %%XMM1, %%XMM1, %%T2
2353
2354        ;;;;;;;;;;;;;;;;;;;;;;
2355
2356        vmovdqu         %%T5, [%%GDATA + HashKey_2]
2357        vpshufd         %%T2, %%XMM6, 01001110b
2358        vpshufd         %%T3, %%T5, 01001110b
2359        vpxor           %%T2, %%T2, %%XMM6
2360        vpxor           %%T3, %%T3, %%T5
2361
2362        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
2363        vpxor           %%T6, %%T6, %%T4
2364
2365        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
2366        vpxor           %%T7, %%T7, %%T4
2367
2368        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2369
2370        vpxor           %%XMM1, %%XMM1, %%T2
2371
2372        ;;;;;;;;;;;;;;;;;;;;;;
2373
2374        vmovdqu         %%T5, [%%GDATA + HashKey_1]
2375        vpshufd         %%T2, %%XMM7, 01001110b
2376        vpshufd         %%T3, %%T5, 01001110b
2377        vpxor           %%T2, %%T2, %%XMM7
2378        vpxor           %%T3, %%T3, %%T5
2379
2380        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
2381        vpxor           %%T6, %%T6, %%T4
2382
2383        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
2384        vpxor           %%T7, %%T7, %%T4
2385
2386        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2387
2388        vpxor           %%XMM1, %%XMM1, %%T2
2389
2390        ;;;;;;;;;;;;;;;;;;;;;;
2391
2392        vpxor           %%XMM1, %%XMM1, %%T6
2393        vpxor           %%T2, %%XMM1, %%T7
2394
2395
2396
2397
2398        vpslldq %%T4, %%T2, 8
2399        vpsrldq %%T2, %%T2, 8
2400
2401        vpxor   %%T7, %%T7, %%T4
2402        vpxor   %%T6, %%T6, %%T2                               ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2403
2404        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2405        ;first phase of the reduction
2406        vmovdqa         %%T3, [rel POLY2]
2407
2408        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
2409        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
2410
2411        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
2412        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2413
2414
2415        ;second phase of the reduction
2416        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
2417        vpsrldq         %%T2, %%T2, 4                           ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2418
2419        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
2420        vpslldq         %%T4, %%T4, 4                           ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2421
2422        vpxor           %%T4, %%T4, %%T2                        ; second phase of the reduction complete
2423        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2424        vpxor           %%T6, %%T6, %%T4                        ; the result is in %%T6
2425%endmacro
2426
2427
2428
2429;;; Handle encryption of the final partial block
2430;;; IN:
2431;;;   r13  - Number of bytes to read
2432;;; MODIFIES:
2433;;;   KEY  - Key for encrypting the partial block
2434;;;   HASH - Current hash value
2435;;; SMASHES:
2436;;;   r10, r12, r15, rax
2437;;;   T1, T2
2438;;; Note:
2439;;;   PLAIN_CYPH_LEN, %7, is passed only to determine
2440;;;   if buffer is big enough to do a 16 byte read & shift.
2441;;;     'LT16' is passed here only if buffer is known to be smaller
2442;;;     than 16 bytes.
2443;;;     Any other value passed here will result in 16 byte read
2444;;;     code path.
2445;;; TBD: Remove HASH from the instantiation
2446%macro  ENCRYPT_FINAL_PARTIAL_BLOCK 8
2447%define %%KEY             %1
2448%define %%T1              %2
2449%define %%T2              %3
2450%define %%CYPH_PLAIN_OUT  %4
2451%define %%PLAIN_CYPH_IN   %5
2452%define %%PLAIN_CYPH_LEN  %6
2453%define %%ENC_DEC         %7
2454%define %%DATA_OFFSET     %8
2455
2456        ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting
2457%ifidn %%PLAIN_CYPH_LEN, LT16
2458        ;; Handle the case where the message is < 16 bytes
2459        lea      r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
2460
2461        ;; T1            - packed output
2462        ;; r10           - input data address
2463        ;; r13           - input data length
2464        ;; r12, r15, rax - temp registers
2465        READ_SMALL_DATA_INPUT   %%T1, r10, r13, r12, r15, rax
2466
2467        lea      r12, [SHIFT_MASK + 16]
2468        sub      r12, r13
2469%else
2470        ;; Handle the case where the message is >= 16 bytes
2471        sub      %%DATA_OFFSET, 16
2472        add      %%DATA_OFFSET, r13
2473        ;; Receive the last <16 Byte block
2474        vmovdqu  %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]
2475        sub      %%DATA_OFFSET, r13
2476        add      %%DATA_OFFSET, 16
2477
2478        lea      r12, [SHIFT_MASK + 16]
2479        ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes
2480        ;; (r13 is the number of bytes in plaintext mod 16)
2481        sub      r12, r13
2482        ;; Get the appropriate shuffle mask
2483        vmovdqu  %%T2, [r12]
2484        ;; shift right 16-r13 bytes
2485        vpshufb  %%T1, %%T2
2486%endif                          ; %%PLAIN_CYPH_LEN, LT16
2487
2488        ;; At this point T1 contains the partial block data
2489%ifidn  %%ENC_DEC, DEC
2490        ;; Plaintext XOR E(K, Yn)
2491        ;; Set aside the ciphertext
2492        vmovdqa  %%T2, %%T1
2493        vpxor    %%KEY, %%KEY, %%T1
2494        ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
2495        vmovdqu  %%T1, [r12 + ALL_F - SHIFT_MASK]
2496        ;; Mask out top 16-r13 bytes of ciphertext
2497        vpand    %%KEY, %%KEY, %%T1
2498
2499        ;; Prepare the ciphertext for the hash
2500        ;; mask out top 16-r13 bytes of the plaintext
2501        vpand    %%T2, %%T2, %%T1
2502%else
2503        ;; Plaintext XOR E(K, Yn)
2504        vpxor    %%KEY, %%KEY, %%T1
2505        ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
2506        vmovdqu  %%T1, [r12 + ALL_F - SHIFT_MASK]
2507        ;; Mask out top 16-r13 bytes of %%KEY
2508        vpand    %%KEY, %%KEY, %%T1
2509%endif
2510
2511        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2512        ;; Output r13 Bytes
2513        vmovq   rax, %%KEY
2514        cmp     r13, 8
2515        jle     %%_less_than_8_bytes_left
2516
2517        mov     [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
2518        add     %%DATA_OFFSET, 8
2519        vpsrldq %%T1, %%KEY, 8
2520        vmovq   rax, %%T1
2521        sub     r13, 8
2522
2523%%_less_than_8_bytes_left:
2524        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
2525        add     %%DATA_OFFSET, 1
2526        shr     rax, 8
2527        sub     r13, 1
2528        jne     %%_less_than_8_bytes_left
2529        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2530
2531%ifidn  %%ENC_DEC, DEC
2532        ;; If decrypt, restore the ciphertext into %%KEY
2533        vmovdqu %%KEY, %%T2
2534%endif
2535%endmacro                       ; ENCRYPT_FINAL_PARTIAL_BLOCK
2536
2537
2538
2539; Encryption of a single block
2540%macro  ENCRYPT_SINGLE_BLOCK 2
2541%define %%GDATA %1
2542%define %%XMM0  %2
2543
2544                vpxor    %%XMM0, %%XMM0, [%%GDATA+16*0]
2545%assign i 1
2546%rep NROUNDS
2547                vaesenc  %%XMM0, [%%GDATA+16*i]
2548%assign i (i+1)
2549%endrep
2550                vaesenclast      %%XMM0, [%%GDATA+16*i]
2551%endmacro
2552
2553
2554;; Start of Stack Setup
2555
2556%macro FUNC_SAVE 0
2557        ;; Required for Update/GMC_ENC
2558        ;the number of pushes must equal STACK_OFFSET
2559        push    r12
2560        push    r13
2561        push    r14
2562        push    r15
2563        mov     r14, rsp
2564
2565        sub     rsp, VARIABLE_OFFSET
2566        and     rsp, ~63
2567
2568%ifidn __OUTPUT_FORMAT__, win64
2569        ; xmm6:xmm15 need to be maintained for Windows
2570        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2571        vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
2572        vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
2573        vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
2574        vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
2575        vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
2576        vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
2577        vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
2578        vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
2579        vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
2580%endif
2581%endmacro
2582
2583
2584%macro FUNC_RESTORE 0
2585
2586%ifdef SAFE_DATA
2587        clear_scratch_gps_asm
2588        clear_scratch_ymms_asm
2589%endif
2590%ifidn __OUTPUT_FORMAT__, win64
2591        vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
2592        vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
2593        vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
2594        vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
2595        vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
2596        vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
2597        vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
2598        vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
2599        vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
2600        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2601%endif
2602
2603;; Required for Update/GMC_ENC
2604        mov     rsp, r14
2605        pop     r15
2606        pop     r14
2607        pop     r13
2608        pop     r12
2609%endmacro
2610
2611%macro CALC_J0 15
2612%define %%KEY           %1 ;; [in] Pointer to GCM KEY structure
2613%define %%IV            %2 ;; [in] Pointer to IV
2614%define %%IV_LEN        %3 ;; [in] IV length
2615%define %%J0            %4 ;; [out] XMM reg to contain J0
2616%define %%TMP0          %5 ;; [clobbered] Temporary GP reg
2617%define %%TMP1          %6 ;; [clobbered] Temporary GP reg
2618%define %%TMP2          %7 ;; [clobbered] Temporary GP reg
2619%define %%TMP3          %8 ;; [clobbered] Temporary GP reg
2620%define %%TMP4          %9 ;; [clobbered] Temporary GP reg
2621%define %%XTMP0         %10 ;; [clobbered] Temporary XMM reg
2622%define %%XTMP1         %11 ;; [clobbered] Temporary XMM reg
2623%define %%XTMP2         %12 ;; [clobbered] Temporary XMM reg
2624%define %%XTMP3         %13 ;; [clobbered] Temporary XMM reg
2625%define %%XTMP4         %14 ;; [clobbered] Temporary XMM reg
2626%define %%XTMP5         %15 ;; [clobbered] Temporary XMM reg
2627
2628        ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
2629        ;; s = 16 * RoundUp(len(IV)/16) -  len(IV) */
2630
2631        ;; Calculate GHASH of (IV || 0s)
2632        vpxor   %%J0, %%J0
2633        CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \
2634                      %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2, %%TMP3, %%TMP4
2635
2636        ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
2637        vmovdqu %%XTMP0, [%%KEY + HashKey]
2638        mov     %%TMP2, %%IV_LEN
2639        shl     %%TMP2, 3 ;; IV length in bits
2640        vmovq   %%XTMP1, %%TMP2
2641        vpxor   %%J0, %%XTMP1
2642        GHASH_MUL %%J0, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
2643
2644        vpshufb %%J0, [rel SHUF_MASK] ; perform a 16Byte swap
2645%endmacro
2646
2647
2648;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2649; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
2650; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, IV_LEN,
2651; Additional Authentication data (A_IN), Additional Data length (A_LEN).
2652; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
2653; Clobbers rax, r10-r13 and xmm0-xmm6
2654;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2655%macro  GCM_INIT        5-6
2656%define %%GDATA_KEY     %1 ; [in] Pointer to GCM Key data structure
2657%define %%GDATA_CTX     %2 ; [in/out] Pointer to GCM Context data structure
2658%define %%IV            %3 ; [in] Pointer to IV
2659%define %%A_IN	        %4 ; [in] Pointer to AAD
2660%define %%A_LEN	        %5 ; [in] AAD length
2661%define %%IV_LEN        %6 ; [in] IV length
2662
2663%define %%AAD_HASH      xmm14
2664
2665
2666        mov     r10, %%A_LEN
2667        cmp     r10, 0
2668        je      %%_aad_is_zero
2669
2670        vpxor   %%AAD_HASH, %%AAD_HASH
2671        CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
2672        jmp     %%_after_aad
2673
2674%%_aad_is_zero:
2675        vpxor   %%AAD_HASH, %%AAD_HASH
2676
2677%%_after_aad:
2678        mov     r10, %%A_LEN
2679        vpxor   xmm2, xmm3
2680
2681        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH         ; ctx_data.aad hash = aad_hash
2682        mov     [%%GDATA_CTX + AadLen], r10                 ; ctx_data.aad_length = aad_length
2683        xor     r10, r10
2684        mov     [%%GDATA_CTX + InLen], r10                  ; ctx_data.in_length = 0
2685        mov     [%%GDATA_CTX + PBlockLen], r10              ; ctx_data.partial_block_length = 0
2686        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2          ; ctx_data.partial_block_enc_key = 0
2687        mov     r10, %%IV
2688%if %0 == 6 ;; IV is different than 12 bytes
2689        CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, r13, rax, xmm1, xmm0, \
2690                xmm3, xmm4, xmm5, xmm6
2691%else ;; IV is 12 bytes
2692        vmovdqa xmm2, [rel ONEf]                        ; read 12 IV bytes and pad with 0x00000001
2693        vpinsrq xmm2, [r10], 0
2694        vpinsrd xmm2, [r10+8], 2
2695%endif
2696        vmovdqu [%%GDATA_CTX + OrigIV], xmm2                ; ctx_data.orig_IV = iv
2697
2698        vpshufb xmm2, [rel SHUF_MASK]
2699
2700        vmovdqu [%%GDATA_CTX + CurCount], xmm2              ; ctx_data.current_counter = iv
2701%endmacro
2702
2703%macro  GCM_ENC_DEC_SMALL   12
2704%define %%GDATA_KEY         %1
2705%define %%GDATA_CTX         %2
2706%define %%CYPH_PLAIN_OUT    %3
2707%define %%PLAIN_CYPH_IN     %4
2708%define %%PLAIN_CYPH_LEN    %5
2709%define %%ENC_DEC           %6
2710%define %%DATA_OFFSET       %7
2711%define %%LENGTH            %8
2712%define %%NUM_BLOCKS        %9
2713%define %%CTR               %10
2714%define %%HASH              %11
2715%define %%INSTANCE_TYPE     %12
2716
2717        ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
2718        ;; cmp     %%NUM_BLOCKS, 0
2719        ;; je      %%_small_initial_blocks_encrypted
2720        cmp     %%NUM_BLOCKS, 8
2721        je      %%_small_initial_num_blocks_is_8
2722        cmp     %%NUM_BLOCKS, 7
2723        je      %%_small_initial_num_blocks_is_7
2724        cmp     %%NUM_BLOCKS, 6
2725        je      %%_small_initial_num_blocks_is_6
2726        cmp     %%NUM_BLOCKS, 5
2727        je      %%_small_initial_num_blocks_is_5
2728        cmp     %%NUM_BLOCKS, 4
2729        je      %%_small_initial_num_blocks_is_4
2730        cmp     %%NUM_BLOCKS, 3
2731        je      %%_small_initial_num_blocks_is_3
2732        cmp     %%NUM_BLOCKS, 2
2733        je      %%_small_initial_num_blocks_is_2
2734
2735        jmp     %%_small_initial_num_blocks_is_1
2736
2737
2738%%_small_initial_num_blocks_is_8:
2739        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2740        jmp     %%_small_initial_blocks_encrypted
2741
2742%%_small_initial_num_blocks_is_7:
2743        ;; r13   - %%LENGTH
2744        ;; xmm12 - T1
2745        ;; xmm13 - T2
2746        ;; xmm14 - T3   - AAD HASH OUT when not producing 8 AES keys
2747        ;; xmm15 - T4
2748        ;; xmm11 - T5
2749        ;; xmm9  - CTR
2750        ;; xmm1  - XMM1 - Cipher + Hash when producing 8 AES keys
2751        ;; xmm2  - XMM2
2752        ;; xmm3  - XMM3
2753        ;; xmm4  - XMM4
2754        ;; xmm5  - XMM5
2755        ;; xmm6  - XMM6
2756        ;; xmm7  - XMM7
2757        ;; xmm8  - XMM8 - AAD HASH IN
2758        ;; xmm10 - T6
2759        ;; xmm0  - T_key
2760        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2761        jmp     %%_small_initial_blocks_encrypted
2762
2763%%_small_initial_num_blocks_is_6:
2764        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2765        jmp     %%_small_initial_blocks_encrypted
2766
2767%%_small_initial_num_blocks_is_5:
2768        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2769        jmp     %%_small_initial_blocks_encrypted
2770
2771%%_small_initial_num_blocks_is_4:
2772        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2773        jmp     %%_small_initial_blocks_encrypted
2774
2775%%_small_initial_num_blocks_is_3:
2776        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2777        jmp     %%_small_initial_blocks_encrypted
2778
2779%%_small_initial_num_blocks_is_2:
2780        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2781        jmp     %%_small_initial_blocks_encrypted
2782
2783%%_small_initial_num_blocks_is_1:
2784        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2785
2786        ;; Note: zero initial blocks not allowed.
2787
2788%%_small_initial_blocks_encrypted:
2789
2790%endmacro                       ; GCM_ENC_DEC_SMALL
2791
2792;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2793; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
2794; has been initialized by GCM_INIT
2795; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
2796; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
2797; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
2798; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
2799; Clobbers rax, r10-r15, and xmm0-xmm15
2800;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2801%macro  GCM_ENC_DEC         7
2802%define %%GDATA_KEY         %1
2803%define %%GDATA_CTX         %2
2804%define %%CYPH_PLAIN_OUT    %3
2805%define %%PLAIN_CYPH_IN     %4
2806%define %%PLAIN_CYPH_LEN    %5
2807%define %%ENC_DEC           %6
2808%define %%INSTANCE_TYPE     %7
2809%define %%DATA_OFFSET       r11
2810
2811; Macro flow:
2812; calculate the number of 16byte blocks in the message
2813; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
2814; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
2815; if there is a block of less than 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
2816
2817        cmp     %%PLAIN_CYPH_LEN, 0
2818        je      %%_enc_dec_done
2819
2820        xor     %%DATA_OFFSET, %%DATA_OFFSET
2821        ;; Update length of data processed
2822%ifidn __OUTPUT_FORMAT__, win64
2823        mov     rax, %%PLAIN_CYPH_LEN
2824       	add     [%%GDATA_CTX + InLen], rax
2825%else
2826        add    [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
2827%endif
2828        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
2829        vmovdqu xmm8, [%%GDATA_CTX + AadHash]
2830
2831%ifidn %%INSTANCE_TYPE, multi_call
2832        ;; NOTE: partial block processing makes only sense for multi_call here.
2833        ;; Used for the update flow - if there was a previous partial
2834        ;; block fill the remaining bytes here.
2835        PARTIAL_BLOCK %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, xmm13, %%ENC_DEC
2836%endif
2837
2838        ;;  lift CTR set from initial_blocks to here
2839%ifidn %%INSTANCE_TYPE, single_call
2840        vmovdqu xmm9, xmm2
2841%else
2842        vmovdqu xmm9, [%%GDATA_CTX + CurCount]
2843%endif
2844
2845        ;; Save the amount of data left to process in r10
2846        mov     r13, %%PLAIN_CYPH_LEN
2847%ifidn %%INSTANCE_TYPE, multi_call
2848        ;; NOTE: %%DATA_OFFSET is zero in single_call case.
2849        ;;      Consequently PLAIN_CYPH_LEN will never be zero after
2850        ;;      %%DATA_OFFSET subtraction below.
2851        sub     r13, %%DATA_OFFSET
2852
2853        ;; There may be no more data if it was consumed in the partial block.
2854        cmp     r13, 0
2855        je      %%_enc_dec_done
2856%endif                          ; %%INSTANCE_TYPE, multi_call
2857        mov     r10, r13
2858
2859        ;; Determine how many blocks to process in INITIAL
2860        mov     r12, r13
2861        shr     r12, 4
2862        and     r12, 7
2863
2864        ;; Process one additional block in INITIAL if there is a partial block
2865        and     r10, 0xf
2866        blsmsk  r10, r10    ; Set CF if zero
2867        cmc                 ; Flip CF
2868        adc     r12, 0x0    ; Process an additional INITIAL block if CF set
2869
2870        ;;      Less than 127B will be handled by the small message code, which
2871        ;;      can process up to 7 16B blocks.
2872        cmp     r13, 128
2873        jge     %%_large_message_path
2874
2875        GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
2876        jmp     %%_ghash_done
2877
2878%%_large_message_path:
2879        and     r12, 0x7    ; Still, don't allow 8 INITIAL blocks since this will
2880                            ; can be handled by the x8 partial loop.
2881
2882        cmp     r12, 0
2883        je      %%_initial_num_blocks_is_0
2884        cmp     r12, 7
2885        je      %%_initial_num_blocks_is_7
2886        cmp     r12, 6
2887        je      %%_initial_num_blocks_is_6
2888        cmp     r12, 5
2889        je      %%_initial_num_blocks_is_5
2890        cmp     r12, 4
2891        je      %%_initial_num_blocks_is_4
2892        cmp     r12, 3
2893        je      %%_initial_num_blocks_is_3
2894        cmp     r12, 2
2895        je      %%_initial_num_blocks_is_2
2896
2897        jmp     %%_initial_num_blocks_is_1
2898
2899%%_initial_num_blocks_is_7:
2900        ;; r13   - %%LENGTH
2901        ;; xmm12 - T1
2902        ;; xmm13 - T2
2903        ;; xmm14 - T3   - AAD HASH OUT when not producing 8 AES keys
2904        ;; xmm15 - T4
2905        ;; xmm11 - T5
2906        ;; xmm9  - CTR
2907        ;; xmm1  - XMM1 - Cipher + Hash when producing 8 AES keys
2908        ;; xmm2  - XMM2
2909        ;; xmm3  - XMM3
2910        ;; xmm4  - XMM4
2911        ;; xmm5  - XMM5
2912        ;; xmm6  - XMM6
2913        ;; xmm7  - XMM7
2914        ;; xmm8  - XMM8 - AAD HASH IN
2915        ;; xmm10 - T6
2916        ;; xmm0  - T_key
2917        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2918        jmp     %%_initial_blocks_encrypted
2919
2920%%_initial_num_blocks_is_6:
2921        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2922        jmp     %%_initial_blocks_encrypted
2923
2924%%_initial_num_blocks_is_5:
2925        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2926        jmp     %%_initial_blocks_encrypted
2927
2928%%_initial_num_blocks_is_4:
2929        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2930        jmp     %%_initial_blocks_encrypted
2931
2932%%_initial_num_blocks_is_3:
2933        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2934        jmp     %%_initial_blocks_encrypted
2935
2936%%_initial_num_blocks_is_2:
2937        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2938        jmp     %%_initial_blocks_encrypted
2939
2940%%_initial_num_blocks_is_1:
2941        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2942        jmp     %%_initial_blocks_encrypted
2943
2944%%_initial_num_blocks_is_0:
2945        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2946
2947
2948%%_initial_blocks_encrypted:
2949        ;; The entire message was encrypted processed in initial and now need to be hashed
2950        cmp     r13, 0
2951        je      %%_encrypt_done
2952
2953        ;; Encrypt the final <16 byte (partial) block, then hash
2954        cmp     r13, 16
2955        jl      %%_encrypt_final_partial
2956
2957        ;; Process 7 full blocks plus a partial block
2958        cmp     r13, 128
2959        jl      %%_encrypt_by_8_partial
2960
2961
2962%%_encrypt_by_8_parallel:
2963        ;; in_order vs. out_order is an optimization to increment the counter without shuffling
2964        ;; it back into little endian. r15d keeps track of when we need to increent in order so
2965        ;; that the carry is handled correctly.
2966        vmovd   r15d, xmm9
2967        and     r15d, 255
2968        vpshufb xmm9, [rel SHUF_MASK]
2969
2970
2971%%_encrypt_by_8_new:
2972        cmp     r15d, 255-8
2973        jg      %%_encrypt_by_8
2974
2975
2976
2977        ;; xmm0  - T1
2978        ;; xmm10 - T2
2979        ;; xmm11 - T3
2980        ;; xmm12 - T4
2981        ;; xmm13 - T5
2982        ;; xmm14 - T6
2983        ;; xmm9  - CTR
2984        ;; xmm1  - XMM1
2985        ;; xmm2  - XMM2
2986        ;; xmm3  - XMM3
2987        ;; xmm4  - XMM4
2988        ;; xmm5  - XMM5
2989        ;; xmm6  - XMM6
2990        ;; xmm7  - XMM7
2991        ;; xmm8  - XMM8
2992        ;; xmm15 - T7
2993        add     r15b, 8
2994        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
2995        add     %%DATA_OFFSET, 128
2996        sub     r13, 128
2997        cmp     r13, 128
2998        jge     %%_encrypt_by_8_new
2999
3000        vpshufb xmm9, [rel SHUF_MASK]
3001        jmp     %%_encrypt_by_8_parallel_done
3002
3003%%_encrypt_by_8:
3004        vpshufb xmm9, [rel SHUF_MASK]
3005        add     r15b, 8
3006        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
3007        vpshufb  xmm9, [rel SHUF_MASK]
3008        add     %%DATA_OFFSET, 128
3009        sub     r13, 128
3010        cmp     r13, 128
3011        jge     %%_encrypt_by_8_new
3012        vpshufb  xmm9, [rel SHUF_MASK]
3013
3014
3015%%_encrypt_by_8_parallel_done:
3016        ;; Test to see if we need a by 8 with partial block. At this point
3017        ;; bytes remaining should be either zero or between 113-127.
3018        cmp     r13, 0
3019        je      %%_encrypt_done
3020
3021%%_encrypt_by_8_partial:
3022        ;; Shuffle needed to align key for partial block xor. out_order
3023        ;; is a little faster because it avoids extra shuffles.
3024        ;; TBD: Might need to account for when we don't have room to increment the counter.
3025
3026
3027        ;; Process parallel buffers with a final partial block.
3028        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
3029
3030
3031        add     %%DATA_OFFSET, 128-16
3032        sub     r13, 128-16
3033
3034%%_encrypt_final_partial:
3035
3036        vpshufb  xmm8, [rel SHUF_MASK]
3037        mov     [%%GDATA_CTX + PBlockLen], r13
3038        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
3039
3040        ;; xmm8  - Final encrypted counter - need to hash with partial or full block ciphertext
3041        ;;                            GDATA,  KEY,   T1,    T2
3042        ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
3043
3044        vpshufb  xmm8, [rel SHUF_MASK]
3045
3046
3047%%_encrypt_done:
3048
3049        ;; Mapping to macro parameters
3050        ;; IN:
3051        ;;   xmm9 contains the counter
3052        ;;   xmm1-xmm8 contain the xor'd ciphertext
3053        ;; OUT:
3054        ;;   xmm14 contains the final hash
3055        ;;             GDATA,   T1,    T2,    T3,    T4,    T5,    T6,    T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3056%ifidn %%INSTANCE_TYPE, multi_call
3057        mov     r13, [%%GDATA_CTX + PBlockLen]
3058        cmp     r13, 0
3059        jz      %%_hash_last_8
3060        GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
3061        ;; XOR the partial word into the hash
3062        vpxor   xmm14, xmm14, xmm8
3063        jmp     %%_ghash_done
3064%endif
3065%%_hash_last_8:
3066        GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
3067
3068%%_ghash_done:
3069        vmovdqu [%%GDATA_CTX + CurCount], xmm9  ; my_ctx_data.current_counter = xmm9
3070        vmovdqu [%%GDATA_CTX + AadHash], xmm14      ; my_ctx_data.aad hash = xmm14
3071
3072%%_enc_dec_done:
3073
3074
3075%endmacro
3076
3077
3078;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3079; GCM_COMPLETE Finishes Encryption/Decryption of last partial block after GCM_UPDATE finishes.
3080; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX).
3081; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
3082; Clobbers rax, r10-r12, and xmm0-xmm2, xmm5-xmm6, xmm9-xmm11, xmm13-xmm15
3083;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3084%macro  GCM_COMPLETE            5
3085%define %%GDATA_KEY             %1
3086%define %%GDATA_CTX             %2
3087%define %%AUTH_TAG              %3
3088%define %%AUTH_TAG_LEN          %4
3089%define %%INSTANCE_TYPE         %5
3090%define %%PLAIN_CYPH_LEN        rax
3091
3092        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
3093        ;; Start AES as early as possible
3094        vmovdqu xmm9, [%%GDATA_CTX + OrigIV]    ; xmm9 = Y0
3095        ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9  ; E(K, Y0)
3096
3097%ifidn %%INSTANCE_TYPE, multi_call
3098        ;; If the GCM function is called as a single function call rather
3099        ;; than invoking the individual parts (init, update, finalize) we
3100        ;; can remove a write to read dependency on AadHash.
3101        vmovdqu xmm14, [%%GDATA_CTX + AadHash]
3102
3103        ;; Encrypt the final partial block. If we did this as a single call then
3104        ;; the partial block was handled in the main GCM_ENC_DEC macro.
3105	mov	r12, [%%GDATA_CTX + PBlockLen]
3106	cmp	r12, 0
3107
3108	je %%_partial_done
3109
3110	GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
3111	vmovdqu [%%GDATA_CTX + AadHash], xmm14
3112
3113%%_partial_done:
3114
3115%endif
3116
3117        mov     r12, [%%GDATA_CTX + AadLen]     ; r12 = aadLen (number of bytes)
3118        mov     %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
3119
3120        shl     r12, 3                      ; convert into number of bits
3121        vmovq   xmm15, r12                  ; len(A) in xmm15
3122
3123        shl     %%PLAIN_CYPH_LEN, 3         ; len(C) in bits  (*128)
3124        vmovq   xmm1, %%PLAIN_CYPH_LEN
3125        vpslldq xmm15, xmm15, 8             ; xmm15 = len(A)|| 0x0000000000000000
3126        vpxor   xmm15, xmm15, xmm1          ; xmm15 = len(A)||len(C)
3127
3128        vpxor   xmm14, xmm15
3129        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
3130        vpshufb  xmm14, [rel SHUF_MASK]         ; perform a 16Byte swap
3131
3132        vpxor   xmm9, xmm9, xmm14
3133
3134
3135%%_return_T:
3136        mov     r10, %%AUTH_TAG             ; r10 = authTag
3137        mov     r11, %%AUTH_TAG_LEN         ; r11 = auth_tag_len
3138
3139        cmp     r11, 16
3140        je      %%_T_16
3141
3142        cmp     r11, 12
3143        je      %%_T_12
3144
3145        cmp     r11, 8
3146        je      %%_T_8
3147
3148        simd_store_avx r10, xmm9, r11, r12, rax
3149        jmp     %%_return_T_done
3150%%_T_8:
3151        vmovq   rax, xmm9
3152        mov     [r10], rax
3153        jmp     %%_return_T_done
3154%%_T_12:
3155        vmovq   rax, xmm9
3156        mov     [r10], rax
3157        vpsrldq xmm9, xmm9, 8
3158        vmovd   eax, xmm9
3159        mov     [r10 + 8], eax
3160        jmp     %%_return_T_done
3161%%_T_16:
3162        vmovdqu  [r10], xmm9
3163
3164%%_return_T_done:
3165
3166%ifdef SAFE_DATA
3167        ;; Clear sensitive data from context structure
3168        vpxor   xmm0, xmm0
3169        vmovdqu [%%GDATA_CTX + AadHash], xmm0
3170        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
3171%endif
3172%endmacro ; GCM_COMPLETE
3173
3174
3175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3176;void   aes_gcm_precomp_128_avx_gen4 /
3177;       aes_gcm_precomp_192_avx_gen4 /
3178;       aes_gcm_precomp_256_avx_gen4
3179;       (struct gcm_key_data *key_data)
3180;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3181MKGLOBAL(FN_NAME(precomp,_),function,)
3182FN_NAME(precomp,_):
3183
3184%ifdef SAFE_PARAM
3185        ;; Check key_data != NULL
3186        cmp     arg1, 0
3187        jz      exit_precomp
3188%endif
3189
3190        push    r12
3191        push    r13
3192        push    r14
3193        push    r15
3194
3195        mov     r14, rsp
3196
3197
3198
3199        sub     rsp, VARIABLE_OFFSET
3200        and     rsp, ~63                                 ; align rsp to 64 bytes
3201
3202%ifidn __OUTPUT_FORMAT__, win64
3203        ; only xmm6 needs to be maintained
3204        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
3205%endif
3206
3207        vpxor   xmm6, xmm6
3208        ENCRYPT_SINGLE_BLOCK    arg1, xmm6              ; xmm6 = HashKey
3209
3210        vpshufb  xmm6, [rel SHUF_MASK]
3211        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
3212        vmovdqa  xmm2, xmm6
3213        vpsllq   xmm6, xmm6, 1
3214        vpsrlq   xmm2, xmm2, 63
3215        vmovdqa  xmm1, xmm2
3216        vpslldq  xmm2, xmm2, 8
3217        vpsrldq  xmm1, xmm1, 8
3218        vpor     xmm6, xmm6, xmm2
3219        ;reduction
3220        vpshufd  xmm2, xmm1, 00100100b
3221        vpcmpeqd xmm2, [rel TWOONE]
3222        vpand    xmm2, xmm2, [rel POLY]
3223        vpxor    xmm6, xmm6, xmm2                       ; xmm6 holds the HashKey<<1 mod poly
3224        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3225        vmovdqu  [arg1 + HashKey], xmm6                 ; store HashKey<<1 mod poly
3226
3227
3228        PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
3229
3230%ifidn __OUTPUT_FORMAT__, win64
3231        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
3232%endif
3233        mov     rsp, r14
3234
3235        pop     r15
3236        pop     r14
3237        pop     r13
3238        pop     r12
3239
3240%ifdef SAFE_DATA
3241        clear_scratch_gps_asm
3242        clear_scratch_ymms_asm
3243%endif
3244exit_precomp:
3245
3246        ret
3247
3248
3249;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3250;void   aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4
3251;       (const struct gcm_key_data *key_data,
3252;        struct gcm_context_data *context_data,
3253;        u8       *iv,
3254;        const u8 *aad,
3255;        u64      aad_len);
3256;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3257MKGLOBAL(FN_NAME(init,_),function,)
3258FN_NAME(init,_):
3259        push    r12
3260        push    r13
3261%ifidn __OUTPUT_FORMAT__, win64
3262        push    r14
3263        push    r15
3264        mov     r14, rsp
3265	; xmm6 needs to be maintained for Windows
3266	sub	rsp, 1*16
3267	vmovdqu	[rsp + 0*16], xmm6
3268%endif
3269
3270%ifdef SAFE_PARAM
3271        ;; Check key_data != NULL
3272        cmp     arg1, 0
3273        jz      exit_init
3274
3275        ;; Check context_data != NULL
3276        cmp     arg2, 0
3277        jz      exit_init
3278
3279        ;; Check IV != NULL
3280        cmp     arg3, 0
3281        jz      exit_init
3282
3283        ;; Check if aad_len == 0
3284        cmp     arg5, 0
3285        jz      skip_aad_check_init
3286
3287        ;; Check aad != NULL (aad_len != 0)
3288        cmp     arg4, 0
3289        jz      exit_init
3290
3291skip_aad_check_init:
3292%endif
3293        GCM_INIT arg1, arg2, arg3, arg4, arg5
3294
3295%ifdef SAFE_DATA
3296        clear_scratch_gps_asm
3297        clear_scratch_ymms_asm
3298%endif
3299exit_init:
3300
3301%ifidn __OUTPUT_FORMAT__, win64
3302	vmovdqu	xmm6 , [rsp + 0*16]
3303        mov     rsp, r14
3304        pop     r15
3305        pop     r14
3306%endif
3307        pop     r13
3308        pop     r12
3309        ret
3310
3311;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3312;void   aes_gcm_init_var_iv_128_avx_gen4 / aes_gcm_init_var_iv_192_avx_gen4 /
3313;       aes_gcm_init_var_iv_256_avx_gen4
3314;        const struct gcm_key_data *key_data,
3315;        struct gcm_context_data *context_data,
3316;        u8        *iv,
3317;        const u64 iv_len,
3318;        const u8  *aad,
3319;        const u64 aad_len);
3320;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3321MKGLOBAL(FN_NAME(init_var_iv,_),function,)
3322FN_NAME(init_var_iv,_):
3323	push	r12
3324	push	r13
3325%ifidn __OUTPUT_FORMAT__, win64
3326        push    r14
3327        push    r15
3328        mov     r14, rsp
3329	; xmm6 needs to be maintained for Windows
3330	sub	rsp, 1*16
3331	vmovdqu	[rsp + 0*16], xmm6
3332%endif
3333
3334%ifdef SAFE_PARAM
3335        ;; Check key_data != NULL
3336        cmp     arg1, 0
3337        jz      exit_init_IV
3338
3339        ;; Check context_data != NULL
3340        cmp     arg2, 0
3341        jz      exit_init_IV
3342
3343        ;; Check IV != NULL
3344        cmp     arg3, 0
3345        jz      exit_init_IV
3346
3347        ;; Check iv_len != 0
3348        cmp     arg4, 0
3349        jz      exit_init_IV
3350
3351        ;; Check if aad_len == 0
3352        cmp     arg6, 0
3353        jz      skip_aad_check_init_IV
3354
3355        ;; Check aad != NULL (aad_len != 0)
3356        cmp     arg5, 0
3357        jz      exit_init_IV
3358
3359skip_aad_check_init_IV:
3360%endif
3361        cmp     arg4, 12
3362        je      iv_len_12_init_IV
3363
3364	GCM_INIT arg1, arg2, arg3, arg5, arg6, arg4
3365        jmp     skip_iv_len_12_init_IV
3366
3367iv_len_12_init_IV:
3368	GCM_INIT arg1, arg2, arg3, arg5, arg6
3369
3370skip_iv_len_12_init_IV:
3371%ifdef SAFE_DATA
3372        clear_scratch_gps_asm
3373        clear_scratch_ymms_asm
3374%endif
3375exit_init_IV:
3376
3377%ifidn __OUTPUT_FORMAT__, win64
3378	vmovdqu	xmm6 , [rsp + 0*16]
3379        mov     rsp, r14
3380        pop     r15
3381        pop     r14
3382%endif
3383	pop	r13
3384	pop	r12
3385        ret
3386
3387;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3388;void   aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 /
3389;       aes_gcm_enc_128_update_avx_gen4
3390;       (const struct gcm_key_data *key_data,
3391;        struct gcm_context_data *context_data,
3392;        u8       *out,
3393;        const u8 *in,
3394;        u64      plaintext_len);
3395;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3396MKGLOBAL(FN_NAME(enc,_update_),function,)
3397FN_NAME(enc,_update_):
3398
3399        FUNC_SAVE
3400
3401%ifdef SAFE_PARAM
3402        ;; Check key_data != NULL
3403        cmp     arg1, 0
3404        jz      exit_update_enc
3405
3406        ;; Check context_data != NULL
3407        cmp     arg2, 0
3408        jz      exit_update_enc
3409
3410        ;; Check if plaintext_len == 0
3411        cmp     arg5, 0
3412        jz      skip_in_out_check_update_enc
3413
3414        ;; Check out != NULL (plaintext_len != 0)
3415        cmp     arg3, 0
3416        jz      exit_update_enc
3417
3418        ;; Check in != NULL (plaintext_len != 0)
3419        cmp     arg4, 0
3420        jz      exit_update_enc
3421
3422skip_in_out_check_update_enc:
3423%endif
3424        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
3425
3426exit_update_enc:
3427        FUNC_RESTORE
3428
3429        ret
3430
3431
3432;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3433;void   aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 /
3434;       aes_gcm_dec_256_update_avx_gen4
3435;       (const struct gcm_key_data *key_data,
3436;        struct gcm_context_data *context_data,
3437;        u8       *out,
3438;        const u8 *in,
3439;        u64      plaintext_len);
3440;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3441MKGLOBAL(FN_NAME(dec,_update_),function,)
3442FN_NAME(dec,_update_):
3443
3444        FUNC_SAVE
3445
3446%ifdef SAFE_PARAM
3447        ;; Check key_data != NULL
3448        cmp     arg1, 0
3449        jz      exit_update_dec
3450
3451        ;; Check context_data != NULL
3452        cmp     arg2, 0
3453        jz      exit_update_dec
3454
3455        ;; Check if plaintext_len == 0
3456        cmp     arg5, 0
3457        jz      skip_in_out_check_update_dec
3458
3459        ;; Check out != NULL (plaintext_len != 0)
3460        cmp     arg3, 0
3461        jz      exit_update_dec
3462
3463        ;; Check in != NULL (plaintext_len != 0)
3464        cmp     arg4, 0
3465        jz      exit_update_dec
3466
3467skip_in_out_check_update_dec:
3468%endif
3469
3470        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
3471
3472exit_update_dec:
3473        FUNC_RESTORE
3474
3475        ret
3476
3477
3478;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3479;void   aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 /
3480;	aes_gcm_enc_256_finalize_avx_gen4
3481;       (const struct gcm_key_data *key_data,
3482;        struct gcm_context_data *context_data,
3483;        u8       *auth_tag,
3484;        u64      auth_tag_len);
3485;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3486MKGLOBAL(FN_NAME(enc,_finalize_),function,)
3487FN_NAME(enc,_finalize_):
3488
3489%ifdef SAFE_PARAM
3490        ;; Check key_data != NULL
3491        cmp     arg1, 0
3492        jz      exit_enc_fin
3493
3494        ;; Check context_data != NULL
3495        cmp     arg2, 0
3496        jz      exit_enc_fin
3497
3498        ;; Check auth_tag != NULL
3499        cmp     arg3, 0
3500        jz      exit_enc_fin
3501
3502        ;; Check auth_tag_len == 0 or > 16
3503        cmp     arg4, 0
3504        jz      exit_enc_fin
3505
3506        cmp     arg4, 16
3507        ja      exit_enc_fin
3508%endif
3509        push r12
3510
3511%ifidn __OUTPUT_FORMAT__, win64
3512        ; xmm6:xmm15 need to be maintained for Windows
3513	sub	rsp, 7*16
3514        vmovdqu	[rsp + 0*16], xmm6
3515        vmovdqu	[rsp + 1*16], xmm9
3516        vmovdqu	[rsp + 2*16], xmm10
3517        vmovdqu	[rsp + 3*16], xmm11
3518        vmovdqu	[rsp + 4*16], xmm13
3519        vmovdqu	[rsp + 5*16], xmm14
3520        vmovdqu	[rsp + 6*16], xmm15
3521%endif
3522        GCM_COMPLETE    arg1, arg2, arg3, arg4, multi_call
3523
3524%ifdef SAFE_DATA
3525        clear_scratch_gps_asm
3526        clear_scratch_ymms_asm
3527%endif
3528%ifidn __OUTPUT_FORMAT__, win64
3529        vmovdqu	xmm15, [rsp + 6*16]
3530        vmovdqu	xmm14, [rsp + 5*16]
3531        vmovdqu	xmm13, [rsp + 4*16]
3532        vmovdqu	xmm11, [rsp + 3*16]
3533        vmovdqu	xmm10, [rsp + 2*16]
3534        vmovdqu	xmm9,  [rsp + 1*16]
3535        vmovdqu	xmm6,  [rsp + 0*16]
3536        add     rsp, 7*16
3537%endif
3538        pop r12
3539exit_enc_fin:
3540
3541ret
3542
3543
3544;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3545;void   aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4
3546;	aes_gcm_dec_256_finalize_avx_gen4
3547;       (const struct gcm_key_data *key_data,
3548;        struct gcm_context_data *context_data,
3549;        u8       *auth_tag,
3550;        u64      auth_tag_len);
3551;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3552MKGLOBAL(FN_NAME(dec,_finalize_),function,)
3553FN_NAME(dec,_finalize_):
3554
3555%ifdef SAFE_PARAM
3556        ;; Check key_data != NULL
3557        cmp     arg1, 0
3558        jz      exit_dec_fin
3559
3560        ;; Check context_data != NULL
3561        cmp     arg2, 0
3562        jz      exit_dec_fin
3563
3564        ;; Check auth_tag != NULL
3565        cmp     arg3, 0
3566        jz      exit_dec_fin
3567
3568        ;; Check auth_tag_len == 0 or > 16
3569        cmp     arg4, 0
3570        jz      exit_dec_fin
3571
3572        cmp     arg4, 16
3573        ja      exit_dec_fin
3574%endif
3575
3576        push r12
3577
3578%ifidn __OUTPUT_FORMAT__, win64
3579        ; xmm6:xmm15 need to be maintained for Windows
3580	sub	rsp, 7*16
3581        vmovdqu	[rsp + 0*16], xmm6
3582        vmovdqu	[rsp + 1*16], xmm9
3583        vmovdqu	[rsp + 2*16], xmm10
3584        vmovdqu	[rsp + 3*16], xmm11
3585        vmovdqu	[rsp + 4*16], xmm13
3586        vmovdqu	[rsp + 5*16], xmm14
3587        vmovdqu	[rsp + 6*16], xmm15
3588%endif
3589        GCM_COMPLETE    arg1, arg2, arg3, arg4, multi_call
3590
3591%ifdef SAFE_DATA
3592        clear_scratch_gps_asm
3593        clear_scratch_ymms_asm
3594%endif
3595%ifidn __OUTPUT_FORMAT__, win64
3596        vmovdqu	xmm15, [rsp + 6*16]
3597        vmovdqu	xmm14, [rsp + 5*16]
3598        vmovdqu	xmm13, [rsp + 4*16]
3599        vmovdqu	xmm11, [rsp + 3*16]
3600        vmovdqu	xmm10, [rsp + 2*16]
3601        vmovdqu	xmm9,  [rsp + 1*16]
3602        vmovdqu	xmm6,  [rsp + 0*16]
3603        add     rsp, 7*16
3604%endif
3605
3606        pop r12
3607
3608exit_dec_fin:
3609        ret
3610
3611
3612;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3613;void   aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4
3614;       (const struct gcm_key_data *key_data,
3615;        struct gcm_context_data *context_data,
3616;        u8       *out,
3617;        const u8 *in,
3618;        u64      plaintext_len,
3619;        u8       *iv,
3620;        const u8 *aad,
3621;        u64      aad_len,
3622;        u8       *auth_tag,
3623;        u64      auth_tag_len);
3624;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3625MKGLOBAL(FN_NAME(enc,_),function,)
3626FN_NAME(enc,_):
3627
3628        FUNC_SAVE
3629
3630%ifdef SAFE_PARAM
3631        ;; Check key_data != NULL
3632        cmp     arg1, 0
3633        jz      exit_enc
3634
3635        ;; Check context_data != NULL
3636        cmp     arg2, 0
3637        jz      exit_enc
3638
3639        ;; Check IV != NULL
3640        cmp     arg6, 0
3641        jz      exit_enc
3642
3643        ;; Check auth_tag != NULL
3644        cmp     arg9, 0
3645        jz      exit_enc
3646
3647        ;; Check auth_tag_len == 0 or > 16
3648        cmp     arg10, 0
3649        jz      exit_enc
3650
3651        cmp     arg10, 16
3652        ja      exit_enc
3653
3654        ;; Check if plaintext_len == 0
3655        cmp     arg5, 0
3656        jz      skip_in_out_check_enc
3657
3658        ;; Check out != NULL (plaintext_len != 0)
3659        cmp     arg3, 0
3660        jz      exit_enc
3661
3662        ;; Check in != NULL (plaintext_len != 0)
3663        cmp     arg4, 0
3664        jz      exit_enc
3665
3666skip_in_out_check_enc:
3667        ;; Check if aad_len == 0
3668        cmp     arg8, 0
3669        jz      skip_aad_check_enc
3670
3671        ;; Check aad != NULL (aad_len != 0)
3672        cmp     arg7, 0
3673        jz      exit_enc
3674
3675skip_aad_check_enc:
3676%endif
3677        GCM_INIT arg1, arg2, arg6, arg7, arg8
3678
3679        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC, single_call
3680
3681        GCM_COMPLETE arg1, arg2, arg9, arg10, single_call
3682
3683exit_enc:
3684        FUNC_RESTORE
3685
3686        ret
3687
3688;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3689;void   aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4
3690;       (const struct gcm_key_data *key_data,
3691;        struct gcm_context_data *context_data,
3692;        u8       *out,
3693;        const u8 *in,
3694;        u64      plaintext_len,
3695;        u8       *iv,
3696;        const u8 *aad,
3697;        u64      aad_len,
3698;        u8       *auth_tag,
3699;        u64      auth_tag_len);
3700;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3701MKGLOBAL(FN_NAME(dec,_),function,)
3702FN_NAME(dec,_):
3703
3704        FUNC_SAVE
3705
3706%ifdef SAFE_PARAM
3707        ;; Check key_data != NULL
3708        cmp     arg1, 0
3709        jz      exit_dec
3710
3711        ;; Check context_data != NULL
3712        cmp     arg2, 0
3713        jz      exit_dec
3714
3715        ;; Check IV != NULL
3716        cmp     arg6, 0
3717        jz      exit_dec
3718
3719        ;; Check auth_tag != NULL
3720        cmp     arg9, 0
3721        jz      exit_dec
3722
3723        ;; Check auth_tag_len == 0 or > 16
3724        cmp     arg10, 0
3725        jz      exit_dec
3726
3727        cmp     arg10, 16
3728        ja      exit_dec
3729
3730        ;; Check if plaintext_len == 0
3731        cmp     arg5, 0
3732        jz      skip_in_out_check_dec
3733
3734        ;; Check out != NULL (plaintext_len != 0)
3735        cmp     arg3, 0
3736        jz      exit_dec
3737
3738        ;; Check in != NULL (plaintext_len != 0)
3739        cmp     arg4, 0
3740        jz      exit_dec
3741
3742skip_in_out_check_dec:
3743        ;; Check if aad_len == 0
3744        cmp     arg8, 0
3745        jz      skip_aad_check_dec
3746
3747        ;; Check aad != NULL (aad_len != 0)
3748        cmp     arg7, 0
3749        jz      exit_dec
3750
3751skip_aad_check_dec:
3752%endif
3753        GCM_INIT arg1, arg2, arg6, arg7, arg8
3754
3755        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC, single_call
3756
3757        GCM_COMPLETE arg1, arg2, arg9, arg10, single_call
3758
3759exit_dec:
3760        FUNC_RESTORE
3761
3762        ret
3763
3764
3765;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3766;void   aes_gcm_enc_var_iv_128_avx_gen4 / aes_gcm_enc_var_iv_192_avx_gen4 /
3767;       aes_gcm_enc_var_iv_256_avx_gen4
3768;        const struct gcm_key_data *key_data,
3769;        struct gcm_context_data *context_data,
3770;        u8        *out,
3771;        const u8  *in,
3772;        u64       plaintext_len,
3773;        u8        *iv,
3774;        const u64 iv_len,
3775;        const u8  *aad,
3776;        const u64 aad_len,
3777;        u8        *auth_tag,
3778;        const u64 auth_tag_len);
3779;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3780MKGLOBAL(FN_NAME(enc_var_iv,_),function,)
3781FN_NAME(enc_var_iv,_):
3782
3783	FUNC_SAVE
3784
3785%ifdef SAFE_PARAM
3786        ;; Check key_data != NULL
3787        cmp     arg1, 0
3788        jz      exit_enc_IV
3789
3790        ;; Check context_data != NULL
3791        cmp     arg2, 0
3792        jz      exit_enc_IV
3793
3794        ;; Check IV != NULL
3795        cmp     arg6, 0
3796        jz      exit_enc_IV
3797
3798        ;; Check IV len != 0
3799        cmp     arg7, 0
3800        jz      exit_enc_IV
3801
3802        ;; Check auth_tag != NULL
3803        cmp     arg10, 0
3804        jz      exit_enc_IV
3805
3806        ;; Check auth_tag_len == 0 or > 16
3807        cmp     arg11, 0
3808        jz      exit_enc_IV
3809
3810        cmp     arg11, 16
3811        ja      exit_enc_IV
3812
3813        ;; Check if plaintext_len == 0
3814        cmp     arg5, 0
3815        jz      skip_in_out_check_enc_IV
3816
3817        ;; Check out != NULL (plaintext_len != 0)
3818        cmp     arg3, 0
3819        jz      exit_enc_IV
3820
3821        ;; Check in != NULL (plaintext_len != 0)
3822        cmp     arg4, 0
3823        jz      exit_enc_IV
3824
3825skip_in_out_check_enc_IV:
3826        ;; Check if aad_len == 0
3827        cmp     arg9, 0
3828        jz      skip_aad_check_enc_IV
3829
3830        ;; Check aad != NULL (aad_len != 0)
3831        cmp     arg8, 0
3832        jz      exit_enc_IV
3833
3834skip_aad_check_enc_IV:
3835%endif
3836        cmp     arg7, 12
3837        je      iv_len_12_enc_IV
3838
3839	GCM_INIT arg1, arg2, arg6, arg8, arg9, arg7
3840        jmp     skip_iv_len_12_enc_IV
3841
3842iv_len_12_enc_IV:
3843	GCM_INIT arg1, arg2, arg6, arg8, arg9
3844
3845skip_iv_len_12_enc_IV:
3846        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC, single_call
3847
3848        GCM_COMPLETE arg1, arg2, arg10, arg11, single_call
3849
3850exit_enc_IV:
3851	FUNC_RESTORE
3852
3853	ret
3854
3855;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3856;void   aes_gcm_dec_var_iv_128_avx_gen4 / aes_gcm_dec_var_iv_192_avx_gen4 /
3857;       aes_gcm_dec_var_iv_256_avx_gen4
3858;        const struct gcm_key_data *key_data,
3859;        struct gcm_context_data *context_data,
3860;        u8        *out,
3861;        const u8  *in,
3862;        u64       plaintext_len,
3863;        u8        *iv,
3864;        const u64 iv_len,
3865;        const u8  *aad,
3866;        const u64 aad_len,
3867;        u8        *auth_tag,
3868;        const u64 auth_tag_len);
3869;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3870MKGLOBAL(FN_NAME(dec_var_iv,_),function,)
3871FN_NAME(dec_var_iv,_):
3872
3873	FUNC_SAVE
3874
3875%ifdef SAFE_PARAM
3876        ;; Check key_data != NULL
3877        cmp     arg1, 0
3878        jz      exit_dec_IV
3879
3880        ;; Check context_data != NULL
3881        cmp     arg2, 0
3882        jz      exit_dec_IV
3883
3884        ;; Check IV != NULL
3885        cmp     arg6, 0
3886        jz      exit_dec_IV
3887
3888        ;; Check IV len != 0
3889        cmp     arg7, 0
3890        jz      exit_dec_IV
3891
3892        ;; Check auth_tag != NULL
3893        cmp     arg10, 0
3894        jz      exit_dec_IV
3895
3896        ;; Check auth_tag_len == 0 or > 16
3897        cmp     arg11, 0
3898        jz      exit_dec_IV
3899
3900        cmp     arg11, 16
3901        ja      exit_dec_IV
3902
3903        ;; Check if plaintext_len == 0
3904        cmp     arg5, 0
3905        jz      skip_in_out_check_dec_IV
3906
3907        ;; Check out != NULL (plaintext_len != 0)
3908        cmp     arg3, 0
3909        jz      exit_dec_IV
3910
3911        ;; Check in != NULL (plaintext_len != 0)
3912        cmp     arg4, 0
3913        jz      exit_dec_IV
3914
3915skip_in_out_check_dec_IV:
3916        ;; Check if aad_len == 0
3917        cmp     arg9, 0
3918        jz      skip_aad_check_dec_IV
3919
3920        ;; Check aad != NULL (aad_len != 0)
3921        cmp     arg8, 0
3922        jz      exit_dec_IV
3923
3924skip_aad_check_dec_IV:
3925%endif
3926        cmp     arg7, 12
3927        je      iv_len_12_dec_IV
3928
3929	GCM_INIT arg1, arg2, arg6, arg8, arg9, arg7
3930        jmp     skip_iv_len_12_dec_IV
3931
3932iv_len_12_dec_IV:
3933	GCM_INIT arg1, arg2, arg6, arg8, arg9
3934
3935skip_iv_len_12_dec_IV:
3936        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC, single_call
3937
3938        GCM_COMPLETE arg1, arg2, arg10, arg11, single_call
3939
3940
3941exit_dec_IV:
3942	FUNC_RESTORE
3943
3944	ret
3945
3946%ifdef GCM128_MODE
3947;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3948;void   ghash_avx_gen4
3949;        const struct gcm_key_data *key_data,
3950;        const void   *in,
3951;        const u64    in_len,
3952;        void         *tag,
3953;        const u64    tag_len);
3954;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3955MKGLOBAL(ghash_avx_gen4,function,)
3956ghash_avx_gen4:
3957
3958        FUNC_SAVE
3959
3960%ifdef SAFE_PARAM
3961        ;; Check key_data != NULL
3962        cmp     arg1, 0
3963        jz      exit_ghash
3964
3965        ;; Check in != NULL
3966        cmp     arg2, 0
3967        jz      exit_ghash
3968
3969        ;; Check in_len != 0
3970        cmp     arg3, 0
3971        jz      exit_ghash
3972
3973        ;; Check tag != NULL
3974        cmp     arg4, 0
3975        jz      exit_ghash
3976
3977        ;; Check tag_len != 0
3978        cmp     arg5, 0
3979        jz      exit_ghash
3980%endif
3981
3982        vpxor   xmm0, xmm0
3983        CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \
3984                      r10, r11, r12, r13, rax
3985
3986        vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap
3987
3988        simd_store_avx arg4, xmm0, arg5, r12, rax
3989
3990exit_ghash:
3991        FUNC_RESTORE
3992
3993        ret
3994%endif
3995
3996;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3997; PARTIAL_BLOCK_GMAC: Handles the tag partial blocks between update calls.
3998; Requires the input data be at least 1 byte long.
3999; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_IN),
4000; input text length (PLAIN_LEN), hash subkey (HASH_SUBKEY).
4001; Output: Updated GDATA_CTX
4002; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
4003;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4004%macro PARTIAL_BLOCK_GMAC	7
4005%define	%%GDATA_KEY             %1
4006%define	%%GDATA_CTX             %2
4007%define	%%PLAIN_IN              %3
4008%define	%%PLAIN_LEN             %4
4009%define	%%DATA_OFFSET           %5
4010%define	%%AAD_HASH              %6
4011%define	%%HASH_SUBKEY           %7
4012
4013	mov	r13, [%%GDATA_CTX + PBlockLen]
4014	cmp	r13, 0
4015        ; Leave Macro if no partial blocks
4016	je	%%_partial_block_done
4017
4018        ; Read in input data without over reading
4019	cmp	%%PLAIN_LEN, 16
4020	jl	%%_fewer_than_16_bytes
4021        ; If more than 16 bytes of data, just fill the xmm register
4022	VXLDR   xmm1, [%%PLAIN_IN]
4023	jmp	%%_data_read
4024
4025%%_fewer_than_16_bytes:
4026	lea	r10, [%%PLAIN_IN]
4027	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_LEN, rax, r12, r15
4028
4029        ; Finished reading in data
4030%%_data_read:
4031
4032	lea	r12, [rel SHIFT_MASK]
4033        ; Adjust the shuffle mask pointer to be able to shift r13 bytes
4034        ; (16-r13 is the number of bytes in plaintext mod 16)
4035	add	r12, r13
4036        ; Get the appropriate shuffle mask
4037	vmovdqu	xmm2, [r12]
4038	vmovdqa	xmm3, xmm1
4039
4040	mov	r15, %%PLAIN_LEN
4041	add	r15, r13
4042        ; Set r15 to be the amount of data left in PLAIN_IN after filling the block
4043	sub	r15, 16
4044        ; Determine if partial block is not being filled and shift mask accordingly
4045	jge	%%_no_extra_mask_1
4046	sub	r12, r15
4047%%_no_extra_mask_1:
4048
4049        ; Get the appropriate mask to mask out bottom r13 bytes of xmm3
4050	vmovdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]
4051
4052	vpand	xmm3, xmm1
4053	vpshufb	xmm3, [rel SHUF_MASK]
4054	vpshufb	xmm3, xmm2
4055	vpxor	%%AAD_HASH, xmm3
4056
4057	cmp	r15,0
4058	jl	%%_partial_incomplete_1
4059
4060        ; GHASH computation for the last <16 Byte block
4061	GHASH_MUL	%%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6
4062	xor	rax, rax
4063	mov	[%%GDATA_CTX + PBlockLen], rax
4064	jmp	%%_ghash_done
4065%%_partial_incomplete_1:
4066%ifidn __OUTPUT_FORMAT__, win64
4067        mov     rax, %%PLAIN_LEN
4068        add     [%%GDATA_CTX + PBlockLen], rax
4069%else
4070        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_LEN
4071%endif
4072%%_ghash_done:
4073	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
4074
4075        cmp     r15, 0
4076        jl      %%_partial_fill
4077
4078        mov     r12, 16
4079        ; Set r12 to be the number of bytes to skip after this macro
4080        sub     r12, r13
4081
4082        jmp     %%offset_set
4083%%_partial_fill:
4084        mov     r12, %%PLAIN_LEN
4085%%offset_set:
4086        mov     %%DATA_OFFSET, r12
4087%%_partial_block_done:
4088%endmacro ; PARTIAL_BLOCK_GMAC
4089
4090;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4091;void   imb_aes_gmac_update_128_avx_gen4 / imb_aes_gmac_update_192_avx_gen4 /
4092;       imb_aes_gmac_update_256_avx_gen4
4093;        const struct gcm_key_data *key_data,
4094;        struct gcm_context_data *context_data,
4095;        const   u8 *in,
4096;        const   u64 plaintext_len);
4097;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4098MKGLOBAL(GMAC_FN_NAME(update),function,)
4099GMAC_FN_NAME(update):
4100
4101	FUNC_SAVE
4102
4103        ;; Check if plaintext_len == 0
4104        cmp     arg4, 0
4105        je	exit_gmac_update
4106
4107%ifdef SAFE_PARAM
4108        ;; Check key_data != NULL
4109        cmp     arg1, 0
4110        jz      exit_gmac_update
4111
4112        ;; Check context_data != NULL
4113        cmp     arg2, 0
4114        jz      exit_gmac_update
4115
4116        ;; Check in != NULL (plaintext_len != 0)
4117        cmp     arg3, 0
4118        jz      exit_gmac_update
4119%endif
4120
4121        ; Increment size of "AAD length" for GMAC
4122        add     [arg2 + AadLen], arg4
4123
4124        ;; Deal with previous partial block
4125	xor	r11, r11
4126	vmovdqu	xmm13, [arg1 + HashKey]
4127	vmovdqu	xmm8, [arg2 + AadHash]
4128
4129	PARTIAL_BLOCK_GMAC arg1, arg2, arg3, arg4, r11, xmm8, xmm13
4130
4131        ; CALC_AAD_HASH needs to deal with multiple of 16 bytes
4132        sub     arg4, r11
4133        add     arg3, r11
4134
4135        vmovq   xmm7, arg4 ; Save remaining length
4136        and     arg4, -16 ; Get multiple of 16 bytes
4137
4138        or      arg4, arg4
4139        jz      no_full_blocks
4140
4141        ;; Calculate GHASH of this segment
4142        CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \
4143                      r10, r11, r12, r13, rax
4144	vmovdqu	[arg2 + AadHash], xmm8	; ctx_data.aad hash = aad_hash
4145
4146no_full_blocks:
4147        add     arg3, arg4 ; Point at partial block
4148
4149        vmovq   arg4, xmm7 ; Restore original remaining length
4150        and     arg4, 15
4151        jz      exit_gmac_update
4152
4153        ; Save next partial block
4154        mov	[arg2 + PBlockLen], arg4
4155        READ_SMALL_DATA_INPUT xmm1, arg3, arg4, r11, r12, r13
4156        vpshufb xmm1, [rel SHUF_MASK]
4157        vpxor   xmm8, xmm1
4158        vmovdqu [arg2 + AadHash], xmm8
4159
4160exit_gmac_update:
4161	FUNC_RESTORE
4162
4163	ret
4164
4165%ifdef LINUX
4166section .note.GNU-stack noalloc noexec nowrite progbits
4167%endif
4168