1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2018-2020, Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;
31; Authors:
32;       Erdinc Ozturk
33;       Vinodh Gopal
34;       James Guilford
35;       Tomasz Kantecki
36;
37;
38; References:
39;       This code was derived and highly optimized from the code described in paper:
40;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
41;       The details of the implementation is explained in:
42;               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
43;
44;
45;
46;
47; Assumptions:
48;
49;
50;
51; iv:
52;       0                   1                   2                   3
53;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
54;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
55;       |                             Salt  (From the SA)               |
56;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
57;       |                     Initialization Vector                     |
58;       |         (This is the sequence number from IPSec header)       |
59;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
60;       |                              0x1                              |
61;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62;
63;
64;
65; AAD:
66;       AAD will be padded with 0 to the next 16byte multiple
67;       for example, assume AAD is a u32 vector
68;
69;       if AAD is 8 bytes:
70;       AAD[3] = {A0, A1};
71;       padded AAD in xmm register = {A1 A0 0 0}
72;
73;       0                   1                   2                   3
74;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
75;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
76;       |                               SPI (A1)                        |
77;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
78;       |                     32-bit Sequence Number (A0)               |
79;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
80;       |                              0x0                              |
81;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82;
83;                                       AAD Format with 32-bit Sequence Number
84;
85;       if AAD is 12 bytes:
86;       AAD[3] = {A0, A1, A2};
87;       padded AAD in xmm register = {A2 A1 A0 0}
88;
89;       0                   1                   2                   3
90;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
91;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92;       |                               SPI (A2)                        |
93;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
94;       |                 64-bit Extended Sequence Number {A1,A0}       |
95;       |                                                               |
96;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
97;       |                              0x0                              |
98;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99;
100;        AAD Format with 64-bit Extended Sequence Number
101;
102;
103; aadLen:
104;       Must be a multiple of 4 bytes and from the definition of the spec.
105;       The code additionally supports any aadLen length.
106;
107; TLen:
108;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
109;
110; poly = x^128 + x^127 + x^126 + x^121 + 1
111; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
112;
113
114%include "include/os.asm"
115%include "include/reg_sizes.asm"
116%include "include/clear_regs.asm"
117%include "include/gcm_defines.asm"
118%include "include/gcm_keys_avx2_avx512.asm"
119
120%include "mb_mgr_datastruct.asm"
121%include "imb_job.asm"
122%include "include/memcpy.asm"
123
124%ifndef GCM128_MODE
125%ifndef GCM192_MODE
126%ifndef GCM256_MODE
127%error "No GCM mode selected for gcm_avx512.asm!"
128%endif
129%endif
130%endif
131
132;; Decide on AES-GCM key size to compile for
133%ifdef GCM128_MODE
134%define NROUNDS 9
135%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx512
136%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _128_ %+ avx512
137%endif
138
139%ifdef GCM192_MODE
140%define NROUNDS 11
141%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx512
142%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _192_ %+ avx512
143%endif
144
145%ifdef GCM256_MODE
146%define NROUNDS 13
147%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx512
148%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _256_ %+ avx512
149%endif
150
151section .text
152default rel
153
154; need to push 4 registers into stack to maintain
155%define STACK_OFFSET   8*4
156
157%ifidn __OUTPUT_FORMAT__, win64
158        %define XMM_STORAGE     16*10
159%else
160        %define XMM_STORAGE     0
161%endif
162
163%define TMP2    16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
164%define TMP3    16*1    ; Temporary storage for AES State 3
165%define TMP4    16*2    ; Temporary storage for AES State 4
166%define TMP5    16*3    ; Temporary storage for AES State 5
167%define TMP6    16*4    ; Temporary storage for AES State 6
168%define TMP7    16*5    ; Temporary storage for AES State 7
169%define TMP8    16*6    ; Temporary storage for AES State 8
170%define LOCAL_STORAGE   16*7
171%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
172
173;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174; Utility Macros
175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176
177;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
179; Input: A and B (128-bits each, bit-reflected)
180; Output: C = A*B*x mod poly, (i.e. >>1 )
181; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
183;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184%macro  GHASH_MUL  7
185%define %%GH %1         ; 16 Bytes
186%define %%HK %2         ; 16 Bytes
187%define %%T1 %3
188%define %%T2 %4
189%define %%T3 %5
190%define %%T4 %6
191%define %%T5 %7
192        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193
194        vpclmulqdq      %%T1, %%GH, %%HK, 0x11          ; %%T1 = a1*b1
195        vpclmulqdq      %%T2, %%GH, %%HK, 0x00          ; %%T2 = a0*b0
196        vpclmulqdq      %%T3, %%GH, %%HK, 0x01          ; %%T3 = a1*b0
197        vpclmulqdq      %%GH, %%GH, %%HK, 0x10          ; %%GH = a0*b1
198        vpxor           %%GH, %%GH, %%T3
199
200
201        vpsrldq         %%T3, %%GH, 8                   ; shift-R %%GH 2 DWs
202        vpslldq         %%GH, %%GH, 8                   ; shift-L %%GH 2 DWs
203
204        vpxor           %%T1, %%T1, %%T3
205        vpxor           %%GH, %%GH, %%T2
206
207        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
208        ;first phase of the reduction
209        vmovdqu         %%T3, [rel POLY2]
210
211        vpclmulqdq      %%T2, %%T3, %%GH, 0x01
212        vpslldq         %%T2, %%T2, 8                    ; shift-L %%T2 2 DWs
213
214        vpxor           %%GH, %%GH, %%T2                 ; first phase of the reduction complete
215        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
216        ;second phase of the reduction
217        vpclmulqdq      %%T2, %%T3, %%GH, 0x00
218        vpsrldq         %%T2, %%T2, 4                    ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
219
220        vpclmulqdq      %%GH, %%T3, %%GH, 0x10
221        vpslldq         %%GH, %%GH, 4                    ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
222
223        vpxor           %%GH, %%GH, %%T2                 ; second phase of the reduction complete
224        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
225        vpxor           %%GH, %%GH, %%T1                 ; the result is in %%GH
226%endmacro
227
228
229; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
230; functions, but are kept to allow users to switch cpu architectures between calls
231; of pre, init, update, and finalize.
232%macro  PRECOMPUTE 8
233%define %%GDATA %1
234%define %%HK    %2
235%define %%T1    %3
236%define %%T2    %4
237%define %%T3    %5
238%define %%T4    %6
239%define %%T5    %7
240%define %%T6    %8
241
242        ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
243        vmovdqa  %%T5, %%HK
244
245        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^2<<1 mod poly
246        vmovdqu  [%%GDATA + HashKey_2], %%T5                    ;  [HashKey_2] = HashKey^2<<1 mod poly
247
248        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^3<<1 mod poly
249        vmovdqu  [%%GDATA + HashKey_3], %%T5
250
251        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^4<<1 mod poly
252        vmovdqu  [%%GDATA + HashKey_4], %%T5
253
254        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^5<<1 mod poly
255        vmovdqu  [%%GDATA + HashKey_5], %%T5
256
257        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^6<<1 mod poly
258        vmovdqu  [%%GDATA + HashKey_6], %%T5
259
260        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^7<<1 mod poly
261        vmovdqu  [%%GDATA + HashKey_7], %%T5
262
263        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^8<<1 mod poly
264        vmovdqu  [%%GDATA + HashKey_8], %%T5
265%endmacro
266
267
268;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
269; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
270; Returns 0 if data has length 0.
271; Input: The input data (INPUT), that data's length (LENGTH).
272; Output: The packed xmm register (OUTPUT).
273;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
274%macro READ_SMALL_DATA_INPUT    4
275%define %%OUTPUT                %1 ; %%OUTPUT is an xmm register
276%define %%INPUT                 %2
277%define %%LENGTH                %3
278%define %%TMP1                  %4
279
280        lea             %%TMP1, [rel byte_len_to_mask_table]
281%ifidn __OUTPUT_FORMAT__, win64
282        add             %%TMP1, %%LENGTH
283        add             %%TMP1, %%LENGTH
284        kmovw           k1, [%%TMP1]
285%else
286        kmovw           k1, [%%TMP1 + %%LENGTH*2]
287%endif
288        vmovdqu8        XWORD(%%OUTPUT){k1}{z}, [%%INPUT]
289
290%endmacro ; READ_SMALL_DATA_INPUT
291
292
293;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
294; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
295; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
296; Output: The hash of the data (AAD_HASH).
297;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
298%macro  CALC_AAD_HASH   13
299%define %%A_IN          %1
300%define %%A_LEN         %2
301%define %%AAD_HASH      %3
302%define %%GDATA_KEY     %4
303%define %%XTMP0         %5      ; xmm temp reg 5
304%define %%XTMP1         %6      ; xmm temp reg 5
305%define %%XTMP2         %7
306%define %%XTMP3         %8
307%define %%XTMP4         %9
308%define %%XTMP5         %10     ; xmm temp reg 5
309%define %%T1            %11     ; temp reg 1
310%define %%T2            %12
311%define %%T3            %13
312
313
314        mov     %%T1, %%A_IN            ; T1 = AAD
315        mov     %%T2, %%A_LEN           ; T2 = aadLen
316
317%%_get_AAD_loop128:
318        cmp     %%T2, 128
319        jl      %%_exit_AAD_loop128
320
321        vmovdqu         %%XTMP0, [%%T1 + 16*0]
322        vpshufb         %%XTMP0, [rel SHUF_MASK]
323
324        vpxor           %%XTMP0, %%AAD_HASH
325
326        vmovdqu         %%XTMP5, [%%GDATA_KEY + HashKey_8]
327        vpclmulqdq      %%XTMP1, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = a1*b1
328        vpclmulqdq      %%XTMP2, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = a0*b0
329        vpclmulqdq      %%XTMP3, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = a1*b0
330        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10                 ; %%T4 = a0*b1
331        vpxor           %%XTMP3, %%XTMP3, %%XTMP4                       ; %%T3 = a1*b0 + a0*b1
332
333%assign i 1
334%assign j 7
335%rep 7
336        vmovdqu         %%XTMP0, [%%T1 + 16*i]
337        vpshufb         %%XTMP0, [rel SHUF_MASK]
338
339        vmovdqu         %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
340        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = T1 + a1*b1
341        vpxor           %%XTMP1, %%XTMP1, %%XTMP4
342
343        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = T2 + a0*b0
344        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
345
346        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = T3 + a1*b0 + a0*b1
347        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
348        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10
349        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
350%assign i (i + 1)
351%assign j (j - 1)
352%endrep
353
354        vpslldq         %%XTMP4, %%XTMP3, 8                             ; shift-L 2 DWs
355        vpsrldq         %%XTMP3, %%XTMP3, 8                             ; shift-R 2 DWs
356        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
357        vpxor           %%XTMP1, %%XTMP1, %%XTMP3                       ; accumulate the results in %%T1(M):%%T2(L)
358
359        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
360        ;first phase of the reduction
361        vmovdqa         %%XTMP5, [rel POLY2]
362        vpclmulqdq      %%XTMP0, %%XTMP5, %%XTMP2, 0x01
363        vpslldq         %%XTMP0, %%XTMP0, 8                             ; shift-L xmm2 2 DWs
364        vpxor           %%XTMP2, %%XTMP2, %%XTMP0                       ; first phase of the reduction complete
365
366        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
367        ;second phase of the reduction
368        vpclmulqdq      %%XTMP3, %%XTMP5, %%XTMP2, 0x00
369        vpsrldq         %%XTMP3, %%XTMP3, 4                             ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
370
371        vpclmulqdq      %%XTMP4, %%XTMP5, %%XTMP2, 0x10
372        vpslldq         %%XTMP4, %%XTMP4, 4                             ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
373
374        vpxor           %%XTMP4, %%XTMP4, %%XTMP3                       ; second phase of the reduction complete
375        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376        vpxor           %%AAD_HASH, %%XTMP1, %%XTMP4                    ; the result is in %%T1
377
378        sub     %%T2, 128
379        je      %%_CALC_AAD_done
380
381        add     %%T1, 128
382        jmp     %%_get_AAD_loop128
383
384%%_exit_AAD_loop128:
385        cmp     %%T2, 16
386        jl      %%_get_small_AAD_block
387
388        ;; calculate hash_key position to start with
389        mov     %%T3, %%T2
390        and     %%T3, -16       ; 1 to 7 blocks possible here
391        neg     %%T3
392        add     %%T3, HashKey_1 + 16
393        lea     %%T3, [%%GDATA_KEY + %%T3]
394
395        vmovdqu         %%XTMP0, [%%T1]
396        vpshufb         %%XTMP0, [rel SHUF_MASK]
397
398        vpxor           %%XTMP0, %%AAD_HASH
399
400        vmovdqu         %%XTMP5, [%%T3]
401        vpclmulqdq      %%XTMP1, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = a1*b1
402        vpclmulqdq      %%XTMP2, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = a0*b0
403        vpclmulqdq      %%XTMP3, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = a1*b0
404        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10                 ; %%T4 = a0*b1
405        vpxor           %%XTMP3, %%XTMP3, %%XTMP4                       ; %%T3 = a1*b0 + a0*b1
406
407        add     %%T3, 16        ; move to next hashkey
408        add     %%T1, 16        ; move to next data block
409        sub     %%T2, 16
410        cmp     %%T2, 16
411        jl      %%_AAD_reduce
412
413%%_AAD_blocks:
414        vmovdqu         %%XTMP0, [%%T1]
415        vpshufb         %%XTMP0, [rel SHUF_MASK]
416
417        vmovdqu         %%XTMP5, [%%T3]
418        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x11                 ; %%T1 = T1 + a1*b1
419        vpxor           %%XTMP1, %%XTMP1, %%XTMP4
420
421        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x00                 ; %%T2 = T2 + a0*b0
422        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
423
424        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x01                 ; %%T3 = T3 + a1*b0 + a0*b1
425        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
426        vpclmulqdq      %%XTMP4, %%XTMP0, %%XTMP5, 0x10
427        vpxor           %%XTMP3, %%XTMP3, %%XTMP4
428
429        add     %%T3, 16        ; move to next hashkey
430        add     %%T1, 16
431        sub     %%T2, 16
432        cmp     %%T2, 16
433        jl      %%_AAD_reduce
434        jmp     %%_AAD_blocks
435
436%%_AAD_reduce:
437        vpslldq         %%XTMP4, %%XTMP3, 8                             ; shift-L 2 DWs
438        vpsrldq         %%XTMP3, %%XTMP3, 8                             ; shift-R 2 DWs
439        vpxor           %%XTMP2, %%XTMP2, %%XTMP4
440        vpxor           %%XTMP1, %%XTMP1, %%XTMP3                       ; accumulate the results in %%T1(M):%%T2(L)
441
442        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
443        ;first phase of the reduction
444        vmovdqa         %%XTMP5, [rel POLY2]
445        vpclmulqdq      %%XTMP0, %%XTMP5, %%XTMP2, 0x01
446        vpslldq         %%XTMP0, %%XTMP0, 8                             ; shift-L xmm2 2 DWs
447        vpxor           %%XTMP2, %%XTMP2, %%XTMP0                       ; first phase of the reduction complete
448
449        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
450        ;second phase of the reduction
451        vpclmulqdq      %%XTMP3, %%XTMP5, %%XTMP2, 0x00
452        vpsrldq         %%XTMP3, %%XTMP3, 4                             ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
453
454        vpclmulqdq      %%XTMP4, %%XTMP5, %%XTMP2, 0x10
455        vpslldq         %%XTMP4, %%XTMP4, 4                             ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
456
457        vpxor           %%XTMP4, %%XTMP4, %%XTMP3                       ; second phase of the reduction complete
458        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
459        vpxor           %%AAD_HASH, %%XTMP1, %%XTMP4                    ; the result is in %%T1
460
461        or      %%T2, %%T2
462        je      %%_CALC_AAD_done
463
464%%_get_small_AAD_block:
465        vmovdqu         %%XTMP0, [%%GDATA_KEY + HashKey]
466        READ_SMALL_DATA_INPUT   %%XTMP1, %%T1, %%T2, %%T3
467        ;byte-reflect the AAD data
468        vpshufb         %%XTMP1, [rel SHUF_MASK]
469        vpxor           %%AAD_HASH, %%XTMP1
470        GHASH_MUL       %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
471
472%%_CALC_AAD_done:
473
474%endmacro ; CALC_AAD_HASH
475
476;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
477; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
478; Requires the input data be at least 1 byte long.
479; Input: gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
480; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
481; the hash subkey (HASH_SUBKEY) and whether encoding or decoding (ENC_DEC)
482; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
483; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11
484;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
485%macro PARTIAL_BLOCK    8
486%define %%GDATA_CTX             %1
487%define %%CYPH_PLAIN_OUT        %2
488%define %%PLAIN_CYPH_IN         %3
489%define %%PLAIN_CYPH_LEN        %4
490%define %%DATA_OFFSET           %5
491%define %%AAD_HASH              %6
492%define %%HASH_SUBKEY           %7
493%define %%ENC_DEC               %8
494
495        mov     r13, [%%GDATA_CTX + PBlockLen]
496        cmp     r13, 0
497        je      %%_partial_block_done           ;Leave Macro if no partial blocks
498
499        cmp     %%PLAIN_CYPH_LEN, 16            ;Read in input data without over reading
500        jl      %%_fewer_than_16_bytes
501        VXLDR   xmm1, [%%PLAIN_CYPH_IN]         ;If more than 16 bytes of data, just fill the xmm register
502        jmp     %%_data_read
503
504%%_fewer_than_16_bytes:
505        lea     r10, [%%PLAIN_CYPH_IN]
506        READ_SMALL_DATA_INPUT   xmm1, r10, %%PLAIN_CYPH_LEN, rax
507
508%%_data_read:                           ;Finished reading in data
509
510        vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey]  ;xmm9 = my_ctx_data.partial_block_enc_key
511
512        lea     r12, [rel SHIFT_MASK]
513
514        add     r12, r13                        ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
515        vmovdqu xmm2, [r12]                     ; get the appropriate shuffle mask
516        vpshufb xmm9, xmm2                      ;shift right r13 bytes
517
518%ifidn  %%ENC_DEC, DEC
519        vmovdqa xmm3, xmm1
520%endif
521        vpxor   xmm9, xmm1                      ; Cyphertext XOR E(K, Yn)
522
523        mov     r15, %%PLAIN_CYPH_LEN
524        add     r15, r13
525        sub     r15, 16                         ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
526        jge     %%_no_extra_mask                ;Determine if if partial block is not being filled and shift mask accordingly
527        sub     r12, r15
528%%_no_extra_mask:
529
530        vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
531        vpand   xmm9, xmm1                      ; mask out bottom r13 bytes of xmm9
532
533%ifidn  %%ENC_DEC, DEC
534        vpand   xmm3, xmm1
535        vpshufb xmm3, [rel SHUF_MASK]
536        vpshufb xmm3, xmm2
537        vpxor   %%AAD_HASH, xmm3
538%else
539        vpshufb xmm9, [rel SHUF_MASK]
540        vpshufb xmm9, xmm2
541        vpxor   %%AAD_HASH, xmm9
542%endif
543        cmp     r15,0
544        jl      %%_partial_incomplete
545
546        GHASH_MUL       %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6       ;GHASH computation for the last <16 Byte block
547        xor     rax,rax
548        mov     [%%GDATA_CTX + PBlockLen], rax
549        jmp     %%_enc_dec_done
550%%_partial_incomplete:
551%ifidn __OUTPUT_FORMAT__, win64
552        mov     rax, %%PLAIN_CYPH_LEN
553       	add     [%%GDATA_CTX + PBlockLen], rax
554%else
555        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
556%endif
557%%_enc_dec_done:
558        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
559
560%ifidn  %%ENC_DEC, ENC
561        vpshufb xmm9, [rel SHUF_MASK]       ; shuffle xmm9 back to output as ciphertext
562        vpshufb xmm9, xmm2
563%endif
564
565        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
566        ; output encrypted Bytes
567        cmp     r15,0
568        jl      %%_partial_fill
569        mov     r12, r13
570        mov     r13, 16
571        sub     r13, r12                        ; Set r13 to be the number of bytes to write out
572        jmp     %%_count_set
573%%_partial_fill:
574        mov     r13, %%PLAIN_CYPH_LEN
575%%_count_set:
576        lea             rax, [rel byte_len_to_mask_table]
577        kmovw           k1, [rax + r13*2]
578        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, xmm9
579        add             %%DATA_OFFSET, r13
580%%_partial_block_done:
581%endmacro ; PARTIAL_BLOCK
582
583
584%macro GHASH_SINGLE_MUL 9
585%define %%GDATA                 %1
586%define %%HASHKEY               %2
587%define %%CIPHER                %3
588%define %%STATE_11              %4
589%define %%STATE_00              %5
590%define %%STATE_MID             %6
591%define %%T1                    %7
592%define %%T2                    %8
593%define %%FIRST                 %9
594
595        vmovdqu         %%T1, [%%GDATA + %%HASHKEY]
596%ifidn %%FIRST, first
597        vpclmulqdq      %%STATE_11, %%CIPHER, %%T1, 0x11         ; %%T4 = a1*b1
598        vpclmulqdq      %%STATE_00, %%CIPHER, %%T1, 0x00         ; %%T4_2 = a0*b0
599        vpclmulqdq      %%STATE_MID, %%CIPHER, %%T1, 0x01        ; %%T6 = a1*b0
600        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10               ; %%T5 = a0*b1
601        vpxor           %%STATE_MID, %%STATE_MID, %%T2
602%else
603        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x11
604        vpxor           %%STATE_11, %%STATE_11, %%T2
605
606        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x00
607        vpxor           %%STATE_00, %%STATE_00, %%T2
608
609        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x01
610        vpxor           %%STATE_MID, %%STATE_MID, %%T2
611
612        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10
613        vpxor           %%STATE_MID, %%STATE_MID, %%T2
614%endif
615
616%endmacro
617
618; if a = number of total plaintext bytes
619; b = floor(a/16)
620; %%num_initial_blocks = b mod 8;
621; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
622; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
623; Updated AAD_HASH is returned in %%T3
624
625%macro INITIAL_BLOCKS 23
626%define %%GDATA_KEY             %1
627%define %%CYPH_PLAIN_OUT        %2
628%define %%PLAIN_CYPH_IN         %3
629%define %%LENGTH                %4
630%define %%DATA_OFFSET           %5
631%define %%num_initial_blocks    %6      ; can be 0, 1, 2, 3, 4, 5, 6 or 7
632%define %%T1                    %7
633%define %%T2                    %8
634%define %%T3                    %9
635%define %%T4                    %10
636%define %%T5                    %11
637%define %%CTR                   %12
638%define %%XMM1                  %13
639%define %%XMM2                  %14
640%define %%XMM3                  %15
641%define %%XMM4                  %16
642%define %%XMM5                  %17
643%define %%XMM6                  %18
644%define %%XMM7                  %19
645%define %%XMM8                  %20
646%define %%T6                    %21
647%define %%T_key                 %22
648%define %%ENC_DEC               %23
649
650%assign i (8-%%num_initial_blocks)
651                ;; Move AAD_HASH to temp reg
652                vmovdqu  %%T2, %%XMM8
653                ;; Start AES for %%num_initial_blocks blocks
654                ;; vmovdqu  %%CTR, [%%GDATA_CTX + CurCount]   ; %%CTR = Y0
655
656%assign i (9-%%num_initial_blocks)
657%rep %%num_initial_blocks
658                vpaddd   %%CTR, %%CTR, [rel ONE]     ; INCR Y0
659                vmovdqa  reg(i), %%CTR
660                vpshufb  reg(i), [rel SHUF_MASK]     ; perform a 16Byte swap
661%assign i (i+1)
662%endrep
663
664%if(%%num_initial_blocks>0)
665vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
666%assign i (9-%%num_initial_blocks)
667%rep %%num_initial_blocks
668                vpxor    reg(i),reg(i),%%T_key
669%assign i (i+1)
670%endrep
671
672%assign j 1
673%rep NROUNDS
674vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
675%assign i (9-%%num_initial_blocks)
676%rep %%num_initial_blocks
677                vaesenc  reg(i),%%T_key
678%assign i (i+1)
679%endrep
680
681%assign j (j+1)
682%endrep
683
684
685vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
686%assign i (9-%%num_initial_blocks)
687%rep %%num_initial_blocks
688                vaesenclast      reg(i),%%T_key
689%assign i (i+1)
690%endrep
691
692%endif ; %if(%%num_initial_blocks>0)
693
694
695
696%assign i (9-%%num_initial_blocks)
697%rep %%num_initial_blocks
698                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
699                vpxor    reg(i), reg(i), %%T1
700                ;; Write back ciphertext for %%num_initial_blocks blocks
701                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
702                add     %%DATA_OFFSET, 16
703                %ifidn  %%ENC_DEC, DEC
704                    vmovdqa  reg(i), %%T1
705                %endif
706                ;; Prepare ciphertext for GHASH computations
707                vpshufb  reg(i), [rel SHUF_MASK]
708%assign i (i+1)
709%endrep
710
711
712;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
713
714%assign i (9-%%num_initial_blocks)
715%if(%%num_initial_blocks>0)
716        vmovdqa %%T3, reg(i)
717%assign i (i+1)
718%endif
719%if %%num_initial_blocks>1
720%rep %%num_initial_blocks-1
721        vmovdqu [rsp + TMP %+ i], reg(i)
722%assign i (i+1)
723%endrep
724%endif
725
726                ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
727                ;; Prepare 8 counter blocks and perform rounds of AES cipher on
728                ;; them, load plain/cipher text and store cipher/plain text.
729                ;; Stitch GHASH computation in between AES rounds.
730                vpaddd   %%XMM1, %%CTR, [rel ONE]   ; INCR Y0
731                vpaddd   %%XMM2, %%CTR, [rel TWO]   ; INCR Y0
732                vpaddd   %%XMM3, %%XMM1, [rel TWO]  ; INCR Y0
733                vpaddd   %%XMM4, %%XMM2, [rel TWO]  ; INCR Y0
734                vpaddd   %%XMM5, %%XMM3, [rel TWO]  ; INCR Y0
735                vpaddd   %%XMM6, %%XMM4, [rel TWO]  ; INCR Y0
736                vpaddd   %%XMM7, %%XMM5, [rel TWO]  ; INCR Y0
737                vpaddd   %%XMM8, %%XMM6, [rel TWO]  ; INCR Y0
738                vmovdqa  %%CTR, %%XMM8
739
740                vpshufb  %%XMM1, [rel SHUF_MASK]    ; perform a 16Byte swap
741                vpshufb  %%XMM2, [rel SHUF_MASK]    ; perform a 16Byte swap
742                vpshufb  %%XMM3, [rel SHUF_MASK]    ; perform a 16Byte swap
743                vpshufb  %%XMM4, [rel SHUF_MASK]    ; perform a 16Byte swap
744                vpshufb  %%XMM5, [rel SHUF_MASK]    ; perform a 16Byte swap
745                vpshufb  %%XMM6, [rel SHUF_MASK]    ; perform a 16Byte swap
746                vpshufb  %%XMM7, [rel SHUF_MASK]    ; perform a 16Byte swap
747                vpshufb  %%XMM8, [rel SHUF_MASK]    ; perform a 16Byte swap
748
749                vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
750                vpxor    %%XMM1, %%XMM1, %%T_key
751                vpxor    %%XMM2, %%XMM2, %%T_key
752                vpxor    %%XMM3, %%XMM3, %%T_key
753                vpxor    %%XMM4, %%XMM4, %%T_key
754                vpxor    %%XMM5, %%XMM5, %%T_key
755                vpxor    %%XMM6, %%XMM6, %%T_key
756                vpxor    %%XMM7, %%XMM7, %%T_key
757                vpxor    %%XMM8, %%XMM8, %%T_key
758
759%assign i (8-%%num_initial_blocks)
760%assign j (9-%%num_initial_blocks)
761%assign k (%%num_initial_blocks)
762
763%define %%T4_2 %%T4
764%if(%%num_initial_blocks>0)
765        ;; Hash in AES state
766        ;; T2 - incoming AAD hash
767        vpxor %%T2, %%T3
768
769        ;;                 GDATA,       HASHKEY, CIPHER,
770        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
771        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
772                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, first
773%endif
774
775                vmovdqu  %%T_key, [%%GDATA_KEY+16*1]
776                vaesenc  %%XMM1, %%T_key
777                vaesenc  %%XMM2, %%T_key
778                vaesenc  %%XMM3, %%T_key
779                vaesenc  %%XMM4, %%T_key
780                vaesenc  %%XMM5, %%T_key
781                vaesenc  %%XMM6, %%T_key
782                vaesenc  %%XMM7, %%T_key
783                vaesenc  %%XMM8, %%T_key
784
785                vmovdqu  %%T_key, [%%GDATA_KEY+16*2]
786                vaesenc  %%XMM1, %%T_key
787                vaesenc  %%XMM2, %%T_key
788                vaesenc  %%XMM3, %%T_key
789                vaesenc  %%XMM4, %%T_key
790                vaesenc  %%XMM5, %%T_key
791                vaesenc  %%XMM6, %%T_key
792                vaesenc  %%XMM7, %%T_key
793                vaesenc  %%XMM8, %%T_key
794
795%assign i (i+1)
796%assign j (j+1)
797%assign k (k-1)
798%if(%%num_initial_blocks>1)
799        ;;                 GDATA,       HASHKEY, CIPHER,
800        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
801        vmovdqu         %%T2, [rsp + TMP %+ j]
802        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
803                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
804%endif
805
806                vmovdqu  %%T_key, [%%GDATA_KEY+16*3]
807                vaesenc  %%XMM1, %%T_key
808                vaesenc  %%XMM2, %%T_key
809                vaesenc  %%XMM3, %%T_key
810                vaesenc  %%XMM4, %%T_key
811                vaesenc  %%XMM5, %%T_key
812                vaesenc  %%XMM6, %%T_key
813                vaesenc  %%XMM7, %%T_key
814                vaesenc  %%XMM8, %%T_key
815
816                vmovdqu  %%T_key, [%%GDATA_KEY+16*4]
817                vaesenc  %%XMM1, %%T_key
818                vaesenc  %%XMM2, %%T_key
819                vaesenc  %%XMM3, %%T_key
820                vaesenc  %%XMM4, %%T_key
821                vaesenc  %%XMM5, %%T_key
822                vaesenc  %%XMM6, %%T_key
823                vaesenc  %%XMM7, %%T_key
824                vaesenc  %%XMM8, %%T_key
825
826%assign i (i+1)
827%assign j (j+1)
828%assign k (k-1)
829%if(%%num_initial_blocks>2)
830        ;;                 GDATA,       HASHKEY, CIPHER,
831        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
832        vmovdqu         %%T2, [rsp + TMP %+ j]
833        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
834                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
835%endif
836
837%assign i (i+1)
838%assign j (j+1)
839%assign k (k-1)
840%if(%%num_initial_blocks>3)
841        ;;                 GDATA,       HASHKEY, CIPHER,
842        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
843        vmovdqu         %%T2, [rsp + TMP %+ j]
844        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
845                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
846%endif
847
848                vmovdqu  %%T_key, [%%GDATA_KEY+16*5]
849                vaesenc  %%XMM1, %%T_key
850                vaesenc  %%XMM2, %%T_key
851                vaesenc  %%XMM3, %%T_key
852                vaesenc  %%XMM4, %%T_key
853                vaesenc  %%XMM5, %%T_key
854                vaesenc  %%XMM6, %%T_key
855                vaesenc  %%XMM7, %%T_key
856                vaesenc  %%XMM8, %%T_key
857
858                vmovdqu  %%T_key, [%%GDATA_KEY+16*6]
859                vaesenc  %%XMM1, %%T_key
860                vaesenc  %%XMM2, %%T_key
861                vaesenc  %%XMM3, %%T_key
862                vaesenc  %%XMM4, %%T_key
863                vaesenc  %%XMM5, %%T_key
864                vaesenc  %%XMM6, %%T_key
865                vaesenc  %%XMM7, %%T_key
866                vaesenc  %%XMM8, %%T_key
867
868%assign i (i+1)
869%assign j (j+1)
870%assign k (k-1)
871%if(%%num_initial_blocks>4)
872        ;;                 GDATA,       HASHKEY, CIPHER,
873        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
874        vmovdqu         %%T2, [rsp + TMP %+ j]
875        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
876                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
877%endif
878
879                vmovdqu  %%T_key, [%%GDATA_KEY+16*7]
880                vaesenc  %%XMM1, %%T_key
881                vaesenc  %%XMM2, %%T_key
882                vaesenc  %%XMM3, %%T_key
883                vaesenc  %%XMM4, %%T_key
884                vaesenc  %%XMM5, %%T_key
885                vaesenc  %%XMM6, %%T_key
886                vaesenc  %%XMM7, %%T_key
887                vaesenc  %%XMM8, %%T_key
888
889                vmovdqu  %%T_key, [%%GDATA_KEY+16*8]
890                vaesenc  %%XMM1, %%T_key
891                vaesenc  %%XMM2, %%T_key
892                vaesenc  %%XMM3, %%T_key
893                vaesenc  %%XMM4, %%T_key
894                vaesenc  %%XMM5, %%T_key
895                vaesenc  %%XMM6, %%T_key
896                vaesenc  %%XMM7, %%T_key
897                vaesenc  %%XMM8, %%T_key
898
899%assign i (i+1)
900%assign j (j+1)
901%assign k (k-1)
902%if(%%num_initial_blocks>5)
903        ;;                 GDATA,       HASHKEY, CIPHER,
904        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
905        vmovdqu         %%T2, [rsp + TMP %+ j]
906        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
907                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
908%endif
909
910                vmovdqu  %%T_key, [%%GDATA_KEY+16*9]
911                vaesenc  %%XMM1, %%T_key
912                vaesenc  %%XMM2, %%T_key
913                vaesenc  %%XMM3, %%T_key
914                vaesenc  %%XMM4, %%T_key
915                vaesenc  %%XMM5, %%T_key
916                vaesenc  %%XMM6, %%T_key
917                vaesenc  %%XMM7, %%T_key
918                vaesenc  %%XMM8, %%T_key
919
920%ifndef GCM128_MODE
921                vmovdqu  %%T_key, [%%GDATA_KEY+16*10]
922                vaesenc  %%XMM1, %%T_key
923                vaesenc  %%XMM2, %%T_key
924                vaesenc  %%XMM3, %%T_key
925                vaesenc  %%XMM4, %%T_key
926                vaesenc  %%XMM5, %%T_key
927                vaesenc  %%XMM6, %%T_key
928                vaesenc  %%XMM7, %%T_key
929                vaesenc  %%XMM8, %%T_key
930%endif
931
932%assign i (i+1)
933%assign j (j+1)
934%assign k (k-1)
935%if(%%num_initial_blocks>6)
936        ;;                 GDATA,       HASHKEY, CIPHER,
937        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
938        vmovdqu         %%T2, [rsp + TMP %+ j]
939        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
940                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
941%endif
942
943%ifdef GCM128_MODE
944                vmovdqu  %%T_key, [%%GDATA_KEY+16*10]
945                vaesenclast  %%XMM1, %%T_key
946                vaesenclast  %%XMM2, %%T_key
947                vaesenclast  %%XMM3, %%T_key
948                vaesenclast  %%XMM4, %%T_key
949                vaesenclast  %%XMM5, %%T_key
950                vaesenclast  %%XMM6, %%T_key
951                vaesenclast  %%XMM7, %%T_key
952                vaesenclast  %%XMM8, %%T_key
953%endif
954
955%ifdef GCM192_MODE
956                vmovdqu  %%T_key, [%%GDATA_KEY+16*11]
957                vaesenc  %%XMM1, %%T_key
958                vaesenc  %%XMM2, %%T_key
959                vaesenc  %%XMM3, %%T_key
960                vaesenc  %%XMM4, %%T_key
961                vaesenc  %%XMM5, %%T_key
962                vaesenc  %%XMM6, %%T_key
963                vaesenc  %%XMM7, %%T_key
964                vaesenc  %%XMM8, %%T_key
965
966                vmovdqu          %%T_key, [%%GDATA_KEY+16*12]
967                vaesenclast      %%XMM1, %%T_key
968                vaesenclast      %%XMM2, %%T_key
969                vaesenclast      %%XMM3, %%T_key
970                vaesenclast      %%XMM4, %%T_key
971                vaesenclast      %%XMM5, %%T_key
972                vaesenclast      %%XMM6, %%T_key
973                vaesenclast      %%XMM7, %%T_key
974                vaesenclast      %%XMM8, %%T_key
975%endif
976%ifdef GCM256_MODE
977                vmovdqu  %%T_key, [%%GDATA_KEY+16*11]
978                vaesenc  %%XMM1, %%T_key
979                vaesenc  %%XMM2, %%T_key
980                vaesenc  %%XMM3, %%T_key
981                vaesenc  %%XMM4, %%T_key
982                vaesenc  %%XMM5, %%T_key
983                vaesenc  %%XMM6, %%T_key
984                vaesenc  %%XMM7, %%T_key
985                vaesenc  %%XMM8, %%T_key
986
987                vmovdqu          %%T_key, [%%GDATA_KEY+16*12]
988                vaesenc  %%XMM1, %%T_key
989                vaesenc  %%XMM2, %%T_key
990                vaesenc  %%XMM3, %%T_key
991                vaesenc  %%XMM4, %%T_key
992                vaesenc  %%XMM5, %%T_key
993                vaesenc  %%XMM6, %%T_key
994                vaesenc  %%XMM7, %%T_key
995                vaesenc  %%XMM8, %%T_key
996%endif
997
998%assign i (i+1)
999%assign j (j+1)
1000%assign k (k-1)
1001%if(%%num_initial_blocks>7)
1002        ;;                 GDATA,       HASHKEY, CIPHER,
1003        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
1004        vmovdqu         %%T2, [rsp + TMP %+ j]
1005        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
1006                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
1007%endif
1008
1009%ifdef GCM256_MODE             ; GCM256
1010                vmovdqu  %%T_key, [%%GDATA_KEY+16*13]
1011                vaesenc  %%XMM1, %%T_key
1012                vaesenc  %%XMM2, %%T_key
1013                vaesenc  %%XMM3, %%T_key
1014                vaesenc  %%XMM4, %%T_key
1015                vaesenc  %%XMM5, %%T_key
1016                vaesenc  %%XMM6, %%T_key
1017                vaesenc  %%XMM7, %%T_key
1018                vaesenc  %%XMM8, %%T_key
1019
1020                vmovdqu          %%T_key, [%%GDATA_KEY+16*14]
1021                vaesenclast      %%XMM1, %%T_key
1022                vaesenclast      %%XMM2, %%T_key
1023                vaesenclast      %%XMM3, %%T_key
1024                vaesenclast      %%XMM4, %%T_key
1025                vaesenclast      %%XMM5, %%T_key
1026                vaesenclast      %%XMM6, %%T_key
1027                vaesenclast      %%XMM7, %%T_key
1028                vaesenclast      %%XMM8, %%T_key
1029%endif                          ;  GCM256 mode
1030
1031%if(%%num_initial_blocks>0)
1032        vpsrldq %%T3, %%T6, 8            ; shift-R %%T2 2 DWs
1033        vpslldq %%T6, %%T6, 8            ; shift-L %%T3 2 DWs
1034        vpxor   %%T1, %%T1, %%T3         ; accumulate the results in %%T1:%%T4
1035        vpxor   %%T4, %%T6, %%T4
1036
1037        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1038        ; First phase of the reduction
1039        vmovdqu         %%T3, [rel POLY2]
1040
1041        vpclmulqdq      %%T2, %%T3, %%T4, 0x01
1042        vpslldq         %%T2, %%T2, 8             ; shift-L xmm2 2 DWs
1043
1044        ;; First phase of the reduction complete
1045        vpxor           %%T4, %%T4, %%T2
1046
1047        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1048        ; Second phase of the reduction
1049        vpclmulqdq      %%T2, %%T3, %%T4, 0x00
1050        ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1051        vpsrldq         %%T2, %%T2, 4
1052
1053        vpclmulqdq      %%T4, %%T3, %%T4, 0x10
1054        ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1055        vpslldq         %%T4, %%T4, 4
1056        ;; Second phase of the reduction complete
1057        vpxor           %%T4, %%T4, %%T2
1058        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1059        ; The result is in %%T3
1060        vpxor           %%T3, %%T1, %%T4
1061%else
1062        ;; The hash should end up in T3
1063        vmovdqa  %%T3, %%T2
1064%endif
1065
1066        ;; Final hash is now in T3
1067%if %%num_initial_blocks > 0
1068        ;; NOTE: obsolete in case %%num_initial_blocks = 0
1069        sub     %%LENGTH, 16*%%num_initial_blocks
1070%endif
1071
1072                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
1073                vpxor    %%XMM1, %%XMM1, %%T1
1074                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
1075                %ifidn  %%ENC_DEC, DEC
1076                vmovdqa  %%XMM1, %%T1
1077                %endif
1078
1079                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
1080                vpxor    %%XMM2, %%XMM2, %%T1
1081                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
1082                %ifidn  %%ENC_DEC, DEC
1083                vmovdqa  %%XMM2, %%T1
1084                %endif
1085
1086                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
1087                vpxor    %%XMM3, %%XMM3, %%T1
1088                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
1089                %ifidn  %%ENC_DEC, DEC
1090                vmovdqa  %%XMM3, %%T1
1091                %endif
1092
1093                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
1094                vpxor    %%XMM4, %%XMM4, %%T1
1095                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
1096                %ifidn  %%ENC_DEC, DEC
1097                vmovdqa  %%XMM4, %%T1
1098                %endif
1099
1100                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
1101                vpxor    %%XMM5, %%XMM5, %%T1
1102                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
1103                %ifidn  %%ENC_DEC, DEC
1104                vmovdqa  %%XMM5, %%T1
1105                %endif
1106
1107                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
1108                vpxor    %%XMM6, %%XMM6, %%T1
1109                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
1110                %ifidn  %%ENC_DEC, DEC
1111                vmovdqa  %%XMM6, %%T1
1112                %endif
1113
1114               VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
1115                vpxor    %%XMM7, %%XMM7, %%T1
1116                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
1117                %ifidn  %%ENC_DEC, DEC
1118                vmovdqa  %%XMM7, %%T1
1119                %endif
1120
1121%if %%num_initial_blocks > 0
1122                ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
1123                ;;      This macro is executed for length 128 and up,
1124                ;;      zero length is checked in GCM_ENC_DEC.
1125                ;; If the last block is partial then the xor will be done later
1126                ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
1127                ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
1128                cmp %%LENGTH, 128
1129                jl %%_initial_skip_last_word_write
1130%endif
1131                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
1132                vpxor    %%XMM8, %%XMM8, %%T1
1133                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
1134                %ifidn  %%ENC_DEC, DEC
1135                vmovdqa  %%XMM8, %%T1
1136                %endif
1137
1138                ;; Update %%LENGTH with the number of blocks processed
1139                sub     %%LENGTH, 16
1140                add     %%DATA_OFFSET, 16
1141%%_initial_skip_last_word_write:
1142                sub     %%LENGTH, 128-16
1143                add     %%DATA_OFFSET, 128-16
1144
1145                vpshufb  %%XMM1, [rel SHUF_MASK]             ; perform a 16Byte swap
1146                ;; Combine GHASHed value with the corresponding ciphertext
1147                vpxor    %%XMM1, %%XMM1, %%T3
1148                vpshufb  %%XMM2, [rel SHUF_MASK]             ; perform a 16Byte swap
1149                vpshufb  %%XMM3, [rel SHUF_MASK]             ; perform a 16Byte swap
1150                vpshufb  %%XMM4, [rel SHUF_MASK]             ; perform a 16Byte swap
1151                vpshufb  %%XMM5, [rel SHUF_MASK]             ; perform a 16Byte swap
1152                vpshufb  %%XMM6, [rel SHUF_MASK]             ; perform a 16Byte swap
1153                vpshufb  %%XMM7, [rel SHUF_MASK]             ; perform a 16Byte swap
1154                vpshufb  %%XMM8, [rel SHUF_MASK]             ; perform a 16Byte swap
1155
1156;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1157
1158%%_initial_blocks_done:
1159
1160
1161%endmacro
1162
1163;;; INITIAL_BLOCKS macro with support for a partial final block.
1164;;; num_initial_blocks is expected to include the partial final block
1165;;;     in the count.
1166%macro INITIAL_BLOCKS_PARTIAL 25
1167%define %%GDATA_KEY             %1
1168%define %%GDATA_CTX             %2
1169%define %%CYPH_PLAIN_OUT        %3
1170%define %%PLAIN_CYPH_IN         %4
1171%define %%LENGTH                %5
1172%define %%DATA_OFFSET           %6
1173%define %%num_initial_blocks    %7  ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
1174%define %%T1                    %8
1175%define %%T2                    %9
1176%define %%T3                    %10 ; [out] hash value
1177%define %%T4                    %11
1178%define %%T5                    %12
1179%define %%CTR                   %13
1180%define %%XMM1                  %14
1181%define %%XMM2                  %15
1182%define %%XMM3                  %16
1183%define %%XMM4                  %17
1184%define %%XMM5                  %18
1185%define %%XMM6                  %19
1186%define %%XMM7                  %20
1187%define %%XMM8                  %21 ; [in] hash value
1188%define %%T6                    %22
1189%define %%T_key                 %23
1190%define %%ENC_DEC               %24
1191%define %%INSTANCE_TYPE         %25
1192
1193                ;; Move AAD_HASH to temp reg
1194                vmovdqu  %%T2, %%XMM8
1195
1196%assign i (9-%%num_initial_blocks)
1197%rep %%num_initial_blocks
1198                ;; Compute AES counters
1199                vpaddd   %%CTR, %%CTR, [rel ONE]     ; INCR Y0
1200                vmovdqa  reg(i), %%CTR
1201                vpshufb  reg(i), [rel SHUF_MASK]     ; perform a 16Byte swap
1202%assign i (i+1)
1203%endrep
1204
1205vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
1206%assign i (9-%%num_initial_blocks)
1207%rep %%num_initial_blocks
1208                ; Start AES for %%num_initial_blocks blocks
1209                vpxor    reg(i),reg(i),%%T_key
1210%assign i (i+1)
1211%endrep
1212
1213%assign j 1
1214%rep NROUNDS
1215vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
1216%assign i (9-%%num_initial_blocks)
1217%rep %%num_initial_blocks
1218                vaesenc  reg(i),%%T_key
1219%assign i (i+1)
1220%endrep
1221
1222%assign j (j+1)
1223%endrep
1224
1225
1226vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
1227%assign i (9-%%num_initial_blocks)
1228%rep %%num_initial_blocks
1229                vaesenclast      reg(i),%%T_key
1230%assign i (i+1)
1231%endrep
1232
1233;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1234;;; Hash all but the last block of data
1235;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1236
1237%assign i (9-%%num_initial_blocks)
1238%rep %%num_initial_blocks-1
1239                ;; Encrypt the message for all but the last block
1240                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1241                vpxor    reg(i), reg(i), %%T1
1242                ;; write back ciphertext for %%num_initial_blocks blocks
1243                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1244                add     %%DATA_OFFSET, 16
1245%ifidn  %%ENC_DEC, DEC
1246                vmovdqa  reg(i), %%T1
1247%endif
1248                ;; Prepare ciphertext for GHASH computations
1249                vpshufb  reg(i), [rel SHUF_MASK]
1250%assign i (i+1)
1251%endrep
1252
1253%if %%num_initial_blocks > 1
1254                ;; The final block of data may be <16B
1255                sub      %%LENGTH, 16*(%%num_initial_blocks-1)
1256%endif
1257
1258%if %%num_initial_blocks < 8
1259                ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
1260                ;;      This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
1261                cmp      %%LENGTH, 16
1262                jl       %%_small_initial_partial_block
1263
1264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1265;;; Handle a full length final block - encrypt and hash all blocks
1266;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1267
1268                sub      %%LENGTH, 16
1269	        mov	[%%GDATA_CTX + PBlockLen], %%LENGTH
1270
1271                ;; Encrypt the message
1272                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
1273                vpxor    reg(i), reg(i), %%T1
1274                ;; write back ciphertext for %%num_initial_blocks blocks
1275                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
1276                add     %%DATA_OFFSET, 16
1277%ifidn  %%ENC_DEC, DEC
1278                vmovdqa  reg(i), %%T1
1279%endif
1280                ;; Prepare ciphertext for GHASH computations
1281                vpshufb  reg(i), [rel SHUF_MASK]
1282
1283        ;; Hash all of the data
1284%assign i (8-%%num_initial_blocks)
1285%assign j (9-%%num_initial_blocks)
1286%assign k (%%num_initial_blocks)
1287%assign last_block_to_hash 0
1288
1289%if(%%num_initial_blocks>last_block_to_hash)
1290        ;; Hash in AES state
1291        vpxor %%T2, reg(j)
1292
1293        ;; T2 - incoming AAD hash
1294        ;; reg(i) holds ciphertext
1295        ;; T5 - hash key
1296        ;; T6 - updated xor
1297        ;; reg(1)/xmm1 should now be available for tmp use
1298        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1299        vpclmulqdq      %%T1, %%T2, %%T5, 0x11             ; %%T4 = a1*b1
1300        vpclmulqdq      %%T4, %%T2, %%T5, 0x00             ; %%T4 = a0*b0
1301        vpclmulqdq      %%T6, %%T2, %%T5, 0x01             ; %%T6 = a1*b0
1302        vpclmulqdq      %%T5, %%T2, %%T5, 0x10             ; %%T5 = a0*b1
1303        vpxor           %%T6, %%T6, %%T5
1304%endif
1305
1306%assign i (i+1)
1307%assign j (j+1)
1308%assign k (k-1)
1309%assign rep_count (%%num_initial_blocks-1)
1310%rep rep_count
1311
1312        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1313        vpclmulqdq      %%T3, reg(j), %%T5, 0x11
1314        vpxor           %%T1, %%T1, %%T3
1315
1316        vpclmulqdq      %%T3, reg(j), %%T5, 0x00
1317        vpxor           %%T4, %%T4, %%T3
1318
1319        vpclmulqdq      %%T3, reg(j), %%T5, 0x01
1320        vpxor           %%T6, %%T6, %%T3
1321
1322        vpclmulqdq      %%T3, reg(j), %%T5, 0x10
1323        vpxor           %%T6, %%T6, %%T3
1324
1325%assign i (i+1)
1326%assign j (j+1)
1327%assign k (k-1)
1328%endrep
1329
1330        ;; Record that a reduction is needed
1331        mov      r12, 1
1332
1333        jmp      %%_small_initial_compute_hash
1334
1335
1336%endif                          ; %if %%num_initial_blocks < 8
1337
1338%%_small_initial_partial_block:
1339
1340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1341;;; Handle ghash for a <16B final block
1342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1343
1344        ;; In this case if it's a single call to encrypt we can
1345        ;; hash all of the data but if it's an init / update / finalize
1346        ;; series of call we need to leave the last block if it's
1347        ;; less than a full block of data.
1348
1349	mov	[%%GDATA_CTX + PBlockLen], %%LENGTH
1350        vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
1351        ;; Handle a partial final block
1352        ;;                            GDATA,    KEY,   T1,   T2
1353        ;; r13 - length
1354        ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
1355        ;;      NOTE: could be replaced with %%LENGTH but at this point
1356        ;;      %%LENGTH is always less than 16.
1357        ;;      No PLAIN_CYPH_LEN argument available in this macro.
1358        ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
1359        vpshufb  reg(i), [rel SHUF_MASK]
1360
1361%ifidn %%INSTANCE_TYPE, multi_call
1362%assign i (8-%%num_initial_blocks)
1363%assign j (9-%%num_initial_blocks)
1364%assign k (%%num_initial_blocks-1)
1365%assign last_block_to_hash 1
1366%else
1367%assign i (8-%%num_initial_blocks)
1368%assign j (9-%%num_initial_blocks)
1369%assign k (%%num_initial_blocks)
1370%assign last_block_to_hash 0
1371%endif
1372
1373%if(%%num_initial_blocks>last_block_to_hash)
1374        ;; Record that a reduction is needed
1375        mov            r12, 1
1376        ;; Hash in AES state
1377        vpxor          %%T2, reg(j)
1378
1379        ;; T2 - incoming AAD hash
1380        ;; reg(i) holds ciphertext
1381        ;; T5 - hash key
1382        ;; T6 - updated xor
1383        ;; reg(1)/xmm1 should now be available for tmp use
1384        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1385        vpclmulqdq      %%T1, %%T2, %%T5, 0x11             ; %%T4 = a1*b1
1386        vpclmulqdq      %%T4, %%T2, %%T5, 0x00             ; %%T4 = a0*b0
1387        vpclmulqdq      %%T6, %%T2, %%T5, 0x01             ; %%T6 = a1*b0
1388        vpclmulqdq      %%T5, %%T2, %%T5, 0x10             ; %%T5 = a0*b1
1389        vpxor           %%T6, %%T6, %%T5
1390%else
1391        ;; Record that a reduction is not needed -
1392        ;; In this case no hashes are computed because there
1393        ;; is only one initial block and it is < 16B in length.
1394        xor             r12, r12
1395%endif
1396
1397%assign i (i+1)
1398%assign j (j+1)
1399%assign k (k-1)
1400%ifidn %%INSTANCE_TYPE, multi_call
1401%assign rep_count (%%num_initial_blocks-2)
1402%%_multi_call_hash:
1403%else
1404%assign rep_count (%%num_initial_blocks-1)
1405%endif
1406
1407%if rep_count < 0
1408        ;; fix for negative rep_count
1409%assign rep_count 0
1410%endif
1411
1412%rep rep_count
1413
1414        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
1415        vpclmulqdq      %%T3, reg(j), %%T5, 0x11
1416        vpxor           %%T1, %%T1, %%T3
1417
1418        vpclmulqdq      %%T3, reg(j), %%T5, 0x00
1419        vpxor           %%T4, %%T4, %%T3
1420
1421        vpclmulqdq      %%T3, reg(j), %%T5, 0x01
1422        vpxor           %%T6, %%T6, %%T3
1423
1424        vpclmulqdq      %%T3, reg(j), %%T5, 0x10
1425        vpxor           %%T6, %%T6, %%T3
1426
1427%assign i (i+1)
1428%assign j (j+1)
1429%assign k (k-1)
1430%endrep
1431
1432%%_small_initial_compute_hash:
1433
1434;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1435;;; Ghash reduction
1436;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1437
1438%if(%%num_initial_blocks=1)
1439%ifidn %%INSTANCE_TYPE, multi_call
1440        ;; We only need to check if a reduction is needed if
1441        ;; initial_blocks == 1 and init/update/final is being used.
1442        ;; In this case we may just have a partial block, and that
1443        ;; gets hashed in finalize.
1444        ;; cmp     r12, 0
1445        or      r12, r12
1446        je      %%_no_reduction_needed
1447%endif
1448%endif
1449
1450        vpsrldq %%T3, %%T6, 8          ; shift-R %%T2 2 DWs
1451        vpslldq %%T6, %%T6, 8          ; shift-L %%T3 2 DWs
1452        vpxor   %%T1, %%T1, %%T3       ; accumulate the results in %%T1:%%T4
1453        vpxor   %%T4, %%T6, %%T4
1454
1455        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1456        ;; First phase of the reduction
1457        vmovdqu         %%T3, [rel POLY2]
1458
1459        vpclmulqdq      %%T2, %%T3, %%T4, 0x01
1460        ;; shift-L xmm2 2 DWs
1461        vpslldq         %%T2, %%T2, 8
1462        vpxor           %%T4, %%T4, %%T2
1463
1464        ;; First phase of the reduction complete
1465        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1466        ;; Second phase of the reduction
1467
1468        vpclmulqdq      %%T2, %%T3, %%T4, 0x00
1469        ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1470        vpsrldq         %%T2, %%T2, 4
1471
1472        vpclmulqdq      %%T4, %%T3, %%T4, 0x10
1473        ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1474        vpslldq         %%T4, %%T4, 4
1475
1476        vpxor           %%T4, %%T4, %%T2
1477        ;; Second phase of the reduction complete
1478        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1479        vpxor           %%T3, %%T1, %%T4
1480
1481%ifidn %%INSTANCE_TYPE, multi_call
1482        ;; If using init/update/finalize, we need to xor any partial block data
1483        ;; into the hash.
1484%if %%num_initial_blocks > 1
1485        ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
1486%if %%num_initial_blocks != 8
1487        ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
1488        cmp             qword [%%GDATA_CTX + PBlockLen], 0
1489        je              %%_no_partial_block_xor
1490%endif                          ; %%num_initial_blocks != 8
1491        vpxor           %%T3, %%T3, reg(8)
1492%%_no_partial_block_xor:
1493%endif                          ; %%num_initial_blocks > 1
1494%endif                          ; %%INSTANCE_TYPE, multi_call
1495
1496%if(%%num_initial_blocks=1)
1497%ifidn %%INSTANCE_TYPE, multi_call
1498        ;; NOTE: %%_no_reduction_needed case only valid for
1499        ;;      multi_call with initial_blocks = 1.
1500        ;; Look for comment above around '_no_reduction_needed'
1501        ;; The jmp below is obsolete as the code will fall through.
1502
1503        ;; The result is in %%T3
1504        jmp             %%_after_reduction
1505
1506%%_no_reduction_needed:
1507        ;; The hash should end up in T3. The only way we should get here is if
1508        ;; there is a partial block of data, so xor that into the hash.
1509        vpxor            %%T3, %%T2, reg(8)
1510%endif                          ; %%INSTANCE_TYPE = multi_call
1511%endif                          ; %%num_initial_blocks=1
1512
1513%%_after_reduction:
1514        ;; Final hash is now in T3
1515
1516%endmacro                       ; INITIAL_BLOCKS_PARTIAL
1517
1518
1519
1520; encrypt 8 blocks at a time
1521; ghash the 8 previously encrypted ciphertext blocks
1522; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
1523; %%DATA_OFFSET is the data offset value
1524%macro  GHASH_8_ENCRYPT_8_PARALLEL 23
1525%define %%GDATA                 %1
1526%define %%CYPH_PLAIN_OUT        %2
1527%define %%PLAIN_CYPH_IN         %3
1528%define %%DATA_OFFSET           %4
1529%define %%T1    %5
1530%define %%T2    %6
1531%define %%T3    %7
1532%define %%T4    %8
1533%define %%T5    %9
1534%define %%T6    %10
1535%define %%CTR   %11
1536%define %%XMM1  %12
1537%define %%XMM2  %13
1538%define %%XMM3  %14
1539%define %%XMM4  %15
1540%define %%XMM5  %16
1541%define %%XMM6  %17
1542%define %%XMM7  %18
1543%define %%XMM8  %19
1544%define %%T7    %20
1545%define %%loop_idx      %21
1546%define %%ENC_DEC       %22
1547%define %%FULL_PARTIAL  %23
1548
1549        vmovdqa %%T2, %%XMM1
1550        vmovdqu [rsp + TMP2], %%XMM2
1551        vmovdqu [rsp + TMP3], %%XMM3
1552        vmovdqu [rsp + TMP4], %%XMM4
1553        vmovdqu [rsp + TMP5], %%XMM5
1554        vmovdqu [rsp + TMP6], %%XMM6
1555        vmovdqu [rsp + TMP7], %%XMM7
1556        vmovdqu [rsp + TMP8], %%XMM8
1557
1558%ifidn %%loop_idx, in_order
1559                vpaddd  %%XMM1, %%CTR,  [rel ONE]           ; INCR CNT
1560                vmovdqu %%T5, [rel TWO]
1561                vpaddd  %%XMM2, %%CTR, %%T5
1562                vpaddd  %%XMM3, %%XMM1, %%T5
1563                vpaddd  %%XMM4, %%XMM2, %%T5
1564                vpaddd  %%XMM5, %%XMM3, %%T5
1565                vpaddd  %%XMM6, %%XMM4, %%T5
1566                vpaddd  %%XMM7, %%XMM5, %%T5
1567                vpaddd  %%XMM8, %%XMM6, %%T5
1568                vmovdqa %%CTR, %%XMM8
1569
1570                vmovdqu %%T5, [rel SHUF_MASK]
1571                vpshufb %%XMM1, %%T5             ; perform a 16Byte swap
1572                vpshufb %%XMM2, %%T5             ; perform a 16Byte swap
1573                vpshufb %%XMM3, %%T5             ; perform a 16Byte swap
1574                vpshufb %%XMM4, %%T5             ; perform a 16Byte swap
1575                vpshufb %%XMM5, %%T5             ; perform a 16Byte swap
1576                vpshufb %%XMM6, %%T5             ; perform a 16Byte swap
1577                vpshufb %%XMM7, %%T5             ; perform a 16Byte swap
1578                vpshufb %%XMM8, %%T5             ; perform a 16Byte swap
1579%else
1580                vpaddd  %%XMM1, %%CTR,  [rel ONEf]          ; INCR CNT
1581                vmovdqu %%T5, [rel TWOf]
1582                vpaddd  %%XMM2, %%CTR,  %%T5
1583                vpaddd  %%XMM3, %%XMM1, %%T5
1584                vpaddd  %%XMM4, %%XMM2, %%T5
1585                vpaddd  %%XMM5, %%XMM3, %%T5
1586                vpaddd  %%XMM6, %%XMM4, %%T5
1587                vpaddd  %%XMM7, %%XMM5, %%T5
1588                vpaddd  %%XMM8, %%XMM6, %%T5
1589                vmovdqa %%CTR, %%XMM8
1590%endif
1591
1592
1593
1594        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1595
1596                vmovdqu %%T1, [%%GDATA + 16*0]
1597                vpxor   %%XMM1, %%XMM1, %%T1
1598                vpxor   %%XMM2, %%XMM2, %%T1
1599                vpxor   %%XMM3, %%XMM3, %%T1
1600                vpxor   %%XMM4, %%XMM4, %%T1
1601                vpxor   %%XMM5, %%XMM5, %%T1
1602                vpxor   %%XMM6, %%XMM6, %%T1
1603                vpxor   %%XMM7, %%XMM7, %%T1
1604                vpxor   %%XMM8, %%XMM8, %%T1
1605
1606        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1607
1608                vmovdqu %%T1, [%%GDATA + 16*1]
1609                vaesenc %%XMM1, %%T1
1610                vaesenc %%XMM2, %%T1
1611                vaesenc %%XMM3, %%T1
1612                vaesenc %%XMM4, %%T1
1613                vaesenc %%XMM5, %%T1
1614                vaesenc %%XMM6, %%T1
1615                vaesenc %%XMM7, %%T1
1616                vaesenc %%XMM8, %%T1
1617
1618
1619                vmovdqu %%T1, [%%GDATA + 16*2]
1620                vaesenc %%XMM1, %%T1
1621                vaesenc %%XMM2, %%T1
1622                vaesenc %%XMM3, %%T1
1623                vaesenc %%XMM4, %%T1
1624                vaesenc %%XMM5, %%T1
1625                vaesenc %%XMM6, %%T1
1626                vaesenc %%XMM7, %%T1
1627                vaesenc %%XMM8, %%T1
1628
1629        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1630
1631        vmovdqu         %%T5, [%%GDATA + HashKey_8]
1632        vpclmulqdq      %%T4, %%T2, %%T5, 0x11                  ; %%T4 = a1*b1
1633        vpclmulqdq      %%T7, %%T2, %%T5, 0x00                  ; %%T7 = a0*b0
1634        vpclmulqdq      %%T6, %%T2, %%T5, 0x01                  ; %%T6 = a1*b0
1635        vpclmulqdq      %%T5, %%T2, %%T5, 0x10                  ; %%T5 = a0*b1
1636        vpxor           %%T6, %%T6, %%T5
1637
1638                vmovdqu %%T1, [%%GDATA + 16*3]
1639                vaesenc %%XMM1, %%T1
1640                vaesenc %%XMM2, %%T1
1641                vaesenc %%XMM3, %%T1
1642                vaesenc %%XMM4, %%T1
1643                vaesenc %%XMM5, %%T1
1644                vaesenc %%XMM6, %%T1
1645                vaesenc %%XMM7, %%T1
1646                vaesenc %%XMM8, %%T1
1647
1648        vmovdqu         %%T1, [rsp + TMP2]
1649        vmovdqu         %%T5, [%%GDATA + HashKey_7]
1650        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1651        vpxor           %%T4, %%T4, %%T3
1652
1653        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1654        vpxor           %%T7, %%T7, %%T3
1655
1656        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1657        vpxor           %%T6, %%T6, %%T3
1658
1659        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1660        vpxor           %%T6, %%T6, %%T3
1661
1662                vmovdqu %%T1, [%%GDATA + 16*4]
1663                vaesenc %%XMM1, %%T1
1664                vaesenc %%XMM2, %%T1
1665                vaesenc %%XMM3, %%T1
1666                vaesenc %%XMM4, %%T1
1667                vaesenc %%XMM5, %%T1
1668                vaesenc %%XMM6, %%T1
1669                vaesenc %%XMM7, %%T1
1670                vaesenc %%XMM8, %%T1
1671
1672        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1673        vmovdqu         %%T1, [rsp + TMP3]
1674        vmovdqu         %%T5, [%%GDATA + HashKey_6]
1675        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1676        vpxor           %%T4, %%T4, %%T3
1677
1678        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1679        vpxor           %%T7, %%T7, %%T3
1680
1681        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1682        vpxor           %%T6, %%T6, %%T3
1683
1684        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1685        vpxor           %%T6, %%T6, %%T3
1686
1687                vmovdqu %%T1, [%%GDATA + 16*5]
1688                vaesenc %%XMM1, %%T1
1689                vaesenc %%XMM2, %%T1
1690                vaesenc %%XMM3, %%T1
1691                vaesenc %%XMM4, %%T1
1692                vaesenc %%XMM5, %%T1
1693                vaesenc %%XMM6, %%T1
1694                vaesenc %%XMM7, %%T1
1695                vaesenc %%XMM8, %%T1
1696
1697
1698        vmovdqu         %%T1, [rsp + TMP4]
1699        vmovdqu         %%T5, [%%GDATA + HashKey_5]
1700        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1701        vpxor           %%T4, %%T4, %%T3
1702
1703        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1704        vpxor           %%T7, %%T7, %%T3
1705
1706        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1707        vpxor           %%T6, %%T6, %%T3
1708
1709        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1710        vpxor           %%T6, %%T6, %%T3
1711
1712                vmovdqu %%T1, [%%GDATA + 16*6]
1713                vaesenc %%XMM1, %%T1
1714                vaesenc %%XMM2, %%T1
1715                vaesenc %%XMM3, %%T1
1716                vaesenc %%XMM4, %%T1
1717                vaesenc %%XMM5, %%T1
1718                vaesenc %%XMM6, %%T1
1719                vaesenc %%XMM7, %%T1
1720                vaesenc %%XMM8, %%T1
1721
1722        vmovdqu         %%T1, [rsp + TMP5]
1723        vmovdqu         %%T5, [%%GDATA + HashKey_4]
1724        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1725        vpxor           %%T4, %%T4, %%T3
1726
1727        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1728        vpxor           %%T7, %%T7, %%T3
1729
1730        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1731        vpxor           %%T6, %%T6, %%T3
1732
1733        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1734        vpxor           %%T6, %%T6, %%T3
1735
1736                vmovdqu %%T1, [%%GDATA + 16*7]
1737                vaesenc %%XMM1, %%T1
1738                vaesenc %%XMM2, %%T1
1739                vaesenc %%XMM3, %%T1
1740                vaesenc %%XMM4, %%T1
1741                vaesenc %%XMM5, %%T1
1742                vaesenc %%XMM6, %%T1
1743                vaesenc %%XMM7, %%T1
1744                vaesenc %%XMM8, %%T1
1745
1746        vmovdqu         %%T1, [rsp + TMP6]
1747        vmovdqu         %%T5, [%%GDATA + HashKey_3]
1748        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1749        vpxor           %%T4, %%T4, %%T3
1750
1751        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1752        vpxor           %%T7, %%T7, %%T3
1753
1754        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1755        vpxor           %%T6, %%T6, %%T3
1756
1757        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1758        vpxor           %%T6, %%T6, %%T3
1759
1760                vmovdqu %%T1, [%%GDATA + 16*8]
1761                vaesenc %%XMM1, %%T1
1762                vaesenc %%XMM2, %%T1
1763                vaesenc %%XMM3, %%T1
1764                vaesenc %%XMM4, %%T1
1765                vaesenc %%XMM5, %%T1
1766                vaesenc %%XMM6, %%T1
1767                vaesenc %%XMM7, %%T1
1768                vaesenc %%XMM8, %%T1
1769
1770        vmovdqu         %%T1, [rsp + TMP7]
1771        vmovdqu         %%T5, [%%GDATA + HashKey_2]
1772        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1773        vpxor           %%T4, %%T4, %%T3
1774
1775        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1776        vpxor           %%T7, %%T7, %%T3
1777
1778        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1779        vpxor           %%T6, %%T6, %%T3
1780
1781        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1782        vpxor           %%T6, %%T6, %%T3
1783
1784        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1785
1786                vmovdqu %%T5, [%%GDATA + 16*9]
1787                vaesenc %%XMM1, %%T5
1788                vaesenc %%XMM2, %%T5
1789                vaesenc %%XMM3, %%T5
1790                vaesenc %%XMM4, %%T5
1791                vaesenc %%XMM5, %%T5
1792                vaesenc %%XMM6, %%T5
1793                vaesenc %%XMM7, %%T5
1794                vaesenc %%XMM8, %%T5
1795
1796        vmovdqu         %%T1, [rsp + TMP8]
1797        vmovdqu         %%T5, [%%GDATA + HashKey]
1798
1799
1800        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
1801        vpxor           %%T7, %%T7, %%T3
1802
1803        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
1804        vpxor           %%T6, %%T6, %%T3
1805
1806        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
1807        vpxor           %%T6, %%T6, %%T3
1808
1809        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
1810        vpxor           %%T1, %%T4, %%T3
1811
1812
1813                vmovdqu %%T5, [%%GDATA + 16*10]
1814 %ifndef GCM128_MODE            ; GCM192 or GCM256
1815                vaesenc %%XMM1, %%T5
1816                vaesenc %%XMM2, %%T5
1817                vaesenc %%XMM3, %%T5
1818                vaesenc %%XMM4, %%T5
1819                vaesenc %%XMM5, %%T5
1820                vaesenc %%XMM6, %%T5
1821                vaesenc %%XMM7, %%T5
1822                vaesenc %%XMM8, %%T5
1823
1824                vmovdqu %%T5, [%%GDATA + 16*11]
1825                vaesenc %%XMM1, %%T5
1826                vaesenc %%XMM2, %%T5
1827                vaesenc %%XMM3, %%T5
1828                vaesenc %%XMM4, %%T5
1829                vaesenc %%XMM5, %%T5
1830                vaesenc %%XMM6, %%T5
1831                vaesenc %%XMM7, %%T5
1832                vaesenc %%XMM8, %%T5
1833
1834                vmovdqu %%T5, [%%GDATA + 16*12]
1835%endif
1836%ifdef GCM256_MODE
1837                vaesenc %%XMM1, %%T5
1838                vaesenc %%XMM2, %%T5
1839                vaesenc %%XMM3, %%T5
1840                vaesenc %%XMM4, %%T5
1841                vaesenc %%XMM5, %%T5
1842                vaesenc %%XMM6, %%T5
1843                vaesenc %%XMM7, %%T5
1844                vaesenc %%XMM8, %%T5
1845
1846                vmovdqu %%T5, [%%GDATA + 16*13]
1847                vaesenc %%XMM1, %%T5
1848                vaesenc %%XMM2, %%T5
1849                vaesenc %%XMM3, %%T5
1850                vaesenc %%XMM4, %%T5
1851                vaesenc %%XMM5, %%T5
1852                vaesenc %%XMM6, %%T5
1853                vaesenc %%XMM7, %%T5
1854                vaesenc %%XMM8, %%T5
1855
1856                vmovdqu %%T5, [%%GDATA + 16*14]
1857%endif                          ; GCM256
1858
1859%assign i 0
1860%assign j 1
1861%rep 8
1862
1863        ;; SNP TBD: This is pretty ugly - consider whether just XORing the
1864        ;; data in after vaesenclast is simpler and performant. Would
1865        ;; also have to ripple it through partial block and ghash_mul_8.
1866%ifidn %%FULL_PARTIAL, full
1867    %ifdef  NT_LD
1868        VXLDR   %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1869        vpxor   %%T2, %%T2, %%T5
1870    %else
1871        vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1872    %endif
1873
1874    %ifidn %%ENC_DEC, ENC
1875        vaesenclast     reg(j), reg(j), %%T2
1876    %else
1877        vaesenclast     %%T3, reg(j), %%T2
1878        vpxor   reg(j), %%T2, %%T5
1879        VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1880    %endif
1881
1882%else
1883    ; Don't read the final data during partial block processing
1884    %ifdef  NT_LD
1885        %if (i<7)
1886            VXLDR   %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1887            vpxor   %%T2, %%T2, %%T5
1888        %else
1889            ;; Stage the key directly in T2 rather than hash it with plaintext
1890            vmovdqu %%T2, %%T5
1891        %endif
1892    %else
1893        %if (i<7)
1894            vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
1895        %else
1896            ;; Stage the key directly in T2 rather than hash it with plaintext
1897            vmovdqu %%T2, %%T5
1898        %endif
1899    %endif
1900
1901    %ifidn %%ENC_DEC, ENC
1902        vaesenclast     reg(j), reg(j), %%T2
1903    %else
1904        %if (i<7)
1905            vaesenclast     %%T3, reg(j), %%T2
1906            vpxor   reg(j), %%T2, %%T5
1907            ;; Do not read the data since it could fault
1908            VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
1909        %else
1910            vaesenclast     reg(j), reg(j), %%T2
1911        %endif
1912    %endif
1913%endif
1914
1915%assign i (i+1)
1916%assign j (j+1)
1917%endrep
1918
1919
1920;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1921
1922
1923        vpslldq %%T3, %%T6, 8                                   ; shift-L %%T3 2 DWs
1924        vpsrldq %%T6, %%T6, 8                                   ; shift-R %%T2 2 DWs
1925        vpxor   %%T7, %%T7, %%T3
1926        vpxor   %%T1, %%T1, %%T6                                ; accumulate the results in %%T1:%%T7
1927
1928
1929
1930        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1931        ;first phase of the reduction
1932        vmovdqu         %%T3, [rel POLY2]
1933
1934        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
1935        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
1936
1937        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
1938        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1939
1940    %ifidn %%ENC_DEC, ENC
1941        ; Write to the Ciphertext buffer
1942        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
1943        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
1944        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
1945        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
1946        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
1947        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
1948        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
1949        %ifidn %%FULL_PARTIAL, full
1950            ;; Avoid writing past the buffer if handling a partial block
1951            VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
1952        %endif
1953    %endif
1954
1955
1956;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1957        ;second phase of the reduction
1958        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
1959        vpsrldq         %%T2, %%T2, 4                                   ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1960
1961        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
1962        vpslldq         %%T4, %%T4, 4                                   ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
1963
1964        vpxor           %%T4, %%T4, %%T2                                ; second phase of the reduction complete
1965        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1966        vpxor           %%T1, %%T1, %%T4                                ; the result is in %%T1
1967
1968                vpshufb %%XMM1, [rel SHUF_MASK]             ; perform a 16Byte swap
1969                vpshufb %%XMM2, [rel SHUF_MASK]             ; perform a 16Byte swap
1970                vpshufb %%XMM3, [rel SHUF_MASK]             ; perform a 16Byte swap
1971                vpshufb %%XMM4, [rel SHUF_MASK]             ; perform a 16Byte swap
1972                vpshufb %%XMM5, [rel SHUF_MASK]             ; perform a 16Byte swap
1973                vpshufb %%XMM6, [rel SHUF_MASK]             ; perform a 16Byte swap
1974                vpshufb %%XMM7, [rel SHUF_MASK]             ; perform a 16Byte swap
1975        vpshufb %%XMM8, [rel SHUF_MASK]             ; perform a 16Byte swap
1976
1977
1978        vpxor   %%XMM1, %%T1
1979
1980
1981%endmacro                       ; GHASH_8_ENCRYPT_8_PARALLEL
1982
1983
1984; GHASH the last 4 ciphertext blocks.
1985%macro  GHASH_LAST_8 16
1986%define %%GDATA %1
1987%define %%T1    %2
1988%define %%T2    %3
1989%define %%T3    %4
1990%define %%T4    %5
1991%define %%T5    %6
1992%define %%T6    %7
1993%define %%T7    %8
1994%define %%XMM1  %9
1995%define %%XMM2  %10
1996%define %%XMM3  %11
1997%define %%XMM4  %12
1998%define %%XMM5  %13
1999%define %%XMM6  %14
2000%define %%XMM7  %15
2001%define %%XMM8  %16
2002
2003        ;; Karatsuba Method
2004
2005        vmovdqu         %%T5, [%%GDATA + HashKey_8]
2006
2007        vpshufd         %%T2, %%XMM1, 01001110b
2008        vpshufd         %%T3, %%T5, 01001110b
2009        vpxor           %%T2, %%T2, %%XMM1
2010        vpxor           %%T3, %%T3, %%T5
2011
2012        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
2013        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
2014
2015        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
2016
2017        ;;;;;;;;;;;;;;;;;;;;;;
2018
2019        vmovdqu         %%T5, [%%GDATA + HashKey_7]
2020        vpshufd         %%T2, %%XMM2, 01001110b
2021        vpshufd         %%T3, %%T5, 01001110b
2022        vpxor           %%T2, %%T2, %%XMM2
2023        vpxor           %%T3, %%T3, %%T5
2024
2025        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
2026        vpxor           %%T6, %%T6, %%T4
2027
2028        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
2029        vpxor           %%T7, %%T7, %%T4
2030
2031        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2032
2033        vpxor           %%XMM1, %%XMM1, %%T2
2034
2035        ;;;;;;;;;;;;;;;;;;;;;;
2036
2037        vmovdqu         %%T5, [%%GDATA + HashKey_6]
2038        vpshufd         %%T2, %%XMM3, 01001110b
2039        vpshufd         %%T3, %%T5, 01001110b
2040        vpxor           %%T2, %%T2, %%XMM3
2041        vpxor           %%T3, %%T3, %%T5
2042
2043        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
2044        vpxor           %%T6, %%T6, %%T4
2045
2046        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
2047        vpxor           %%T7, %%T7, %%T4
2048
2049        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2050
2051        vpxor           %%XMM1, %%XMM1, %%T2
2052
2053        ;;;;;;;;;;;;;;;;;;;;;;
2054
2055        vmovdqu         %%T5, [%%GDATA + HashKey_5]
2056        vpshufd         %%T2, %%XMM4, 01001110b
2057        vpshufd         %%T3, %%T5, 01001110b
2058        vpxor           %%T2, %%T2, %%XMM4
2059        vpxor           %%T3, %%T3, %%T5
2060
2061        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
2062        vpxor           %%T6, %%T6, %%T4
2063
2064        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
2065        vpxor           %%T7, %%T7, %%T4
2066
2067        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2068
2069        vpxor           %%XMM1, %%XMM1, %%T2
2070
2071        ;;;;;;;;;;;;;;;;;;;;;;
2072
2073        vmovdqu         %%T5, [%%GDATA + HashKey_4]
2074        vpshufd         %%T2, %%XMM5, 01001110b
2075        vpshufd         %%T3, %%T5, 01001110b
2076        vpxor           %%T2, %%T2, %%XMM5
2077        vpxor           %%T3, %%T3, %%T5
2078
2079        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
2080        vpxor           %%T6, %%T6, %%T4
2081
2082        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
2083        vpxor           %%T7, %%T7, %%T4
2084
2085        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2086
2087        vpxor           %%XMM1, %%XMM1, %%T2
2088
2089        ;;;;;;;;;;;;;;;;;;;;;;
2090
2091        vmovdqu         %%T5, [%%GDATA + HashKey_3]
2092        vpshufd         %%T2, %%XMM6, 01001110b
2093        vpshufd         %%T3, %%T5, 01001110b
2094        vpxor           %%T2, %%T2, %%XMM6
2095        vpxor           %%T3, %%T3, %%T5
2096
2097        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
2098        vpxor           %%T6, %%T6, %%T4
2099
2100        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
2101        vpxor           %%T7, %%T7, %%T4
2102
2103        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2104
2105        vpxor           %%XMM1, %%XMM1, %%T2
2106
2107        ;;;;;;;;;;;;;;;;;;;;;;
2108
2109        vmovdqu         %%T5, [%%GDATA + HashKey_2]
2110        vpshufd         %%T2, %%XMM7, 01001110b
2111        vpshufd         %%T3, %%T5, 01001110b
2112        vpxor           %%T2, %%T2, %%XMM7
2113        vpxor           %%T3, %%T3, %%T5
2114
2115        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
2116        vpxor           %%T6, %%T6, %%T4
2117
2118        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
2119        vpxor           %%T7, %%T7, %%T4
2120
2121        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2122
2123        vpxor           %%XMM1, %%XMM1, %%T2
2124
2125        ;;;;;;;;;;;;;;;;;;;;;;
2126
2127        vmovdqu         %%T5, [%%GDATA + HashKey]
2128        vpshufd         %%T2, %%XMM8, 01001110b
2129        vpshufd         %%T3, %%T5, 01001110b
2130        vpxor           %%T2, %%T2, %%XMM8
2131        vpxor           %%T3, %%T3, %%T5
2132
2133        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x11
2134        vpxor           %%T6, %%T6, %%T4
2135
2136        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x00
2137        vpxor           %%T7, %%T7, %%T4
2138
2139        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2140
2141        vpxor           %%XMM1, %%XMM1, %%T2
2142        vpxor           %%XMM1, %%XMM1, %%T6
2143        vpxor           %%T2, %%XMM1, %%T7
2144
2145
2146
2147
2148        vpslldq %%T4, %%T2, 8
2149        vpsrldq %%T2, %%T2, 8
2150
2151        vpxor   %%T7, %%T7, %%T4
2152        vpxor   %%T6, %%T6, %%T2                               ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2153
2154        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2155        ;first phase of the reduction
2156        vmovdqu         %%T3, [rel POLY2]
2157
2158        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
2159        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
2160
2161        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
2162        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2163
2164
2165        ;second phase of the reduction
2166        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
2167        vpsrldq         %%T2, %%T2, 4                           ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2168
2169        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
2170        vpslldq         %%T4, %%T4, 4                           ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2171
2172        vpxor           %%T4, %%T4, %%T2                        ; second phase of the reduction complete
2173        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2174        vpxor           %%T6, %%T6, %%T4                        ; the result is in %%T6
2175%endmacro
2176
2177
2178; GHASH the last 4 ciphertext blocks.
2179%macro  GHASH_LAST_7 15
2180%define %%GDATA %1
2181%define %%T1    %2
2182%define %%T2    %3
2183%define %%T3    %4
2184%define %%T4    %5
2185%define %%T5    %6
2186%define %%T6    %7
2187%define %%T7    %8
2188%define %%XMM1  %9
2189%define %%XMM2  %10
2190%define %%XMM3  %11
2191%define %%XMM4  %12
2192%define %%XMM5  %13
2193%define %%XMM6  %14
2194%define %%XMM7  %15
2195
2196        ;; Karatsuba Method
2197
2198        vmovdqu         %%T5, [%%GDATA + HashKey_7]
2199
2200        vpshufd         %%T2, %%XMM1, 01001110b
2201        vpshufd         %%T3, %%T5, 01001110b
2202        vpxor           %%T2, %%T2, %%XMM1
2203        vpxor           %%T3, %%T3, %%T5
2204
2205        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
2206        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
2207
2208        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
2209
2210        ;;;;;;;;;;;;;;;;;;;;;;
2211
2212        vmovdqu         %%T5, [%%GDATA + HashKey_6]
2213        vpshufd         %%T2, %%XMM2, 01001110b
2214        vpshufd         %%T3, %%T5, 01001110b
2215        vpxor           %%T2, %%T2, %%XMM2
2216        vpxor           %%T3, %%T3, %%T5
2217
2218        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
2219        vpxor           %%T6, %%T6, %%T4
2220
2221        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
2222        vpxor           %%T7, %%T7, %%T4
2223
2224        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2225
2226        vpxor           %%XMM1, %%XMM1, %%T2
2227
2228        ;;;;;;;;;;;;;;;;;;;;;;
2229
2230        vmovdqu         %%T5, [%%GDATA + HashKey_5]
2231        vpshufd         %%T2, %%XMM3, 01001110b
2232        vpshufd         %%T3, %%T5, 01001110b
2233        vpxor           %%T2, %%T2, %%XMM3
2234        vpxor           %%T3, %%T3, %%T5
2235
2236        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
2237        vpxor           %%T6, %%T6, %%T4
2238
2239        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
2240        vpxor           %%T7, %%T7, %%T4
2241
2242        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2243
2244        vpxor           %%XMM1, %%XMM1, %%T2
2245
2246        ;;;;;;;;;;;;;;;;;;;;;;
2247
2248        vmovdqu         %%T5, [%%GDATA + HashKey_4]
2249        vpshufd         %%T2, %%XMM4, 01001110b
2250        vpshufd         %%T3, %%T5, 01001110b
2251        vpxor           %%T2, %%T2, %%XMM4
2252        vpxor           %%T3, %%T3, %%T5
2253
2254        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
2255        vpxor           %%T6, %%T6, %%T4
2256
2257        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
2258        vpxor           %%T7, %%T7, %%T4
2259
2260        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2261
2262        vpxor           %%XMM1, %%XMM1, %%T2
2263
2264        ;;;;;;;;;;;;;;;;;;;;;;
2265
2266        vmovdqu         %%T5, [%%GDATA + HashKey_3]
2267        vpshufd         %%T2, %%XMM5, 01001110b
2268        vpshufd         %%T3, %%T5, 01001110b
2269        vpxor           %%T2, %%T2, %%XMM5
2270        vpxor           %%T3, %%T3, %%T5
2271
2272        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
2273        vpxor           %%T6, %%T6, %%T4
2274
2275        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
2276        vpxor           %%T7, %%T7, %%T4
2277
2278        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2279
2280        vpxor           %%XMM1, %%XMM1, %%T2
2281
2282        ;;;;;;;;;;;;;;;;;;;;;;
2283
2284        vmovdqu         %%T5, [%%GDATA + HashKey_2]
2285        vpshufd         %%T2, %%XMM6, 01001110b
2286        vpshufd         %%T3, %%T5, 01001110b
2287        vpxor           %%T2, %%T2, %%XMM6
2288        vpxor           %%T3, %%T3, %%T5
2289
2290        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
2291        vpxor           %%T6, %%T6, %%T4
2292
2293        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
2294        vpxor           %%T7, %%T7, %%T4
2295
2296        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2297
2298        vpxor           %%XMM1, %%XMM1, %%T2
2299
2300        ;;;;;;;;;;;;;;;;;;;;;;
2301
2302        vmovdqu         %%T5, [%%GDATA + HashKey_1]
2303        vpshufd         %%T2, %%XMM7, 01001110b
2304        vpshufd         %%T3, %%T5, 01001110b
2305        vpxor           %%T2, %%T2, %%XMM7
2306        vpxor           %%T3, %%T3, %%T5
2307
2308        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
2309        vpxor           %%T6, %%T6, %%T4
2310
2311        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
2312        vpxor           %%T7, %%T7, %%T4
2313
2314        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
2315
2316        vpxor           %%XMM1, %%XMM1, %%T2
2317
2318        ;;;;;;;;;;;;;;;;;;;;;;
2319
2320        vpxor           %%XMM1, %%XMM1, %%T6
2321        vpxor           %%T2, %%XMM1, %%T7
2322
2323
2324
2325
2326        vpslldq %%T4, %%T2, 8
2327        vpsrldq %%T2, %%T2, 8
2328
2329        vpxor   %%T7, %%T7, %%T4
2330        vpxor   %%T6, %%T6, %%T2                               ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
2331
2332        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2333        ;first phase of the reduction
2334        vmovdqu         %%T3, [rel POLY2]
2335
2336        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
2337        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
2338
2339        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
2340        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2341
2342
2343        ;second phase of the reduction
2344        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
2345        vpsrldq         %%T2, %%T2, 4                           ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2346
2347        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
2348        vpslldq         %%T4, %%T4, 4                           ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2349
2350        vpxor           %%T4, %%T4, %%T2                        ; second phase of the reduction complete
2351        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2352        vpxor           %%T6, %%T6, %%T4                        ; the result is in %%T6
2353%endmacro
2354
2355
2356
2357;;; Handle encryption of the final partial block
2358;;; IN:
2359;;;   r13  - Number of bytes to read
2360;;; MODIFIES:
2361;;;   KEY  - Key for encrypting the partial block
2362;;; SMASHES:
2363;;;   rax, T1
2364;;; Note:
2365;;;   PLAIN_CYPH_LEN is unused at this stage. Previously:
2366;;;     it was used  to determine if buffer is big enough to do
2367;;;     a 16 byte read & shift.
2368;;;     'LT16' is passed here only if buffer is known to be smaller
2369;;;     than 16 bytes.
2370;;;     Any other value passed here will result in 16 byte read
2371;;;     code path.
2372%macro  ENCRYPT_FINAL_PARTIAL_BLOCK 7
2373%define %%KEY             %1
2374%define %%T1              %2
2375%define %%CYPH_PLAIN_OUT  %3
2376%define %%PLAIN_CYPH_IN   %4
2377%define %%PLAIN_CYPH_LEN  %5
2378%define %%ENC_DEC         %6
2379%define %%DATA_OFFSET     %7
2380
2381        ;; %%PLAIN_CYPH_IN + %%DATA_OFFSET
2382        ;;               - input data address
2383        ;; r13           - input data length
2384        ;; rax           - temp registers
2385        ;; out:
2386        ;; T1            - packed output
2387        ;; k1            - valid byte mask
2388        READ_SMALL_DATA_INPUT   %%T1, %%PLAIN_CYPH_IN+%%DATA_OFFSET, r13, rax
2389
2390        ;; At this point T1 contains the partial block data
2391        ;; Plaintext XOR E(K, Yn)
2392        vpxorq          %%KEY, %%KEY, %%T1
2393
2394        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2395        ;; Output r13 Bytes
2396        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, %%KEY
2397
2398%ifidn  %%ENC_DEC, DEC
2399        ;; If decrypt, restore the ciphertext into %%KEY
2400        vmovdqa64       %%KEY, %%T1
2401%else
2402        vmovdqu8        %%KEY{k1}{z}, %%KEY
2403%endif
2404%endmacro                       ; ENCRYPT_FINAL_PARTIAL_BLOCK
2405
2406
2407
2408; Encryption of a single block
2409%macro  ENCRYPT_SINGLE_BLOCK 2
2410%define %%GDATA %1
2411%define %%XMM0  %2
2412
2413                vpxor    %%XMM0, %%XMM0, [%%GDATA+16*0]
2414%assign i 1
2415%rep NROUNDS
2416                vaesenc  %%XMM0, [%%GDATA+16*i]
2417%assign i (i+1)
2418%endrep
2419                vaesenclast      %%XMM0, [%%GDATA+16*i]
2420%endmacro
2421
2422
2423;; Start of Stack Setup
2424
2425%macro FUNC_SAVE 0
2426        ;; Required for Update/GMC_ENC
2427        ;the number of pushes must equal STACK_OFFSET
2428        push    r12
2429        push    r13
2430        push    r14
2431        push    r15
2432        mov     r14, rsp
2433
2434        sub     rsp, VARIABLE_OFFSET
2435        and     rsp, ~63
2436
2437%ifidn __OUTPUT_FORMAT__, win64
2438        ; xmm6:xmm15 need to be maintained for Windows
2439        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
2440        vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
2441        vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
2442        vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
2443        vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
2444        vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
2445        vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
2446        vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
2447        vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
2448        vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
2449%endif
2450%endmacro
2451
2452
2453%macro FUNC_RESTORE 0
2454
2455%ifdef SAFE_DATA
2456        clear_scratch_gps_asm
2457        clear_scratch_zmms_asm
2458%endif
2459%ifidn __OUTPUT_FORMAT__, win64
2460        vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
2461        vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
2462        vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
2463        vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
2464        vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
2465        vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
2466        vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
2467        vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
2468        vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
2469        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
2470%endif
2471;; Required for Update/GMC_ENC
2472        mov     rsp, r14
2473        pop     r15
2474        pop     r14
2475        pop     r13
2476        pop     r12
2477%endmacro
2478
2479%macro CALC_J0 13
2480%define %%KEY           %1 ;; [in] Pointer to GCM KEY structure
2481%define %%IV            %2 ;; [in] Pointer to IV
2482%define %%IV_LEN        %3 ;; [in] IV length
2483%define %%J0            %4 ;; [out] XMM reg to contain J0
2484%define %%TMP0          %5 ;; [clobbered] Temporary GP reg
2485%define %%TMP1          %6 ;; [clobbered] Temporary GP reg
2486%define %%TMP2          %7 ;; [clobbered] Temporary GP reg
2487%define %%XTMP0         %8 ;; [clobbered] Temporary XMM reg
2488%define %%XTMP1         %9 ;; [clobbered] Temporary XMM reg
2489%define %%XTMP2         %10 ;; [clobbered] Temporary XMM reg
2490%define %%XTMP3         %11 ;; [clobbered] Temporary XMM reg
2491%define %%XTMP4         %12 ;; [clobbered] Temporary XMM reg
2492%define %%XTMP5         %13 ;; [clobbered] Temporary XMM reg
2493
2494        ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
2495        ;; s = 16 * RoundUp(len(IV)/16) -  len(IV) */
2496
2497        ;; Calculate GHASH of (IV || 0s)
2498        vpxor   %%J0, %%J0
2499        CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \
2500                      %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2
2501
2502        ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
2503        vmovdqu %%XTMP0, [%%KEY + HashKey]
2504        mov     %%TMP2, %%IV_LEN
2505        shl     %%TMP2, 3 ;; IV length in bits
2506        vmovq   %%XTMP1, %%TMP2
2507        vpxor   %%J0, %%XTMP1
2508        GHASH_MUL %%J0, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
2509
2510        vpshufb %%J0, [rel SHUF_MASK] ; perform a 16Byte swap
2511%endmacro
2512
2513;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2514; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
2515; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, IV_LEN
2516; Additional Authentication data (A_IN), Additional Data length (A_LEN).
2517; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
2518; Clobbers rax, r10-r13, and xmm0-xmm6
2519;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2520%macro  GCM_INIT        8-9
2521%define %%GDATA_KEY     %1      ; [in] GCM expanded keys pointer
2522%define %%GDATA_CTX     %2      ; [in] GCM context pointer
2523%define %%IV            %3      ; [in] IV pointer
2524%define %%A_IN          %4      ; [in] AAD pointer
2525%define %%A_LEN         %5      ; [in] AAD length in bytes
2526%define %%GPR1          %6      ; temp GPR
2527%define %%GPR2          %7      ; temp GPR
2528%define %%GPR3          %8      ; temp GPR
2529%define %%IV_LEN        %9      ; [in] IV length
2530
2531%define %%AAD_HASH      xmm14
2532
2533        vpxor   %%AAD_HASH, %%AAD_HASH
2534        CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3
2535
2536        mov     %%GPR1, %%A_LEN
2537        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH         ; ctx_data.aad hash = aad_hash
2538        mov     [%%GDATA_CTX + AadLen], %%GPR1              ; ctx_data.aad_length = aad_length
2539
2540        xor     %%GPR1, %%GPR1
2541        mov     [%%GDATA_CTX + InLen], %%GPR1               ; ctx_data.in_length = 0
2542        mov     [%%GDATA_CTX + PBlockLen], %%GPR1           ; ctx_data.partial_block_length = 0
2543
2544%if %0 == 9 ;; IV is different than 12 bytes
2545        CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, xmm0, xmm1, \
2546                xmm3, xmm4, xmm5, xmm6
2547%else ;; IV is 12 bytes
2548        ;; read 12 IV bytes and pad with 0x00000001
2549        mov     %%GPR2, %%IV
2550        vmovd   xmm3, [%%GPR2 + 8]
2551        vpslldq xmm3, 8
2552        vmovq   xmm2, [%%GPR2]
2553        vmovdqa xmm4, [rel ONEf]
2554        vpternlogq xmm2, xmm3, xmm4, 0xfe     ; xmm2 = xmm2 or xmm3 or xmm4
2555%endif
2556        vmovdqu [%%GDATA_CTX + OrigIV], xmm2                ; ctx_data.orig_IV = iv
2557
2558        ;; store IV as counter in LE format
2559        vpshufb xmm2, [rel SHUF_MASK]
2560        vmovdqu [%%GDATA_CTX + CurCount], xmm2              ; ctx_data.current_counter = iv
2561%endmacro
2562
2563%macro  GCM_ENC_DEC_SMALL   12
2564%define %%GDATA_KEY         %1
2565%define %%GDATA_CTX         %2
2566%define %%CYPH_PLAIN_OUT    %3
2567%define %%PLAIN_CYPH_IN     %4
2568%define %%PLAIN_CYPH_LEN    %5
2569%define %%ENC_DEC           %6
2570%define %%DATA_OFFSET       %7
2571%define %%LENGTH            %8  ; assumed r13
2572%define %%NUM_BLOCKS        %9
2573%define %%CTR               %10 ; assumed xmm9
2574%define %%HASH_OUT          %11 ; assumed xmm14
2575%define %%INSTANCE_TYPE     %12
2576
2577        ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
2578        ;; cmp     %%NUM_BLOCKS, 0
2579        ;; je      %%_small_initial_blocks_encrypted
2580        cmp     %%NUM_BLOCKS, 8
2581        je      %%_small_initial_num_blocks_is_8
2582        cmp     %%NUM_BLOCKS, 7
2583        je      %%_small_initial_num_blocks_is_7
2584        cmp     %%NUM_BLOCKS, 6
2585        je      %%_small_initial_num_blocks_is_6
2586        cmp     %%NUM_BLOCKS, 5
2587        je      %%_small_initial_num_blocks_is_5
2588        cmp     %%NUM_BLOCKS, 4
2589        je      %%_small_initial_num_blocks_is_4
2590        cmp     %%NUM_BLOCKS, 3
2591        je      %%_small_initial_num_blocks_is_3
2592        cmp     %%NUM_BLOCKS, 2
2593        je      %%_small_initial_num_blocks_is_2
2594
2595        jmp     %%_small_initial_num_blocks_is_1
2596
2597
2598%%_small_initial_num_blocks_is_8:
2599        ;; r13   - %%LENGTH
2600        ;; xmm12 - T1
2601        ;; xmm13 - T2
2602        ;; xmm14 - T3   - AAD HASH OUT when not producing 8 AES keys
2603        ;; xmm15 - T4
2604        ;; xmm11 - T5
2605        ;; xmm9  - CTR
2606        ;; xmm1  - XMM1 - Cipher + Hash when producing 8 AES keys
2607        ;; xmm2  - XMM2
2608        ;; xmm3  - XMM3
2609        ;; xmm4  - XMM4
2610        ;; xmm5  - XMM5
2611        ;; xmm6  - XMM6
2612        ;; xmm7  - XMM7
2613        ;; xmm8  - XMM8 - AAD HASH IN
2614        ;; xmm10 - T6
2615        ;; xmm0  - T_key
2616        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2617                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 8, \
2618                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2619                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2620                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2621        jmp     %%_small_initial_blocks_encrypted
2622
2623%%_small_initial_num_blocks_is_7:
2624        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2625                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 7, \
2626                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2627                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2628                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2629        jmp     %%_small_initial_blocks_encrypted
2630
2631%%_small_initial_num_blocks_is_6:
2632        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2633                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 6, \
2634                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2635                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2636                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2637        jmp     %%_small_initial_blocks_encrypted
2638
2639%%_small_initial_num_blocks_is_5:
2640        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2641                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 5, \
2642                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2643                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2644                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2645        jmp     %%_small_initial_blocks_encrypted
2646
2647%%_small_initial_num_blocks_is_4:
2648        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2649                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 4, \
2650                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2651                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2652                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2653        jmp     %%_small_initial_blocks_encrypted
2654
2655%%_small_initial_num_blocks_is_3:
2656        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2657                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 3, \
2658                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2659                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2660                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2661        jmp     %%_small_initial_blocks_encrypted
2662
2663%%_small_initial_num_blocks_is_2:
2664        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2665                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 2, \
2666                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2667                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2668                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2669        jmp     %%_small_initial_blocks_encrypted
2670
2671%%_small_initial_num_blocks_is_1:
2672        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
2673                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 1, \
2674                xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
2675                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
2676                xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
2677%%_small_initial_blocks_encrypted:
2678
2679%endmacro                       ; GCM_ENC_DEC_SMALL
2680
2681;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2682; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
2683; has been initialized by GCM_INIT
2684; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
2685; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
2686; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
2687; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
2688; Clobbers rax, r10-r15, and xmm0-xmm15
2689;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2690%macro  GCM_ENC_DEC         7
2691%define %%GDATA_KEY         %1
2692%define %%GDATA_CTX         %2
2693%define %%CYPH_PLAIN_OUT    %3
2694%define %%PLAIN_CYPH_IN     %4
2695%define %%PLAIN_CYPH_LEN    %5
2696%define %%ENC_DEC           %6
2697%define %%INSTANCE_TYPE     %7
2698%define %%DATA_OFFSET       r11
2699
2700; Macro flow:
2701; calculate the number of 16byte blocks in the message
2702; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
2703; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
2704; if there is a block of less than 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
2705
2706%ifidn __OUTPUT_FORMAT__, win64
2707        cmp     %%PLAIN_CYPH_LEN, 0
2708%else
2709        or      %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
2710%endif
2711        je      %%_enc_dec_done
2712
2713        xor     %%DATA_OFFSET, %%DATA_OFFSET
2714        ;; Update length of data processed
2715%ifidn __OUTPUT_FORMAT__, win64
2716        mov     rax, %%PLAIN_CYPH_LEN
2717       	add     [%%GDATA_CTX + InLen], rax
2718%else
2719        add    [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
2720%endif
2721        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
2722        vmovdqu xmm8, [%%GDATA_CTX + AadHash]
2723
2724%ifidn %%INSTANCE_TYPE, multi_call
2725        ;; NOTE: partial block processing makes only sense for multi_call here.
2726        ;; Used for the update flow - if there was a previous partial
2727        ;; block fill the remaining bytes here.
2728        PARTIAL_BLOCK %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, xmm13, %%ENC_DEC
2729%endif
2730
2731        ;;  lift CTR set from initial_blocks to here
2732%ifidn %%INSTANCE_TYPE, single_call
2733        vmovdqu xmm9, xmm2
2734%else
2735        vmovdqu xmm9, [%%GDATA_CTX + CurCount]
2736%endif
2737
2738        ;; Save the amount of data left to process in r10
2739        mov     r13, %%PLAIN_CYPH_LEN
2740%ifidn %%INSTANCE_TYPE, multi_call
2741        ;; NOTE: %%DATA_OFFSET is zero in single_call case.
2742        ;;      Consequently PLAIN_CYPH_LEN will never be zero after
2743        ;;      %%DATA_OFFSET subtraction below.
2744        sub     r13, %%DATA_OFFSET
2745
2746        ;; There may be no more data if it was consumed in the partial block.
2747        cmp     r13, 0
2748        je      %%_enc_dec_done
2749%endif                          ; %%INSTANCE_TYPE, multi_call
2750        mov     r10, r13
2751
2752        ;; Determine how many blocks to process in INITIAL
2753        mov     r12, r13
2754        shr     r12, 4
2755        and     r12, 7
2756
2757        ;; Process one additional block in INITIAL if there is a partial block
2758        and     r10, 0xf
2759        blsmsk  r10, r10    ; Set CF if zero
2760        cmc                 ; Flip CF
2761        adc     r12, 0x0    ; Process an additional INITIAL block if CF set
2762
2763        ;;      Less than 127B will be handled by the small message code, which
2764        ;;      can process up to 7 16B blocks.
2765        cmp     r13, 128
2766        jge     %%_large_message_path
2767
2768        GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
2769        jmp     %%_ghash_done
2770
2771%%_large_message_path:
2772        and     r12, 0x7    ; Still, don't allow 8 INITIAL blocks since this will
2773                            ; can be handled by the x8 partial loop.
2774
2775        cmp     r12, 0
2776        je      %%_initial_num_blocks_is_0
2777        cmp     r12, 7
2778        je      %%_initial_num_blocks_is_7
2779        cmp     r12, 6
2780        je      %%_initial_num_blocks_is_6
2781        cmp     r12, 5
2782        je      %%_initial_num_blocks_is_5
2783        cmp     r12, 4
2784        je      %%_initial_num_blocks_is_4
2785        cmp     r12, 3
2786        je      %%_initial_num_blocks_is_3
2787        cmp     r12, 2
2788        je      %%_initial_num_blocks_is_2
2789
2790        jmp     %%_initial_num_blocks_is_1
2791
2792%%_initial_num_blocks_is_7:
2793        ;; r13   - %%LENGTH
2794        ;; xmm12 - T1
2795        ;; xmm13 - T2
2796        ;; xmm14 - T3   - AAD HASH OUT when not producing 8 AES keys
2797        ;; xmm15 - T4
2798        ;; xmm11 - T5
2799        ;; xmm9  - CTR
2800        ;; xmm1  - XMM1 - Cipher + Hash when producing 8 AES keys
2801        ;; xmm2  - XMM2
2802        ;; xmm3  - XMM3
2803        ;; xmm4  - XMM4
2804        ;; xmm5  - XMM5
2805        ;; xmm6  - XMM6
2806        ;; xmm7  - XMM7
2807        ;; xmm8  - XMM8 - AAD HASH IN
2808        ;; xmm10 - T6
2809        ;; xmm0  - T_key
2810        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2811        jmp     %%_initial_blocks_encrypted
2812
2813%%_initial_num_blocks_is_6:
2814        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2815        jmp     %%_initial_blocks_encrypted
2816
2817%%_initial_num_blocks_is_5:
2818        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2819        jmp     %%_initial_blocks_encrypted
2820
2821%%_initial_num_blocks_is_4:
2822        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2823        jmp     %%_initial_blocks_encrypted
2824
2825%%_initial_num_blocks_is_3:
2826        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2827        jmp     %%_initial_blocks_encrypted
2828
2829%%_initial_num_blocks_is_2:
2830        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2831        jmp     %%_initial_blocks_encrypted
2832
2833%%_initial_num_blocks_is_1:
2834        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2835        jmp     %%_initial_blocks_encrypted
2836
2837%%_initial_num_blocks_is_0:
2838        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
2839
2840
2841%%_initial_blocks_encrypted:
2842        ;; The entire message was encrypted processed in initial and now need to be hashed
2843        cmp     r13, 0
2844        je      %%_encrypt_done
2845
2846        ;; Encrypt the final <16 byte (partial) block, then hash
2847        cmp     r13, 16
2848        jl      %%_encrypt_final_partial
2849
2850        ;; Process 7 full blocks plus a partial block
2851        cmp     r13, 128
2852        jl      %%_encrypt_by_8_partial
2853
2854
2855%%_encrypt_by_8_parallel:
2856        ;; in_order vs. out_order is an optimization to increment the counter without shuffling
2857        ;; it back into little endian. r15d keeps track of when we need to increent in order so
2858        ;; that the carry is handled correctly.
2859        vmovd   r15d, xmm9
2860        and     r15d, 255
2861        vpshufb xmm9, [rel SHUF_MASK]
2862
2863
2864%%_encrypt_by_8_new:
2865        cmp     r15d, 255-8
2866        jg      %%_encrypt_by_8
2867
2868
2869
2870        ;; xmm0  - T1
2871        ;; xmm10 - T2
2872        ;; xmm11 - T3
2873        ;; xmm12 - T4
2874        ;; xmm13 - T5
2875        ;; xmm14 - T6
2876        ;; xmm9  - CTR
2877        ;; xmm1  - XMM1
2878        ;; xmm2  - XMM2
2879        ;; xmm3  - XMM3
2880        ;; xmm4  - XMM4
2881        ;; xmm5  - XMM5
2882        ;; xmm6  - XMM6
2883        ;; xmm7  - XMM7
2884        ;; xmm8  - XMM8
2885        ;; xmm15 - T7
2886        add     r15b, 8
2887        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
2888        add     %%DATA_OFFSET, 128
2889        sub     r13, 128
2890        cmp     r13, 128
2891        jge     %%_encrypt_by_8_new
2892
2893        vpshufb xmm9, [rel SHUF_MASK]
2894        jmp     %%_encrypt_by_8_parallel_done
2895
2896%%_encrypt_by_8:
2897        vpshufb xmm9, [rel SHUF_MASK]
2898        add     r15b, 8
2899        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
2900        vpshufb  xmm9, [rel SHUF_MASK]
2901        add     %%DATA_OFFSET, 128
2902        sub     r13, 128
2903        cmp     r13, 128
2904        jge     %%_encrypt_by_8_new
2905        vpshufb  xmm9, [rel SHUF_MASK]
2906
2907
2908%%_encrypt_by_8_parallel_done:
2909        ;; Test to see if we need a by 8 with partial block. At this point
2910        ;; bytes remaining should be either zero or between 113-127.
2911        cmp     r13, 0
2912        je      %%_encrypt_done
2913
2914%%_encrypt_by_8_partial:
2915        ;; Shuffle needed to align key for partial block xor. out_order
2916        ;; is a little faster because it avoids extra shuffles.
2917        ;; TBD: Might need to account for when we don't have room to increment the counter.
2918
2919
2920        ;; Process parallel buffers with a final partial block.
2921        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
2922
2923
2924        add     %%DATA_OFFSET, 128-16
2925        sub     r13, 128-16
2926
2927%%_encrypt_final_partial:
2928
2929        vpshufb  xmm8, [rel SHUF_MASK]
2930        mov     [%%GDATA_CTX + PBlockLen], r13
2931        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
2932
2933        ;; xmm8  - Final encrypted counter - need to hash with partial or full block ciphertext
2934        ;;                            GDATA,  KEY,   T1,    T2
2935        ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
2936
2937        vpshufb  xmm8, [rel SHUF_MASK]
2938
2939
2940%%_encrypt_done:
2941
2942        ;; Mapping to macro parameters
2943        ;; IN:
2944        ;;   xmm9 contains the counter
2945        ;;   xmm1-xmm8 contain the xor'd ciphertext
2946        ;; OUT:
2947        ;;   xmm14 contains the final hash
2948        ;;             GDATA,   T1,    T2,    T3,    T4,    T5,    T6,    T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
2949%ifidn %%INSTANCE_TYPE, multi_call
2950        mov     r13, [%%GDATA_CTX + PBlockLen]
2951        cmp     r13, 0
2952        jz      %%_hash_last_8
2953        GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
2954        ;; XOR the partial word into the hash
2955        vpxor   xmm14, xmm14, xmm8
2956        jmp     %%_ghash_done
2957%endif
2958%%_hash_last_8:
2959        GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
2960
2961%%_ghash_done:
2962        vmovdqu [%%GDATA_CTX + CurCount], xmm9  ; my_ctx_data.current_counter = xmm9
2963        vmovdqu [%%GDATA_CTX + AadHash], xmm14      ; my_ctx_data.aad hash = xmm14
2964
2965%%_enc_dec_done:
2966
2967
2968%endmacro                       ; GCM_ENC_DEC
2969
2970;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2971; GCM_COMPLETE Finishes Encryption/Decryption of last partial block after GCM_UPDATE finishes.
2972; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX).
2973; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
2974; Clobbers rax, r10-r12, and xmm0-xmm2, xmm5-xmm6, xmm9-xmm11, xmm13-xmm15
2975;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2976%macro  GCM_COMPLETE            5
2977%define %%GDATA_KEY             %1
2978%define %%GDATA_CTX             %2
2979%define %%AUTH_TAG              %3
2980%define %%AUTH_TAG_LEN          %4
2981%define %%INSTANCE_TYPE         %5
2982%define %%PLAIN_CYPH_LEN        rax
2983
2984        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
2985        ;; Start AES as early as possible
2986        vmovdqu xmm9, [%%GDATA_CTX + OrigIV]    ; xmm9 = Y0
2987        ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9  ; E(K, Y0)
2988
2989%ifidn %%INSTANCE_TYPE, multi_call
2990        ;; If the GCM function is called as a single function call rather
2991        ;; than invoking the individual parts (init, update, finalize) we
2992        ;; can remove a write to read dependency on AadHash.
2993        vmovdqu xmm14, [%%GDATA_CTX + AadHash]
2994
2995        ;; Encrypt the final partial block. If we did this as a single call then
2996        ;; the partial block was handled in the main GCM_ENC_DEC macro.
2997	mov	r12, [%%GDATA_CTX + PBlockLen]
2998	cmp	r12, 0
2999
3000	je %%_partial_done
3001
3002	GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
3003	vmovdqu [%%GDATA_CTX + AadHash], xmm14
3004
3005%%_partial_done:
3006
3007%endif
3008
3009        mov     r12, [%%GDATA_CTX + AadLen]     ; r12 = aadLen (number of bytes)
3010        mov     %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
3011
3012        shl     r12, 3                      ; convert into number of bits
3013        vmovq   xmm15, r12                  ; len(A) in xmm15
3014
3015        shl     %%PLAIN_CYPH_LEN, 3         ; len(C) in bits  (*128)
3016        vmovq   xmm1, %%PLAIN_CYPH_LEN
3017        vpslldq xmm15, xmm15, 8             ; xmm15 = len(A)|| 0x0000000000000000
3018        vpxor   xmm15, xmm15, xmm1          ; xmm15 = len(A)||len(C)
3019
3020        vpxor   xmm14, xmm15
3021        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
3022        vpshufb  xmm14, [rel SHUF_MASK]         ; perform a 16Byte swap
3023
3024        vpxor   xmm9, xmm9, xmm14
3025
3026
3027%%_return_T:
3028        mov     r10, %%AUTH_TAG             ; r10 = authTag
3029        mov     r11, %%AUTH_TAG_LEN         ; r11 = auth_tag_len
3030
3031        cmp     r11, 16
3032        je      %%_T_16
3033
3034        cmp     r11, 12
3035        je      %%_T_12
3036
3037        cmp     r11, 8
3038        je      %%_T_8
3039
3040        simd_store_avx r10, xmm9, r11, r12, rax
3041        jmp     %%_return_T_done
3042%%_T_8:
3043        vmovq    rax, xmm9
3044        mov     [r10], rax
3045        jmp     %%_return_T_done
3046%%_T_12:
3047        vmovq    rax, xmm9
3048        mov     [r10], rax
3049        vpsrldq xmm9, xmm9, 8
3050        vmovd    eax, xmm9
3051        mov     [r10 + 8], eax
3052        jmp     %%_return_T_done
3053%%_T_16:
3054        vmovdqu  [r10], xmm9
3055
3056%%_return_T_done:
3057
3058%ifdef SAFE_DATA
3059        ;; Clear sensitive data from context structure
3060        vpxor   xmm0, xmm0
3061        vmovdqu [%%GDATA_CTX + AadHash], xmm0
3062        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
3063%endif
3064%endmacro ; GCM_COMPLETE
3065
3066
3067;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3068;void   aes_gcm_precomp_128_avx512 /
3069;       aes_gcm_precomp_192_avx512 /
3070;       aes_gcm_precomp_256_avx512
3071;       (struct gcm_key_data *key_data)
3072;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3073MKGLOBAL(FN_NAME(precomp,_),function,)
3074FN_NAME(precomp,_):
3075;; Parameter is passed through register
3076%ifdef SAFE_PARAM
3077        ;; Check key_data != NULL
3078        cmp     arg1, 0
3079        jz      exit_precomp
3080%endif
3081
3082        push    r12
3083        push    r13
3084        push    r14
3085        push    r15
3086
3087        mov     r14, rsp
3088
3089
3090
3091        sub     rsp, VARIABLE_OFFSET
3092        and     rsp, ~63                                 ; align rsp to 64 bytes
3093
3094%ifidn __OUTPUT_FORMAT__, win64
3095        ; only xmm6 needs to be maintained
3096        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
3097%endif
3098
3099        vpxor   xmm6, xmm6
3100        ENCRYPT_SINGLE_BLOCK    arg1, xmm6              ; xmm6 = HashKey
3101
3102        vpshufb  xmm6, [rel SHUF_MASK]
3103        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
3104        vmovdqa  xmm2, xmm6
3105        vpsllq   xmm6, xmm6, 1
3106        vpsrlq   xmm2, xmm2, 63
3107        vmovdqa  xmm1, xmm2
3108        vpslldq  xmm2, xmm2, 8
3109        vpsrldq  xmm1, xmm1, 8
3110        vpor     xmm6, xmm6, xmm2
3111        ;reduction
3112        vpshufd  xmm2, xmm1, 00100100b
3113        vpcmpeqd xmm2, [rel TWOONE]
3114        vpand    xmm2, xmm2, [rel POLY]
3115        vpxor    xmm6, xmm6, xmm2                       ; xmm6 holds the HashKey<<1 mod poly
3116        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3117        vmovdqu  [arg1 + HashKey], xmm6                 ; store HashKey<<1 mod poly
3118
3119
3120        PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
3121
3122%ifdef SAFE_DATA
3123        clear_scratch_gps_asm
3124        clear_scratch_zmms_asm
3125%endif
3126%ifidn __OUTPUT_FORMAT__, win64
3127        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
3128%endif
3129        mov     rsp, r14
3130
3131        pop     r15
3132        pop     r14
3133        pop     r13
3134        pop     r12
3135
3136exit_precomp:
3137        ret
3138
3139
3140;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3141;void   aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512
3142;       (const struct gcm_key_data *key_data,
3143;        struct gcm_context_data *context_data,
3144;        u8       *iv,
3145;        const u8 *aad,
3146;        u64      aad_len);
3147;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3148MKGLOBAL(FN_NAME(init,_),function,)
3149FN_NAME(init,_):
3150        push    r12
3151        push    r13
3152%ifidn __OUTPUT_FORMAT__, win64
3153        push    r14
3154        push    r15
3155        mov     r14, rsp
3156	; xmm6 needs to be maintained for Windows
3157	sub	rsp, 1*16
3158	vmovdqu [rsp + 0*16], xmm6
3159%endif
3160
3161%ifdef SAFE_PARAM
3162        ;; Check key_data != NULL
3163        cmp     arg1, 0
3164        jz      exit_init
3165
3166        ;; Check context_data != NULL
3167        cmp     arg2, 0
3168        jz      exit_init
3169
3170        ;; Check IV != NULL
3171        cmp     arg3, 0
3172        jz      exit_init
3173
3174        ;; Check if aad_len == 0
3175        cmp     arg5, 0
3176        jz      skip_aad_check_init
3177
3178        ;; Check aad != NULL (aad_len != 0)
3179        cmp     arg4, 0
3180        jz      exit_init
3181
3182skip_aad_check_init:
3183%endif
3184        GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12
3185
3186%ifdef SAFE_DATA
3187        clear_scratch_gps_asm
3188        clear_scratch_zmms_asm
3189%endif
3190exit_init:
3191%ifidn __OUTPUT_FORMAT__, win64
3192	vmovdqu	xmm6 , [rsp + 0*16]
3193        mov     rsp, r14
3194        pop     r15
3195        pop     r14
3196%endif
3197        pop     r13
3198        pop     r12
3199        ret
3200
3201
3202;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3203;void   aes_gcm_init_var_iv_128_avx512 / aes_gcm_init_var_iv_192_avx512 /
3204;       aes_gcm_init_var_iv_256_avx512
3205;       (const struct gcm_key_data *key_data,
3206;        struct gcm_context_data *context_data,
3207;        u8        *iv,
3208;        const u64 iv_len,
3209;        const u8  *aad,
3210;        const u64 aad_len);
3211;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3212MKGLOBAL(FN_NAME(init_var_iv,_),function,)
3213FN_NAME(init_var_iv,_):
3214        push    r12
3215        push    r13
3216%ifidn __OUTPUT_FORMAT__, win64
3217        push    r14
3218        push    r15
3219        mov     r14, rsp
3220	; xmm6 needs to be maintained for Windows
3221	sub	rsp, 1*16
3222	vmovdqu	[rsp + 0*16], xmm6
3223%endif
3224
3225%ifdef SAFE_PARAM
3226        ;; Check key_data != NULL
3227        cmp     arg1, 0
3228        jz      exit_init_IV
3229
3230        ;; Check context_data != NULL
3231        cmp     arg2, 0
3232        jz      exit_init_IV
3233
3234        ;; Check IV != NULL
3235        cmp     arg3, 0
3236        jz      exit_init_IV
3237
3238        ;; Check iv_len != 0
3239        cmp     arg4, 0
3240        jz      exit_init_IV
3241
3242        ;; Check if aad_len == 0
3243        cmp     arg6, 0
3244        jz      skip_aad_check_init_IV
3245
3246        ;; Check aad != NULL (aad_len != 0)
3247        cmp     arg5, 0
3248        jz      exit_init_IV
3249
3250skip_aad_check_init_IV:
3251%endif
3252        cmp     arg4, 12
3253        je      iv_len_12_init_IV
3254
3255	GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, arg4
3256        jmp     skip_iv_len_12_init_IV
3257
3258iv_len_12_init_IV:
3259	GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12
3260
3261skip_iv_len_12_init_IV:
3262%ifdef SAFE_DATA
3263        clear_scratch_gps_asm
3264        clear_scratch_zmms_asm
3265%endif
3266exit_init_IV:
3267%ifidn __OUTPUT_FORMAT__, win64
3268	vmovdqu xmm6 , [rsp + 0*16]
3269        mov     rsp, r14
3270        pop     r15
3271        pop     r14
3272%endif
3273        pop     r13
3274        pop     r12
3275        ret
3276
3277
3278;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3279;void   aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 /
3280;       aes_gcm_enc_256_update_avx512
3281;       (const struct gcm_key_data *key_data,
3282;        struct gcm_context_data *context_data,
3283;        u8       *out,
3284;        const u8 *in,
3285;        u64      plaintext_len);
3286;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3287MKGLOBAL(FN_NAME(enc,_update_),function,)
3288FN_NAME(enc,_update_):
3289
3290        FUNC_SAVE
3291
3292%ifdef SAFE_PARAM
3293        ;; Check key_data != NULL
3294        cmp     arg1, 0
3295        jz      exit_update_enc
3296
3297        ;; Check context_data != NULL
3298        cmp     arg2, 0
3299        jz      exit_update_enc
3300
3301        ;; Check if plaintext_len == 0
3302        cmp     arg5, 0
3303        jz      skip_in_out_check_update_enc
3304
3305        ;; Check out != NULL (plaintext_len != 0)
3306        cmp     arg3, 0
3307        jz      exit_update_enc
3308
3309        ;; Check in != NULL (plaintext_len != 0)
3310        cmp     arg4, 0
3311        jz      exit_update_enc
3312
3313skip_in_out_check_update_enc:
3314%endif
3315        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
3316
3317exit_update_enc:
3318        FUNC_RESTORE
3319
3320        ret
3321
3322
3323;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3324;void   aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 /
3325;       aes_gcm_dec_256_update_avx512
3326;       (const struct gcm_key_data *key_data,
3327;        struct gcm_context_data *context_data,
3328;        u8       *out,
3329;        const u8 *in,
3330;        u64      plaintext_len);
3331;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3332MKGLOBAL(FN_NAME(dec,_update_),function,)
3333FN_NAME(dec,_update_):
3334
3335        FUNC_SAVE
3336
3337%ifdef SAFE_PARAM
3338        ;; Check key_data != NULL
3339        cmp     arg1, 0
3340        jz      exit_update_dec
3341
3342        ;; Check context_data != NULL
3343        cmp     arg2, 0
3344        jz      exit_update_dec
3345
3346        ;; Check if plaintext_len == 0
3347        cmp     arg5, 0
3348        jz      skip_in_out_check_update_dec
3349
3350        ;; Check out != NULL (plaintext_len != 0)
3351        cmp     arg3, 0
3352        jz      exit_update_dec
3353
3354        ;; Check in != NULL (plaintext_len != 0)
3355        cmp     arg4, 0
3356        jz      exit_update_dec
3357
3358skip_in_out_check_update_dec:
3359%endif
3360
3361        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
3362
3363exit_update_dec:
3364        FUNC_RESTORE
3365        ret
3366
3367;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3368;void   aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 /
3369;	aes_gcm_enc_256_finalize_avx512
3370;       (const struct gcm_key_data *key_data,
3371;        struct gcm_context_data *context_data,
3372;        u8       *auth_tag,
3373;        u64      auth_tag_len);
3374;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3375MKGLOBAL(FN_NAME(enc,_finalize_),function,)
3376FN_NAME(enc,_finalize_):
3377
3378;; All parameters are passed through registers
3379%ifdef SAFE_PARAM
3380        ;; Check key_data != NULL
3381        cmp     arg1, 0
3382        jz      exit_enc_fin
3383
3384        ;; Check context_data != NULL
3385        cmp     arg2, 0
3386        jz      exit_enc_fin
3387
3388        ;; Check auth_tag != NULL
3389        cmp     arg3, 0
3390        jz      exit_enc_fin
3391
3392        ;; Check auth_tag_len == 0 or > 16
3393        cmp     arg4, 0
3394        jz      exit_enc_fin
3395
3396        cmp     arg4, 16
3397        ja      exit_enc_fin
3398%endif
3399
3400        push r12
3401
3402%ifidn __OUTPUT_FORMAT__, win64
3403        ; xmm6:xmm15 need to be maintained for Windows
3404	sub	rsp, 7*16
3405        vmovdqu	[rsp + 0*16], xmm6
3406        vmovdqu	[rsp + 1*16], xmm9
3407        vmovdqu	[rsp + 2*16], xmm10
3408        vmovdqu	[rsp + 3*16], xmm11
3409        vmovdqu	[rsp + 4*16], xmm13
3410        vmovdqu	[rsp + 5*16], xmm14
3411        vmovdqu	[rsp + 6*16], xmm15
3412%endif
3413        GCM_COMPLETE    arg1, arg2, arg3, arg4, multi_call
3414
3415%ifdef SAFE_DATA
3416        clear_scratch_gps_asm
3417        clear_scratch_zmms_asm
3418%endif
3419%ifidn __OUTPUT_FORMAT__, win64
3420        vmovdqu	xmm15, [rsp + 6*16]
3421        vmovdqu	xmm14, [rsp + 5*16]
3422        vmovdqu	xmm13, [rsp + 4*16]
3423        vmovdqu	xmm11, [rsp + 3*16]
3424        vmovdqu	xmm10, [rsp + 2*16]
3425        vmovdqu	xmm9,  [rsp + 1*16]
3426        vmovdqu	xmm6,  [rsp + 0*16]
3427        add     rsp, 7*16
3428%endif
3429
3430        pop r12
3431
3432exit_enc_fin:
3433	ret
3434
3435
3436;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3437;void   aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512
3438;	aes_gcm_dec_256_finalize_avx512
3439;       (const struct gcm_key_data *key_data,
3440;        struct gcm_context_data *context_data,
3441;        u8       *auth_tag,
3442;        u64      auth_tag_len);
3443;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3444MKGLOBAL(FN_NAME(dec,_finalize_),function,)
3445FN_NAME(dec,_finalize_):
3446
3447;; All parameters are passed through registers
3448%ifdef SAFE_PARAM
3449        ;; Check key_data != NULL
3450        cmp     arg1, 0
3451        jz      exit_dec_fin
3452
3453        ;; Check context_data != NULL
3454        cmp     arg2, 0
3455        jz      exit_dec_fin
3456
3457        ;; Check auth_tag != NULL
3458        cmp     arg3, 0
3459        jz      exit_dec_fin
3460
3461        ;; Check auth_tag_len == 0 or > 16
3462        cmp     arg4, 0
3463        jz      exit_dec_fin
3464
3465        cmp     arg4, 16
3466        ja      exit_dec_fin
3467%endif
3468
3469        push r12
3470
3471%ifidn __OUTPUT_FORMAT__, win64
3472        ; xmm6:xmm15 need to be maintained for Windows
3473	sub	rsp, 7*16
3474        vmovdqu	[rsp + 0*16], xmm6
3475        vmovdqu	[rsp + 1*16], xmm9
3476        vmovdqu	[rsp + 2*16], xmm10
3477        vmovdqu	[rsp + 3*16], xmm11
3478        vmovdqu	[rsp + 4*16], xmm13
3479        vmovdqu	[rsp + 5*16], xmm14
3480        vmovdqu	[rsp + 6*16], xmm15
3481%endif
3482        GCM_COMPLETE    arg1, arg2, arg3, arg4, multi_call
3483
3484%ifdef SAFE_DATA
3485        clear_scratch_gps_asm
3486        clear_scratch_zmms_asm
3487%endif
3488%ifidn __OUTPUT_FORMAT__, win64
3489        vmovdqu	xmm15, [rsp + 6*16]
3490        vmovdqu	xmm14, [rsp + 5*16]
3491        vmovdqu	xmm13, [rsp + 4*16]
3492        vmovdqu	xmm11, [rsp + 3*16]
3493        vmovdqu	xmm10, [rsp + 2*16]
3494        vmovdqu	xmm9,  [rsp + 1*16]
3495        vmovdqu	xmm6,  [rsp + 0*16]
3496        add     rsp, 7*16
3497%endif
3498
3499        pop r12
3500exit_dec_fin:
3501
3502        ret
3503
3504
3505;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3506;void   aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512
3507;       (const struct gcm_key_data *key_data,
3508;        struct gcm_context_data *context_data,
3509;        u8       *out,
3510;        const u8 *in,
3511;        u64      plaintext_len,
3512;        u8       *iv,
3513;        const u8 *aad,
3514;        u64      aad_len,
3515;        u8       *auth_tag,
3516;        u64      auth_tag_len);
3517;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3518MKGLOBAL(FN_NAME(enc,_),function,)
3519FN_NAME(enc,_):
3520
3521        FUNC_SAVE
3522
3523%ifdef SAFE_PARAM
3524        ;; Check key_data != NULL
3525        cmp     arg1, 0
3526        jz      exit_enc
3527
3528        ;; Check context_data != NULL
3529        cmp     arg2, 0
3530        jz      exit_enc
3531
3532        ;; Check IV != NULL
3533        cmp     arg6, 0
3534        jz      exit_enc
3535
3536        ;; Check auth_tag != NULL
3537        cmp     arg9, 0
3538        jz      exit_enc
3539
3540        ;; Check auth_tag_len == 0 or > 16
3541        cmp     arg10, 0
3542        jz      exit_enc
3543
3544        cmp     arg10, 16
3545        ja      exit_enc
3546
3547        ;; Check if plaintext_len == 0
3548        cmp     arg5, 0
3549        jz      skip_in_out_check_enc
3550
3551        ;; Check out != NULL (plaintext_len != 0)
3552        cmp     arg3, 0
3553        jz      exit_enc
3554
3555        ;; Check in != NULL (plaintext_len != 0)
3556        cmp     arg4, 0
3557        jz      exit_enc
3558
3559skip_in_out_check_enc:
3560        ;; Check if aad_len == 0
3561        cmp     arg8, 0
3562        jz      skip_aad_check_enc
3563
3564        ;; Check aad != NULL (aad_len != 0)
3565        cmp     arg7, 0
3566        jz      exit_enc
3567
3568skip_aad_check_enc:
3569%endif
3570        GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
3571
3572        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC, single_call
3573
3574        GCM_COMPLETE arg1, arg2, arg9, arg10, single_call
3575
3576exit_enc:
3577        FUNC_RESTORE
3578
3579        ret
3580
3581;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3582;void   aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512
3583;       (const struct gcm_key_data *key_data,
3584;        struct gcm_context_data *context_data,
3585;        u8       *out,
3586;        const u8 *in,
3587;        u64      plaintext_len,
3588;        u8       *iv,
3589;        const u8 *aad,
3590;        u64      aad_len,
3591;        u8       *auth_tag,
3592;        u64      auth_tag_len);
3593;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3594MKGLOBAL(FN_NAME(dec,_),function,)
3595FN_NAME(dec,_):
3596
3597        FUNC_SAVE
3598
3599%ifdef SAFE_PARAM
3600        ;; Check key_data != NULL
3601        cmp     arg1, 0
3602        jz      exit_dec
3603
3604        ;; Check context_data != NULL
3605        cmp     arg2, 0
3606        jz      exit_dec
3607
3608        ;; Check IV != NULL
3609        cmp     arg6, 0
3610        jz      exit_dec
3611
3612        ;; Check auth_tag != NULL
3613        cmp     arg9, 0
3614        jz      exit_dec
3615
3616        ;; Check auth_tag_len == 0 or > 16
3617        cmp     arg10, 0
3618        jz      exit_dec
3619
3620        cmp     arg10, 16
3621        ja      exit_dec
3622
3623        ;; Check if plaintext_len == 0
3624        cmp     arg5, 0
3625        jz      skip_in_out_check_dec
3626
3627        ;; Check out != NULL (plaintext_len != 0)
3628        cmp     arg3, 0
3629        jz      exit_dec
3630
3631        ;; Check in != NULL (plaintext_len != 0)
3632        cmp     arg4, 0
3633        jz      exit_dec
3634
3635skip_in_out_check_dec:
3636        ;; Check if aad_len == 0
3637        cmp     arg8, 0
3638        jz      skip_aad_check_dec
3639
3640        ;; Check aad != NULL (aad_len != 0)
3641        cmp     arg7, 0
3642        jz      exit_dec
3643
3644skip_aad_check_dec:
3645%endif
3646
3647        GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
3648
3649        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC, single_call
3650
3651        GCM_COMPLETE arg1, arg2, arg9, arg10, single_call
3652
3653exit_dec:
3654        FUNC_RESTORE
3655
3656        ret
3657
3658;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3659;void   aes_gcm_enc_var_iv_128_avx512 / aes_gcm_enc_var_iv_192_avx512 /
3660;       aes_gcm_enc_var_iv_256_avx512
3661;       (const struct gcm_key_data *key_data,
3662;        struct gcm_context_data *context_data,
3663;        u8        *out,
3664;        const u8  *in,
3665;        u64       plaintext_len,
3666;        u8        *iv,
3667;        const u64 iv_len,
3668;        const u8  *aad,
3669;        const u64 aad_len,
3670;        u8        *auth_tag,
3671;        const u64 auth_tag_len);
3672;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3673MKGLOBAL(FN_NAME(enc_var_iv,_),function,)
3674FN_NAME(enc_var_iv,_):
3675
3676        FUNC_SAVE
3677
3678%ifdef SAFE_PARAM
3679        ;; Check key_data != NULL
3680        cmp     arg1, 0
3681        jz      exit_enc_IV
3682
3683        ;; Check context_data != NULL
3684        cmp     arg2, 0
3685        jz      exit_enc_IV
3686
3687        ;; Check IV != NULL
3688        cmp     arg6, 0
3689        jz      exit_enc_IV
3690
3691        ;; Check IV len != 0
3692        cmp     arg7, 0
3693        jz      exit_enc_IV
3694
3695        ;; Check auth_tag != NULL
3696        cmp     arg10, 0
3697        jz      exit_enc_IV
3698
3699        ;; Check auth_tag_len == 0 or > 16
3700        cmp     arg11, 0
3701        jz      exit_enc_IV
3702
3703        cmp     arg11, 16
3704        ja      exit_enc_IV
3705
3706        ;; Check if plaintext_len == 0
3707        cmp     arg5, 0
3708        jz      skip_in_out_check_enc_IV
3709
3710        ;; Check out != NULL (plaintext_len != 0)
3711        cmp     arg3, 0
3712        jz      exit_enc_IV
3713
3714        ;; Check in != NULL (plaintext_len != 0)
3715        cmp     arg4, 0
3716        jz      exit_enc_IV
3717
3718skip_in_out_check_enc_IV:
3719        ;; Check if aad_len == 0
3720        cmp     arg9, 0
3721        jz      skip_aad_check_enc_IV
3722
3723        ;; Check aad != NULL (aad_len != 0)
3724        cmp     arg8, 0
3725        jz      exit_enc_IV
3726
3727skip_aad_check_enc_IV:
3728%endif
3729        cmp     arg7, 12
3730        je      iv_len_12_enc_IV
3731
3732	GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12, arg7
3733        jmp     skip_iv_len_12_enc_IV
3734
3735iv_len_12_enc_IV:
3736	GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12
3737
3738skip_iv_len_12_enc_IV:
3739        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC, single_call
3740
3741        GCM_COMPLETE arg1, arg2, arg10, arg11, single_call
3742
3743exit_enc_IV:
3744        FUNC_RESTORE
3745
3746        ret
3747
3748;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3749;void   aes_gcm_dec_var_iv_128_avx512 / aes_gcm_dec_var_iv_192_avx512 /
3750;       aes_gcm_dec_var_iv_256_avx512
3751;       (const struct gcm_key_data *key_data,
3752;        struct gcm_context_data *context_data,
3753;        u8        *out,
3754;        const u8  *in,
3755;        u64       plaintext_len,
3756;        u8        *iv,
3757;        const u64 iv_len,
3758;        const u8  *aad,
3759;        const u64 aad_len,
3760;        u8        *auth_tag,
3761;        const u64 auth_tag_len);
3762;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3763MKGLOBAL(FN_NAME(dec_var_iv,_),function,)
3764FN_NAME(dec_var_iv,_):
3765
3766        FUNC_SAVE
3767
3768%ifdef SAFE_PARAM
3769        ;; Check key_data != NULL
3770        cmp     arg1, 0
3771        jz      exit_dec_IV
3772
3773        ;; Check context_data != NULL
3774        cmp     arg2, 0
3775        jz      exit_dec_IV
3776
3777        ;; Check IV != NULL
3778        cmp     arg6, 0
3779        jz      exit_dec_IV
3780
3781        ;; Check IV len != 0
3782        cmp     arg7, 0
3783        jz      exit_dec_IV
3784
3785        ;; Check auth_tag != NULL
3786        cmp     arg10, 0
3787        jz      exit_dec_IV
3788
3789        ;; Check auth_tag_len == 0 or > 16
3790        cmp     arg11, 0
3791        jz      exit_dec_IV
3792
3793        cmp     arg11, 16
3794        ja      exit_dec_IV
3795
3796        ;; Check if plaintext_len == 0
3797        cmp     arg5, 0
3798        jz      skip_in_out_check_dec_IV
3799
3800        ;; Check out != NULL (plaintext_len != 0)
3801        cmp     arg3, 0
3802        jz      exit_dec_IV
3803
3804        ;; Check in != NULL (plaintext_len != 0)
3805        cmp     arg4, 0
3806        jz      exit_dec_IV
3807
3808skip_in_out_check_dec_IV:
3809        ;; Check if aad_len == 0
3810        cmp     arg9, 0
3811        jz      skip_aad_check_dec_IV
3812
3813        ;; Check aad != NULL (aad_len != 0)
3814        cmp     arg8, 0
3815        jz      exit_dec_IV
3816
3817skip_aad_check_dec_IV:
3818%endif
3819        cmp     arg7, 12
3820        je      iv_len_12_dec_IV
3821
3822	GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12, arg7
3823        jmp     skip_iv_len_12_dec_IV
3824
3825iv_len_12_dec_IV:
3826        GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12
3827
3828skip_iv_len_12_dec_IV:
3829        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC, single_call
3830        GCM_COMPLETE arg1, arg2, arg10, arg11, single_call
3831
3832exit_dec_IV:
3833        FUNC_RESTORE
3834        ret
3835
3836%ifdef GCM128_MODE
3837;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3838;void   ghash_avx512
3839;        const struct gcm_key_data *key_data,
3840;        const void   *in,
3841;        const u64    in_len,
3842;        void         *tag,
3843;        const u64    tag_len);
3844;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3845MKGLOBAL(ghash_avx512,function,)
3846ghash_avx512:
3847
3848        FUNC_SAVE
3849
3850%ifdef SAFE_PARAM
3851        ;; Check key_data != NULL
3852        cmp     arg1, 0
3853        jz      exit_ghash
3854
3855        ;; Check in != NULL
3856        cmp     arg2, 0
3857        jz      exit_ghash
3858
3859        ;; Check in_len != 0
3860        cmp     arg3, 0
3861        jz      exit_ghash
3862
3863        ;; Check tag != NULL
3864        cmp     arg4, 0
3865        jz      exit_ghash
3866
3867        ;; Check tag_len != 0
3868        cmp     arg5, 0
3869        jz      exit_ghash
3870%endif
3871
3872        vpxor   xmm0, xmm0
3873        CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \
3874                      r10, r11, r12
3875
3876        vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap
3877
3878        simd_store_avx arg4, xmm0, arg5, r12, rax
3879
3880exit_ghash:
3881        FUNC_RESTORE
3882
3883        ret
3884%endif
3885
3886;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3887; PARTIAL_BLOCK_GMAC: Handles the tag partial blocks between update calls.
3888; Requires the input data be at least 1 byte long.
3889; Input: gcm_context_data (GDATA_CTX), input text (PLAIN_IN), hash subkey (HASH_SUBKEY)
3890; input text length (PLAIN_LEN).
3891; Output: Updated GDATA_CTX
3892; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11
3893;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3894%macro PARTIAL_BLOCK_GMAC       6
3895%define %%GDATA_CTX             %1
3896%define %%PLAIN_IN              %2
3897%define %%PLAIN_LEN             %3
3898%define %%DATA_OFFSET           %4
3899%define %%AAD_HASH              %5
3900%define %%HASH_SUBKEY           %6
3901
3902	mov	r13, [%%GDATA_CTX + PBlockLen]
3903	cmp	r13, 0
3904        ; Leave Macro if no partial blocks
3905	je	%%_partial_block_done
3906
3907        ; Read in input data without over reading
3908	cmp	%%PLAIN_LEN, 16
3909	jl	%%_fewer_than_16_bytes
3910        ; If more than 16 bytes of data, just fill the xmm register
3911	VXLDR   xmm1, [%%PLAIN_IN]
3912	jmp	%%_data_read
3913
3914%%_fewer_than_16_bytes:
3915	lea	r10, [%%PLAIN_IN]
3916	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_LEN, rax
3917
3918        ; Finished reading in data
3919%%_data_read:
3920
3921	lea	r12, [rel SHIFT_MASK]
3922        ; Adjust the shuffle mask pointer to be able to shift r13 bytes
3923        ; (16-r13 is the number of bytes in plaintext mod 16)
3924	add	r12, r13
3925        ; Get the appropriate shuffle mask
3926	vmovdqu	xmm2, [r12]
3927	vmovdqa	xmm3, xmm1
3928
3929	mov	r15, %%PLAIN_LEN
3930	add	r15, r13
3931        ; Set r15 to be the amount of data left in PLAIN_IN after filling the block
3932	sub	r15, 16
3933        ; Determine if partial block is not being filled and shift mask accordingly
3934	jge	%%_no_extra_mask_1
3935	sub	r12, r15
3936%%_no_extra_mask_1:
3937
3938        ; Get the appropriate mask to mask out bottom r13 bytes of xmm3
3939	vmovdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]
3940
3941	vpand	xmm3, xmm1
3942	vpshufb	xmm3, [rel SHUF_MASK]
3943	vpshufb	xmm3, xmm2
3944	vpxor	%%AAD_HASH, xmm3
3945
3946	cmp	r15,0
3947	jl	%%_partial_incomplete_1
3948
3949        ; GHASH computation for the last <16 Byte block
3950	GHASH_MUL	%%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6
3951	xor	rax, rax
3952	mov	[%%GDATA_CTX + PBlockLen], rax
3953	jmp	%%_ghash_done
3954%%_partial_incomplete_1:
3955%ifidn __OUTPUT_FORMAT__, win64
3956        mov     rax, %%PLAIN_LEN
3957        add     [%%GDATA_CTX + PBlockLen], rax
3958%else
3959        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_LEN
3960%endif
3961%%_ghash_done:
3962	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
3963
3964        cmp     r15, 0
3965        jl      %%_partial_fill
3966
3967        mov     r12, 16
3968        ; Set r12 to be the number of bytes to skip after this macro
3969        sub     r12, r13
3970
3971        jmp     %%offset_set
3972%%_partial_fill:
3973        mov     r12, %%PLAIN_LEN
3974%%offset_set:
3975        mov     %%DATA_OFFSET, r12
3976%%_partial_block_done:
3977%endmacro ; PARTIAL_BLOCK_GMAC
3978
3979;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3980;void   imb_aes_gmac_update_128_avx512 / imb_aes_gmac_update_192_avx512 /
3981;       imb_aes_gmac_update_256_avx512
3982;        const struct gcm_key_data *key_data,
3983;        struct gcm_context_data *context_data,
3984;        const   u8 *in,
3985;        const   u64 plaintext_len);
3986;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3987MKGLOBAL(GMAC_FN_NAME(update),function,)
3988GMAC_FN_NAME(update):
3989
3990	FUNC_SAVE
3991
3992        ;; Check if plaintext_len == 0
3993	cmp	arg4, 0
3994	je	exit_gmac_update
3995
3996%ifdef SAFE_PARAM
3997        ;; Check key_data != NULL
3998        cmp     arg1, 0
3999        jz      exit_gmac_update
4000
4001        ;; Check context_data != NULL
4002        cmp     arg2, 0
4003        jz      exit_gmac_update
4004
4005        ;; Check in != NULL (plaintext_len != 0)
4006        cmp     arg3, 0
4007        jz      exit_gmac_update
4008%endif
4009
4010        ; Increment size of "AAD length" for GMAC
4011        add     [arg2 + AadLen], arg4
4012
4013        ;; Deal with previous partial block
4014	xor	r11, r11
4015	vmovdqu	xmm13, [arg1 + HashKey]
4016	vmovdqu	xmm8, [arg2 + AadHash]
4017
4018	PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm8, xmm13
4019
4020        ; CALC_AAD_HASH needs to deal with multiple of 16 bytes
4021        sub     arg4, r11
4022        add     arg3, r11
4023
4024        vmovq   xmm7, arg4 ; Save remaining length
4025        and     arg4, -16 ; Get multiple of 16 bytes
4026
4027        or      arg4, arg4
4028        jz      no_full_blocks
4029
4030        ;; Calculate GHASH of this segment
4031        CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \
4032                      r10, r11, r12
4033	vmovdqu	[arg2 + AadHash], xmm8	; ctx_data.aad hash = aad_hash
4034
4035no_full_blocks:
4036        add     arg3, arg4 ; Point at partial block
4037
4038        vmovq   arg4, xmm7 ; Restore original remaining length
4039        and     arg4, 15
4040        jz      exit_gmac_update
4041
4042        ; Save next partial block
4043        mov	[arg2 + PBlockLen], arg4
4044        READ_SMALL_DATA_INPUT xmm1, arg3, arg4, r11
4045        vpshufb xmm1, [rel SHUF_MASK]
4046        vpxor   xmm8, xmm1
4047        vmovdqu [arg2 + AadHash], xmm8
4048
4049exit_gmac_update:
4050	FUNC_RESTORE
4051
4052	ret
4053
4054%ifdef LINUX
4055section .note.GNU-stack noalloc noexec nowrite progbits
4056%endif
4057