1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2018-2020, Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30; 31; Authors: 32; Erdinc Ozturk 33; Vinodh Gopal 34; James Guilford 35; Tomasz Kantecki 36; 37; 38; References: 39; This code was derived and highly optimized from the code described in paper: 40; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 41; The details of the implementation is explained in: 42; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. 43; 44; 45; 46; 47; Assumptions: 48; 49; 50; 51; iv: 52; 0 1 2 3 53; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 54; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 55; | Salt (From the SA) | 56; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 57; | Initialization Vector | 58; | (This is the sequence number from IPSec header) | 59; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 60; | 0x1 | 61; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62; 63; 64; 65; AAD: 66; AAD will be padded with 0 to the next 16byte multiple 67; for example, assume AAD is a u32 vector 68; 69; if AAD is 8 bytes: 70; AAD[3] = {A0, A1}; 71; padded AAD in xmm register = {A1 A0 0 0} 72; 73; 0 1 2 3 74; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 75; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 76; | SPI (A1) | 77; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 78; | 32-bit Sequence Number (A0) | 79; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 80; | 0x0 | 81; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 82; 83; AAD Format with 32-bit Sequence Number 84; 85; if AAD is 12 bytes: 86; AAD[3] = {A0, A1, A2}; 87; padded AAD in xmm register = {A2 A1 A0 0} 88; 89; 0 1 2 3 90; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 91; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 92; | SPI (A2) | 93; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 94; | 64-bit Extended Sequence Number {A1,A0} | 95; | | 96; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 97; | 0x0 | 98; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99; 100; AAD Format with 64-bit Extended Sequence Number 101; 102; 103; aadLen: 104; Must be a multiple of 4 bytes and from the definition of the spec. 105; The code additionally supports any aadLen length. 106; 107; TLen: 108; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 109; 110; poly = x^128 + x^127 + x^126 + x^121 + 1 111; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. 112; 113 114%include "include/os.asm" 115%include "include/reg_sizes.asm" 116%include "include/clear_regs.asm" 117%include "include/gcm_defines.asm" 118%include "include/gcm_keys_avx2_avx512.asm" 119 120%include "mb_mgr_datastruct.asm" 121%include "imb_job.asm" 122%include "include/memcpy.asm" 123 124%ifndef GCM128_MODE 125%ifndef GCM192_MODE 126%ifndef GCM256_MODE 127%error "No GCM mode selected for gcm_avx512.asm!" 128%endif 129%endif 130%endif 131 132;; Decide on AES-GCM key size to compile for 133%ifdef GCM128_MODE 134%define NROUNDS 9 135%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx512 136%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _128_ %+ avx512 137%endif 138 139%ifdef GCM192_MODE 140%define NROUNDS 11 141%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx512 142%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _192_ %+ avx512 143%endif 144 145%ifdef GCM256_MODE 146%define NROUNDS 13 147%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx512 148%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _256_ %+ avx512 149%endif 150 151section .text 152default rel 153 154; need to push 4 registers into stack to maintain 155%define STACK_OFFSET 8*4 156 157%ifidn __OUTPUT_FORMAT__, win64 158 %define XMM_STORAGE 16*10 159%else 160 %define XMM_STORAGE 0 161%endif 162 163%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) 164%define TMP3 16*1 ; Temporary storage for AES State 3 165%define TMP4 16*2 ; Temporary storage for AES State 4 166%define TMP5 16*3 ; Temporary storage for AES State 5 167%define TMP6 16*4 ; Temporary storage for AES State 6 168%define TMP7 16*5 ; Temporary storage for AES State 7 169%define TMP8 16*6 ; Temporary storage for AES State 8 170%define LOCAL_STORAGE 16*7 171%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE 172 173;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 174; Utility Macros 175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 176 177;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 178; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 179; Input: A and B (128-bits each, bit-reflected) 180; Output: C = A*B*x mod poly, (i.e. >>1 ) 181; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 182; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 183;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 184%macro GHASH_MUL 7 185%define %%GH %1 ; 16 Bytes 186%define %%HK %2 ; 16 Bytes 187%define %%T1 %3 188%define %%T2 %4 189%define %%T3 %5 190%define %%T4 %6 191%define %%T5 %7 192 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 193 194 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 195 vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 196 vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 197 vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 198 vpxor %%GH, %%GH, %%T3 199 200 201 vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs 202 vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs 203 204 vpxor %%T1, %%T1, %%T3 205 vpxor %%GH, %%GH, %%T2 206 207 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 208 ;first phase of the reduction 209 vmovdqu %%T3, [rel POLY2] 210 211 vpclmulqdq %%T2, %%T3, %%GH, 0x01 212 vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs 213 214 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete 215 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 216 ;second phase of the reduction 217 vpclmulqdq %%T2, %%T3, %%GH, 0x00 218 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 219 220 vpclmulqdq %%GH, %%T3, %%GH, 0x10 221 vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 222 223 vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete 224 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 225 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH 226%endmacro 227 228 229; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512 230; functions, but are kept to allow users to switch cpu architectures between calls 231; of pre, init, update, and finalize. 232%macro PRECOMPUTE 8 233%define %%GDATA %1 234%define %%HK %2 235%define %%T1 %3 236%define %%T2 %4 237%define %%T3 %5 238%define %%T4 %6 239%define %%T5 %7 240%define %%T6 %8 241 242 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 243 vmovdqa %%T5, %%HK 244 245 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly 246 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly 247 248 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly 249 vmovdqu [%%GDATA + HashKey_3], %%T5 250 251 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly 252 vmovdqu [%%GDATA + HashKey_4], %%T5 253 254 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly 255 vmovdqu [%%GDATA + HashKey_5], %%T5 256 257 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly 258 vmovdqu [%%GDATA + HashKey_6], %%T5 259 260 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly 261 vmovdqu [%%GDATA + HashKey_7], %%T5 262 263 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly 264 vmovdqu [%%GDATA + HashKey_8], %%T5 265%endmacro 266 267 268;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 269; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. 270; Returns 0 if data has length 0. 271; Input: The input data (INPUT), that data's length (LENGTH). 272; Output: The packed xmm register (OUTPUT). 273;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 274%macro READ_SMALL_DATA_INPUT 4 275%define %%OUTPUT %1 ; %%OUTPUT is an xmm register 276%define %%INPUT %2 277%define %%LENGTH %3 278%define %%TMP1 %4 279 280 lea %%TMP1, [rel byte_len_to_mask_table] 281%ifidn __OUTPUT_FORMAT__, win64 282 add %%TMP1, %%LENGTH 283 add %%TMP1, %%LENGTH 284 kmovw k1, [%%TMP1] 285%else 286 kmovw k1, [%%TMP1 + %%LENGTH*2] 287%endif 288 vmovdqu8 XWORD(%%OUTPUT){k1}{z}, [%%INPUT] 289 290%endmacro ; READ_SMALL_DATA_INPUT 291 292 293;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 294; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 295; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). 296; Output: The hash of the data (AAD_HASH). 297;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 298%macro CALC_AAD_HASH 13 299%define %%A_IN %1 300%define %%A_LEN %2 301%define %%AAD_HASH %3 302%define %%GDATA_KEY %4 303%define %%XTMP0 %5 ; xmm temp reg 5 304%define %%XTMP1 %6 ; xmm temp reg 5 305%define %%XTMP2 %7 306%define %%XTMP3 %8 307%define %%XTMP4 %9 308%define %%XTMP5 %10 ; xmm temp reg 5 309%define %%T1 %11 ; temp reg 1 310%define %%T2 %12 311%define %%T3 %13 312 313 314 mov %%T1, %%A_IN ; T1 = AAD 315 mov %%T2, %%A_LEN ; T2 = aadLen 316 317%%_get_AAD_loop128: 318 cmp %%T2, 128 319 jl %%_exit_AAD_loop128 320 321 vmovdqu %%XTMP0, [%%T1 + 16*0] 322 vpshufb %%XTMP0, [rel SHUF_MASK] 323 324 vpxor %%XTMP0, %%AAD_HASH 325 326 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8] 327 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 328 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 329 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 330 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 331 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 332 333%assign i 1 334%assign j 7 335%rep 7 336 vmovdqu %%XTMP0, [%%T1 + 16*i] 337 vpshufb %%XTMP0, [rel SHUF_MASK] 338 339 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j] 340 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 341 vpxor %%XTMP1, %%XTMP1, %%XTMP4 342 343 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 344 vpxor %%XTMP2, %%XTMP2, %%XTMP4 345 346 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 347 vpxor %%XTMP3, %%XTMP3, %%XTMP4 348 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 349 vpxor %%XTMP3, %%XTMP3, %%XTMP4 350%assign i (i + 1) 351%assign j (j - 1) 352%endrep 353 354 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs 355 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs 356 vpxor %%XTMP2, %%XTMP2, %%XTMP4 357 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) 358 359 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 360 ;first phase of the reduction 361 vmovdqa %%XTMP5, [rel POLY2] 362 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 363 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs 364 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete 365 366 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 367 ;second phase of the reduction 368 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 369 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 370 371 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 372 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) 373 374 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete 375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 376 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 377 378 sub %%T2, 128 379 je %%_CALC_AAD_done 380 381 add %%T1, 128 382 jmp %%_get_AAD_loop128 383 384%%_exit_AAD_loop128: 385 cmp %%T2, 16 386 jl %%_get_small_AAD_block 387 388 ;; calculate hash_key position to start with 389 mov %%T3, %%T2 390 and %%T3, -16 ; 1 to 7 blocks possible here 391 neg %%T3 392 add %%T3, HashKey_1 + 16 393 lea %%T3, [%%GDATA_KEY + %%T3] 394 395 vmovdqu %%XTMP0, [%%T1] 396 vpshufb %%XTMP0, [rel SHUF_MASK] 397 398 vpxor %%XTMP0, %%AAD_HASH 399 400 vmovdqu %%XTMP5, [%%T3] 401 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 402 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 403 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 404 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 405 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 406 407 add %%T3, 16 ; move to next hashkey 408 add %%T1, 16 ; move to next data block 409 sub %%T2, 16 410 cmp %%T2, 16 411 jl %%_AAD_reduce 412 413%%_AAD_blocks: 414 vmovdqu %%XTMP0, [%%T1] 415 vpshufb %%XTMP0, [rel SHUF_MASK] 416 417 vmovdqu %%XTMP5, [%%T3] 418 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 419 vpxor %%XTMP1, %%XTMP1, %%XTMP4 420 421 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 422 vpxor %%XTMP2, %%XTMP2, %%XTMP4 423 424 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 425 vpxor %%XTMP3, %%XTMP3, %%XTMP4 426 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 427 vpxor %%XTMP3, %%XTMP3, %%XTMP4 428 429 add %%T3, 16 ; move to next hashkey 430 add %%T1, 16 431 sub %%T2, 16 432 cmp %%T2, 16 433 jl %%_AAD_reduce 434 jmp %%_AAD_blocks 435 436%%_AAD_reduce: 437 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs 438 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs 439 vpxor %%XTMP2, %%XTMP2, %%XTMP4 440 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) 441 442 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 443 ;first phase of the reduction 444 vmovdqa %%XTMP5, [rel POLY2] 445 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 446 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs 447 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete 448 449 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 450 ;second phase of the reduction 451 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 452 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 453 454 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 455 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) 456 457 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete 458 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 459 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 460 461 or %%T2, %%T2 462 je %%_CALC_AAD_done 463 464%%_get_small_AAD_block: 465 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey] 466 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3 467 ;byte-reflect the AAD data 468 vpshufb %%XTMP1, [rel SHUF_MASK] 469 vpxor %%AAD_HASH, %%XTMP1 470 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 471 472%%_CALC_AAD_done: 473 474%endmacro ; CALC_AAD_HASH 475 476;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 477; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. 478; Requires the input data be at least 1 byte long. 479; Input: gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), 480; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), 481; the hash subkey (HASH_SUBKEY) and whether encoding or decoding (ENC_DEC) 482; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX 483; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11 484;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 485%macro PARTIAL_BLOCK 8 486%define %%GDATA_CTX %1 487%define %%CYPH_PLAIN_OUT %2 488%define %%PLAIN_CYPH_IN %3 489%define %%PLAIN_CYPH_LEN %4 490%define %%DATA_OFFSET %5 491%define %%AAD_HASH %6 492%define %%HASH_SUBKEY %7 493%define %%ENC_DEC %8 494 495 mov r13, [%%GDATA_CTX + PBlockLen] 496 cmp r13, 0 497 je %%_partial_block_done ;Leave Macro if no partial blocks 498 499 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading 500 jl %%_fewer_than_16_bytes 501 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register 502 jmp %%_data_read 503 504%%_fewer_than_16_bytes: 505 lea r10, [%%PLAIN_CYPH_IN] 506 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax 507 508%%_data_read: ;Finished reading in data 509 510 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key 511 512 lea r12, [rel SHIFT_MASK] 513 514 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) 515 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask 516 vpshufb xmm9, xmm2 ;shift right r13 bytes 517 518%ifidn %%ENC_DEC, DEC 519 vmovdqa xmm3, xmm1 520%endif 521 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) 522 523 mov r15, %%PLAIN_CYPH_LEN 524 add r15, r13 525 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block 526 jge %%_no_extra_mask ;Determine if if partial block is not being filled and shift mask accordingly 527 sub r12, r15 528%%_no_extra_mask: 529 530 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 531 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 532 533%ifidn %%ENC_DEC, DEC 534 vpand xmm3, xmm1 535 vpshufb xmm3, [rel SHUF_MASK] 536 vpshufb xmm3, xmm2 537 vpxor %%AAD_HASH, xmm3 538%else 539 vpshufb xmm9, [rel SHUF_MASK] 540 vpshufb xmm9, xmm2 541 vpxor %%AAD_HASH, xmm9 542%endif 543 cmp r15,0 544 jl %%_partial_incomplete 545 546 GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block 547 xor rax,rax 548 mov [%%GDATA_CTX + PBlockLen], rax 549 jmp %%_enc_dec_done 550%%_partial_incomplete: 551%ifidn __OUTPUT_FORMAT__, win64 552 mov rax, %%PLAIN_CYPH_LEN 553 add [%%GDATA_CTX + PBlockLen], rax 554%else 555 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN 556%endif 557%%_enc_dec_done: 558 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH 559 560%ifidn %%ENC_DEC, ENC 561 vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext 562 vpshufb xmm9, xmm2 563%endif 564 565 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 566 ; output encrypted Bytes 567 cmp r15,0 568 jl %%_partial_fill 569 mov r12, r13 570 mov r13, 16 571 sub r13, r12 ; Set r13 to be the number of bytes to write out 572 jmp %%_count_set 573%%_partial_fill: 574 mov r13, %%PLAIN_CYPH_LEN 575%%_count_set: 576 lea rax, [rel byte_len_to_mask_table] 577 kmovw k1, [rax + r13*2] 578 vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, xmm9 579 add %%DATA_OFFSET, r13 580%%_partial_block_done: 581%endmacro ; PARTIAL_BLOCK 582 583 584%macro GHASH_SINGLE_MUL 9 585%define %%GDATA %1 586%define %%HASHKEY %2 587%define %%CIPHER %3 588%define %%STATE_11 %4 589%define %%STATE_00 %5 590%define %%STATE_MID %6 591%define %%T1 %7 592%define %%T2 %8 593%define %%FIRST %9 594 595 vmovdqu %%T1, [%%GDATA + %%HASHKEY] 596%ifidn %%FIRST, first 597 vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 598 vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 599 vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 600 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 601 vpxor %%STATE_MID, %%STATE_MID, %%T2 602%else 603 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 604 vpxor %%STATE_11, %%STATE_11, %%T2 605 606 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 607 vpxor %%STATE_00, %%STATE_00, %%T2 608 609 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 610 vpxor %%STATE_MID, %%STATE_MID, %%T2 611 612 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 613 vpxor %%STATE_MID, %%STATE_MID, %%T2 614%endif 615 616%endmacro 617 618; if a = number of total plaintext bytes 619; b = floor(a/16) 620; %%num_initial_blocks = b mod 8; 621; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext 622; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. 623; Updated AAD_HASH is returned in %%T3 624 625%macro INITIAL_BLOCKS 23 626%define %%GDATA_KEY %1 627%define %%CYPH_PLAIN_OUT %2 628%define %%PLAIN_CYPH_IN %3 629%define %%LENGTH %4 630%define %%DATA_OFFSET %5 631%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 632%define %%T1 %7 633%define %%T2 %8 634%define %%T3 %9 635%define %%T4 %10 636%define %%T5 %11 637%define %%CTR %12 638%define %%XMM1 %13 639%define %%XMM2 %14 640%define %%XMM3 %15 641%define %%XMM4 %16 642%define %%XMM5 %17 643%define %%XMM6 %18 644%define %%XMM7 %19 645%define %%XMM8 %20 646%define %%T6 %21 647%define %%T_key %22 648%define %%ENC_DEC %23 649 650%assign i (8-%%num_initial_blocks) 651 ;; Move AAD_HASH to temp reg 652 vmovdqu %%T2, %%XMM8 653 ;; Start AES for %%num_initial_blocks blocks 654 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 655 656%assign i (9-%%num_initial_blocks) 657%rep %%num_initial_blocks 658 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 659 vmovdqa reg(i), %%CTR 660 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap 661%assign i (i+1) 662%endrep 663 664%if(%%num_initial_blocks>0) 665vmovdqu %%T_key, [%%GDATA_KEY+16*0] 666%assign i (9-%%num_initial_blocks) 667%rep %%num_initial_blocks 668 vpxor reg(i),reg(i),%%T_key 669%assign i (i+1) 670%endrep 671 672%assign j 1 673%rep NROUNDS 674vmovdqu %%T_key, [%%GDATA_KEY+16*j] 675%assign i (9-%%num_initial_blocks) 676%rep %%num_initial_blocks 677 vaesenc reg(i),%%T_key 678%assign i (i+1) 679%endrep 680 681%assign j (j+1) 682%endrep 683 684 685vmovdqu %%T_key, [%%GDATA_KEY+16*j] 686%assign i (9-%%num_initial_blocks) 687%rep %%num_initial_blocks 688 vaesenclast reg(i),%%T_key 689%assign i (i+1) 690%endrep 691 692%endif ; %if(%%num_initial_blocks>0) 693 694 695 696%assign i (9-%%num_initial_blocks) 697%rep %%num_initial_blocks 698 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 699 vpxor reg(i), reg(i), %%T1 700 ;; Write back ciphertext for %%num_initial_blocks blocks 701 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) 702 add %%DATA_OFFSET, 16 703 %ifidn %%ENC_DEC, DEC 704 vmovdqa reg(i), %%T1 705 %endif 706 ;; Prepare ciphertext for GHASH computations 707 vpshufb reg(i), [rel SHUF_MASK] 708%assign i (i+1) 709%endrep 710 711 712;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 713 714%assign i (9-%%num_initial_blocks) 715%if(%%num_initial_blocks>0) 716 vmovdqa %%T3, reg(i) 717%assign i (i+1) 718%endif 719%if %%num_initial_blocks>1 720%rep %%num_initial_blocks-1 721 vmovdqu [rsp + TMP %+ i], reg(i) 722%assign i (i+1) 723%endrep 724%endif 725 726 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 727 ;; Prepare 8 counter blocks and perform rounds of AES cipher on 728 ;; them, load plain/cipher text and store cipher/plain text. 729 ;; Stitch GHASH computation in between AES rounds. 730 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0 731 vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0 732 vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0 733 vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0 734 vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0 735 vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0 736 vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0 737 vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0 738 vmovdqa %%CTR, %%XMM8 739 740 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap 741 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap 742 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap 743 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap 744 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap 745 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap 746 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap 747 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap 748 749 vmovdqu %%T_key, [%%GDATA_KEY+16*0] 750 vpxor %%XMM1, %%XMM1, %%T_key 751 vpxor %%XMM2, %%XMM2, %%T_key 752 vpxor %%XMM3, %%XMM3, %%T_key 753 vpxor %%XMM4, %%XMM4, %%T_key 754 vpxor %%XMM5, %%XMM5, %%T_key 755 vpxor %%XMM6, %%XMM6, %%T_key 756 vpxor %%XMM7, %%XMM7, %%T_key 757 vpxor %%XMM8, %%XMM8, %%T_key 758 759%assign i (8-%%num_initial_blocks) 760%assign j (9-%%num_initial_blocks) 761%assign k (%%num_initial_blocks) 762 763%define %%T4_2 %%T4 764%if(%%num_initial_blocks>0) 765 ;; Hash in AES state 766 ;; T2 - incoming AAD hash 767 vpxor %%T2, %%T3 768 769 ;; GDATA, HASHKEY, CIPHER, 770 ;; STATE_11, STATE_00, STATE_MID, T1, T2 771 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 772 %%T1, %%T4, %%T6, %%T5, %%T3, first 773%endif 774 775 vmovdqu %%T_key, [%%GDATA_KEY+16*1] 776 vaesenc %%XMM1, %%T_key 777 vaesenc %%XMM2, %%T_key 778 vaesenc %%XMM3, %%T_key 779 vaesenc %%XMM4, %%T_key 780 vaesenc %%XMM5, %%T_key 781 vaesenc %%XMM6, %%T_key 782 vaesenc %%XMM7, %%T_key 783 vaesenc %%XMM8, %%T_key 784 785 vmovdqu %%T_key, [%%GDATA_KEY+16*2] 786 vaesenc %%XMM1, %%T_key 787 vaesenc %%XMM2, %%T_key 788 vaesenc %%XMM3, %%T_key 789 vaesenc %%XMM4, %%T_key 790 vaesenc %%XMM5, %%T_key 791 vaesenc %%XMM6, %%T_key 792 vaesenc %%XMM7, %%T_key 793 vaesenc %%XMM8, %%T_key 794 795%assign i (i+1) 796%assign j (j+1) 797%assign k (k-1) 798%if(%%num_initial_blocks>1) 799 ;; GDATA, HASHKEY, CIPHER, 800 ;; STATE_11, STATE_00, STATE_MID, T1, T2 801 vmovdqu %%T2, [rsp + TMP %+ j] 802 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 803 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 804%endif 805 806 vmovdqu %%T_key, [%%GDATA_KEY+16*3] 807 vaesenc %%XMM1, %%T_key 808 vaesenc %%XMM2, %%T_key 809 vaesenc %%XMM3, %%T_key 810 vaesenc %%XMM4, %%T_key 811 vaesenc %%XMM5, %%T_key 812 vaesenc %%XMM6, %%T_key 813 vaesenc %%XMM7, %%T_key 814 vaesenc %%XMM8, %%T_key 815 816 vmovdqu %%T_key, [%%GDATA_KEY+16*4] 817 vaesenc %%XMM1, %%T_key 818 vaesenc %%XMM2, %%T_key 819 vaesenc %%XMM3, %%T_key 820 vaesenc %%XMM4, %%T_key 821 vaesenc %%XMM5, %%T_key 822 vaesenc %%XMM6, %%T_key 823 vaesenc %%XMM7, %%T_key 824 vaesenc %%XMM8, %%T_key 825 826%assign i (i+1) 827%assign j (j+1) 828%assign k (k-1) 829%if(%%num_initial_blocks>2) 830 ;; GDATA, HASHKEY, CIPHER, 831 ;; STATE_11, STATE_00, STATE_MID, T1, T2 832 vmovdqu %%T2, [rsp + TMP %+ j] 833 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 834 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 835%endif 836 837%assign i (i+1) 838%assign j (j+1) 839%assign k (k-1) 840%if(%%num_initial_blocks>3) 841 ;; GDATA, HASHKEY, CIPHER, 842 ;; STATE_11, STATE_00, STATE_MID, T1, T2 843 vmovdqu %%T2, [rsp + TMP %+ j] 844 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 845 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 846%endif 847 848 vmovdqu %%T_key, [%%GDATA_KEY+16*5] 849 vaesenc %%XMM1, %%T_key 850 vaesenc %%XMM2, %%T_key 851 vaesenc %%XMM3, %%T_key 852 vaesenc %%XMM4, %%T_key 853 vaesenc %%XMM5, %%T_key 854 vaesenc %%XMM6, %%T_key 855 vaesenc %%XMM7, %%T_key 856 vaesenc %%XMM8, %%T_key 857 858 vmovdqu %%T_key, [%%GDATA_KEY+16*6] 859 vaesenc %%XMM1, %%T_key 860 vaesenc %%XMM2, %%T_key 861 vaesenc %%XMM3, %%T_key 862 vaesenc %%XMM4, %%T_key 863 vaesenc %%XMM5, %%T_key 864 vaesenc %%XMM6, %%T_key 865 vaesenc %%XMM7, %%T_key 866 vaesenc %%XMM8, %%T_key 867 868%assign i (i+1) 869%assign j (j+1) 870%assign k (k-1) 871%if(%%num_initial_blocks>4) 872 ;; GDATA, HASHKEY, CIPHER, 873 ;; STATE_11, STATE_00, STATE_MID, T1, T2 874 vmovdqu %%T2, [rsp + TMP %+ j] 875 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 876 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 877%endif 878 879 vmovdqu %%T_key, [%%GDATA_KEY+16*7] 880 vaesenc %%XMM1, %%T_key 881 vaesenc %%XMM2, %%T_key 882 vaesenc %%XMM3, %%T_key 883 vaesenc %%XMM4, %%T_key 884 vaesenc %%XMM5, %%T_key 885 vaesenc %%XMM6, %%T_key 886 vaesenc %%XMM7, %%T_key 887 vaesenc %%XMM8, %%T_key 888 889 vmovdqu %%T_key, [%%GDATA_KEY+16*8] 890 vaesenc %%XMM1, %%T_key 891 vaesenc %%XMM2, %%T_key 892 vaesenc %%XMM3, %%T_key 893 vaesenc %%XMM4, %%T_key 894 vaesenc %%XMM5, %%T_key 895 vaesenc %%XMM6, %%T_key 896 vaesenc %%XMM7, %%T_key 897 vaesenc %%XMM8, %%T_key 898 899%assign i (i+1) 900%assign j (j+1) 901%assign k (k-1) 902%if(%%num_initial_blocks>5) 903 ;; GDATA, HASHKEY, CIPHER, 904 ;; STATE_11, STATE_00, STATE_MID, T1, T2 905 vmovdqu %%T2, [rsp + TMP %+ j] 906 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 907 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 908%endif 909 910 vmovdqu %%T_key, [%%GDATA_KEY+16*9] 911 vaesenc %%XMM1, %%T_key 912 vaesenc %%XMM2, %%T_key 913 vaesenc %%XMM3, %%T_key 914 vaesenc %%XMM4, %%T_key 915 vaesenc %%XMM5, %%T_key 916 vaesenc %%XMM6, %%T_key 917 vaesenc %%XMM7, %%T_key 918 vaesenc %%XMM8, %%T_key 919 920%ifndef GCM128_MODE 921 vmovdqu %%T_key, [%%GDATA_KEY+16*10] 922 vaesenc %%XMM1, %%T_key 923 vaesenc %%XMM2, %%T_key 924 vaesenc %%XMM3, %%T_key 925 vaesenc %%XMM4, %%T_key 926 vaesenc %%XMM5, %%T_key 927 vaesenc %%XMM6, %%T_key 928 vaesenc %%XMM7, %%T_key 929 vaesenc %%XMM8, %%T_key 930%endif 931 932%assign i (i+1) 933%assign j (j+1) 934%assign k (k-1) 935%if(%%num_initial_blocks>6) 936 ;; GDATA, HASHKEY, CIPHER, 937 ;; STATE_11, STATE_00, STATE_MID, T1, T2 938 vmovdqu %%T2, [rsp + TMP %+ j] 939 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 940 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 941%endif 942 943%ifdef GCM128_MODE 944 vmovdqu %%T_key, [%%GDATA_KEY+16*10] 945 vaesenclast %%XMM1, %%T_key 946 vaesenclast %%XMM2, %%T_key 947 vaesenclast %%XMM3, %%T_key 948 vaesenclast %%XMM4, %%T_key 949 vaesenclast %%XMM5, %%T_key 950 vaesenclast %%XMM6, %%T_key 951 vaesenclast %%XMM7, %%T_key 952 vaesenclast %%XMM8, %%T_key 953%endif 954 955%ifdef GCM192_MODE 956 vmovdqu %%T_key, [%%GDATA_KEY+16*11] 957 vaesenc %%XMM1, %%T_key 958 vaesenc %%XMM2, %%T_key 959 vaesenc %%XMM3, %%T_key 960 vaesenc %%XMM4, %%T_key 961 vaesenc %%XMM5, %%T_key 962 vaesenc %%XMM6, %%T_key 963 vaesenc %%XMM7, %%T_key 964 vaesenc %%XMM8, %%T_key 965 966 vmovdqu %%T_key, [%%GDATA_KEY+16*12] 967 vaesenclast %%XMM1, %%T_key 968 vaesenclast %%XMM2, %%T_key 969 vaesenclast %%XMM3, %%T_key 970 vaesenclast %%XMM4, %%T_key 971 vaesenclast %%XMM5, %%T_key 972 vaesenclast %%XMM6, %%T_key 973 vaesenclast %%XMM7, %%T_key 974 vaesenclast %%XMM8, %%T_key 975%endif 976%ifdef GCM256_MODE 977 vmovdqu %%T_key, [%%GDATA_KEY+16*11] 978 vaesenc %%XMM1, %%T_key 979 vaesenc %%XMM2, %%T_key 980 vaesenc %%XMM3, %%T_key 981 vaesenc %%XMM4, %%T_key 982 vaesenc %%XMM5, %%T_key 983 vaesenc %%XMM6, %%T_key 984 vaesenc %%XMM7, %%T_key 985 vaesenc %%XMM8, %%T_key 986 987 vmovdqu %%T_key, [%%GDATA_KEY+16*12] 988 vaesenc %%XMM1, %%T_key 989 vaesenc %%XMM2, %%T_key 990 vaesenc %%XMM3, %%T_key 991 vaesenc %%XMM4, %%T_key 992 vaesenc %%XMM5, %%T_key 993 vaesenc %%XMM6, %%T_key 994 vaesenc %%XMM7, %%T_key 995 vaesenc %%XMM8, %%T_key 996%endif 997 998%assign i (i+1) 999%assign j (j+1) 1000%assign k (k-1) 1001%if(%%num_initial_blocks>7) 1002 ;; GDATA, HASHKEY, CIPHER, 1003 ;; STATE_11, STATE_00, STATE_MID, T1, T2 1004 vmovdqu %%T2, [rsp + TMP %+ j] 1005 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 1006 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 1007%endif 1008 1009%ifdef GCM256_MODE ; GCM256 1010 vmovdqu %%T_key, [%%GDATA_KEY+16*13] 1011 vaesenc %%XMM1, %%T_key 1012 vaesenc %%XMM2, %%T_key 1013 vaesenc %%XMM3, %%T_key 1014 vaesenc %%XMM4, %%T_key 1015 vaesenc %%XMM5, %%T_key 1016 vaesenc %%XMM6, %%T_key 1017 vaesenc %%XMM7, %%T_key 1018 vaesenc %%XMM8, %%T_key 1019 1020 vmovdqu %%T_key, [%%GDATA_KEY+16*14] 1021 vaesenclast %%XMM1, %%T_key 1022 vaesenclast %%XMM2, %%T_key 1023 vaesenclast %%XMM3, %%T_key 1024 vaesenclast %%XMM4, %%T_key 1025 vaesenclast %%XMM5, %%T_key 1026 vaesenclast %%XMM6, %%T_key 1027 vaesenclast %%XMM7, %%T_key 1028 vaesenclast %%XMM8, %%T_key 1029%endif ; GCM256 mode 1030 1031%if(%%num_initial_blocks>0) 1032 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs 1033 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs 1034 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 1035 vpxor %%T4, %%T6, %%T4 1036 1037 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1038 ; First phase of the reduction 1039 vmovdqu %%T3, [rel POLY2] 1040 1041 vpclmulqdq %%T2, %%T3, %%T4, 0x01 1042 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 1043 1044 ;; First phase of the reduction complete 1045 vpxor %%T4, %%T4, %%T2 1046 1047 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1048 ; Second phase of the reduction 1049 vpclmulqdq %%T2, %%T3, %%T4, 0x00 1050 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1051 vpsrldq %%T2, %%T2, 4 1052 1053 vpclmulqdq %%T4, %%T3, %%T4, 0x10 1054 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 1055 vpslldq %%T4, %%T4, 4 1056 ;; Second phase of the reduction complete 1057 vpxor %%T4, %%T4, %%T2 1058 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1059 ; The result is in %%T3 1060 vpxor %%T3, %%T1, %%T4 1061%else 1062 ;; The hash should end up in T3 1063 vmovdqa %%T3, %%T2 1064%endif 1065 1066 ;; Final hash is now in T3 1067%if %%num_initial_blocks > 0 1068 ;; NOTE: obsolete in case %%num_initial_blocks = 0 1069 sub %%LENGTH, 16*%%num_initial_blocks 1070%endif 1071 1072 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] 1073 vpxor %%XMM1, %%XMM1, %%T1 1074 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 1075 %ifidn %%ENC_DEC, DEC 1076 vmovdqa %%XMM1, %%T1 1077 %endif 1078 1079 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] 1080 vpxor %%XMM2, %%XMM2, %%T1 1081 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 1082 %ifidn %%ENC_DEC, DEC 1083 vmovdqa %%XMM2, %%T1 1084 %endif 1085 1086 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] 1087 vpxor %%XMM3, %%XMM3, %%T1 1088 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 1089 %ifidn %%ENC_DEC, DEC 1090 vmovdqa %%XMM3, %%T1 1091 %endif 1092 1093 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] 1094 vpxor %%XMM4, %%XMM4, %%T1 1095 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 1096 %ifidn %%ENC_DEC, DEC 1097 vmovdqa %%XMM4, %%T1 1098 %endif 1099 1100 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] 1101 vpxor %%XMM5, %%XMM5, %%T1 1102 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 1103 %ifidn %%ENC_DEC, DEC 1104 vmovdqa %%XMM5, %%T1 1105 %endif 1106 1107 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] 1108 vpxor %%XMM6, %%XMM6, %%T1 1109 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 1110 %ifidn %%ENC_DEC, DEC 1111 vmovdqa %%XMM6, %%T1 1112 %endif 1113 1114 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] 1115 vpxor %%XMM7, %%XMM7, %%T1 1116 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 1117 %ifidn %%ENC_DEC, DEC 1118 vmovdqa %%XMM7, %%T1 1119 %endif 1120 1121%if %%num_initial_blocks > 0 1122 ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 1123 ;; This macro is executed for length 128 and up, 1124 ;; zero length is checked in GCM_ENC_DEC. 1125 ;; If the last block is partial then the xor will be done later 1126 ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. 1127 ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 1128 cmp %%LENGTH, 128 1129 jl %%_initial_skip_last_word_write 1130%endif 1131 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] 1132 vpxor %%XMM8, %%XMM8, %%T1 1133 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 1134 %ifidn %%ENC_DEC, DEC 1135 vmovdqa %%XMM8, %%T1 1136 %endif 1137 1138 ;; Update %%LENGTH with the number of blocks processed 1139 sub %%LENGTH, 16 1140 add %%DATA_OFFSET, 16 1141%%_initial_skip_last_word_write: 1142 sub %%LENGTH, 128-16 1143 add %%DATA_OFFSET, 128-16 1144 1145 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap 1146 ;; Combine GHASHed value with the corresponding ciphertext 1147 vpxor %%XMM1, %%XMM1, %%T3 1148 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap 1149 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap 1150 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap 1151 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap 1152 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap 1153 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap 1154 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap 1155 1156;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1157 1158%%_initial_blocks_done: 1159 1160 1161%endmacro 1162 1163;;; INITIAL_BLOCKS macro with support for a partial final block. 1164;;; num_initial_blocks is expected to include the partial final block 1165;;; in the count. 1166%macro INITIAL_BLOCKS_PARTIAL 25 1167%define %%GDATA_KEY %1 1168%define %%GDATA_CTX %2 1169%define %%CYPH_PLAIN_OUT %3 1170%define %%PLAIN_CYPH_IN %4 1171%define %%LENGTH %5 1172%define %%DATA_OFFSET %6 1173%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) 1174%define %%T1 %8 1175%define %%T2 %9 1176%define %%T3 %10 ; [out] hash value 1177%define %%T4 %11 1178%define %%T5 %12 1179%define %%CTR %13 1180%define %%XMM1 %14 1181%define %%XMM2 %15 1182%define %%XMM3 %16 1183%define %%XMM4 %17 1184%define %%XMM5 %18 1185%define %%XMM6 %19 1186%define %%XMM7 %20 1187%define %%XMM8 %21 ; [in] hash value 1188%define %%T6 %22 1189%define %%T_key %23 1190%define %%ENC_DEC %24 1191%define %%INSTANCE_TYPE %25 1192 1193 ;; Move AAD_HASH to temp reg 1194 vmovdqu %%T2, %%XMM8 1195 1196%assign i (9-%%num_initial_blocks) 1197%rep %%num_initial_blocks 1198 ;; Compute AES counters 1199 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 1200 vmovdqa reg(i), %%CTR 1201 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap 1202%assign i (i+1) 1203%endrep 1204 1205vmovdqu %%T_key, [%%GDATA_KEY+16*0] 1206%assign i (9-%%num_initial_blocks) 1207%rep %%num_initial_blocks 1208 ; Start AES for %%num_initial_blocks blocks 1209 vpxor reg(i),reg(i),%%T_key 1210%assign i (i+1) 1211%endrep 1212 1213%assign j 1 1214%rep NROUNDS 1215vmovdqu %%T_key, [%%GDATA_KEY+16*j] 1216%assign i (9-%%num_initial_blocks) 1217%rep %%num_initial_blocks 1218 vaesenc reg(i),%%T_key 1219%assign i (i+1) 1220%endrep 1221 1222%assign j (j+1) 1223%endrep 1224 1225 1226vmovdqu %%T_key, [%%GDATA_KEY+16*j] 1227%assign i (9-%%num_initial_blocks) 1228%rep %%num_initial_blocks 1229 vaesenclast reg(i),%%T_key 1230%assign i (i+1) 1231%endrep 1232 1233;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1234;;; Hash all but the last block of data 1235;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1236 1237%assign i (9-%%num_initial_blocks) 1238%rep %%num_initial_blocks-1 1239 ;; Encrypt the message for all but the last block 1240 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 1241 vpxor reg(i), reg(i), %%T1 1242 ;; write back ciphertext for %%num_initial_blocks blocks 1243 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) 1244 add %%DATA_OFFSET, 16 1245%ifidn %%ENC_DEC, DEC 1246 vmovdqa reg(i), %%T1 1247%endif 1248 ;; Prepare ciphertext for GHASH computations 1249 vpshufb reg(i), [rel SHUF_MASK] 1250%assign i (i+1) 1251%endrep 1252 1253%if %%num_initial_blocks > 1 1254 ;; The final block of data may be <16B 1255 sub %%LENGTH, 16*(%%num_initial_blocks-1) 1256%endif 1257 1258%if %%num_initial_blocks < 8 1259 ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. 1260 ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. 1261 cmp %%LENGTH, 16 1262 jl %%_small_initial_partial_block 1263 1264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1265;;; Handle a full length final block - encrypt and hash all blocks 1266;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1267 1268 sub %%LENGTH, 16 1269 mov [%%GDATA_CTX + PBlockLen], %%LENGTH 1270 1271 ;; Encrypt the message 1272 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 1273 vpxor reg(i), reg(i), %%T1 1274 ;; write back ciphertext for %%num_initial_blocks blocks 1275 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) 1276 add %%DATA_OFFSET, 16 1277%ifidn %%ENC_DEC, DEC 1278 vmovdqa reg(i), %%T1 1279%endif 1280 ;; Prepare ciphertext for GHASH computations 1281 vpshufb reg(i), [rel SHUF_MASK] 1282 1283 ;; Hash all of the data 1284%assign i (8-%%num_initial_blocks) 1285%assign j (9-%%num_initial_blocks) 1286%assign k (%%num_initial_blocks) 1287%assign last_block_to_hash 0 1288 1289%if(%%num_initial_blocks>last_block_to_hash) 1290 ;; Hash in AES state 1291 vpxor %%T2, reg(j) 1292 1293 ;; T2 - incoming AAD hash 1294 ;; reg(i) holds ciphertext 1295 ;; T5 - hash key 1296 ;; T6 - updated xor 1297 ;; reg(1)/xmm1 should now be available for tmp use 1298 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1299 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 1300 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 1301 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 1302 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 1303 vpxor %%T6, %%T6, %%T5 1304%endif 1305 1306%assign i (i+1) 1307%assign j (j+1) 1308%assign k (k-1) 1309%assign rep_count (%%num_initial_blocks-1) 1310%rep rep_count 1311 1312 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1313 vpclmulqdq %%T3, reg(j), %%T5, 0x11 1314 vpxor %%T1, %%T1, %%T3 1315 1316 vpclmulqdq %%T3, reg(j), %%T5, 0x00 1317 vpxor %%T4, %%T4, %%T3 1318 1319 vpclmulqdq %%T3, reg(j), %%T5, 0x01 1320 vpxor %%T6, %%T6, %%T3 1321 1322 vpclmulqdq %%T3, reg(j), %%T5, 0x10 1323 vpxor %%T6, %%T6, %%T3 1324 1325%assign i (i+1) 1326%assign j (j+1) 1327%assign k (k-1) 1328%endrep 1329 1330 ;; Record that a reduction is needed 1331 mov r12, 1 1332 1333 jmp %%_small_initial_compute_hash 1334 1335 1336%endif ; %if %%num_initial_blocks < 8 1337 1338%%_small_initial_partial_block: 1339 1340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1341;;; Handle ghash for a <16B final block 1342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1343 1344 ;; In this case if it's a single call to encrypt we can 1345 ;; hash all of the data but if it's an init / update / finalize 1346 ;; series of call we need to leave the last block if it's 1347 ;; less than a full block of data. 1348 1349 mov [%%GDATA_CTX + PBlockLen], %%LENGTH 1350 vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) 1351 ;; Handle a partial final block 1352 ;; GDATA, KEY, T1, T2 1353 ;; r13 - length 1354 ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long 1355 ;; NOTE: could be replaced with %%LENGTH but at this point 1356 ;; %%LENGTH is always less than 16. 1357 ;; No PLAIN_CYPH_LEN argument available in this macro. 1358 ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET 1359 vpshufb reg(i), [rel SHUF_MASK] 1360 1361%ifidn %%INSTANCE_TYPE, multi_call 1362%assign i (8-%%num_initial_blocks) 1363%assign j (9-%%num_initial_blocks) 1364%assign k (%%num_initial_blocks-1) 1365%assign last_block_to_hash 1 1366%else 1367%assign i (8-%%num_initial_blocks) 1368%assign j (9-%%num_initial_blocks) 1369%assign k (%%num_initial_blocks) 1370%assign last_block_to_hash 0 1371%endif 1372 1373%if(%%num_initial_blocks>last_block_to_hash) 1374 ;; Record that a reduction is needed 1375 mov r12, 1 1376 ;; Hash in AES state 1377 vpxor %%T2, reg(j) 1378 1379 ;; T2 - incoming AAD hash 1380 ;; reg(i) holds ciphertext 1381 ;; T5 - hash key 1382 ;; T6 - updated xor 1383 ;; reg(1)/xmm1 should now be available for tmp use 1384 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1385 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 1386 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 1387 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 1388 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 1389 vpxor %%T6, %%T6, %%T5 1390%else 1391 ;; Record that a reduction is not needed - 1392 ;; In this case no hashes are computed because there 1393 ;; is only one initial block and it is < 16B in length. 1394 xor r12, r12 1395%endif 1396 1397%assign i (i+1) 1398%assign j (j+1) 1399%assign k (k-1) 1400%ifidn %%INSTANCE_TYPE, multi_call 1401%assign rep_count (%%num_initial_blocks-2) 1402%%_multi_call_hash: 1403%else 1404%assign rep_count (%%num_initial_blocks-1) 1405%endif 1406 1407%if rep_count < 0 1408 ;; fix for negative rep_count 1409%assign rep_count 0 1410%endif 1411 1412%rep rep_count 1413 1414 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1415 vpclmulqdq %%T3, reg(j), %%T5, 0x11 1416 vpxor %%T1, %%T1, %%T3 1417 1418 vpclmulqdq %%T3, reg(j), %%T5, 0x00 1419 vpxor %%T4, %%T4, %%T3 1420 1421 vpclmulqdq %%T3, reg(j), %%T5, 0x01 1422 vpxor %%T6, %%T6, %%T3 1423 1424 vpclmulqdq %%T3, reg(j), %%T5, 0x10 1425 vpxor %%T6, %%T6, %%T3 1426 1427%assign i (i+1) 1428%assign j (j+1) 1429%assign k (k-1) 1430%endrep 1431 1432%%_small_initial_compute_hash: 1433 1434;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1435;;; Ghash reduction 1436;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1437 1438%if(%%num_initial_blocks=1) 1439%ifidn %%INSTANCE_TYPE, multi_call 1440 ;; We only need to check if a reduction is needed if 1441 ;; initial_blocks == 1 and init/update/final is being used. 1442 ;; In this case we may just have a partial block, and that 1443 ;; gets hashed in finalize. 1444 ;; cmp r12, 0 1445 or r12, r12 1446 je %%_no_reduction_needed 1447%endif 1448%endif 1449 1450 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs 1451 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs 1452 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 1453 vpxor %%T4, %%T6, %%T4 1454 1455 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1456 ;; First phase of the reduction 1457 vmovdqu %%T3, [rel POLY2] 1458 1459 vpclmulqdq %%T2, %%T3, %%T4, 0x01 1460 ;; shift-L xmm2 2 DWs 1461 vpslldq %%T2, %%T2, 8 1462 vpxor %%T4, %%T4, %%T2 1463 1464 ;; First phase of the reduction complete 1465 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1466 ;; Second phase of the reduction 1467 1468 vpclmulqdq %%T2, %%T3, %%T4, 0x00 1469 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1470 vpsrldq %%T2, %%T2, 4 1471 1472 vpclmulqdq %%T4, %%T3, %%T4, 0x10 1473 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 1474 vpslldq %%T4, %%T4, 4 1475 1476 vpxor %%T4, %%T4, %%T2 1477 ;; Second phase of the reduction complete 1478 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1479 vpxor %%T3, %%T1, %%T4 1480 1481%ifidn %%INSTANCE_TYPE, multi_call 1482 ;; If using init/update/finalize, we need to xor any partial block data 1483 ;; into the hash. 1484%if %%num_initial_blocks > 1 1485 ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place 1486%if %%num_initial_blocks != 8 1487 ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero 1488 cmp qword [%%GDATA_CTX + PBlockLen], 0 1489 je %%_no_partial_block_xor 1490%endif ; %%num_initial_blocks != 8 1491 vpxor %%T3, %%T3, reg(8) 1492%%_no_partial_block_xor: 1493%endif ; %%num_initial_blocks > 1 1494%endif ; %%INSTANCE_TYPE, multi_call 1495 1496%if(%%num_initial_blocks=1) 1497%ifidn %%INSTANCE_TYPE, multi_call 1498 ;; NOTE: %%_no_reduction_needed case only valid for 1499 ;; multi_call with initial_blocks = 1. 1500 ;; Look for comment above around '_no_reduction_needed' 1501 ;; The jmp below is obsolete as the code will fall through. 1502 1503 ;; The result is in %%T3 1504 jmp %%_after_reduction 1505 1506%%_no_reduction_needed: 1507 ;; The hash should end up in T3. The only way we should get here is if 1508 ;; there is a partial block of data, so xor that into the hash. 1509 vpxor %%T3, %%T2, reg(8) 1510%endif ; %%INSTANCE_TYPE = multi_call 1511%endif ; %%num_initial_blocks=1 1512 1513%%_after_reduction: 1514 ;; Final hash is now in T3 1515 1516%endmacro ; INITIAL_BLOCKS_PARTIAL 1517 1518 1519 1520; encrypt 8 blocks at a time 1521; ghash the 8 previously encrypted ciphertext blocks 1522; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified 1523; %%DATA_OFFSET is the data offset value 1524%macro GHASH_8_ENCRYPT_8_PARALLEL 23 1525%define %%GDATA %1 1526%define %%CYPH_PLAIN_OUT %2 1527%define %%PLAIN_CYPH_IN %3 1528%define %%DATA_OFFSET %4 1529%define %%T1 %5 1530%define %%T2 %6 1531%define %%T3 %7 1532%define %%T4 %8 1533%define %%T5 %9 1534%define %%T6 %10 1535%define %%CTR %11 1536%define %%XMM1 %12 1537%define %%XMM2 %13 1538%define %%XMM3 %14 1539%define %%XMM4 %15 1540%define %%XMM5 %16 1541%define %%XMM6 %17 1542%define %%XMM7 %18 1543%define %%XMM8 %19 1544%define %%T7 %20 1545%define %%loop_idx %21 1546%define %%ENC_DEC %22 1547%define %%FULL_PARTIAL %23 1548 1549 vmovdqa %%T2, %%XMM1 1550 vmovdqu [rsp + TMP2], %%XMM2 1551 vmovdqu [rsp + TMP3], %%XMM3 1552 vmovdqu [rsp + TMP4], %%XMM4 1553 vmovdqu [rsp + TMP5], %%XMM5 1554 vmovdqu [rsp + TMP6], %%XMM6 1555 vmovdqu [rsp + TMP7], %%XMM7 1556 vmovdqu [rsp + TMP8], %%XMM8 1557 1558%ifidn %%loop_idx, in_order 1559 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT 1560 vmovdqu %%T5, [rel TWO] 1561 vpaddd %%XMM2, %%CTR, %%T5 1562 vpaddd %%XMM3, %%XMM1, %%T5 1563 vpaddd %%XMM4, %%XMM2, %%T5 1564 vpaddd %%XMM5, %%XMM3, %%T5 1565 vpaddd %%XMM6, %%XMM4, %%T5 1566 vpaddd %%XMM7, %%XMM5, %%T5 1567 vpaddd %%XMM8, %%XMM6, %%T5 1568 vmovdqa %%CTR, %%XMM8 1569 1570 vmovdqu %%T5, [rel SHUF_MASK] 1571 vpshufb %%XMM1, %%T5 ; perform a 16Byte swap 1572 vpshufb %%XMM2, %%T5 ; perform a 16Byte swap 1573 vpshufb %%XMM3, %%T5 ; perform a 16Byte swap 1574 vpshufb %%XMM4, %%T5 ; perform a 16Byte swap 1575 vpshufb %%XMM5, %%T5 ; perform a 16Byte swap 1576 vpshufb %%XMM6, %%T5 ; perform a 16Byte swap 1577 vpshufb %%XMM7, %%T5 ; perform a 16Byte swap 1578 vpshufb %%XMM8, %%T5 ; perform a 16Byte swap 1579%else 1580 vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT 1581 vmovdqu %%T5, [rel TWOf] 1582 vpaddd %%XMM2, %%CTR, %%T5 1583 vpaddd %%XMM3, %%XMM1, %%T5 1584 vpaddd %%XMM4, %%XMM2, %%T5 1585 vpaddd %%XMM5, %%XMM3, %%T5 1586 vpaddd %%XMM6, %%XMM4, %%T5 1587 vpaddd %%XMM7, %%XMM5, %%T5 1588 vpaddd %%XMM8, %%XMM6, %%T5 1589 vmovdqa %%CTR, %%XMM8 1590%endif 1591 1592 1593 1594 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1595 1596 vmovdqu %%T1, [%%GDATA + 16*0] 1597 vpxor %%XMM1, %%XMM1, %%T1 1598 vpxor %%XMM2, %%XMM2, %%T1 1599 vpxor %%XMM3, %%XMM3, %%T1 1600 vpxor %%XMM4, %%XMM4, %%T1 1601 vpxor %%XMM5, %%XMM5, %%T1 1602 vpxor %%XMM6, %%XMM6, %%T1 1603 vpxor %%XMM7, %%XMM7, %%T1 1604 vpxor %%XMM8, %%XMM8, %%T1 1605 1606 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1607 1608 vmovdqu %%T1, [%%GDATA + 16*1] 1609 vaesenc %%XMM1, %%T1 1610 vaesenc %%XMM2, %%T1 1611 vaesenc %%XMM3, %%T1 1612 vaesenc %%XMM4, %%T1 1613 vaesenc %%XMM5, %%T1 1614 vaesenc %%XMM6, %%T1 1615 vaesenc %%XMM7, %%T1 1616 vaesenc %%XMM8, %%T1 1617 1618 1619 vmovdqu %%T1, [%%GDATA + 16*2] 1620 vaesenc %%XMM1, %%T1 1621 vaesenc %%XMM2, %%T1 1622 vaesenc %%XMM3, %%T1 1623 vaesenc %%XMM4, %%T1 1624 vaesenc %%XMM5, %%T1 1625 vaesenc %%XMM6, %%T1 1626 vaesenc %%XMM7, %%T1 1627 vaesenc %%XMM8, %%T1 1628 1629 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1630 1631 vmovdqu %%T5, [%%GDATA + HashKey_8] 1632 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 1633 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 1634 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 1635 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 1636 vpxor %%T6, %%T6, %%T5 1637 1638 vmovdqu %%T1, [%%GDATA + 16*3] 1639 vaesenc %%XMM1, %%T1 1640 vaesenc %%XMM2, %%T1 1641 vaesenc %%XMM3, %%T1 1642 vaesenc %%XMM4, %%T1 1643 vaesenc %%XMM5, %%T1 1644 vaesenc %%XMM6, %%T1 1645 vaesenc %%XMM7, %%T1 1646 vaesenc %%XMM8, %%T1 1647 1648 vmovdqu %%T1, [rsp + TMP2] 1649 vmovdqu %%T5, [%%GDATA + HashKey_7] 1650 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1651 vpxor %%T4, %%T4, %%T3 1652 1653 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1654 vpxor %%T7, %%T7, %%T3 1655 1656 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1657 vpxor %%T6, %%T6, %%T3 1658 1659 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1660 vpxor %%T6, %%T6, %%T3 1661 1662 vmovdqu %%T1, [%%GDATA + 16*4] 1663 vaesenc %%XMM1, %%T1 1664 vaesenc %%XMM2, %%T1 1665 vaesenc %%XMM3, %%T1 1666 vaesenc %%XMM4, %%T1 1667 vaesenc %%XMM5, %%T1 1668 vaesenc %%XMM6, %%T1 1669 vaesenc %%XMM7, %%T1 1670 vaesenc %%XMM8, %%T1 1671 1672 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1673 vmovdqu %%T1, [rsp + TMP3] 1674 vmovdqu %%T5, [%%GDATA + HashKey_6] 1675 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1676 vpxor %%T4, %%T4, %%T3 1677 1678 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1679 vpxor %%T7, %%T7, %%T3 1680 1681 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1682 vpxor %%T6, %%T6, %%T3 1683 1684 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1685 vpxor %%T6, %%T6, %%T3 1686 1687 vmovdqu %%T1, [%%GDATA + 16*5] 1688 vaesenc %%XMM1, %%T1 1689 vaesenc %%XMM2, %%T1 1690 vaesenc %%XMM3, %%T1 1691 vaesenc %%XMM4, %%T1 1692 vaesenc %%XMM5, %%T1 1693 vaesenc %%XMM6, %%T1 1694 vaesenc %%XMM7, %%T1 1695 vaesenc %%XMM8, %%T1 1696 1697 1698 vmovdqu %%T1, [rsp + TMP4] 1699 vmovdqu %%T5, [%%GDATA + HashKey_5] 1700 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1701 vpxor %%T4, %%T4, %%T3 1702 1703 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1704 vpxor %%T7, %%T7, %%T3 1705 1706 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1707 vpxor %%T6, %%T6, %%T3 1708 1709 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1710 vpxor %%T6, %%T6, %%T3 1711 1712 vmovdqu %%T1, [%%GDATA + 16*6] 1713 vaesenc %%XMM1, %%T1 1714 vaesenc %%XMM2, %%T1 1715 vaesenc %%XMM3, %%T1 1716 vaesenc %%XMM4, %%T1 1717 vaesenc %%XMM5, %%T1 1718 vaesenc %%XMM6, %%T1 1719 vaesenc %%XMM7, %%T1 1720 vaesenc %%XMM8, %%T1 1721 1722 vmovdqu %%T1, [rsp + TMP5] 1723 vmovdqu %%T5, [%%GDATA + HashKey_4] 1724 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1725 vpxor %%T4, %%T4, %%T3 1726 1727 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1728 vpxor %%T7, %%T7, %%T3 1729 1730 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1731 vpxor %%T6, %%T6, %%T3 1732 1733 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1734 vpxor %%T6, %%T6, %%T3 1735 1736 vmovdqu %%T1, [%%GDATA + 16*7] 1737 vaesenc %%XMM1, %%T1 1738 vaesenc %%XMM2, %%T1 1739 vaesenc %%XMM3, %%T1 1740 vaesenc %%XMM4, %%T1 1741 vaesenc %%XMM5, %%T1 1742 vaesenc %%XMM6, %%T1 1743 vaesenc %%XMM7, %%T1 1744 vaesenc %%XMM8, %%T1 1745 1746 vmovdqu %%T1, [rsp + TMP6] 1747 vmovdqu %%T5, [%%GDATA + HashKey_3] 1748 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1749 vpxor %%T4, %%T4, %%T3 1750 1751 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1752 vpxor %%T7, %%T7, %%T3 1753 1754 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1755 vpxor %%T6, %%T6, %%T3 1756 1757 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1758 vpxor %%T6, %%T6, %%T3 1759 1760 vmovdqu %%T1, [%%GDATA + 16*8] 1761 vaesenc %%XMM1, %%T1 1762 vaesenc %%XMM2, %%T1 1763 vaesenc %%XMM3, %%T1 1764 vaesenc %%XMM4, %%T1 1765 vaesenc %%XMM5, %%T1 1766 vaesenc %%XMM6, %%T1 1767 vaesenc %%XMM7, %%T1 1768 vaesenc %%XMM8, %%T1 1769 1770 vmovdqu %%T1, [rsp + TMP7] 1771 vmovdqu %%T5, [%%GDATA + HashKey_2] 1772 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1773 vpxor %%T4, %%T4, %%T3 1774 1775 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1776 vpxor %%T7, %%T7, %%T3 1777 1778 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1779 vpxor %%T6, %%T6, %%T3 1780 1781 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1782 vpxor %%T6, %%T6, %%T3 1783 1784 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1785 1786 vmovdqu %%T5, [%%GDATA + 16*9] 1787 vaesenc %%XMM1, %%T5 1788 vaesenc %%XMM2, %%T5 1789 vaesenc %%XMM3, %%T5 1790 vaesenc %%XMM4, %%T5 1791 vaesenc %%XMM5, %%T5 1792 vaesenc %%XMM6, %%T5 1793 vaesenc %%XMM7, %%T5 1794 vaesenc %%XMM8, %%T5 1795 1796 vmovdqu %%T1, [rsp + TMP8] 1797 vmovdqu %%T5, [%%GDATA + HashKey] 1798 1799 1800 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1801 vpxor %%T7, %%T7, %%T3 1802 1803 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1804 vpxor %%T6, %%T6, %%T3 1805 1806 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1807 vpxor %%T6, %%T6, %%T3 1808 1809 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1810 vpxor %%T1, %%T4, %%T3 1811 1812 1813 vmovdqu %%T5, [%%GDATA + 16*10] 1814 %ifndef GCM128_MODE ; GCM192 or GCM256 1815 vaesenc %%XMM1, %%T5 1816 vaesenc %%XMM2, %%T5 1817 vaesenc %%XMM3, %%T5 1818 vaesenc %%XMM4, %%T5 1819 vaesenc %%XMM5, %%T5 1820 vaesenc %%XMM6, %%T5 1821 vaesenc %%XMM7, %%T5 1822 vaesenc %%XMM8, %%T5 1823 1824 vmovdqu %%T5, [%%GDATA + 16*11] 1825 vaesenc %%XMM1, %%T5 1826 vaesenc %%XMM2, %%T5 1827 vaesenc %%XMM3, %%T5 1828 vaesenc %%XMM4, %%T5 1829 vaesenc %%XMM5, %%T5 1830 vaesenc %%XMM6, %%T5 1831 vaesenc %%XMM7, %%T5 1832 vaesenc %%XMM8, %%T5 1833 1834 vmovdqu %%T5, [%%GDATA + 16*12] 1835%endif 1836%ifdef GCM256_MODE 1837 vaesenc %%XMM1, %%T5 1838 vaesenc %%XMM2, %%T5 1839 vaesenc %%XMM3, %%T5 1840 vaesenc %%XMM4, %%T5 1841 vaesenc %%XMM5, %%T5 1842 vaesenc %%XMM6, %%T5 1843 vaesenc %%XMM7, %%T5 1844 vaesenc %%XMM8, %%T5 1845 1846 vmovdqu %%T5, [%%GDATA + 16*13] 1847 vaesenc %%XMM1, %%T5 1848 vaesenc %%XMM2, %%T5 1849 vaesenc %%XMM3, %%T5 1850 vaesenc %%XMM4, %%T5 1851 vaesenc %%XMM5, %%T5 1852 vaesenc %%XMM6, %%T5 1853 vaesenc %%XMM7, %%T5 1854 vaesenc %%XMM8, %%T5 1855 1856 vmovdqu %%T5, [%%GDATA + 16*14] 1857%endif ; GCM256 1858 1859%assign i 0 1860%assign j 1 1861%rep 8 1862 1863 ;; SNP TBD: This is pretty ugly - consider whether just XORing the 1864 ;; data in after vaesenclast is simpler and performant. Would 1865 ;; also have to ripple it through partial block and ghash_mul_8. 1866%ifidn %%FULL_PARTIAL, full 1867 %ifdef NT_LD 1868 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1869 vpxor %%T2, %%T2, %%T5 1870 %else 1871 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1872 %endif 1873 1874 %ifidn %%ENC_DEC, ENC 1875 vaesenclast reg(j), reg(j), %%T2 1876 %else 1877 vaesenclast %%T3, reg(j), %%T2 1878 vpxor reg(j), %%T2, %%T5 1879 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 1880 %endif 1881 1882%else 1883 ; Don't read the final data during partial block processing 1884 %ifdef NT_LD 1885 %if (i<7) 1886 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1887 vpxor %%T2, %%T2, %%T5 1888 %else 1889 ;; Stage the key directly in T2 rather than hash it with plaintext 1890 vmovdqu %%T2, %%T5 1891 %endif 1892 %else 1893 %if (i<7) 1894 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1895 %else 1896 ;; Stage the key directly in T2 rather than hash it with plaintext 1897 vmovdqu %%T2, %%T5 1898 %endif 1899 %endif 1900 1901 %ifidn %%ENC_DEC, ENC 1902 vaesenclast reg(j), reg(j), %%T2 1903 %else 1904 %if (i<7) 1905 vaesenclast %%T3, reg(j), %%T2 1906 vpxor reg(j), %%T2, %%T5 1907 ;; Do not read the data since it could fault 1908 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 1909 %else 1910 vaesenclast reg(j), reg(j), %%T2 1911 %endif 1912 %endif 1913%endif 1914 1915%assign i (i+1) 1916%assign j (j+1) 1917%endrep 1918 1919 1920;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1921 1922 1923 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs 1924 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs 1925 vpxor %%T7, %%T7, %%T3 1926 vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 1927 1928 1929 1930 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1931 ;first phase of the reduction 1932 vmovdqu %%T3, [rel POLY2] 1933 1934 vpclmulqdq %%T2, %%T3, %%T7, 0x01 1935 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 1936 1937 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete 1938 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1939 1940 %ifidn %%ENC_DEC, ENC 1941 ; Write to the Ciphertext buffer 1942 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 1943 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 1944 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 1945 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 1946 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 1947 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 1948 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 1949 %ifidn %%FULL_PARTIAL, full 1950 ;; Avoid writing past the buffer if handling a partial block 1951 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 1952 %endif 1953 %endif 1954 1955 1956;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1957 ;second phase of the reduction 1958 vpclmulqdq %%T2, %%T3, %%T7, 0x00 1959 vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1960 1961 vpclmulqdq %%T4, %%T3, %%T7, 0x10 1962 vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 1963 1964 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete 1965 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1966 vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 1967 1968 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap 1969 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap 1970 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap 1971 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap 1972 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap 1973 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap 1974 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap 1975 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap 1976 1977 1978 vpxor %%XMM1, %%T1 1979 1980 1981%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL 1982 1983 1984; GHASH the last 4 ciphertext blocks. 1985%macro GHASH_LAST_8 16 1986%define %%GDATA %1 1987%define %%T1 %2 1988%define %%T2 %3 1989%define %%T3 %4 1990%define %%T4 %5 1991%define %%T5 %6 1992%define %%T6 %7 1993%define %%T7 %8 1994%define %%XMM1 %9 1995%define %%XMM2 %10 1996%define %%XMM3 %11 1997%define %%XMM4 %12 1998%define %%XMM5 %13 1999%define %%XMM6 %14 2000%define %%XMM7 %15 2001%define %%XMM8 %16 2002 2003 ;; Karatsuba Method 2004 2005 vmovdqu %%T5, [%%GDATA + HashKey_8] 2006 2007 vpshufd %%T2, %%XMM1, 01001110b 2008 vpshufd %%T3, %%T5, 01001110b 2009 vpxor %%T2, %%T2, %%XMM1 2010 vpxor %%T3, %%T3, %%T5 2011 2012 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 2013 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 2014 2015 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 2016 2017 ;;;;;;;;;;;;;;;;;;;;;; 2018 2019 vmovdqu %%T5, [%%GDATA + HashKey_7] 2020 vpshufd %%T2, %%XMM2, 01001110b 2021 vpshufd %%T3, %%T5, 01001110b 2022 vpxor %%T2, %%T2, %%XMM2 2023 vpxor %%T3, %%T3, %%T5 2024 2025 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 2026 vpxor %%T6, %%T6, %%T4 2027 2028 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 2029 vpxor %%T7, %%T7, %%T4 2030 2031 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2032 2033 vpxor %%XMM1, %%XMM1, %%T2 2034 2035 ;;;;;;;;;;;;;;;;;;;;;; 2036 2037 vmovdqu %%T5, [%%GDATA + HashKey_6] 2038 vpshufd %%T2, %%XMM3, 01001110b 2039 vpshufd %%T3, %%T5, 01001110b 2040 vpxor %%T2, %%T2, %%XMM3 2041 vpxor %%T3, %%T3, %%T5 2042 2043 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 2044 vpxor %%T6, %%T6, %%T4 2045 2046 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 2047 vpxor %%T7, %%T7, %%T4 2048 2049 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2050 2051 vpxor %%XMM1, %%XMM1, %%T2 2052 2053 ;;;;;;;;;;;;;;;;;;;;;; 2054 2055 vmovdqu %%T5, [%%GDATA + HashKey_5] 2056 vpshufd %%T2, %%XMM4, 01001110b 2057 vpshufd %%T3, %%T5, 01001110b 2058 vpxor %%T2, %%T2, %%XMM4 2059 vpxor %%T3, %%T3, %%T5 2060 2061 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 2062 vpxor %%T6, %%T6, %%T4 2063 2064 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 2065 vpxor %%T7, %%T7, %%T4 2066 2067 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2068 2069 vpxor %%XMM1, %%XMM1, %%T2 2070 2071 ;;;;;;;;;;;;;;;;;;;;;; 2072 2073 vmovdqu %%T5, [%%GDATA + HashKey_4] 2074 vpshufd %%T2, %%XMM5, 01001110b 2075 vpshufd %%T3, %%T5, 01001110b 2076 vpxor %%T2, %%T2, %%XMM5 2077 vpxor %%T3, %%T3, %%T5 2078 2079 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 2080 vpxor %%T6, %%T6, %%T4 2081 2082 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 2083 vpxor %%T7, %%T7, %%T4 2084 2085 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2086 2087 vpxor %%XMM1, %%XMM1, %%T2 2088 2089 ;;;;;;;;;;;;;;;;;;;;;; 2090 2091 vmovdqu %%T5, [%%GDATA + HashKey_3] 2092 vpshufd %%T2, %%XMM6, 01001110b 2093 vpshufd %%T3, %%T5, 01001110b 2094 vpxor %%T2, %%T2, %%XMM6 2095 vpxor %%T3, %%T3, %%T5 2096 2097 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 2098 vpxor %%T6, %%T6, %%T4 2099 2100 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 2101 vpxor %%T7, %%T7, %%T4 2102 2103 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2104 2105 vpxor %%XMM1, %%XMM1, %%T2 2106 2107 ;;;;;;;;;;;;;;;;;;;;;; 2108 2109 vmovdqu %%T5, [%%GDATA + HashKey_2] 2110 vpshufd %%T2, %%XMM7, 01001110b 2111 vpshufd %%T3, %%T5, 01001110b 2112 vpxor %%T2, %%T2, %%XMM7 2113 vpxor %%T3, %%T3, %%T5 2114 2115 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 2116 vpxor %%T6, %%T6, %%T4 2117 2118 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 2119 vpxor %%T7, %%T7, %%T4 2120 2121 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2122 2123 vpxor %%XMM1, %%XMM1, %%T2 2124 2125 ;;;;;;;;;;;;;;;;;;;;;; 2126 2127 vmovdqu %%T5, [%%GDATA + HashKey] 2128 vpshufd %%T2, %%XMM8, 01001110b 2129 vpshufd %%T3, %%T5, 01001110b 2130 vpxor %%T2, %%T2, %%XMM8 2131 vpxor %%T3, %%T3, %%T5 2132 2133 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 2134 vpxor %%T6, %%T6, %%T4 2135 2136 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 2137 vpxor %%T7, %%T7, %%T4 2138 2139 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2140 2141 vpxor %%XMM1, %%XMM1, %%T2 2142 vpxor %%XMM1, %%XMM1, %%T6 2143 vpxor %%T2, %%XMM1, %%T7 2144 2145 2146 2147 2148 vpslldq %%T4, %%T2, 8 2149 vpsrldq %%T2, %%T2, 8 2150 2151 vpxor %%T7, %%T7, %%T4 2152 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications 2153 2154 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2155 ;first phase of the reduction 2156 vmovdqu %%T3, [rel POLY2] 2157 2158 vpclmulqdq %%T2, %%T3, %%T7, 0x01 2159 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 2160 2161 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete 2162 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2163 2164 2165 ;second phase of the reduction 2166 vpclmulqdq %%T2, %%T3, %%T7, 0x00 2167 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2168 2169 vpclmulqdq %%T4, %%T3, %%T7, 0x10 2170 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2171 2172 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete 2173 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2174 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 2175%endmacro 2176 2177 2178; GHASH the last 4 ciphertext blocks. 2179%macro GHASH_LAST_7 15 2180%define %%GDATA %1 2181%define %%T1 %2 2182%define %%T2 %3 2183%define %%T3 %4 2184%define %%T4 %5 2185%define %%T5 %6 2186%define %%T6 %7 2187%define %%T7 %8 2188%define %%XMM1 %9 2189%define %%XMM2 %10 2190%define %%XMM3 %11 2191%define %%XMM4 %12 2192%define %%XMM5 %13 2193%define %%XMM6 %14 2194%define %%XMM7 %15 2195 2196 ;; Karatsuba Method 2197 2198 vmovdqu %%T5, [%%GDATA + HashKey_7] 2199 2200 vpshufd %%T2, %%XMM1, 01001110b 2201 vpshufd %%T3, %%T5, 01001110b 2202 vpxor %%T2, %%T2, %%XMM1 2203 vpxor %%T3, %%T3, %%T5 2204 2205 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 2206 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 2207 2208 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 2209 2210 ;;;;;;;;;;;;;;;;;;;;;; 2211 2212 vmovdqu %%T5, [%%GDATA + HashKey_6] 2213 vpshufd %%T2, %%XMM2, 01001110b 2214 vpshufd %%T3, %%T5, 01001110b 2215 vpxor %%T2, %%T2, %%XMM2 2216 vpxor %%T3, %%T3, %%T5 2217 2218 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 2219 vpxor %%T6, %%T6, %%T4 2220 2221 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 2222 vpxor %%T7, %%T7, %%T4 2223 2224 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2225 2226 vpxor %%XMM1, %%XMM1, %%T2 2227 2228 ;;;;;;;;;;;;;;;;;;;;;; 2229 2230 vmovdqu %%T5, [%%GDATA + HashKey_5] 2231 vpshufd %%T2, %%XMM3, 01001110b 2232 vpshufd %%T3, %%T5, 01001110b 2233 vpxor %%T2, %%T2, %%XMM3 2234 vpxor %%T3, %%T3, %%T5 2235 2236 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 2237 vpxor %%T6, %%T6, %%T4 2238 2239 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 2240 vpxor %%T7, %%T7, %%T4 2241 2242 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2243 2244 vpxor %%XMM1, %%XMM1, %%T2 2245 2246 ;;;;;;;;;;;;;;;;;;;;;; 2247 2248 vmovdqu %%T5, [%%GDATA + HashKey_4] 2249 vpshufd %%T2, %%XMM4, 01001110b 2250 vpshufd %%T3, %%T5, 01001110b 2251 vpxor %%T2, %%T2, %%XMM4 2252 vpxor %%T3, %%T3, %%T5 2253 2254 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 2255 vpxor %%T6, %%T6, %%T4 2256 2257 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 2258 vpxor %%T7, %%T7, %%T4 2259 2260 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2261 2262 vpxor %%XMM1, %%XMM1, %%T2 2263 2264 ;;;;;;;;;;;;;;;;;;;;;; 2265 2266 vmovdqu %%T5, [%%GDATA + HashKey_3] 2267 vpshufd %%T2, %%XMM5, 01001110b 2268 vpshufd %%T3, %%T5, 01001110b 2269 vpxor %%T2, %%T2, %%XMM5 2270 vpxor %%T3, %%T3, %%T5 2271 2272 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 2273 vpxor %%T6, %%T6, %%T4 2274 2275 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 2276 vpxor %%T7, %%T7, %%T4 2277 2278 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2279 2280 vpxor %%XMM1, %%XMM1, %%T2 2281 2282 ;;;;;;;;;;;;;;;;;;;;;; 2283 2284 vmovdqu %%T5, [%%GDATA + HashKey_2] 2285 vpshufd %%T2, %%XMM6, 01001110b 2286 vpshufd %%T3, %%T5, 01001110b 2287 vpxor %%T2, %%T2, %%XMM6 2288 vpxor %%T3, %%T3, %%T5 2289 2290 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 2291 vpxor %%T6, %%T6, %%T4 2292 2293 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 2294 vpxor %%T7, %%T7, %%T4 2295 2296 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2297 2298 vpxor %%XMM1, %%XMM1, %%T2 2299 2300 ;;;;;;;;;;;;;;;;;;;;;; 2301 2302 vmovdqu %%T5, [%%GDATA + HashKey_1] 2303 vpshufd %%T2, %%XMM7, 01001110b 2304 vpshufd %%T3, %%T5, 01001110b 2305 vpxor %%T2, %%T2, %%XMM7 2306 vpxor %%T3, %%T3, %%T5 2307 2308 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 2309 vpxor %%T6, %%T6, %%T4 2310 2311 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 2312 vpxor %%T7, %%T7, %%T4 2313 2314 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2315 2316 vpxor %%XMM1, %%XMM1, %%T2 2317 2318 ;;;;;;;;;;;;;;;;;;;;;; 2319 2320 vpxor %%XMM1, %%XMM1, %%T6 2321 vpxor %%T2, %%XMM1, %%T7 2322 2323 2324 2325 2326 vpslldq %%T4, %%T2, 8 2327 vpsrldq %%T2, %%T2, 8 2328 2329 vpxor %%T7, %%T7, %%T4 2330 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications 2331 2332 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2333 ;first phase of the reduction 2334 vmovdqu %%T3, [rel POLY2] 2335 2336 vpclmulqdq %%T2, %%T3, %%T7, 0x01 2337 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 2338 2339 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete 2340 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2341 2342 2343 ;second phase of the reduction 2344 vpclmulqdq %%T2, %%T3, %%T7, 0x00 2345 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2346 2347 vpclmulqdq %%T4, %%T3, %%T7, 0x10 2348 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2349 2350 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete 2351 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2352 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 2353%endmacro 2354 2355 2356 2357;;; Handle encryption of the final partial block 2358;;; IN: 2359;;; r13 - Number of bytes to read 2360;;; MODIFIES: 2361;;; KEY - Key for encrypting the partial block 2362;;; SMASHES: 2363;;; rax, T1 2364;;; Note: 2365;;; PLAIN_CYPH_LEN is unused at this stage. Previously: 2366;;; it was used to determine if buffer is big enough to do 2367;;; a 16 byte read & shift. 2368;;; 'LT16' is passed here only if buffer is known to be smaller 2369;;; than 16 bytes. 2370;;; Any other value passed here will result in 16 byte read 2371;;; code path. 2372%macro ENCRYPT_FINAL_PARTIAL_BLOCK 7 2373%define %%KEY %1 2374%define %%T1 %2 2375%define %%CYPH_PLAIN_OUT %3 2376%define %%PLAIN_CYPH_IN %4 2377%define %%PLAIN_CYPH_LEN %5 2378%define %%ENC_DEC %6 2379%define %%DATA_OFFSET %7 2380 2381 ;; %%PLAIN_CYPH_IN + %%DATA_OFFSET 2382 ;; - input data address 2383 ;; r13 - input data length 2384 ;; rax - temp registers 2385 ;; out: 2386 ;; T1 - packed output 2387 ;; k1 - valid byte mask 2388 READ_SMALL_DATA_INPUT %%T1, %%PLAIN_CYPH_IN+%%DATA_OFFSET, r13, rax 2389 2390 ;; At this point T1 contains the partial block data 2391 ;; Plaintext XOR E(K, Yn) 2392 vpxorq %%KEY, %%KEY, %%T1 2393 2394 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2395 ;; Output r13 Bytes 2396 vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, %%KEY 2397 2398%ifidn %%ENC_DEC, DEC 2399 ;; If decrypt, restore the ciphertext into %%KEY 2400 vmovdqa64 %%KEY, %%T1 2401%else 2402 vmovdqu8 %%KEY{k1}{z}, %%KEY 2403%endif 2404%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK 2405 2406 2407 2408; Encryption of a single block 2409%macro ENCRYPT_SINGLE_BLOCK 2 2410%define %%GDATA %1 2411%define %%XMM0 %2 2412 2413 vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] 2414%assign i 1 2415%rep NROUNDS 2416 vaesenc %%XMM0, [%%GDATA+16*i] 2417%assign i (i+1) 2418%endrep 2419 vaesenclast %%XMM0, [%%GDATA+16*i] 2420%endmacro 2421 2422 2423;; Start of Stack Setup 2424 2425%macro FUNC_SAVE 0 2426 ;; Required for Update/GMC_ENC 2427 ;the number of pushes must equal STACK_OFFSET 2428 push r12 2429 push r13 2430 push r14 2431 push r15 2432 mov r14, rsp 2433 2434 sub rsp, VARIABLE_OFFSET 2435 and rsp, ~63 2436 2437%ifidn __OUTPUT_FORMAT__, win64 2438 ; xmm6:xmm15 need to be maintained for Windows 2439 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 2440 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 2441 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 2442 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 2443 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 2444 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 2445 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 2446 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 2447 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 2448 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 2449%endif 2450%endmacro 2451 2452 2453%macro FUNC_RESTORE 0 2454 2455%ifdef SAFE_DATA 2456 clear_scratch_gps_asm 2457 clear_scratch_zmms_asm 2458%endif 2459%ifidn __OUTPUT_FORMAT__, win64 2460 vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] 2461 vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] 2462 vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] 2463 vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] 2464 vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] 2465 vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] 2466 vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] 2467 vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] 2468 vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] 2469 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] 2470%endif 2471;; Required for Update/GMC_ENC 2472 mov rsp, r14 2473 pop r15 2474 pop r14 2475 pop r13 2476 pop r12 2477%endmacro 2478 2479%macro CALC_J0 13 2480%define %%KEY %1 ;; [in] Pointer to GCM KEY structure 2481%define %%IV %2 ;; [in] Pointer to IV 2482%define %%IV_LEN %3 ;; [in] IV length 2483%define %%J0 %4 ;; [out] XMM reg to contain J0 2484%define %%TMP0 %5 ;; [clobbered] Temporary GP reg 2485%define %%TMP1 %6 ;; [clobbered] Temporary GP reg 2486%define %%TMP2 %7 ;; [clobbered] Temporary GP reg 2487%define %%XTMP0 %8 ;; [clobbered] Temporary XMM reg 2488%define %%XTMP1 %9 ;; [clobbered] Temporary XMM reg 2489%define %%XTMP2 %10 ;; [clobbered] Temporary XMM reg 2490%define %%XTMP3 %11 ;; [clobbered] Temporary XMM reg 2491%define %%XTMP4 %12 ;; [clobbered] Temporary XMM reg 2492%define %%XTMP5 %13 ;; [clobbered] Temporary XMM reg 2493 2494 ;; J0 = GHASH(IV || 0s+64 || len(IV)64) 2495 ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ 2496 2497 ;; Calculate GHASH of (IV || 0s) 2498 vpxor %%J0, %%J0 2499 CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ 2500 %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2 2501 2502 ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) 2503 vmovdqu %%XTMP0, [%%KEY + HashKey] 2504 mov %%TMP2, %%IV_LEN 2505 shl %%TMP2, 3 ;; IV length in bits 2506 vmovq %%XTMP1, %%TMP2 2507 vpxor %%J0, %%XTMP1 2508 GHASH_MUL %%J0, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 2509 2510 vpshufb %%J0, [rel SHUF_MASK] ; perform a 16Byte swap 2511%endmacro 2512 2513;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2514; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. 2515; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, IV_LEN 2516; Additional Authentication data (A_IN), Additional Data length (A_LEN). 2517; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. 2518; Clobbers rax, r10-r13, and xmm0-xmm6 2519;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2520%macro GCM_INIT 8-9 2521%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer 2522%define %%GDATA_CTX %2 ; [in] GCM context pointer 2523%define %%IV %3 ; [in] IV pointer 2524%define %%A_IN %4 ; [in] AAD pointer 2525%define %%A_LEN %5 ; [in] AAD length in bytes 2526%define %%GPR1 %6 ; temp GPR 2527%define %%GPR2 %7 ; temp GPR 2528%define %%GPR3 %8 ; temp GPR 2529%define %%IV_LEN %9 ; [in] IV length 2530 2531%define %%AAD_HASH xmm14 2532 2533 vpxor %%AAD_HASH, %%AAD_HASH 2534 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 2535 2536 mov %%GPR1, %%A_LEN 2537 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash 2538 mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length 2539 2540 xor %%GPR1, %%GPR1 2541 mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0 2542 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0 2543 2544%if %0 == 9 ;; IV is different than 12 bytes 2545 CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, xmm0, xmm1, \ 2546 xmm3, xmm4, xmm5, xmm6 2547%else ;; IV is 12 bytes 2548 ;; read 12 IV bytes and pad with 0x00000001 2549 mov %%GPR2, %%IV 2550 vmovd xmm3, [%%GPR2 + 8] 2551 vpslldq xmm3, 8 2552 vmovq xmm2, [%%GPR2] 2553 vmovdqa xmm4, [rel ONEf] 2554 vpternlogq xmm2, xmm3, xmm4, 0xfe ; xmm2 = xmm2 or xmm3 or xmm4 2555%endif 2556 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv 2557 2558 ;; store IV as counter in LE format 2559 vpshufb xmm2, [rel SHUF_MASK] 2560 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv 2561%endmacro 2562 2563%macro GCM_ENC_DEC_SMALL 12 2564%define %%GDATA_KEY %1 2565%define %%GDATA_CTX %2 2566%define %%CYPH_PLAIN_OUT %3 2567%define %%PLAIN_CYPH_IN %4 2568%define %%PLAIN_CYPH_LEN %5 2569%define %%ENC_DEC %6 2570%define %%DATA_OFFSET %7 2571%define %%LENGTH %8 ; assumed r13 2572%define %%NUM_BLOCKS %9 2573%define %%CTR %10 ; assumed xmm9 2574%define %%HASH_OUT %11 ; assumed xmm14 2575%define %%INSTANCE_TYPE %12 2576 2577 ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. 2578 ;; cmp %%NUM_BLOCKS, 0 2579 ;; je %%_small_initial_blocks_encrypted 2580 cmp %%NUM_BLOCKS, 8 2581 je %%_small_initial_num_blocks_is_8 2582 cmp %%NUM_BLOCKS, 7 2583 je %%_small_initial_num_blocks_is_7 2584 cmp %%NUM_BLOCKS, 6 2585 je %%_small_initial_num_blocks_is_6 2586 cmp %%NUM_BLOCKS, 5 2587 je %%_small_initial_num_blocks_is_5 2588 cmp %%NUM_BLOCKS, 4 2589 je %%_small_initial_num_blocks_is_4 2590 cmp %%NUM_BLOCKS, 3 2591 je %%_small_initial_num_blocks_is_3 2592 cmp %%NUM_BLOCKS, 2 2593 je %%_small_initial_num_blocks_is_2 2594 2595 jmp %%_small_initial_num_blocks_is_1 2596 2597 2598%%_small_initial_num_blocks_is_8: 2599 ;; r13 - %%LENGTH 2600 ;; xmm12 - T1 2601 ;; xmm13 - T2 2602 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys 2603 ;; xmm15 - T4 2604 ;; xmm11 - T5 2605 ;; xmm9 - CTR 2606 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys 2607 ;; xmm2 - XMM2 2608 ;; xmm3 - XMM3 2609 ;; xmm4 - XMM4 2610 ;; xmm5 - XMM5 2611 ;; xmm6 - XMM6 2612 ;; xmm7 - XMM7 2613 ;; xmm8 - XMM8 - AAD HASH IN 2614 ;; xmm10 - T6 2615 ;; xmm0 - T_key 2616 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2617 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 8, \ 2618 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2619 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2620 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2621 jmp %%_small_initial_blocks_encrypted 2622 2623%%_small_initial_num_blocks_is_7: 2624 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2625 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 7, \ 2626 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2627 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2628 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2629 jmp %%_small_initial_blocks_encrypted 2630 2631%%_small_initial_num_blocks_is_6: 2632 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2633 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 6, \ 2634 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2635 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2636 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2637 jmp %%_small_initial_blocks_encrypted 2638 2639%%_small_initial_num_blocks_is_5: 2640 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2641 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 5, \ 2642 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2643 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2644 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2645 jmp %%_small_initial_blocks_encrypted 2646 2647%%_small_initial_num_blocks_is_4: 2648 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2649 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 4, \ 2650 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2651 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2652 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2653 jmp %%_small_initial_blocks_encrypted 2654 2655%%_small_initial_num_blocks_is_3: 2656 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2657 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 3, \ 2658 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2659 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2660 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2661 jmp %%_small_initial_blocks_encrypted 2662 2663%%_small_initial_num_blocks_is_2: 2664 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2665 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 2, \ 2666 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2667 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2668 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2669 jmp %%_small_initial_blocks_encrypted 2670 2671%%_small_initial_num_blocks_is_1: 2672 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ 2673 %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 1, \ 2674 xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \ 2675 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \ 2676 xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2677%%_small_initial_blocks_encrypted: 2678 2679%endmacro ; GCM_ENC_DEC_SMALL 2680 2681;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2682; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct 2683; has been initialized by GCM_INIT 2684; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. 2685; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), 2686; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). 2687; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX 2688; Clobbers rax, r10-r15, and xmm0-xmm15 2689;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2690%macro GCM_ENC_DEC 7 2691%define %%GDATA_KEY %1 2692%define %%GDATA_CTX %2 2693%define %%CYPH_PLAIN_OUT %3 2694%define %%PLAIN_CYPH_IN %4 2695%define %%PLAIN_CYPH_LEN %5 2696%define %%ENC_DEC %6 2697%define %%INSTANCE_TYPE %7 2698%define %%DATA_OFFSET r11 2699 2700; Macro flow: 2701; calculate the number of 16byte blocks in the message 2702; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' 2703; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' 2704; if there is a block of less than 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' 2705 2706%ifidn __OUTPUT_FORMAT__, win64 2707 cmp %%PLAIN_CYPH_LEN, 0 2708%else 2709 or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN 2710%endif 2711 je %%_enc_dec_done 2712 2713 xor %%DATA_OFFSET, %%DATA_OFFSET 2714 ;; Update length of data processed 2715%ifidn __OUTPUT_FORMAT__, win64 2716 mov rax, %%PLAIN_CYPH_LEN 2717 add [%%GDATA_CTX + InLen], rax 2718%else 2719 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN 2720%endif 2721 vmovdqu xmm13, [%%GDATA_KEY + HashKey] 2722 vmovdqu xmm8, [%%GDATA_CTX + AadHash] 2723 2724%ifidn %%INSTANCE_TYPE, multi_call 2725 ;; NOTE: partial block processing makes only sense for multi_call here. 2726 ;; Used for the update flow - if there was a previous partial 2727 ;; block fill the remaining bytes here. 2728 PARTIAL_BLOCK %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, xmm13, %%ENC_DEC 2729%endif 2730 2731 ;; lift CTR set from initial_blocks to here 2732%ifidn %%INSTANCE_TYPE, single_call 2733 vmovdqu xmm9, xmm2 2734%else 2735 vmovdqu xmm9, [%%GDATA_CTX + CurCount] 2736%endif 2737 2738 ;; Save the amount of data left to process in r10 2739 mov r13, %%PLAIN_CYPH_LEN 2740%ifidn %%INSTANCE_TYPE, multi_call 2741 ;; NOTE: %%DATA_OFFSET is zero in single_call case. 2742 ;; Consequently PLAIN_CYPH_LEN will never be zero after 2743 ;; %%DATA_OFFSET subtraction below. 2744 sub r13, %%DATA_OFFSET 2745 2746 ;; There may be no more data if it was consumed in the partial block. 2747 cmp r13, 0 2748 je %%_enc_dec_done 2749%endif ; %%INSTANCE_TYPE, multi_call 2750 mov r10, r13 2751 2752 ;; Determine how many blocks to process in INITIAL 2753 mov r12, r13 2754 shr r12, 4 2755 and r12, 7 2756 2757 ;; Process one additional block in INITIAL if there is a partial block 2758 and r10, 0xf 2759 blsmsk r10, r10 ; Set CF if zero 2760 cmc ; Flip CF 2761 adc r12, 0x0 ; Process an additional INITIAL block if CF set 2762 2763 ;; Less than 127B will be handled by the small message code, which 2764 ;; can process up to 7 16B blocks. 2765 cmp r13, 128 2766 jge %%_large_message_path 2767 2768 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE 2769 jmp %%_ghash_done 2770 2771%%_large_message_path: 2772 and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will 2773 ; can be handled by the x8 partial loop. 2774 2775 cmp r12, 0 2776 je %%_initial_num_blocks_is_0 2777 cmp r12, 7 2778 je %%_initial_num_blocks_is_7 2779 cmp r12, 6 2780 je %%_initial_num_blocks_is_6 2781 cmp r12, 5 2782 je %%_initial_num_blocks_is_5 2783 cmp r12, 4 2784 je %%_initial_num_blocks_is_4 2785 cmp r12, 3 2786 je %%_initial_num_blocks_is_3 2787 cmp r12, 2 2788 je %%_initial_num_blocks_is_2 2789 2790 jmp %%_initial_num_blocks_is_1 2791 2792%%_initial_num_blocks_is_7: 2793 ;; r13 - %%LENGTH 2794 ;; xmm12 - T1 2795 ;; xmm13 - T2 2796 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys 2797 ;; xmm15 - T4 2798 ;; xmm11 - T5 2799 ;; xmm9 - CTR 2800 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys 2801 ;; xmm2 - XMM2 2802 ;; xmm3 - XMM3 2803 ;; xmm4 - XMM4 2804 ;; xmm5 - XMM5 2805 ;; xmm6 - XMM6 2806 ;; xmm7 - XMM7 2807 ;; xmm8 - XMM8 - AAD HASH IN 2808 ;; xmm10 - T6 2809 ;; xmm0 - T_key 2810 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2811 jmp %%_initial_blocks_encrypted 2812 2813%%_initial_num_blocks_is_6: 2814 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2815 jmp %%_initial_blocks_encrypted 2816 2817%%_initial_num_blocks_is_5: 2818 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2819 jmp %%_initial_blocks_encrypted 2820 2821%%_initial_num_blocks_is_4: 2822 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2823 jmp %%_initial_blocks_encrypted 2824 2825%%_initial_num_blocks_is_3: 2826 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2827 jmp %%_initial_blocks_encrypted 2828 2829%%_initial_num_blocks_is_2: 2830 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2831 jmp %%_initial_blocks_encrypted 2832 2833%%_initial_num_blocks_is_1: 2834 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2835 jmp %%_initial_blocks_encrypted 2836 2837%%_initial_num_blocks_is_0: 2838 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2839 2840 2841%%_initial_blocks_encrypted: 2842 ;; The entire message was encrypted processed in initial and now need to be hashed 2843 cmp r13, 0 2844 je %%_encrypt_done 2845 2846 ;; Encrypt the final <16 byte (partial) block, then hash 2847 cmp r13, 16 2848 jl %%_encrypt_final_partial 2849 2850 ;; Process 7 full blocks plus a partial block 2851 cmp r13, 128 2852 jl %%_encrypt_by_8_partial 2853 2854 2855%%_encrypt_by_8_parallel: 2856 ;; in_order vs. out_order is an optimization to increment the counter without shuffling 2857 ;; it back into little endian. r15d keeps track of when we need to increent in order so 2858 ;; that the carry is handled correctly. 2859 vmovd r15d, xmm9 2860 and r15d, 255 2861 vpshufb xmm9, [rel SHUF_MASK] 2862 2863 2864%%_encrypt_by_8_new: 2865 cmp r15d, 255-8 2866 jg %%_encrypt_by_8 2867 2868 2869 2870 ;; xmm0 - T1 2871 ;; xmm10 - T2 2872 ;; xmm11 - T3 2873 ;; xmm12 - T4 2874 ;; xmm13 - T5 2875 ;; xmm14 - T6 2876 ;; xmm9 - CTR 2877 ;; xmm1 - XMM1 2878 ;; xmm2 - XMM2 2879 ;; xmm3 - XMM3 2880 ;; xmm4 - XMM4 2881 ;; xmm5 - XMM5 2882 ;; xmm6 - XMM6 2883 ;; xmm7 - XMM7 2884 ;; xmm8 - XMM8 2885 ;; xmm15 - T7 2886 add r15b, 8 2887 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full 2888 add %%DATA_OFFSET, 128 2889 sub r13, 128 2890 cmp r13, 128 2891 jge %%_encrypt_by_8_new 2892 2893 vpshufb xmm9, [rel SHUF_MASK] 2894 jmp %%_encrypt_by_8_parallel_done 2895 2896%%_encrypt_by_8: 2897 vpshufb xmm9, [rel SHUF_MASK] 2898 add r15b, 8 2899 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full 2900 vpshufb xmm9, [rel SHUF_MASK] 2901 add %%DATA_OFFSET, 128 2902 sub r13, 128 2903 cmp r13, 128 2904 jge %%_encrypt_by_8_new 2905 vpshufb xmm9, [rel SHUF_MASK] 2906 2907 2908%%_encrypt_by_8_parallel_done: 2909 ;; Test to see if we need a by 8 with partial block. At this point 2910 ;; bytes remaining should be either zero or between 113-127. 2911 cmp r13, 0 2912 je %%_encrypt_done 2913 2914%%_encrypt_by_8_partial: 2915 ;; Shuffle needed to align key for partial block xor. out_order 2916 ;; is a little faster because it avoids extra shuffles. 2917 ;; TBD: Might need to account for when we don't have room to increment the counter. 2918 2919 2920 ;; Process parallel buffers with a final partial block. 2921 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial 2922 2923 2924 add %%DATA_OFFSET, 128-16 2925 sub r13, 128-16 2926 2927%%_encrypt_final_partial: 2928 2929 vpshufb xmm8, [rel SHUF_MASK] 2930 mov [%%GDATA_CTX + PBlockLen], r13 2931 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 2932 2933 ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext 2934 ;; GDATA, KEY, T1, T2 2935 ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET 2936 2937 vpshufb xmm8, [rel SHUF_MASK] 2938 2939 2940%%_encrypt_done: 2941 2942 ;; Mapping to macro parameters 2943 ;; IN: 2944 ;; xmm9 contains the counter 2945 ;; xmm1-xmm8 contain the xor'd ciphertext 2946 ;; OUT: 2947 ;; xmm14 contains the final hash 2948 ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 2949%ifidn %%INSTANCE_TYPE, multi_call 2950 mov r13, [%%GDATA_CTX + PBlockLen] 2951 cmp r13, 0 2952 jz %%_hash_last_8 2953 GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 2954 ;; XOR the partial word into the hash 2955 vpxor xmm14, xmm14, xmm8 2956 jmp %%_ghash_done 2957%endif 2958%%_hash_last_8: 2959 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 2960 2961%%_ghash_done: 2962 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 2963 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 2964 2965%%_enc_dec_done: 2966 2967 2968%endmacro ; GCM_ENC_DEC 2969 2970;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2971; GCM_COMPLETE Finishes Encryption/Decryption of last partial block after GCM_UPDATE finishes. 2972; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX). 2973; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) 2974; Clobbers rax, r10-r12, and xmm0-xmm2, xmm5-xmm6, xmm9-xmm11, xmm13-xmm15 2975;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2976%macro GCM_COMPLETE 5 2977%define %%GDATA_KEY %1 2978%define %%GDATA_CTX %2 2979%define %%AUTH_TAG %3 2980%define %%AUTH_TAG_LEN %4 2981%define %%INSTANCE_TYPE %5 2982%define %%PLAIN_CYPH_LEN rax 2983 2984 vmovdqu xmm13, [%%GDATA_KEY + HashKey] 2985 ;; Start AES as early as possible 2986 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 2987 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) 2988 2989%ifidn %%INSTANCE_TYPE, multi_call 2990 ;; If the GCM function is called as a single function call rather 2991 ;; than invoking the individual parts (init, update, finalize) we 2992 ;; can remove a write to read dependency on AadHash. 2993 vmovdqu xmm14, [%%GDATA_CTX + AadHash] 2994 2995 ;; Encrypt the final partial block. If we did this as a single call then 2996 ;; the partial block was handled in the main GCM_ENC_DEC macro. 2997 mov r12, [%%GDATA_CTX + PBlockLen] 2998 cmp r12, 0 2999 3000 je %%_partial_done 3001 3002 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block 3003 vmovdqu [%%GDATA_CTX + AadHash], xmm14 3004 3005%%_partial_done: 3006 3007%endif 3008 3009 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) 3010 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] 3011 3012 shl r12, 3 ; convert into number of bits 3013 vmovq xmm15, r12 ; len(A) in xmm15 3014 3015 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) 3016 vmovq xmm1, %%PLAIN_CYPH_LEN 3017 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 3018 vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) 3019 3020 vpxor xmm14, xmm15 3021 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 3022 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap 3023 3024 vpxor xmm9, xmm9, xmm14 3025 3026 3027%%_return_T: 3028 mov r10, %%AUTH_TAG ; r10 = authTag 3029 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len 3030 3031 cmp r11, 16 3032 je %%_T_16 3033 3034 cmp r11, 12 3035 je %%_T_12 3036 3037 cmp r11, 8 3038 je %%_T_8 3039 3040 simd_store_avx r10, xmm9, r11, r12, rax 3041 jmp %%_return_T_done 3042%%_T_8: 3043 vmovq rax, xmm9 3044 mov [r10], rax 3045 jmp %%_return_T_done 3046%%_T_12: 3047 vmovq rax, xmm9 3048 mov [r10], rax 3049 vpsrldq xmm9, xmm9, 8 3050 vmovd eax, xmm9 3051 mov [r10 + 8], eax 3052 jmp %%_return_T_done 3053%%_T_16: 3054 vmovdqu [r10], xmm9 3055 3056%%_return_T_done: 3057 3058%ifdef SAFE_DATA 3059 ;; Clear sensitive data from context structure 3060 vpxor xmm0, xmm0 3061 vmovdqu [%%GDATA_CTX + AadHash], xmm0 3062 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0 3063%endif 3064%endmacro ; GCM_COMPLETE 3065 3066 3067;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3068;void aes_gcm_precomp_128_avx512 / 3069; aes_gcm_precomp_192_avx512 / 3070; aes_gcm_precomp_256_avx512 3071; (struct gcm_key_data *key_data) 3072;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3073MKGLOBAL(FN_NAME(precomp,_),function,) 3074FN_NAME(precomp,_): 3075;; Parameter is passed through register 3076%ifdef SAFE_PARAM 3077 ;; Check key_data != NULL 3078 cmp arg1, 0 3079 jz exit_precomp 3080%endif 3081 3082 push r12 3083 push r13 3084 push r14 3085 push r15 3086 3087 mov r14, rsp 3088 3089 3090 3091 sub rsp, VARIABLE_OFFSET 3092 and rsp, ~63 ; align rsp to 64 bytes 3093 3094%ifidn __OUTPUT_FORMAT__, win64 3095 ; only xmm6 needs to be maintained 3096 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 3097%endif 3098 3099 vpxor xmm6, xmm6 3100 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey 3101 3102 vpshufb xmm6, [rel SHUF_MASK] 3103 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; 3104 vmovdqa xmm2, xmm6 3105 vpsllq xmm6, xmm6, 1 3106 vpsrlq xmm2, xmm2, 63 3107 vmovdqa xmm1, xmm2 3108 vpslldq xmm2, xmm2, 8 3109 vpsrldq xmm1, xmm1, 8 3110 vpor xmm6, xmm6, xmm2 3111 ;reduction 3112 vpshufd xmm2, xmm1, 00100100b 3113 vpcmpeqd xmm2, [rel TWOONE] 3114 vpand xmm2, xmm2, [rel POLY] 3115 vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly 3116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3117 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly 3118 3119 3120 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 3121 3122%ifdef SAFE_DATA 3123 clear_scratch_gps_asm 3124 clear_scratch_zmms_asm 3125%endif 3126%ifidn __OUTPUT_FORMAT__, win64 3127 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] 3128%endif 3129 mov rsp, r14 3130 3131 pop r15 3132 pop r14 3133 pop r13 3134 pop r12 3135 3136exit_precomp: 3137 ret 3138 3139 3140;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3141;void aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512 3142; (const struct gcm_key_data *key_data, 3143; struct gcm_context_data *context_data, 3144; u8 *iv, 3145; const u8 *aad, 3146; u64 aad_len); 3147;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3148MKGLOBAL(FN_NAME(init,_),function,) 3149FN_NAME(init,_): 3150 push r12 3151 push r13 3152%ifidn __OUTPUT_FORMAT__, win64 3153 push r14 3154 push r15 3155 mov r14, rsp 3156 ; xmm6 needs to be maintained for Windows 3157 sub rsp, 1*16 3158 vmovdqu [rsp + 0*16], xmm6 3159%endif 3160 3161%ifdef SAFE_PARAM 3162 ;; Check key_data != NULL 3163 cmp arg1, 0 3164 jz exit_init 3165 3166 ;; Check context_data != NULL 3167 cmp arg2, 0 3168 jz exit_init 3169 3170 ;; Check IV != NULL 3171 cmp arg3, 0 3172 jz exit_init 3173 3174 ;; Check if aad_len == 0 3175 cmp arg5, 0 3176 jz skip_aad_check_init 3177 3178 ;; Check aad != NULL (aad_len != 0) 3179 cmp arg4, 0 3180 jz exit_init 3181 3182skip_aad_check_init: 3183%endif 3184 GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12 3185 3186%ifdef SAFE_DATA 3187 clear_scratch_gps_asm 3188 clear_scratch_zmms_asm 3189%endif 3190exit_init: 3191%ifidn __OUTPUT_FORMAT__, win64 3192 vmovdqu xmm6 , [rsp + 0*16] 3193 mov rsp, r14 3194 pop r15 3195 pop r14 3196%endif 3197 pop r13 3198 pop r12 3199 ret 3200 3201 3202;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3203;void aes_gcm_init_var_iv_128_avx512 / aes_gcm_init_var_iv_192_avx512 / 3204; aes_gcm_init_var_iv_256_avx512 3205; (const struct gcm_key_data *key_data, 3206; struct gcm_context_data *context_data, 3207; u8 *iv, 3208; const u64 iv_len, 3209; const u8 *aad, 3210; const u64 aad_len); 3211;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3212MKGLOBAL(FN_NAME(init_var_iv,_),function,) 3213FN_NAME(init_var_iv,_): 3214 push r12 3215 push r13 3216%ifidn __OUTPUT_FORMAT__, win64 3217 push r14 3218 push r15 3219 mov r14, rsp 3220 ; xmm6 needs to be maintained for Windows 3221 sub rsp, 1*16 3222 vmovdqu [rsp + 0*16], xmm6 3223%endif 3224 3225%ifdef SAFE_PARAM 3226 ;; Check key_data != NULL 3227 cmp arg1, 0 3228 jz exit_init_IV 3229 3230 ;; Check context_data != NULL 3231 cmp arg2, 0 3232 jz exit_init_IV 3233 3234 ;; Check IV != NULL 3235 cmp arg3, 0 3236 jz exit_init_IV 3237 3238 ;; Check iv_len != 0 3239 cmp arg4, 0 3240 jz exit_init_IV 3241 3242 ;; Check if aad_len == 0 3243 cmp arg6, 0 3244 jz skip_aad_check_init_IV 3245 3246 ;; Check aad != NULL (aad_len != 0) 3247 cmp arg5, 0 3248 jz exit_init_IV 3249 3250skip_aad_check_init_IV: 3251%endif 3252 cmp arg4, 12 3253 je iv_len_12_init_IV 3254 3255 GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, arg4 3256 jmp skip_iv_len_12_init_IV 3257 3258iv_len_12_init_IV: 3259 GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12 3260 3261skip_iv_len_12_init_IV: 3262%ifdef SAFE_DATA 3263 clear_scratch_gps_asm 3264 clear_scratch_zmms_asm 3265%endif 3266exit_init_IV: 3267%ifidn __OUTPUT_FORMAT__, win64 3268 vmovdqu xmm6 , [rsp + 0*16] 3269 mov rsp, r14 3270 pop r15 3271 pop r14 3272%endif 3273 pop r13 3274 pop r12 3275 ret 3276 3277 3278;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3279;void aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 / 3280; aes_gcm_enc_256_update_avx512 3281; (const struct gcm_key_data *key_data, 3282; struct gcm_context_data *context_data, 3283; u8 *out, 3284; const u8 *in, 3285; u64 plaintext_len); 3286;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3287MKGLOBAL(FN_NAME(enc,_update_),function,) 3288FN_NAME(enc,_update_): 3289 3290 FUNC_SAVE 3291 3292%ifdef SAFE_PARAM 3293 ;; Check key_data != NULL 3294 cmp arg1, 0 3295 jz exit_update_enc 3296 3297 ;; Check context_data != NULL 3298 cmp arg2, 0 3299 jz exit_update_enc 3300 3301 ;; Check if plaintext_len == 0 3302 cmp arg5, 0 3303 jz skip_in_out_check_update_enc 3304 3305 ;; Check out != NULL (plaintext_len != 0) 3306 cmp arg3, 0 3307 jz exit_update_enc 3308 3309 ;; Check in != NULL (plaintext_len != 0) 3310 cmp arg4, 0 3311 jz exit_update_enc 3312 3313skip_in_out_check_update_enc: 3314%endif 3315 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call 3316 3317exit_update_enc: 3318 FUNC_RESTORE 3319 3320 ret 3321 3322 3323;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3324;void aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 / 3325; aes_gcm_dec_256_update_avx512 3326; (const struct gcm_key_data *key_data, 3327; struct gcm_context_data *context_data, 3328; u8 *out, 3329; const u8 *in, 3330; u64 plaintext_len); 3331;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3332MKGLOBAL(FN_NAME(dec,_update_),function,) 3333FN_NAME(dec,_update_): 3334 3335 FUNC_SAVE 3336 3337%ifdef SAFE_PARAM 3338 ;; Check key_data != NULL 3339 cmp arg1, 0 3340 jz exit_update_dec 3341 3342 ;; Check context_data != NULL 3343 cmp arg2, 0 3344 jz exit_update_dec 3345 3346 ;; Check if plaintext_len == 0 3347 cmp arg5, 0 3348 jz skip_in_out_check_update_dec 3349 3350 ;; Check out != NULL (plaintext_len != 0) 3351 cmp arg3, 0 3352 jz exit_update_dec 3353 3354 ;; Check in != NULL (plaintext_len != 0) 3355 cmp arg4, 0 3356 jz exit_update_dec 3357 3358skip_in_out_check_update_dec: 3359%endif 3360 3361 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call 3362 3363exit_update_dec: 3364 FUNC_RESTORE 3365 ret 3366 3367;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3368;void aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 / 3369; aes_gcm_enc_256_finalize_avx512 3370; (const struct gcm_key_data *key_data, 3371; struct gcm_context_data *context_data, 3372; u8 *auth_tag, 3373; u64 auth_tag_len); 3374;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3375MKGLOBAL(FN_NAME(enc,_finalize_),function,) 3376FN_NAME(enc,_finalize_): 3377 3378;; All parameters are passed through registers 3379%ifdef SAFE_PARAM 3380 ;; Check key_data != NULL 3381 cmp arg1, 0 3382 jz exit_enc_fin 3383 3384 ;; Check context_data != NULL 3385 cmp arg2, 0 3386 jz exit_enc_fin 3387 3388 ;; Check auth_tag != NULL 3389 cmp arg3, 0 3390 jz exit_enc_fin 3391 3392 ;; Check auth_tag_len == 0 or > 16 3393 cmp arg4, 0 3394 jz exit_enc_fin 3395 3396 cmp arg4, 16 3397 ja exit_enc_fin 3398%endif 3399 3400 push r12 3401 3402%ifidn __OUTPUT_FORMAT__, win64 3403 ; xmm6:xmm15 need to be maintained for Windows 3404 sub rsp, 7*16 3405 vmovdqu [rsp + 0*16], xmm6 3406 vmovdqu [rsp + 1*16], xmm9 3407 vmovdqu [rsp + 2*16], xmm10 3408 vmovdqu [rsp + 3*16], xmm11 3409 vmovdqu [rsp + 4*16], xmm13 3410 vmovdqu [rsp + 5*16], xmm14 3411 vmovdqu [rsp + 6*16], xmm15 3412%endif 3413 GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call 3414 3415%ifdef SAFE_DATA 3416 clear_scratch_gps_asm 3417 clear_scratch_zmms_asm 3418%endif 3419%ifidn __OUTPUT_FORMAT__, win64 3420 vmovdqu xmm15, [rsp + 6*16] 3421 vmovdqu xmm14, [rsp + 5*16] 3422 vmovdqu xmm13, [rsp + 4*16] 3423 vmovdqu xmm11, [rsp + 3*16] 3424 vmovdqu xmm10, [rsp + 2*16] 3425 vmovdqu xmm9, [rsp + 1*16] 3426 vmovdqu xmm6, [rsp + 0*16] 3427 add rsp, 7*16 3428%endif 3429 3430 pop r12 3431 3432exit_enc_fin: 3433 ret 3434 3435 3436;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3437;void aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512 3438; aes_gcm_dec_256_finalize_avx512 3439; (const struct gcm_key_data *key_data, 3440; struct gcm_context_data *context_data, 3441; u8 *auth_tag, 3442; u64 auth_tag_len); 3443;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3444MKGLOBAL(FN_NAME(dec,_finalize_),function,) 3445FN_NAME(dec,_finalize_): 3446 3447;; All parameters are passed through registers 3448%ifdef SAFE_PARAM 3449 ;; Check key_data != NULL 3450 cmp arg1, 0 3451 jz exit_dec_fin 3452 3453 ;; Check context_data != NULL 3454 cmp arg2, 0 3455 jz exit_dec_fin 3456 3457 ;; Check auth_tag != NULL 3458 cmp arg3, 0 3459 jz exit_dec_fin 3460 3461 ;; Check auth_tag_len == 0 or > 16 3462 cmp arg4, 0 3463 jz exit_dec_fin 3464 3465 cmp arg4, 16 3466 ja exit_dec_fin 3467%endif 3468 3469 push r12 3470 3471%ifidn __OUTPUT_FORMAT__, win64 3472 ; xmm6:xmm15 need to be maintained for Windows 3473 sub rsp, 7*16 3474 vmovdqu [rsp + 0*16], xmm6 3475 vmovdqu [rsp + 1*16], xmm9 3476 vmovdqu [rsp + 2*16], xmm10 3477 vmovdqu [rsp + 3*16], xmm11 3478 vmovdqu [rsp + 4*16], xmm13 3479 vmovdqu [rsp + 5*16], xmm14 3480 vmovdqu [rsp + 6*16], xmm15 3481%endif 3482 GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call 3483 3484%ifdef SAFE_DATA 3485 clear_scratch_gps_asm 3486 clear_scratch_zmms_asm 3487%endif 3488%ifidn __OUTPUT_FORMAT__, win64 3489 vmovdqu xmm15, [rsp + 6*16] 3490 vmovdqu xmm14, [rsp + 5*16] 3491 vmovdqu xmm13, [rsp + 4*16] 3492 vmovdqu xmm11, [rsp + 3*16] 3493 vmovdqu xmm10, [rsp + 2*16] 3494 vmovdqu xmm9, [rsp + 1*16] 3495 vmovdqu xmm6, [rsp + 0*16] 3496 add rsp, 7*16 3497%endif 3498 3499 pop r12 3500exit_dec_fin: 3501 3502 ret 3503 3504 3505;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3506;void aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512 3507; (const struct gcm_key_data *key_data, 3508; struct gcm_context_data *context_data, 3509; u8 *out, 3510; const u8 *in, 3511; u64 plaintext_len, 3512; u8 *iv, 3513; const u8 *aad, 3514; u64 aad_len, 3515; u8 *auth_tag, 3516; u64 auth_tag_len); 3517;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3518MKGLOBAL(FN_NAME(enc,_),function,) 3519FN_NAME(enc,_): 3520 3521 FUNC_SAVE 3522 3523%ifdef SAFE_PARAM 3524 ;; Check key_data != NULL 3525 cmp arg1, 0 3526 jz exit_enc 3527 3528 ;; Check context_data != NULL 3529 cmp arg2, 0 3530 jz exit_enc 3531 3532 ;; Check IV != NULL 3533 cmp arg6, 0 3534 jz exit_enc 3535 3536 ;; Check auth_tag != NULL 3537 cmp arg9, 0 3538 jz exit_enc 3539 3540 ;; Check auth_tag_len == 0 or > 16 3541 cmp arg10, 0 3542 jz exit_enc 3543 3544 cmp arg10, 16 3545 ja exit_enc 3546 3547 ;; Check if plaintext_len == 0 3548 cmp arg5, 0 3549 jz skip_in_out_check_enc 3550 3551 ;; Check out != NULL (plaintext_len != 0) 3552 cmp arg3, 0 3553 jz exit_enc 3554 3555 ;; Check in != NULL (plaintext_len != 0) 3556 cmp arg4, 0 3557 jz exit_enc 3558 3559skip_in_out_check_enc: 3560 ;; Check if aad_len == 0 3561 cmp arg8, 0 3562 jz skip_aad_check_enc 3563 3564 ;; Check aad != NULL (aad_len != 0) 3565 cmp arg7, 0 3566 jz exit_enc 3567 3568skip_aad_check_enc: 3569%endif 3570 GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 3571 3572 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call 3573 3574 GCM_COMPLETE arg1, arg2, arg9, arg10, single_call 3575 3576exit_enc: 3577 FUNC_RESTORE 3578 3579 ret 3580 3581;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3582;void aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512 3583; (const struct gcm_key_data *key_data, 3584; struct gcm_context_data *context_data, 3585; u8 *out, 3586; const u8 *in, 3587; u64 plaintext_len, 3588; u8 *iv, 3589; const u8 *aad, 3590; u64 aad_len, 3591; u8 *auth_tag, 3592; u64 auth_tag_len); 3593;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3594MKGLOBAL(FN_NAME(dec,_),function,) 3595FN_NAME(dec,_): 3596 3597 FUNC_SAVE 3598 3599%ifdef SAFE_PARAM 3600 ;; Check key_data != NULL 3601 cmp arg1, 0 3602 jz exit_dec 3603 3604 ;; Check context_data != NULL 3605 cmp arg2, 0 3606 jz exit_dec 3607 3608 ;; Check IV != NULL 3609 cmp arg6, 0 3610 jz exit_dec 3611 3612 ;; Check auth_tag != NULL 3613 cmp arg9, 0 3614 jz exit_dec 3615 3616 ;; Check auth_tag_len == 0 or > 16 3617 cmp arg10, 0 3618 jz exit_dec 3619 3620 cmp arg10, 16 3621 ja exit_dec 3622 3623 ;; Check if plaintext_len == 0 3624 cmp arg5, 0 3625 jz skip_in_out_check_dec 3626 3627 ;; Check out != NULL (plaintext_len != 0) 3628 cmp arg3, 0 3629 jz exit_dec 3630 3631 ;; Check in != NULL (plaintext_len != 0) 3632 cmp arg4, 0 3633 jz exit_dec 3634 3635skip_in_out_check_dec: 3636 ;; Check if aad_len == 0 3637 cmp arg8, 0 3638 jz skip_aad_check_dec 3639 3640 ;; Check aad != NULL (aad_len != 0) 3641 cmp arg7, 0 3642 jz exit_dec 3643 3644skip_aad_check_dec: 3645%endif 3646 3647 GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 3648 3649 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call 3650 3651 GCM_COMPLETE arg1, arg2, arg9, arg10, single_call 3652 3653exit_dec: 3654 FUNC_RESTORE 3655 3656 ret 3657 3658;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3659;void aes_gcm_enc_var_iv_128_avx512 / aes_gcm_enc_var_iv_192_avx512 / 3660; aes_gcm_enc_var_iv_256_avx512 3661; (const struct gcm_key_data *key_data, 3662; struct gcm_context_data *context_data, 3663; u8 *out, 3664; const u8 *in, 3665; u64 plaintext_len, 3666; u8 *iv, 3667; const u64 iv_len, 3668; const u8 *aad, 3669; const u64 aad_len, 3670; u8 *auth_tag, 3671; const u64 auth_tag_len); 3672;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3673MKGLOBAL(FN_NAME(enc_var_iv,_),function,) 3674FN_NAME(enc_var_iv,_): 3675 3676 FUNC_SAVE 3677 3678%ifdef SAFE_PARAM 3679 ;; Check key_data != NULL 3680 cmp arg1, 0 3681 jz exit_enc_IV 3682 3683 ;; Check context_data != NULL 3684 cmp arg2, 0 3685 jz exit_enc_IV 3686 3687 ;; Check IV != NULL 3688 cmp arg6, 0 3689 jz exit_enc_IV 3690 3691 ;; Check IV len != 0 3692 cmp arg7, 0 3693 jz exit_enc_IV 3694 3695 ;; Check auth_tag != NULL 3696 cmp arg10, 0 3697 jz exit_enc_IV 3698 3699 ;; Check auth_tag_len == 0 or > 16 3700 cmp arg11, 0 3701 jz exit_enc_IV 3702 3703 cmp arg11, 16 3704 ja exit_enc_IV 3705 3706 ;; Check if plaintext_len == 0 3707 cmp arg5, 0 3708 jz skip_in_out_check_enc_IV 3709 3710 ;; Check out != NULL (plaintext_len != 0) 3711 cmp arg3, 0 3712 jz exit_enc_IV 3713 3714 ;; Check in != NULL (plaintext_len != 0) 3715 cmp arg4, 0 3716 jz exit_enc_IV 3717 3718skip_in_out_check_enc_IV: 3719 ;; Check if aad_len == 0 3720 cmp arg9, 0 3721 jz skip_aad_check_enc_IV 3722 3723 ;; Check aad != NULL (aad_len != 0) 3724 cmp arg8, 0 3725 jz exit_enc_IV 3726 3727skip_aad_check_enc_IV: 3728%endif 3729 cmp arg7, 12 3730 je iv_len_12_enc_IV 3731 3732 GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12, arg7 3733 jmp skip_iv_len_12_enc_IV 3734 3735iv_len_12_enc_IV: 3736 GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12 3737 3738skip_iv_len_12_enc_IV: 3739 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call 3740 3741 GCM_COMPLETE arg1, arg2, arg10, arg11, single_call 3742 3743exit_enc_IV: 3744 FUNC_RESTORE 3745 3746 ret 3747 3748;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3749;void aes_gcm_dec_var_iv_128_avx512 / aes_gcm_dec_var_iv_192_avx512 / 3750; aes_gcm_dec_var_iv_256_avx512 3751; (const struct gcm_key_data *key_data, 3752; struct gcm_context_data *context_data, 3753; u8 *out, 3754; const u8 *in, 3755; u64 plaintext_len, 3756; u8 *iv, 3757; const u64 iv_len, 3758; const u8 *aad, 3759; const u64 aad_len, 3760; u8 *auth_tag, 3761; const u64 auth_tag_len); 3762;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3763MKGLOBAL(FN_NAME(dec_var_iv,_),function,) 3764FN_NAME(dec_var_iv,_): 3765 3766 FUNC_SAVE 3767 3768%ifdef SAFE_PARAM 3769 ;; Check key_data != NULL 3770 cmp arg1, 0 3771 jz exit_dec_IV 3772 3773 ;; Check context_data != NULL 3774 cmp arg2, 0 3775 jz exit_dec_IV 3776 3777 ;; Check IV != NULL 3778 cmp arg6, 0 3779 jz exit_dec_IV 3780 3781 ;; Check IV len != 0 3782 cmp arg7, 0 3783 jz exit_dec_IV 3784 3785 ;; Check auth_tag != NULL 3786 cmp arg10, 0 3787 jz exit_dec_IV 3788 3789 ;; Check auth_tag_len == 0 or > 16 3790 cmp arg11, 0 3791 jz exit_dec_IV 3792 3793 cmp arg11, 16 3794 ja exit_dec_IV 3795 3796 ;; Check if plaintext_len == 0 3797 cmp arg5, 0 3798 jz skip_in_out_check_dec_IV 3799 3800 ;; Check out != NULL (plaintext_len != 0) 3801 cmp arg3, 0 3802 jz exit_dec_IV 3803 3804 ;; Check in != NULL (plaintext_len != 0) 3805 cmp arg4, 0 3806 jz exit_dec_IV 3807 3808skip_in_out_check_dec_IV: 3809 ;; Check if aad_len == 0 3810 cmp arg9, 0 3811 jz skip_aad_check_dec_IV 3812 3813 ;; Check aad != NULL (aad_len != 0) 3814 cmp arg8, 0 3815 jz exit_dec_IV 3816 3817skip_aad_check_dec_IV: 3818%endif 3819 cmp arg7, 12 3820 je iv_len_12_dec_IV 3821 3822 GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12, arg7 3823 jmp skip_iv_len_12_dec_IV 3824 3825iv_len_12_dec_IV: 3826 GCM_INIT arg1, arg2, arg6, arg8, arg9, r10, r11, r12 3827 3828skip_iv_len_12_dec_IV: 3829 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call 3830 GCM_COMPLETE arg1, arg2, arg10, arg11, single_call 3831 3832exit_dec_IV: 3833 FUNC_RESTORE 3834 ret 3835 3836%ifdef GCM128_MODE 3837;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3838;void ghash_avx512 3839; const struct gcm_key_data *key_data, 3840; const void *in, 3841; const u64 in_len, 3842; void *tag, 3843; const u64 tag_len); 3844;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3845MKGLOBAL(ghash_avx512,function,) 3846ghash_avx512: 3847 3848 FUNC_SAVE 3849 3850%ifdef SAFE_PARAM 3851 ;; Check key_data != NULL 3852 cmp arg1, 0 3853 jz exit_ghash 3854 3855 ;; Check in != NULL 3856 cmp arg2, 0 3857 jz exit_ghash 3858 3859 ;; Check in_len != 0 3860 cmp arg3, 0 3861 jz exit_ghash 3862 3863 ;; Check tag != NULL 3864 cmp arg4, 0 3865 jz exit_ghash 3866 3867 ;; Check tag_len != 0 3868 cmp arg5, 0 3869 jz exit_ghash 3870%endif 3871 3872 vpxor xmm0, xmm0 3873 CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ 3874 r10, r11, r12 3875 3876 vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap 3877 3878 simd_store_avx arg4, xmm0, arg5, r12, rax 3879 3880exit_ghash: 3881 FUNC_RESTORE 3882 3883 ret 3884%endif 3885 3886;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3887; PARTIAL_BLOCK_GMAC: Handles the tag partial blocks between update calls. 3888; Requires the input data be at least 1 byte long. 3889; Input: gcm_context_data (GDATA_CTX), input text (PLAIN_IN), hash subkey (HASH_SUBKEY) 3890; input text length (PLAIN_LEN). 3891; Output: Updated GDATA_CTX 3892; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11 3893;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3894%macro PARTIAL_BLOCK_GMAC 6 3895%define %%GDATA_CTX %1 3896%define %%PLAIN_IN %2 3897%define %%PLAIN_LEN %3 3898%define %%DATA_OFFSET %4 3899%define %%AAD_HASH %5 3900%define %%HASH_SUBKEY %6 3901 3902 mov r13, [%%GDATA_CTX + PBlockLen] 3903 cmp r13, 0 3904 ; Leave Macro if no partial blocks 3905 je %%_partial_block_done 3906 3907 ; Read in input data without over reading 3908 cmp %%PLAIN_LEN, 16 3909 jl %%_fewer_than_16_bytes 3910 ; If more than 16 bytes of data, just fill the xmm register 3911 VXLDR xmm1, [%%PLAIN_IN] 3912 jmp %%_data_read 3913 3914%%_fewer_than_16_bytes: 3915 lea r10, [%%PLAIN_IN] 3916 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_LEN, rax 3917 3918 ; Finished reading in data 3919%%_data_read: 3920 3921 lea r12, [rel SHIFT_MASK] 3922 ; Adjust the shuffle mask pointer to be able to shift r13 bytes 3923 ; (16-r13 is the number of bytes in plaintext mod 16) 3924 add r12, r13 3925 ; Get the appropriate shuffle mask 3926 vmovdqu xmm2, [r12] 3927 vmovdqa xmm3, xmm1 3928 3929 mov r15, %%PLAIN_LEN 3930 add r15, r13 3931 ; Set r15 to be the amount of data left in PLAIN_IN after filling the block 3932 sub r15, 16 3933 ; Determine if partial block is not being filled and shift mask accordingly 3934 jge %%_no_extra_mask_1 3935 sub r12, r15 3936%%_no_extra_mask_1: 3937 3938 ; Get the appropriate mask to mask out bottom r13 bytes of xmm3 3939 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] 3940 3941 vpand xmm3, xmm1 3942 vpshufb xmm3, [rel SHUF_MASK] 3943 vpshufb xmm3, xmm2 3944 vpxor %%AAD_HASH, xmm3 3945 3946 cmp r15,0 3947 jl %%_partial_incomplete_1 3948 3949 ; GHASH computation for the last <16 Byte block 3950 GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 3951 xor rax, rax 3952 mov [%%GDATA_CTX + PBlockLen], rax 3953 jmp %%_ghash_done 3954%%_partial_incomplete_1: 3955%ifidn __OUTPUT_FORMAT__, win64 3956 mov rax, %%PLAIN_LEN 3957 add [%%GDATA_CTX + PBlockLen], rax 3958%else 3959 add [%%GDATA_CTX + PBlockLen], %%PLAIN_LEN 3960%endif 3961%%_ghash_done: 3962 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH 3963 3964 cmp r15, 0 3965 jl %%_partial_fill 3966 3967 mov r12, 16 3968 ; Set r12 to be the number of bytes to skip after this macro 3969 sub r12, r13 3970 3971 jmp %%offset_set 3972%%_partial_fill: 3973 mov r12, %%PLAIN_LEN 3974%%offset_set: 3975 mov %%DATA_OFFSET, r12 3976%%_partial_block_done: 3977%endmacro ; PARTIAL_BLOCK_GMAC 3978 3979;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3980;void imb_aes_gmac_update_128_avx512 / imb_aes_gmac_update_192_avx512 / 3981; imb_aes_gmac_update_256_avx512 3982; const struct gcm_key_data *key_data, 3983; struct gcm_context_data *context_data, 3984; const u8 *in, 3985; const u64 plaintext_len); 3986;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3987MKGLOBAL(GMAC_FN_NAME(update),function,) 3988GMAC_FN_NAME(update): 3989 3990 FUNC_SAVE 3991 3992 ;; Check if plaintext_len == 0 3993 cmp arg4, 0 3994 je exit_gmac_update 3995 3996%ifdef SAFE_PARAM 3997 ;; Check key_data != NULL 3998 cmp arg1, 0 3999 jz exit_gmac_update 4000 4001 ;; Check context_data != NULL 4002 cmp arg2, 0 4003 jz exit_gmac_update 4004 4005 ;; Check in != NULL (plaintext_len != 0) 4006 cmp arg3, 0 4007 jz exit_gmac_update 4008%endif 4009 4010 ; Increment size of "AAD length" for GMAC 4011 add [arg2 + AadLen], arg4 4012 4013 ;; Deal with previous partial block 4014 xor r11, r11 4015 vmovdqu xmm13, [arg1 + HashKey] 4016 vmovdqu xmm8, [arg2 + AadHash] 4017 4018 PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm8, xmm13 4019 4020 ; CALC_AAD_HASH needs to deal with multiple of 16 bytes 4021 sub arg4, r11 4022 add arg3, r11 4023 4024 vmovq xmm7, arg4 ; Save remaining length 4025 and arg4, -16 ; Get multiple of 16 bytes 4026 4027 or arg4, arg4 4028 jz no_full_blocks 4029 4030 ;; Calculate GHASH of this segment 4031 CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ 4032 r10, r11, r12 4033 vmovdqu [arg2 + AadHash], xmm8 ; ctx_data.aad hash = aad_hash 4034 4035no_full_blocks: 4036 add arg3, arg4 ; Point at partial block 4037 4038 vmovq arg4, xmm7 ; Restore original remaining length 4039 and arg4, 15 4040 jz exit_gmac_update 4041 4042 ; Save next partial block 4043 mov [arg2 + PBlockLen], arg4 4044 READ_SMALL_DATA_INPUT xmm1, arg3, arg4, r11 4045 vpshufb xmm1, [rel SHUF_MASK] 4046 vpxor xmm8, xmm1 4047 vmovdqu [arg2 + AadHash], xmm8 4048 4049exit_gmac_update: 4050 FUNC_RESTORE 4051 4052 ret 4053 4054%ifdef LINUX 4055section .note.GNU-stack noalloc noexec nowrite progbits 4056%endif 4057