1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2020, Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30; 31; Authors: 32; Erdinc Ozturk 33; Vinodh Gopal 34; James Guilford 35; 36; 37; References: 38; This code was derived and highly optimized from the code described in paper: 39; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 40; The details of the implementation is explained in: 41; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. 42; 43; 44; 45; 46; Assumptions: 47; 48; 49; 50; iv: 51; 0 1 2 3 52; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 53; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 54; | Salt (From the SA) | 55; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 56; | Initialization Vector | 57; | (This is the sequence number from IPSec header) | 58; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 59; | 0x1 | 60; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 61; 62; 63; 64; AAD: 65; AAD will be padded with 0 to the next 16byte multiple 66; for example, assume AAD is a u32 vector 67; 68; if AAD is 8 bytes: 69; AAD[3] = {A0, A1}; 70; padded AAD in xmm register = {A1 A0 0 0} 71; 72; 0 1 2 3 73; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 74; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 75; | SPI (A1) | 76; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 77; | 32-bit Sequence Number (A0) | 78; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 79; | 0x0 | 80; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 81; 82; AAD Format with 32-bit Sequence Number 83; 84; if AAD is 12 bytes: 85; AAD[3] = {A0, A1, A2}; 86; padded AAD in xmm register = {A2 A1 A0 0} 87; 88; 0 1 2 3 89; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 90; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 91; | SPI (A2) | 92; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 93; | 64-bit Extended Sequence Number {A1,A0} | 94; | | 95; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 96; | 0x0 | 97; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 98; 99; AAD Format with 64-bit Extended Sequence Number 100; 101; 102; aadLen: 103; Must be a multiple of 4 bytes and from the definition of the spec. 104; The code additionally supports any aadLen length. 105; 106; TLen: 107; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 108; 109; poly = x^128 + x^127 + x^126 + x^121 + 1 110; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. 111; 112 113%include "include/os.asm" 114%include "include/reg_sizes.asm" 115%include "include/clear_regs.asm" 116%include "include/gcm_defines.asm" 117%include "include/gcm_keys_avx2_avx512.asm" 118%include "include/memcpy.asm" 119 120%ifndef GCM128_MODE 121%ifndef GCM192_MODE 122%ifndef GCM256_MODE 123%error "No GCM mode selected for gcm_avx_gen4.asm!" 124%endif 125%endif 126%endif 127 128;; Decide on AES-GCM key size to compile for 129%ifdef GCM128_MODE 130%define NROUNDS 9 131%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 132%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _128_ %+ avx_gen4 133%endif 134 135%ifdef GCM192_MODE 136%define NROUNDS 11 137%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 138%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _192_ %+ avx_gen4 139%endif 140 141%ifdef GCM256_MODE 142%define NROUNDS 13 143%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 144%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ _256_ %+ avx_gen4 145%endif 146 147section .text 148default rel 149 150; need to push 4 registers into stack to maintain 151%define STACK_OFFSET 8*4 152 153%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) 154%define TMP3 16*1 ; Temporary storage for AES State 3 155%define TMP4 16*2 ; Temporary storage for AES State 4 156%define TMP5 16*3 ; Temporary storage for AES State 5 157%define TMP6 16*4 ; Temporary storage for AES State 6 158%define TMP7 16*5 ; Temporary storage for AES State 7 159%define TMP8 16*6 ; Temporary storage for AES State 8 160 161%define LOCAL_STORAGE 16*7 162 163%ifidn __OUTPUT_FORMAT__, win64 164 %define XMM_STORAGE 16*10 165%else 166 %define XMM_STORAGE 0 167%endif 168 169%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE 170 171;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 172; Utility Macros 173;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 174 175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 176; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 177; Input: A and B (128-bits each, bit-reflected) 178; Output: C = A*B*x mod poly, (i.e. >>1 ) 179; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 180; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 181;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 182%macro GHASH_MUL 7 183%define %%GH %1 ; 16 Bytes 184%define %%HK %2 ; 16 Bytes 185%define %%T1 %3 186%define %%T2 %4 187%define %%T3 %5 188%define %%T4 %6 189%define %%T5 %7 190 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 191 192 vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 193 vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 194 vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 195 vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 196 vpxor %%GH, %%GH, %%T3 197 198 199 vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs 200 vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs 201 202 vpxor %%T1, %%T1, %%T3 203 vpxor %%GH, %%GH, %%T2 204 205 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 206 ;first phase of the reduction 207 vmovdqa %%T3, [rel POLY2] 208 209 vpclmulqdq %%T2, %%T3, %%GH, 0x01 210 vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs 211 212 vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete 213 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 214 ;second phase of the reduction 215 vpclmulqdq %%T2, %%T3, %%GH, 0x00 216 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 217 218 vpclmulqdq %%GH, %%T3, %%GH, 0x10 219 vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 220 221 vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete 222 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 223 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH 224 225%endmacro 226 227 228; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4 229; functions, but are kept to allow users to switch cpu architectures between calls 230; of pre, init, update, and finalize. 231%macro PRECOMPUTE 8 232%define %%GDATA %1 233%define %%HK %2 234%define %%T1 %3 235%define %%T2 %4 236%define %%T3 %5 237%define %%T4 %6 238%define %%T5 %7 239%define %%T6 %8 240 241 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 242 vmovdqa %%T5, %%HK 243 244 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly 245 vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly 246 247 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly 248 vmovdqu [%%GDATA + HashKey_3], %%T5 249 250 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly 251 vmovdqu [%%GDATA + HashKey_4], %%T5 252 253 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly 254 vmovdqu [%%GDATA + HashKey_5], %%T5 255 256 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly 257 vmovdqu [%%GDATA + HashKey_6], %%T5 258 259 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly 260 vmovdqu [%%GDATA + HashKey_7], %%T5 261 262 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly 263 vmovdqu [%%GDATA + HashKey_8], %%T5 264%endmacro 265 266 267;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 268; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. 269; Returns 0 if data has length 0. 270; Input: The input data (INPUT), that data's length (LENGTH). 271; Output: The packed xmm register (OUTPUT). 272;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 273%macro READ_SMALL_DATA_INPUT 6 274%define %%OUTPUT %1 ; %%OUTPUT is an xmm register 275%define %%INPUT %2 276%define %%LENGTH %3 277%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers 278%define %%COUNTER %5 279%define %%TMP1 %6 280 281 vpxor %%OUTPUT, %%OUTPUT 282 mov %%COUNTER, %%LENGTH 283 mov %%END_READ_LOCATION, %%INPUT 284 add %%END_READ_LOCATION, %%LENGTH 285 xor %%TMP1, %%TMP1 286 287 288 cmp %%COUNTER, 8 289 jl %%_byte_loop_2 290 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists 291 je %%_done 292 293 sub %%COUNTER, 8 294 295%%_byte_loop_1: ;Read in data 1 byte at a time while data is left 296 shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in 297 dec %%END_READ_LOCATION 298 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] 299 dec %%COUNTER 300 jg %%_byte_loop_1 301 vpinsrq %%OUTPUT, %%TMP1, 1 302 jmp %%_done 303 304%%_byte_loop_2: ;Read in data 1 byte at a time while data is left 305 ;; NOTE: in current implementation check for zero length is obsolete here. 306 ;; The adequate checks are done by callers of this macro. 307 ;; cmp %%COUNTER, 0 308 ;; je %%_done 309 shl %%TMP1, 8 ;This loop handles when no bytes were already read in 310 dec %%END_READ_LOCATION 311 mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] 312 dec %%COUNTER 313 jg %%_byte_loop_2 314 vpinsrq %%OUTPUT, %%TMP1, 0 315%%_done: 316 317%endmacro ; READ_SMALL_DATA_INPUT 318 319 320;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 321; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 322; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). 323; Output: The hash of the data (AAD_HASH). 324;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 325%macro CALC_AAD_HASH 15 326%define %%A_IN %1 327%define %%A_LEN %2 328%define %%AAD_HASH %3 329%define %%GDATA_KEY %4 330%define %%XTMP0 %5 ; xmm temp reg 5 331%define %%XTMP1 %6 ; xmm temp reg 5 332%define %%XTMP2 %7 333%define %%XTMP3 %8 334%define %%XTMP4 %9 335%define %%XTMP5 %10 ; xmm temp reg 5 336%define %%T1 %11 ; temp reg 1 337%define %%T2 %12 338%define %%T3 %13 339%define %%T4 %14 340%define %%T5 %15 ; temp reg 5 341 342 343 mov %%T1, %%A_IN ; T1 = AAD 344 mov %%T2, %%A_LEN ; T2 = aadLen 345 346%%_get_AAD_loop128: 347 cmp %%T2, 128 348 jl %%_exit_AAD_loop128 349 350 vmovdqu %%XTMP0, [%%T1 + 16*0] 351 vpshufb %%XTMP0, [rel SHUF_MASK] 352 353 vpxor %%XTMP0, %%AAD_HASH 354 355 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8] 356 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 357 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 358 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 359 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 360 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 361 362%assign i 1 363%assign j 7 364%rep 7 365 vmovdqu %%XTMP0, [%%T1 + 16*i] 366 vpshufb %%XTMP0, [rel SHUF_MASK] 367 368 vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j] 369 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 370 vpxor %%XTMP1, %%XTMP1, %%XTMP4 371 372 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 373 vpxor %%XTMP2, %%XTMP2, %%XTMP4 374 375 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 376 vpxor %%XTMP3, %%XTMP3, %%XTMP4 377 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 378 vpxor %%XTMP3, %%XTMP3, %%XTMP4 379%assign i (i + 1) 380%assign j (j - 1) 381%endrep 382 383 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs 384 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs 385 vpxor %%XTMP2, %%XTMP2, %%XTMP4 386 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) 387 388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 389 ;first phase of the reduction 390 vmovdqa %%XTMP5, [rel POLY2] 391 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 392 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs 393 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete 394 395 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 396 ;second phase of the reduction 397 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 398 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 399 400 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 401 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) 402 403 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete 404 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 405 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 406 407 sub %%T2, 128 408 je %%_CALC_AAD_done 409 410 add %%T1, 128 411 jmp %%_get_AAD_loop128 412 413%%_exit_AAD_loop128: 414 cmp %%T2, 16 415 jl %%_get_small_AAD_block 416 417 ;; calculate hash_key position to start with 418 mov %%T3, %%T2 419 and %%T3, -16 ; 1 to 7 blocks possible here 420 neg %%T3 421 add %%T3, HashKey_1 + 16 422 lea %%T3, [%%GDATA_KEY + %%T3] 423 424 vmovdqu %%XTMP0, [%%T1] 425 vpshufb %%XTMP0, [rel SHUF_MASK] 426 427 vpxor %%XTMP0, %%AAD_HASH 428 429 vmovdqu %%XTMP5, [%%T3] 430 vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 431 vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 432 vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 433 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 434 vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 435 436 add %%T3, 16 ; move to next hashkey 437 add %%T1, 16 ; move to next data block 438 sub %%T2, 16 439 cmp %%T2, 16 440 jl %%_AAD_reduce 441 442%%_AAD_blocks: 443 vmovdqu %%XTMP0, [%%T1] 444 vpshufb %%XTMP0, [rel SHUF_MASK] 445 446 vmovdqu %%XTMP5, [%%T3] 447 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 448 vpxor %%XTMP1, %%XTMP1, %%XTMP4 449 450 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 451 vpxor %%XTMP2, %%XTMP2, %%XTMP4 452 453 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 454 vpxor %%XTMP3, %%XTMP3, %%XTMP4 455 vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 456 vpxor %%XTMP3, %%XTMP3, %%XTMP4 457 458 add %%T3, 16 ; move to next hashkey 459 add %%T1, 16 460 sub %%T2, 16 461 cmp %%T2, 16 462 jl %%_AAD_reduce 463 jmp %%_AAD_blocks 464 465%%_AAD_reduce: 466 vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs 467 vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs 468 vpxor %%XTMP2, %%XTMP2, %%XTMP4 469 vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) 470 471 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 472 ;first phase of the reduction 473 vmovdqa %%XTMP5, [rel POLY2] 474 vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 475 vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs 476 vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete 477 478 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 479 ;second phase of the reduction 480 vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 481 vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 482 483 vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 484 vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) 485 486 vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete 487 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 488 vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 489 490 or %%T2, %%T2 491 je %%_CALC_AAD_done 492 493%%_get_small_AAD_block: 494 vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey] 495 READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 496 ;byte-reflect the AAD data 497 vpshufb %%XTMP1, [rel SHUF_MASK] 498 vpxor %%AAD_HASH, %%XTMP1 499 GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 500 501%%_CALC_AAD_done: 502 503%endmacro ; CALC_AAD_HASH 504 505 506 507;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 508; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. 509; Requires the input data be at least 1 byte long. 510; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), 511; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), 512; the hash subkey (HASH_SUBKEY) and whether encoding or decoding (ENC_DEC) 513; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX 514; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11 515;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 516%macro PARTIAL_BLOCK 8 517%define %%GDATA_CTX %1 518%define %%CYPH_PLAIN_OUT %2 519%define %%PLAIN_CYPH_IN %3 520%define %%PLAIN_CYPH_LEN %4 521%define %%DATA_OFFSET %5 522%define %%AAD_HASH %6 523%define %%HASH_SUBKEY %7 524%define %%ENC_DEC %8 525 526 mov r13, [%%GDATA_CTX + PBlockLen] 527 cmp r13, 0 528 je %%_partial_block_done ;Leave Macro if no partial blocks 529 530 cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading 531 jl %%_fewer_than_16_bytes 532 VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register 533 jmp %%_data_read 534 535%%_fewer_than_16_bytes: 536 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 537 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 538 539%%_data_read: ;Finished reading in data 540 541 542 vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key 543 544 lea r12, [rel SHIFT_MASK] 545 546 add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) 547 vmovdqu xmm2, [r12] ; get the appropriate shuffle mask 548 vpshufb xmm9, xmm2 ;shift right r13 bytes 549 550%ifidn %%ENC_DEC, DEC 551 vmovdqa xmm3, xmm1 552 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) 553 554 mov r15, %%PLAIN_CYPH_LEN 555 add r15, r13 556 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block 557 jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly 558 sub r12, r15 559%%_no_extra_mask_1: 560 561 vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 562 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 563 564 vpand xmm3, xmm1 565 vpshufb xmm3, [rel SHUF_MASK] 566 vpshufb xmm3, xmm2 567 vpxor %%AAD_HASH, xmm3 568 569 570 cmp r15,0 571 jl %%_partial_incomplete_1 572 573 GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block 574 xor rax,rax 575 mov [%%GDATA_CTX + PBlockLen], rax 576 jmp %%_dec_done 577%%_partial_incomplete_1: 578%ifidn __OUTPUT_FORMAT__, win64 579 mov rax, %%PLAIN_CYPH_LEN 580 add [%%GDATA_CTX + PBlockLen], rax 581%else 582 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN 583%endif 584%%_dec_done: 585 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH 586 587%else 588 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) 589 590 mov r15, %%PLAIN_CYPH_LEN 591 add r15, r13 592 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block 593 jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly 594 sub r12, r15 595%%_no_extra_mask_2: 596 597 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 598 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 599 600 vpshufb xmm9, [rel SHUF_MASK] 601 vpshufb xmm9, xmm2 602 vpxor %%AAD_HASH, xmm9 603 604 cmp r15,0 605 jl %%_partial_incomplete_2 606 607 GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block 608 xor rax,rax 609 mov [%%GDATA_CTX + PBlockLen], rax 610 jmp %%_encode_done 611%%_partial_incomplete_2: 612%ifidn __OUTPUT_FORMAT__, win64 613 mov rax, %%PLAIN_CYPH_LEN 614 add [%%GDATA_CTX + PBlockLen], rax 615%else 616 add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN 617%endif 618%%_encode_done: 619 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH 620 621 vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext 622 vpshufb xmm9, xmm2 623%endif 624 625 626 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 627 ; output encrypted Bytes 628 cmp r15,0 629 jl %%_partial_fill 630 mov r12, r13 631 mov r13, 16 632 sub r13, r12 ; Set r13 to be the number of bytes to write out 633 jmp %%_count_set 634%%_partial_fill: 635 mov r13, %%PLAIN_CYPH_LEN 636%%_count_set: 637 vmovq rax, xmm9 638 cmp r13, 8 639 jle %%_less_than_8_bytes_left 640 641 mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax 642 add %%DATA_OFFSET, 8 643 vpsrldq xmm9, xmm9, 8 644 vmovq rax, xmm9 645 sub r13, 8 646%%_less_than_8_bytes_left: 647 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al 648 add %%DATA_OFFSET, 1 649 shr rax, 8 650 sub r13, 1 651 jne %%_less_than_8_bytes_left 652 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 653 654%%_partial_block_done: 655%endmacro ; PARTIAL_BLOCK 656 657 658%macro GHASH_SINGLE_MUL 9 659%define %%GDATA %1 660%define %%HASHKEY %2 661%define %%CIPHER %3 662%define %%STATE_11 %4 663%define %%STATE_00 %5 664%define %%STATE_MID %6 665%define %%T1 %7 666%define %%T2 %8 667%define %%FIRST %9 668 669 vmovdqu %%T1, [%%GDATA + %%HASHKEY] 670%ifidn %%FIRST, first 671 vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 672 vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 673 vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 674 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 675 vpxor %%STATE_MID, %%STATE_MID, %%T2 676%else 677 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 678 vpxor %%STATE_11, %%STATE_11, %%T2 679 680 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 681 vpxor %%STATE_00, %%STATE_00, %%T2 682 683 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 684 vpxor %%STATE_MID, %%STATE_MID, %%T2 685 686 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 687 vpxor %%STATE_MID, %%STATE_MID, %%T2 688%endif 689 690%endmacro 691 692; if a = number of total plaintext bytes 693; b = floor(a/16) 694; %%num_initial_blocks = b mod 8; 695; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext 696; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. 697; Updated AAD_HASH is returned in %%T3 698 699%macro INITIAL_BLOCKS 23 700%define %%GDATA_KEY %1 701%define %%CYPH_PLAIN_OUT %2 702%define %%PLAIN_CYPH_IN %3 703%define %%LENGTH %4 704%define %%DATA_OFFSET %5 705%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 706%define %%T1 %7 707%define %%T2 %8 708%define %%T3 %9 709%define %%T4 %10 710%define %%T5 %11 711%define %%CTR %12 712%define %%XMM1 %13 713%define %%XMM2 %14 714%define %%XMM3 %15 715%define %%XMM4 %16 716%define %%XMM5 %17 717%define %%XMM6 %18 718%define %%XMM7 %19 719%define %%XMM8 %20 720%define %%T6 %21 721%define %%T_key %22 722%define %%ENC_DEC %23 723 724%assign i (8-%%num_initial_blocks) 725 ;; Move AAD_HASH to temp reg 726 vmovdqu %%T2, %%XMM8 727 ;; Start AES for %%num_initial_blocks blocks 728 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 729 730%assign i (9-%%num_initial_blocks) 731%rep %%num_initial_blocks 732 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 733 vmovdqa reg(i), %%CTR 734 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap 735%assign i (i+1) 736%endrep 737 738%if(%%num_initial_blocks>0) 739vmovdqu %%T_key, [%%GDATA_KEY+16*0] 740%assign i (9-%%num_initial_blocks) 741%rep %%num_initial_blocks 742 vpxor reg(i),reg(i),%%T_key 743%assign i (i+1) 744%endrep 745 746%assign j 1 747%rep NROUNDS 748vmovdqu %%T_key, [%%GDATA_KEY+16*j] 749%assign i (9-%%num_initial_blocks) 750%rep %%num_initial_blocks 751 vaesenc reg(i),%%T_key 752%assign i (i+1) 753%endrep 754 755%assign j (j+1) 756%endrep 757 758 759vmovdqu %%T_key, [%%GDATA_KEY+16*j] 760%assign i (9-%%num_initial_blocks) 761%rep %%num_initial_blocks 762 vaesenclast reg(i),%%T_key 763%assign i (i+1) 764%endrep 765 766%endif ; %if(%%num_initial_blocks>0) 767 768 769 770%assign i (9-%%num_initial_blocks) 771%rep %%num_initial_blocks 772 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 773 vpxor reg(i), reg(i), %%T1 774 ;; Write back ciphertext for %%num_initial_blocks blocks 775 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) 776 add %%DATA_OFFSET, 16 777 %ifidn %%ENC_DEC, DEC 778 vmovdqa reg(i), %%T1 779 %endif 780 ;; Prepare ciphertext for GHASH computations 781 vpshufb reg(i), [rel SHUF_MASK] 782%assign i (i+1) 783%endrep 784 785 786;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 787 788%assign i (9-%%num_initial_blocks) 789%if(%%num_initial_blocks>0) 790 vmovdqa %%T3, reg(i) 791%assign i (i+1) 792%endif 793%if(%%num_initial_blocks>1) 794%rep %%num_initial_blocks-1 795 vmovdqu [rsp + TMP %+ i], reg(i) 796%assign i (i+1) 797%endrep 798%endif 799 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 800 ;; Prepare 8 counter blocks and perform rounds of AES cipher on 801 ;; them, load plain/cipher text and store cipher/plain text. 802 ;; Stitch GHASH computation in between AES rounds. 803 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0 804 vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0 805 vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0 806 vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0 807 vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0 808 vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0 809 vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0 810 vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0 811 vmovdqa %%CTR, %%XMM8 812 813 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap 814 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap 815 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap 816 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap 817 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap 818 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap 819 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap 820 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap 821 822 vmovdqu %%T_key, [%%GDATA_KEY+16*0] 823 vpxor %%XMM1, %%XMM1, %%T_key 824 vpxor %%XMM2, %%XMM2, %%T_key 825 vpxor %%XMM3, %%XMM3, %%T_key 826 vpxor %%XMM4, %%XMM4, %%T_key 827 vpxor %%XMM5, %%XMM5, %%T_key 828 vpxor %%XMM6, %%XMM6, %%T_key 829 vpxor %%XMM7, %%XMM7, %%T_key 830 vpxor %%XMM8, %%XMM8, %%T_key 831 832%assign i (8-%%num_initial_blocks) 833%assign j (9-%%num_initial_blocks) 834%assign k (%%num_initial_blocks) 835 836%define %%T4_2 %%T4 837%if(%%num_initial_blocks>0) 838 ;; Hash in AES state 839 ;; T2 - incoming AAD hash 840 vpxor %%T2, %%T3 841 842 ;; GDATA, HASHKEY, CIPHER, 843 ;; STATE_11, STATE_00, STATE_MID, T1, T2 844 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 845 %%T1, %%T4, %%T6, %%T5, %%T3, first 846%endif 847 848 vmovdqu %%T_key, [%%GDATA_KEY+16*1] 849 vaesenc %%XMM1, %%T_key 850 vaesenc %%XMM2, %%T_key 851 vaesenc %%XMM3, %%T_key 852 vaesenc %%XMM4, %%T_key 853 vaesenc %%XMM5, %%T_key 854 vaesenc %%XMM6, %%T_key 855 vaesenc %%XMM7, %%T_key 856 vaesenc %%XMM8, %%T_key 857 858 vmovdqu %%T_key, [%%GDATA_KEY+16*2] 859 vaesenc %%XMM1, %%T_key 860 vaesenc %%XMM2, %%T_key 861 vaesenc %%XMM3, %%T_key 862 vaesenc %%XMM4, %%T_key 863 vaesenc %%XMM5, %%T_key 864 vaesenc %%XMM6, %%T_key 865 vaesenc %%XMM7, %%T_key 866 vaesenc %%XMM8, %%T_key 867 868%assign i (i+1) 869%assign j (j+1) 870%assign k (k-1) 871%if(%%num_initial_blocks>1) 872 ;; GDATA, HASHKEY, CIPHER, 873 ;; STATE_11, STATE_00, STATE_MID, T1, T2 874 vmovdqu %%T2, [rsp + TMP %+ j] 875 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 876 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 877%endif 878 879 vmovdqu %%T_key, [%%GDATA_KEY+16*3] 880 vaesenc %%XMM1, %%T_key 881 vaesenc %%XMM2, %%T_key 882 vaesenc %%XMM3, %%T_key 883 vaesenc %%XMM4, %%T_key 884 vaesenc %%XMM5, %%T_key 885 vaesenc %%XMM6, %%T_key 886 vaesenc %%XMM7, %%T_key 887 vaesenc %%XMM8, %%T_key 888 889 vmovdqu %%T_key, [%%GDATA_KEY+16*4] 890 vaesenc %%XMM1, %%T_key 891 vaesenc %%XMM2, %%T_key 892 vaesenc %%XMM3, %%T_key 893 vaesenc %%XMM4, %%T_key 894 vaesenc %%XMM5, %%T_key 895 vaesenc %%XMM6, %%T_key 896 vaesenc %%XMM7, %%T_key 897 vaesenc %%XMM8, %%T_key 898 899%assign i (i+1) 900%assign j (j+1) 901%assign k (k-1) 902%if(%%num_initial_blocks>2) 903 ;; GDATA, HASHKEY, CIPHER, 904 ;; STATE_11, STATE_00, STATE_MID, T1, T2 905 vmovdqu %%T2, [rsp + TMP %+ j] 906 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 907 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 908%endif 909 910%assign i (i+1) 911%assign j (j+1) 912%assign k (k-1) 913%if(%%num_initial_blocks>3) 914 ;; GDATA, HASHKEY, CIPHER, 915 ;; STATE_11, STATE_00, STATE_MID, T1, T2 916 vmovdqu %%T2, [rsp + TMP %+ j] 917 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 918 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 919%endif 920 921 vmovdqu %%T_key, [%%GDATA_KEY+16*5] 922 vaesenc %%XMM1, %%T_key 923 vaesenc %%XMM2, %%T_key 924 vaesenc %%XMM3, %%T_key 925 vaesenc %%XMM4, %%T_key 926 vaesenc %%XMM5, %%T_key 927 vaesenc %%XMM6, %%T_key 928 vaesenc %%XMM7, %%T_key 929 vaesenc %%XMM8, %%T_key 930 931 vmovdqu %%T_key, [%%GDATA_KEY+16*6] 932 vaesenc %%XMM1, %%T_key 933 vaesenc %%XMM2, %%T_key 934 vaesenc %%XMM3, %%T_key 935 vaesenc %%XMM4, %%T_key 936 vaesenc %%XMM5, %%T_key 937 vaesenc %%XMM6, %%T_key 938 vaesenc %%XMM7, %%T_key 939 vaesenc %%XMM8, %%T_key 940 941%assign i (i+1) 942%assign j (j+1) 943%assign k (k-1) 944%if(%%num_initial_blocks>4) 945 ;; GDATA, HASHKEY, CIPHER, 946 ;; STATE_11, STATE_00, STATE_MID, T1, T2 947 vmovdqu %%T2, [rsp + TMP %+ j] 948 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 949 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 950%endif 951 952 vmovdqu %%T_key, [%%GDATA_KEY+16*7] 953 vaesenc %%XMM1, %%T_key 954 vaesenc %%XMM2, %%T_key 955 vaesenc %%XMM3, %%T_key 956 vaesenc %%XMM4, %%T_key 957 vaesenc %%XMM5, %%T_key 958 vaesenc %%XMM6, %%T_key 959 vaesenc %%XMM7, %%T_key 960 vaesenc %%XMM8, %%T_key 961 962 vmovdqu %%T_key, [%%GDATA_KEY+16*8] 963 vaesenc %%XMM1, %%T_key 964 vaesenc %%XMM2, %%T_key 965 vaesenc %%XMM3, %%T_key 966 vaesenc %%XMM4, %%T_key 967 vaesenc %%XMM5, %%T_key 968 vaesenc %%XMM6, %%T_key 969 vaesenc %%XMM7, %%T_key 970 vaesenc %%XMM8, %%T_key 971 972%assign i (i+1) 973%assign j (j+1) 974%assign k (k-1) 975%if(%%num_initial_blocks>5) 976 ;; GDATA, HASHKEY, CIPHER, 977 ;; STATE_11, STATE_00, STATE_MID, T1, T2 978 vmovdqu %%T2, [rsp + TMP %+ j] 979 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 980 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 981%endif 982 983 vmovdqu %%T_key, [%%GDATA_KEY+16*9] 984 vaesenc %%XMM1, %%T_key 985 vaesenc %%XMM2, %%T_key 986 vaesenc %%XMM3, %%T_key 987 vaesenc %%XMM4, %%T_key 988 vaesenc %%XMM5, %%T_key 989 vaesenc %%XMM6, %%T_key 990 vaesenc %%XMM7, %%T_key 991 vaesenc %%XMM8, %%T_key 992 993%ifndef GCM128_MODE 994 vmovdqu %%T_key, [%%GDATA_KEY+16*10] 995 vaesenc %%XMM1, %%T_key 996 vaesenc %%XMM2, %%T_key 997 vaesenc %%XMM3, %%T_key 998 vaesenc %%XMM4, %%T_key 999 vaesenc %%XMM5, %%T_key 1000 vaesenc %%XMM6, %%T_key 1001 vaesenc %%XMM7, %%T_key 1002 vaesenc %%XMM8, %%T_key 1003%endif 1004 1005%assign i (i+1) 1006%assign j (j+1) 1007%assign k (k-1) 1008%if(%%num_initial_blocks>6) 1009 ;; GDATA, HASHKEY, CIPHER, 1010 ;; STATE_11, STATE_00, STATE_MID, T1, T2 1011 vmovdqu %%T2, [rsp + TMP %+ j] 1012 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 1013 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 1014%endif 1015 1016%ifdef GCM128_MODE 1017 vmovdqu %%T_key, [%%GDATA_KEY+16*10] 1018 vaesenclast %%XMM1, %%T_key 1019 vaesenclast %%XMM2, %%T_key 1020 vaesenclast %%XMM3, %%T_key 1021 vaesenclast %%XMM4, %%T_key 1022 vaesenclast %%XMM5, %%T_key 1023 vaesenclast %%XMM6, %%T_key 1024 vaesenclast %%XMM7, %%T_key 1025 vaesenclast %%XMM8, %%T_key 1026%endif 1027 1028%ifdef GCM192_MODE 1029 vmovdqu %%T_key, [%%GDATA_KEY+16*11] 1030 vaesenc %%XMM1, %%T_key 1031 vaesenc %%XMM2, %%T_key 1032 vaesenc %%XMM3, %%T_key 1033 vaesenc %%XMM4, %%T_key 1034 vaesenc %%XMM5, %%T_key 1035 vaesenc %%XMM6, %%T_key 1036 vaesenc %%XMM7, %%T_key 1037 vaesenc %%XMM8, %%T_key 1038 1039 vmovdqu %%T_key, [%%GDATA_KEY+16*12] 1040 vaesenclast %%XMM1, %%T_key 1041 vaesenclast %%XMM2, %%T_key 1042 vaesenclast %%XMM3, %%T_key 1043 vaesenclast %%XMM4, %%T_key 1044 vaesenclast %%XMM5, %%T_key 1045 vaesenclast %%XMM6, %%T_key 1046 vaesenclast %%XMM7, %%T_key 1047 vaesenclast %%XMM8, %%T_key 1048%endif 1049%ifdef GCM256_MODE 1050 vmovdqu %%T_key, [%%GDATA_KEY+16*11] 1051 vaesenc %%XMM1, %%T_key 1052 vaesenc %%XMM2, %%T_key 1053 vaesenc %%XMM3, %%T_key 1054 vaesenc %%XMM4, %%T_key 1055 vaesenc %%XMM5, %%T_key 1056 vaesenc %%XMM6, %%T_key 1057 vaesenc %%XMM7, %%T_key 1058 vaesenc %%XMM8, %%T_key 1059 1060 vmovdqu %%T_key, [%%GDATA_KEY+16*12] 1061 vaesenc %%XMM1, %%T_key 1062 vaesenc %%XMM2, %%T_key 1063 vaesenc %%XMM3, %%T_key 1064 vaesenc %%XMM4, %%T_key 1065 vaesenc %%XMM5, %%T_key 1066 vaesenc %%XMM6, %%T_key 1067 vaesenc %%XMM7, %%T_key 1068 vaesenc %%XMM8, %%T_key 1069%endif 1070 1071%assign i (i+1) 1072%assign j (j+1) 1073%assign k (k-1) 1074%if(%%num_initial_blocks>7) 1075 ;; GDATA, HASHKEY, CIPHER, 1076 ;; STATE_11, STATE_00, STATE_MID, T1, T2 1077 vmovdqu %%T2, [rsp + TMP %+ j] 1078 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ 1079 %%T1, %%T4, %%T6, %%T5, %%T3, not_first 1080%endif 1081 1082%ifdef GCM256_MODE ; GCM256 1083 vmovdqu %%T_key, [%%GDATA_KEY+16*13] 1084 vaesenc %%XMM1, %%T_key 1085 vaesenc %%XMM2, %%T_key 1086 vaesenc %%XMM3, %%T_key 1087 vaesenc %%XMM4, %%T_key 1088 vaesenc %%XMM5, %%T_key 1089 vaesenc %%XMM6, %%T_key 1090 vaesenc %%XMM7, %%T_key 1091 vaesenc %%XMM8, %%T_key 1092 1093 vmovdqu %%T_key, [%%GDATA_KEY+16*14] 1094 vaesenclast %%XMM1, %%T_key 1095 vaesenclast %%XMM2, %%T_key 1096 vaesenclast %%XMM3, %%T_key 1097 vaesenclast %%XMM4, %%T_key 1098 vaesenclast %%XMM5, %%T_key 1099 vaesenclast %%XMM6, %%T_key 1100 vaesenclast %%XMM7, %%T_key 1101 vaesenclast %%XMM8, %%T_key 1102%endif ; GCM256 mode 1103 1104%if(%%num_initial_blocks>0) 1105 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs 1106 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs 1107 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 1108 vpxor %%T4, %%T6, %%T4 1109 1110 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1111 ; First phase of the reduction 1112 vmovdqa %%T3, [rel POLY2] 1113 1114 vpclmulqdq %%T2, %%T3, %%T4, 0x01 1115 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 1116 1117 ;; First phase of the reduction complete 1118 vpxor %%T4, %%T4, %%T2 1119 1120 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1121 ; Second phase of the reduction 1122 vpclmulqdq %%T2, %%T3, %%T4, 0x00 1123 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1124 vpsrldq %%T2, %%T2, 4 1125 1126 vpclmulqdq %%T4, %%T3, %%T4, 0x10 1127 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 1128 vpslldq %%T4, %%T4, 4 1129 ;; Second phase of the reduction complete 1130 vpxor %%T4, %%T4, %%T2 1131 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1132 ; The result is in %%T3 1133 vpxor %%T3, %%T1, %%T4 1134%else 1135 ;; The hash should end up in T3 1136 vmovdqa %%T3, %%T2 1137%endif 1138 1139 ;; Final hash is now in T3 1140%if %%num_initial_blocks > 0 1141 ;; NOTE: obsolete in case %%num_initial_blocks = 0 1142 sub %%LENGTH, 16*%%num_initial_blocks 1143%endif 1144 1145 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] 1146 vpxor %%XMM1, %%XMM1, %%T1 1147 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 1148 %ifidn %%ENC_DEC, DEC 1149 vmovdqa %%XMM1, %%T1 1150 %endif 1151 1152 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] 1153 vpxor %%XMM2, %%XMM2, %%T1 1154 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 1155 %ifidn %%ENC_DEC, DEC 1156 vmovdqa %%XMM2, %%T1 1157 %endif 1158 1159 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] 1160 vpxor %%XMM3, %%XMM3, %%T1 1161 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 1162 %ifidn %%ENC_DEC, DEC 1163 vmovdqa %%XMM3, %%T1 1164 %endif 1165 1166 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] 1167 vpxor %%XMM4, %%XMM4, %%T1 1168 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 1169 %ifidn %%ENC_DEC, DEC 1170 vmovdqa %%XMM4, %%T1 1171 %endif 1172 1173 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] 1174 vpxor %%XMM5, %%XMM5, %%T1 1175 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 1176 %ifidn %%ENC_DEC, DEC 1177 vmovdqa %%XMM5, %%T1 1178 %endif 1179 1180 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] 1181 vpxor %%XMM6, %%XMM6, %%T1 1182 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 1183 %ifidn %%ENC_DEC, DEC 1184 vmovdqa %%XMM6, %%T1 1185 %endif 1186 1187 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] 1188 vpxor %%XMM7, %%XMM7, %%T1 1189 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 1190 %ifidn %%ENC_DEC, DEC 1191 vmovdqa %%XMM7, %%T1 1192 %endif 1193 1194%if %%num_initial_blocks > 0 1195 ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 1196 ;; This macro is executed for length 128 and up, 1197 ;; zero length is checked in GCM_ENC_DEC. 1198 ;; If the last block is partial then the xor will be done later 1199 ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. 1200 ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 1201 cmp %%LENGTH, 128 1202 jl %%_initial_skip_last_word_write 1203%endif 1204 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] 1205 vpxor %%XMM8, %%XMM8, %%T1 1206 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 1207 %ifidn %%ENC_DEC, DEC 1208 vmovdqa %%XMM8, %%T1 1209 %endif 1210 1211 ;; Update %%LENGTH with the number of blocks processed 1212 sub %%LENGTH, 16 1213 add %%DATA_OFFSET, 16 1214%%_initial_skip_last_word_write: 1215 sub %%LENGTH, 128-16 1216 add %%DATA_OFFSET, 128-16 1217 1218 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap 1219 ;; Combine GHASHed value with the corresponding ciphertext 1220 vpxor %%XMM1, %%XMM1, %%T3 1221 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap 1222 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap 1223 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap 1224 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap 1225 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap 1226 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap 1227 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap 1228 1229;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1230 1231%%_initial_blocks_done: 1232 1233 1234%endmacro 1235 1236;;; INITIAL_BLOCKS macro with support for a partial final block. 1237;;; num_initial_blocks is expected to include the partial final block 1238;;; in the count. 1239%macro INITIAL_BLOCKS_PARTIAL 25 1240%define %%GDATA_KEY %1 1241%define %%GDATA_CTX %2 1242%define %%CYPH_PLAIN_OUT %3 1243%define %%PLAIN_CYPH_IN %4 1244%define %%LENGTH %5 1245%define %%DATA_OFFSET %6 1246%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) 1247%define %%T1 %8 1248%define %%T2 %9 1249%define %%T3 %10 1250%define %%T4 %11 1251%define %%T5 %12 1252%define %%CTR %13 1253%define %%XMM1 %14 1254%define %%XMM2 %15 1255%define %%XMM3 %16 1256%define %%XMM4 %17 1257%define %%XMM5 %18 1258%define %%XMM6 %19 1259%define %%XMM7 %20 1260%define %%XMM8 %21 1261%define %%T6 %22 1262%define %%T_key %23 1263%define %%ENC_DEC %24 1264%define %%INSTANCE_TYPE %25 1265 1266%assign i (8-%%num_initial_blocks) 1267 ;; Move AAD_HASH to temp reg 1268 vmovdqu %%T2, %%XMM8 1269 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 1270 1271%assign i (9-%%num_initial_blocks) 1272%rep %%num_initial_blocks 1273 ;; Compute AES counters 1274 vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 1275 vmovdqa reg(i), %%CTR 1276 vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap 1277%assign i (i+1) 1278%endrep 1279 1280vmovdqu %%T_key, [%%GDATA_KEY+16*0] 1281%assign i (9-%%num_initial_blocks) 1282%rep %%num_initial_blocks 1283 ; Start AES for %%num_initial_blocks blocks 1284 vpxor reg(i),reg(i),%%T_key 1285%assign i (i+1) 1286%endrep 1287 1288%assign j 1 1289%rep NROUNDS 1290vmovdqu %%T_key, [%%GDATA_KEY+16*j] 1291%assign i (9-%%num_initial_blocks) 1292%rep %%num_initial_blocks 1293 vaesenc reg(i),%%T_key 1294%assign i (i+1) 1295%endrep 1296 1297%assign j (j+1) 1298%endrep 1299 1300 1301vmovdqu %%T_key, [%%GDATA_KEY+16*j] 1302%assign i (9-%%num_initial_blocks) 1303%rep %%num_initial_blocks 1304 vaesenclast reg(i),%%T_key 1305%assign i (i+1) 1306%endrep 1307 1308;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1309;;; Hash all but the last block of data 1310;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1311 1312%assign i (9-%%num_initial_blocks) 1313%rep %%num_initial_blocks-1 1314 ;; Encrypt the message for all but the last block 1315 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 1316 vpxor reg(i), reg(i), %%T1 1317 ;; write back ciphertext for %%num_initial_blocks blocks 1318 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) 1319 add %%DATA_OFFSET, 16 1320 %ifidn %%ENC_DEC, DEC 1321 vmovdqa reg(i), %%T1 1322 %endif 1323 ;; Prepare ciphertext for GHASH computations 1324 vpshufb reg(i), [rel SHUF_MASK] 1325%assign i (i+1) 1326%endrep 1327 1328 ;; The final block of data may be <16B 1329 sub %%LENGTH, 16*(%%num_initial_blocks-1) 1330 1331%if %%num_initial_blocks < 8 1332 ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. 1333 ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. 1334 cmp %%LENGTH, 16 1335 jl %%_small_initial_partial_block 1336 1337;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1338;;; Handle a full length final block - encrypt and hash all blocks 1339;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1340 1341 sub %%LENGTH, 16 1342 mov [%%GDATA_CTX + PBlockLen], %%LENGTH 1343 1344 ;; Encrypt the message 1345 VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 1346 vpxor reg(i), reg(i), %%T1 1347 ;; write back ciphertext for %%num_initial_blocks blocks 1348 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) 1349 add %%DATA_OFFSET, 16 1350 %ifidn %%ENC_DEC, DEC 1351 vmovdqa reg(i), %%T1 1352 %endif 1353 ;; Prepare ciphertext for GHASH computations 1354 vpshufb reg(i), [rel SHUF_MASK] 1355 1356 ;; Hash all of the data 1357%assign i (8-%%num_initial_blocks) 1358%assign j (9-%%num_initial_blocks) 1359%assign k (%%num_initial_blocks) 1360%assign last_block_to_hash 0 1361 1362%if(%%num_initial_blocks>last_block_to_hash) 1363 ;; Hash in AES state 1364 vpxor %%T2, reg(j) 1365 1366 ;; T2 - incoming AAD hash 1367 ;; reg(i) holds ciphertext 1368 ;; T5 - hash key 1369 ;; T6 - updated xor 1370 ;; reg(1)/xmm1 should now be available for tmp use 1371 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1372 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 1373 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 1374 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 1375 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 1376 vpxor %%T6, %%T6, %%T5 1377%endif 1378 1379%assign i (i+1) 1380%assign j (j+1) 1381%assign k (k-1) 1382%assign rep_count (%%num_initial_blocks-1) 1383%rep rep_count 1384 1385 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1386 vpclmulqdq %%T3, reg(j), %%T5, 0x11 1387 vpxor %%T1, %%T1, %%T3 1388 1389 vpclmulqdq %%T3, reg(j), %%T5, 0x00 1390 vpxor %%T4, %%T4, %%T3 1391 1392 vpclmulqdq %%T3, reg(j), %%T5, 0x01 1393 vpxor %%T6, %%T6, %%T3 1394 1395 vpclmulqdq %%T3, reg(j), %%T5, 0x10 1396 vpxor %%T6, %%T6, %%T3 1397 1398%assign i (i+1) 1399%assign j (j+1) 1400%assign k (k-1) 1401%endrep 1402 1403 ;; Record that a reduction is needed 1404 mov r12, 1 1405 1406 jmp %%_small_initial_compute_hash 1407 1408 1409%endif ; %if %%num_initial_blocks < 8 1410 1411%%_small_initial_partial_block: 1412 1413;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1414;;; Handle ghash for a <16B final block 1415;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1416 1417 ;; In this case if it's a single call to encrypt we can 1418 ;; hash all of the data but if it's an init / update / finalize 1419 ;; series of call we need to leave the last block if it's 1420 ;; less than a full block of data. 1421 1422 mov [%%GDATA_CTX + PBlockLen], %%LENGTH 1423 vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) 1424 ;; Handle a partial final block 1425 ;; GDATA, KEY, T1, T2 1426 ;; r13 - length 1427 ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long 1428 ;; NOTE: could be replaced with %%LENGTH but at this point 1429 ;; %%LENGTH is always less than 16. 1430 ;; No PLAIN_CYPH_LEN argument available in this macro. 1431 ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET 1432 vpshufb reg(i), [rel SHUF_MASK] 1433 1434%ifidn %%INSTANCE_TYPE, multi_call 1435%assign i (8-%%num_initial_blocks) 1436%assign j (9-%%num_initial_blocks) 1437%assign k (%%num_initial_blocks-1) 1438%assign last_block_to_hash 1 1439%else 1440%assign i (8-%%num_initial_blocks) 1441%assign j (9-%%num_initial_blocks) 1442%assign k (%%num_initial_blocks) 1443%assign last_block_to_hash 0 1444%endif 1445 1446%if(%%num_initial_blocks>last_block_to_hash) 1447 ;; Record that a reduction is needed 1448 mov r12, 1 1449 ;; Hash in AES state 1450 vpxor %%T2, reg(j) 1451 1452 ;; T2 - incoming AAD hash 1453 ;; reg(i) holds ciphertext 1454 ;; T5 - hash key 1455 ;; T6 - updated xor 1456 ;; reg(1)/xmm1 should now be available for tmp use 1457 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1458 vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 1459 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 1460 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 1461 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 1462 vpxor %%T6, %%T6, %%T5 1463%else 1464 ;; Record that a reduction is not needed - 1465 ;; In this case no hashes are computed because there 1466 ;; is only one initial block and it is < 16B in length. 1467 mov r12, 0 1468%endif 1469 1470%assign i (i+1) 1471%assign j (j+1) 1472%assign k (k-1) 1473%ifidn %%INSTANCE_TYPE, multi_call 1474%assign rep_count (%%num_initial_blocks-2) 1475%%_multi_call_hash: 1476%else 1477%assign rep_count (%%num_initial_blocks-1) 1478%endif 1479 1480%if rep_count < 0 1481 ;; quick fix for negative rep_count (to be investigated) 1482%assign rep_count 0 1483%endif 1484 1485%rep rep_count 1486 1487 vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] 1488 vpclmulqdq %%T3, reg(j), %%T5, 0x11 1489 vpxor %%T1, %%T1, %%T3 1490 1491 vpclmulqdq %%T3, reg(j), %%T5, 0x00 1492 vpxor %%T4, %%T4, %%T3 1493 1494 vpclmulqdq %%T3, reg(j), %%T5, 0x01 1495 vpxor %%T6, %%T6, %%T3 1496 1497 vpclmulqdq %%T3, reg(j), %%T5, 0x10 1498 vpxor %%T6, %%T6, %%T3 1499 1500%assign i (i+1) 1501%assign j (j+1) 1502%assign k (k-1) 1503%endrep 1504 1505%%_small_initial_compute_hash: 1506 1507;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1508;;; Ghash reduction 1509;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1510 1511%if(%%num_initial_blocks=1) 1512%ifidn %%INSTANCE_TYPE, multi_call 1513 ;; We only need to check if a reduction is needed if 1514 ;; initial_blocks == 1 and init/update/final is being used. 1515 ;; In this case we may just have a partial block, and that 1516 ;; gets hashed in finalize. 1517 cmp r12, 0 1518 je %%_no_reduction_needed 1519%endif 1520%endif 1521 1522 vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs 1523 vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs 1524 vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 1525 vpxor %%T4, %%T6, %%T4 1526 1527 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1528 ;; First phase of the reduction 1529 vmovdqa %%T3, [rel POLY2] 1530 1531 vpclmulqdq %%T2, %%T3, %%T4, 0x01 1532 ;; shift-L xmm2 2 DWs 1533 vpslldq %%T2, %%T2, 8 1534 vpxor %%T4, %%T4, %%T2 1535 1536 ;; First phase of the reduction complete 1537 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1538 ;; Second phase of the reduction 1539 1540 vpclmulqdq %%T2, %%T3, %%T4, 0x00 1541 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1542 vpsrldq %%T2, %%T2, 4 1543 1544 vpclmulqdq %%T4, %%T3, %%T4, 0x10 1545 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 1546 vpslldq %%T4, %%T4, 4 1547 1548 vpxor %%T4, %%T4, %%T2 1549 ;; Second phase of the reduction complete 1550 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1551 vpxor %%T3, %%T1, %%T4 1552 1553%ifidn %%INSTANCE_TYPE, multi_call 1554 ;; If using init/update/finalize, we need to xor any partial block data 1555 ;; into the hash. 1556%if %%num_initial_blocks > 1 1557 ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place 1558%if %%num_initial_blocks != 8 1559 ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero 1560 cmp qword [%%GDATA_CTX + PBlockLen], 0 1561 je %%_no_partial_block_xor 1562%endif ; %%num_initial_blocks != 8 1563 vpxor %%T3, %%T3, reg(8) 1564%%_no_partial_block_xor: 1565%endif ; %%num_initial_blocks > 1 1566%endif ; %%INSTANCE_TYPE, multi_call 1567 1568%if(%%num_initial_blocks=1) 1569%ifidn %%INSTANCE_TYPE, multi_call 1570 ;; NOTE: %%_no_reduction_needed case only valid for 1571 ;; multi_call with initial_blocks = 1. 1572 ;; Look for comment above around '_no_reduction_needed' 1573 ;; The jmp below is obsolete as the code will fall through. 1574 1575 ;; The result is in %%T3 1576 jmp %%_after_reduction 1577 1578%%_no_reduction_needed: 1579 ;; The hash should end up in T3. The only way we should get here is if 1580 ;; there is a partial block of data, so xor that into the hash. 1581 vpxor %%T3, %%T2, reg(8) 1582%endif ; %%INSTANCE_TYPE = multi_call 1583%endif ; %%num_initial_blocks=1 1584 1585%%_after_reduction: 1586 ;; Final hash is now in T3 1587 1588%endmacro ; INITIAL_BLOCKS_PARTIAL 1589 1590 1591 1592; encrypt 8 blocks at a time 1593; ghash the 8 previously encrypted ciphertext blocks 1594; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified 1595; %%DATA_OFFSET is the data offset value 1596%macro GHASH_8_ENCRYPT_8_PARALLEL 23 1597%define %%GDATA %1 1598%define %%CYPH_PLAIN_OUT %2 1599%define %%PLAIN_CYPH_IN %3 1600%define %%DATA_OFFSET %4 1601%define %%T1 %5 1602%define %%T2 %6 1603%define %%T3 %7 1604%define %%T4 %8 1605%define %%T5 %9 1606%define %%T6 %10 1607%define %%CTR %11 1608%define %%XMM1 %12 1609%define %%XMM2 %13 1610%define %%XMM3 %14 1611%define %%XMM4 %15 1612%define %%XMM5 %16 1613%define %%XMM6 %17 1614%define %%XMM7 %18 1615%define %%XMM8 %19 1616%define %%T7 %20 1617%define %%loop_idx %21 1618%define %%ENC_DEC %22 1619%define %%FULL_PARTIAL %23 1620 1621 vmovdqa %%T2, %%XMM1 1622 vmovdqu [rsp + TMP2], %%XMM2 1623 vmovdqu [rsp + TMP3], %%XMM3 1624 vmovdqu [rsp + TMP4], %%XMM4 1625 vmovdqu [rsp + TMP5], %%XMM5 1626 vmovdqu [rsp + TMP6], %%XMM6 1627 vmovdqu [rsp + TMP7], %%XMM7 1628 vmovdqu [rsp + TMP8], %%XMM8 1629 1630%ifidn %%loop_idx, in_order 1631 vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT 1632 vmovdqa %%T5, [rel TWO] 1633 vpaddd %%XMM2, %%CTR, %%T5 1634 vpaddd %%XMM3, %%XMM1, %%T5 1635 vpaddd %%XMM4, %%XMM2, %%T5 1636 vpaddd %%XMM5, %%XMM3, %%T5 1637 vpaddd %%XMM6, %%XMM4, %%T5 1638 vpaddd %%XMM7, %%XMM5, %%T5 1639 vpaddd %%XMM8, %%XMM6, %%T5 1640 vmovdqa %%CTR, %%XMM8 1641 1642 vmovdqa %%T5, [rel SHUF_MASK] 1643 vpshufb %%XMM1, %%T5 ; perform a 16Byte swap 1644 vpshufb %%XMM2, %%T5 ; perform a 16Byte swap 1645 vpshufb %%XMM3, %%T5 ; perform a 16Byte swap 1646 vpshufb %%XMM4, %%T5 ; perform a 16Byte swap 1647 vpshufb %%XMM5, %%T5 ; perform a 16Byte swap 1648 vpshufb %%XMM6, %%T5 ; perform a 16Byte swap 1649 vpshufb %%XMM7, %%T5 ; perform a 16Byte swap 1650 vpshufb %%XMM8, %%T5 ; perform a 16Byte swap 1651%else 1652 vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT 1653 vmovdqa %%T5, [rel TWOf] 1654 vpaddd %%XMM2, %%CTR, %%T5 1655 vpaddd %%XMM3, %%XMM1, %%T5 1656 vpaddd %%XMM4, %%XMM2, %%T5 1657 vpaddd %%XMM5, %%XMM3, %%T5 1658 vpaddd %%XMM6, %%XMM4, %%T5 1659 vpaddd %%XMM7, %%XMM5, %%T5 1660 vpaddd %%XMM8, %%XMM6, %%T5 1661 vmovdqa %%CTR, %%XMM8 1662%endif 1663 1664 1665 1666 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1667 1668 vmovdqu %%T1, [%%GDATA + 16*0] 1669 vpxor %%XMM1, %%XMM1, %%T1 1670 vpxor %%XMM2, %%XMM2, %%T1 1671 vpxor %%XMM3, %%XMM3, %%T1 1672 vpxor %%XMM4, %%XMM4, %%T1 1673 vpxor %%XMM5, %%XMM5, %%T1 1674 vpxor %%XMM6, %%XMM6, %%T1 1675 vpxor %%XMM7, %%XMM7, %%T1 1676 vpxor %%XMM8, %%XMM8, %%T1 1677 1678 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1679 1680 vmovdqu %%T1, [%%GDATA + 16*1] 1681 vaesenc %%XMM1, %%T1 1682 vaesenc %%XMM2, %%T1 1683 vaesenc %%XMM3, %%T1 1684 vaesenc %%XMM4, %%T1 1685 vaesenc %%XMM5, %%T1 1686 vaesenc %%XMM6, %%T1 1687 vaesenc %%XMM7, %%T1 1688 vaesenc %%XMM8, %%T1 1689 1690 1691 vmovdqu %%T1, [%%GDATA + 16*2] 1692 vaesenc %%XMM1, %%T1 1693 vaesenc %%XMM2, %%T1 1694 vaesenc %%XMM3, %%T1 1695 vaesenc %%XMM4, %%T1 1696 vaesenc %%XMM5, %%T1 1697 vaesenc %%XMM6, %%T1 1698 vaesenc %%XMM7, %%T1 1699 vaesenc %%XMM8, %%T1 1700 1701 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1702 1703 vmovdqu %%T5, [%%GDATA + HashKey_8] 1704 vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 1705 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 1706 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 1707 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 1708 vpxor %%T6, %%T6, %%T5 1709 1710 vmovdqu %%T1, [%%GDATA + 16*3] 1711 vaesenc %%XMM1, %%T1 1712 vaesenc %%XMM2, %%T1 1713 vaesenc %%XMM3, %%T1 1714 vaesenc %%XMM4, %%T1 1715 vaesenc %%XMM5, %%T1 1716 vaesenc %%XMM6, %%T1 1717 vaesenc %%XMM7, %%T1 1718 vaesenc %%XMM8, %%T1 1719 1720 vmovdqu %%T1, [rsp + TMP2] 1721 vmovdqu %%T5, [%%GDATA + HashKey_7] 1722 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1723 vpxor %%T4, %%T4, %%T3 1724 1725 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1726 vpxor %%T7, %%T7, %%T3 1727 1728 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1729 vpxor %%T6, %%T6, %%T3 1730 1731 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1732 vpxor %%T6, %%T6, %%T3 1733 1734 vmovdqu %%T1, [%%GDATA + 16*4] 1735 vaesenc %%XMM1, %%T1 1736 vaesenc %%XMM2, %%T1 1737 vaesenc %%XMM3, %%T1 1738 vaesenc %%XMM4, %%T1 1739 vaesenc %%XMM5, %%T1 1740 vaesenc %%XMM6, %%T1 1741 vaesenc %%XMM7, %%T1 1742 vaesenc %%XMM8, %%T1 1743 1744 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1745 vmovdqu %%T1, [rsp + TMP3] 1746 vmovdqu %%T5, [%%GDATA + HashKey_6] 1747 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1748 vpxor %%T4, %%T4, %%T3 1749 1750 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1751 vpxor %%T7, %%T7, %%T3 1752 1753 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1754 vpxor %%T6, %%T6, %%T3 1755 1756 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1757 vpxor %%T6, %%T6, %%T3 1758 1759 vmovdqu %%T1, [%%GDATA + 16*5] 1760 vaesenc %%XMM1, %%T1 1761 vaesenc %%XMM2, %%T1 1762 vaesenc %%XMM3, %%T1 1763 vaesenc %%XMM4, %%T1 1764 vaesenc %%XMM5, %%T1 1765 vaesenc %%XMM6, %%T1 1766 vaesenc %%XMM7, %%T1 1767 vaesenc %%XMM8, %%T1 1768 1769 1770 vmovdqu %%T1, [rsp + TMP4] 1771 vmovdqu %%T5, [%%GDATA + HashKey_5] 1772 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1773 vpxor %%T4, %%T4, %%T3 1774 1775 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1776 vpxor %%T7, %%T7, %%T3 1777 1778 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1779 vpxor %%T6, %%T6, %%T3 1780 1781 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1782 vpxor %%T6, %%T6, %%T3 1783 1784 vmovdqu %%T1, [%%GDATA + 16*6] 1785 vaesenc %%XMM1, %%T1 1786 vaesenc %%XMM2, %%T1 1787 vaesenc %%XMM3, %%T1 1788 vaesenc %%XMM4, %%T1 1789 vaesenc %%XMM5, %%T1 1790 vaesenc %%XMM6, %%T1 1791 vaesenc %%XMM7, %%T1 1792 vaesenc %%XMM8, %%T1 1793 1794 vmovdqu %%T1, [rsp + TMP5] 1795 vmovdqu %%T5, [%%GDATA + HashKey_4] 1796 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1797 vpxor %%T4, %%T4, %%T3 1798 1799 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1800 vpxor %%T7, %%T7, %%T3 1801 1802 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1803 vpxor %%T6, %%T6, %%T3 1804 1805 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1806 vpxor %%T6, %%T6, %%T3 1807 1808 vmovdqu %%T1, [%%GDATA + 16*7] 1809 vaesenc %%XMM1, %%T1 1810 vaesenc %%XMM2, %%T1 1811 vaesenc %%XMM3, %%T1 1812 vaesenc %%XMM4, %%T1 1813 vaesenc %%XMM5, %%T1 1814 vaesenc %%XMM6, %%T1 1815 vaesenc %%XMM7, %%T1 1816 vaesenc %%XMM8, %%T1 1817 1818 vmovdqu %%T1, [rsp + TMP6] 1819 vmovdqu %%T5, [%%GDATA + HashKey_3] 1820 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1821 vpxor %%T4, %%T4, %%T3 1822 1823 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1824 vpxor %%T7, %%T7, %%T3 1825 1826 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1827 vpxor %%T6, %%T6, %%T3 1828 1829 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1830 vpxor %%T6, %%T6, %%T3 1831 1832 vmovdqu %%T1, [%%GDATA + 16*8] 1833 vaesenc %%XMM1, %%T1 1834 vaesenc %%XMM2, %%T1 1835 vaesenc %%XMM3, %%T1 1836 vaesenc %%XMM4, %%T1 1837 vaesenc %%XMM5, %%T1 1838 vaesenc %%XMM6, %%T1 1839 vaesenc %%XMM7, %%T1 1840 vaesenc %%XMM8, %%T1 1841 1842 vmovdqu %%T1, [rsp + TMP7] 1843 vmovdqu %%T5, [%%GDATA + HashKey_2] 1844 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1845 vpxor %%T4, %%T4, %%T3 1846 1847 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1848 vpxor %%T7, %%T7, %%T3 1849 1850 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1851 vpxor %%T6, %%T6, %%T3 1852 1853 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1854 vpxor %%T6, %%T6, %%T3 1855 1856 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1857 1858 vmovdqu %%T5, [%%GDATA + 16*9] 1859 vaesenc %%XMM1, %%T5 1860 vaesenc %%XMM2, %%T5 1861 vaesenc %%XMM3, %%T5 1862 vaesenc %%XMM4, %%T5 1863 vaesenc %%XMM5, %%T5 1864 vaesenc %%XMM6, %%T5 1865 vaesenc %%XMM7, %%T5 1866 vaesenc %%XMM8, %%T5 1867 1868 vmovdqu %%T1, [rsp + TMP8] 1869 vmovdqu %%T5, [%%GDATA + HashKey] 1870 1871 1872 vpclmulqdq %%T3, %%T1, %%T5, 0x00 1873 vpxor %%T7, %%T7, %%T3 1874 1875 vpclmulqdq %%T3, %%T1, %%T5, 0x01 1876 vpxor %%T6, %%T6, %%T3 1877 1878 vpclmulqdq %%T3, %%T1, %%T5, 0x10 1879 vpxor %%T6, %%T6, %%T3 1880 1881 vpclmulqdq %%T3, %%T1, %%T5, 0x11 1882 vpxor %%T1, %%T4, %%T3 1883 1884 1885 vmovdqu %%T5, [%%GDATA + 16*10] 1886 %ifndef GCM128_MODE ; GCM192 or GCM256 1887 vaesenc %%XMM1, %%T5 1888 vaesenc %%XMM2, %%T5 1889 vaesenc %%XMM3, %%T5 1890 vaesenc %%XMM4, %%T5 1891 vaesenc %%XMM5, %%T5 1892 vaesenc %%XMM6, %%T5 1893 vaesenc %%XMM7, %%T5 1894 vaesenc %%XMM8, %%T5 1895 1896 vmovdqu %%T5, [%%GDATA + 16*11] 1897 vaesenc %%XMM1, %%T5 1898 vaesenc %%XMM2, %%T5 1899 vaesenc %%XMM3, %%T5 1900 vaesenc %%XMM4, %%T5 1901 vaesenc %%XMM5, %%T5 1902 vaesenc %%XMM6, %%T5 1903 vaesenc %%XMM7, %%T5 1904 vaesenc %%XMM8, %%T5 1905 1906 vmovdqu %%T5, [%%GDATA + 16*12] 1907%endif 1908%ifdef GCM256_MODE 1909 vaesenc %%XMM1, %%T5 1910 vaesenc %%XMM2, %%T5 1911 vaesenc %%XMM3, %%T5 1912 vaesenc %%XMM4, %%T5 1913 vaesenc %%XMM5, %%T5 1914 vaesenc %%XMM6, %%T5 1915 vaesenc %%XMM7, %%T5 1916 vaesenc %%XMM8, %%T5 1917 1918 vmovdqu %%T5, [%%GDATA + 16*13] 1919 vaesenc %%XMM1, %%T5 1920 vaesenc %%XMM2, %%T5 1921 vaesenc %%XMM3, %%T5 1922 vaesenc %%XMM4, %%T5 1923 vaesenc %%XMM5, %%T5 1924 vaesenc %%XMM6, %%T5 1925 vaesenc %%XMM7, %%T5 1926 vaesenc %%XMM8, %%T5 1927 1928 vmovdqu %%T5, [%%GDATA + 16*14] 1929%endif ; GCM256 1930 1931%assign i 0 1932%assign j 1 1933%rep 8 1934 1935 ;; SNP TBD: This is pretty ugly - consider whether just XORing the 1936 ;; data in after vaesenclast is simpler and performant. Would 1937 ;; also have to ripple it through partial block and ghash_mul_8. 1938%ifidn %%FULL_PARTIAL, full 1939 %ifdef NT_LD 1940 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1941 vpxor %%T2, %%T2, %%T5 1942 %else 1943 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1944 %endif 1945 1946 %ifidn %%ENC_DEC, ENC 1947 vaesenclast reg(j), reg(j), %%T2 1948 %else 1949 vaesenclast %%T3, reg(j), %%T2 1950 vpxor reg(j), %%T2, %%T5 1951 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 1952 %endif 1953 1954%else 1955 ; Don't read the final data during partial block processing 1956 %ifdef NT_LD 1957 %if (i<7) 1958 VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1959 vpxor %%T2, %%T2, %%T5 1960 %else 1961 ;; Stage the key directly in T2 rather than hash it with plaintext 1962 vmovdqu %%T2, %%T5 1963 %endif 1964 %else 1965 %if (i<7) 1966 vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] 1967 %else 1968 ;; Stage the key directly in T2 rather than hash it with plaintext 1969 vmovdqu %%T2, %%T5 1970 %endif 1971 %endif 1972 1973 %ifidn %%ENC_DEC, ENC 1974 vaesenclast reg(j), reg(j), %%T2 1975 %else 1976 %if (i<7) 1977 vaesenclast %%T3, reg(j), %%T2 1978 vpxor reg(j), %%T2, %%T5 1979 ;; Do not read the data since it could fault 1980 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 1981 %else 1982 vaesenclast reg(j), reg(j), %%T2 1983 %endif 1984 %endif 1985%endif 1986 1987%assign i (i+1) 1988%assign j (j+1) 1989%endrep 1990 1991 1992;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1993 1994 1995 vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs 1996 vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs 1997 vpxor %%T7, %%T7, %%T3 1998 vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 1999 2000 2001 2002 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2003 ;first phase of the reduction 2004 vmovdqa %%T3, [rel POLY2] 2005 2006 vpclmulqdq %%T2, %%T3, %%T7, 0x01 2007 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 2008 2009 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete 2010 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2011 2012 %ifidn %%ENC_DEC, ENC 2013 ; Write to the Ciphertext buffer 2014 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 2015 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 2016 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 2017 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 2018 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 2019 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 2020 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 2021 %ifidn %%FULL_PARTIAL, full 2022 ;; Avoid writing past the buffer if handling a partial block 2023 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 2024 %endif 2025 %endif 2026 2027 2028;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2029 ;second phase of the reduction 2030 vpclmulqdq %%T2, %%T3, %%T7, 0x00 2031 vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2032 2033 vpclmulqdq %%T4, %%T3, %%T7, 0x10 2034 vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2035 2036 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete 2037 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2038 vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 2039 2040 vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap 2041 vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap 2042 vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap 2043 vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap 2044 vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap 2045 vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap 2046 vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap 2047 vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap 2048 2049 2050 vpxor %%XMM1, %%T1 2051 2052 2053%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL 2054 2055 2056; GHASH the last 4 ciphertext blocks. 2057%macro GHASH_LAST_8 16 2058%define %%GDATA %1 2059%define %%T1 %2 2060%define %%T2 %3 2061%define %%T3 %4 2062%define %%T4 %5 2063%define %%T5 %6 2064%define %%T6 %7 2065%define %%T7 %8 2066%define %%XMM1 %9 2067%define %%XMM2 %10 2068%define %%XMM3 %11 2069%define %%XMM4 %12 2070%define %%XMM5 %13 2071%define %%XMM6 %14 2072%define %%XMM7 %15 2073%define %%XMM8 %16 2074 2075 ;; Karatsuba Method 2076 2077 vmovdqu %%T5, [%%GDATA + HashKey_8] 2078 2079 vpshufd %%T2, %%XMM1, 01001110b 2080 vpshufd %%T3, %%T5, 01001110b 2081 vpxor %%T2, %%T2, %%XMM1 2082 vpxor %%T3, %%T3, %%T5 2083 2084 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 2085 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 2086 2087 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 2088 2089 ;;;;;;;;;;;;;;;;;;;;;; 2090 2091 vmovdqu %%T5, [%%GDATA + HashKey_7] 2092 vpshufd %%T2, %%XMM2, 01001110b 2093 vpshufd %%T3, %%T5, 01001110b 2094 vpxor %%T2, %%T2, %%XMM2 2095 vpxor %%T3, %%T3, %%T5 2096 2097 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 2098 vpxor %%T6, %%T6, %%T4 2099 2100 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 2101 vpxor %%T7, %%T7, %%T4 2102 2103 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2104 2105 vpxor %%XMM1, %%XMM1, %%T2 2106 2107 ;;;;;;;;;;;;;;;;;;;;;; 2108 2109 vmovdqu %%T5, [%%GDATA + HashKey_6] 2110 vpshufd %%T2, %%XMM3, 01001110b 2111 vpshufd %%T3, %%T5, 01001110b 2112 vpxor %%T2, %%T2, %%XMM3 2113 vpxor %%T3, %%T3, %%T5 2114 2115 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 2116 vpxor %%T6, %%T6, %%T4 2117 2118 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 2119 vpxor %%T7, %%T7, %%T4 2120 2121 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2122 2123 vpxor %%XMM1, %%XMM1, %%T2 2124 2125 ;;;;;;;;;;;;;;;;;;;;;; 2126 2127 vmovdqu %%T5, [%%GDATA + HashKey_5] 2128 vpshufd %%T2, %%XMM4, 01001110b 2129 vpshufd %%T3, %%T5, 01001110b 2130 vpxor %%T2, %%T2, %%XMM4 2131 vpxor %%T3, %%T3, %%T5 2132 2133 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 2134 vpxor %%T6, %%T6, %%T4 2135 2136 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 2137 vpxor %%T7, %%T7, %%T4 2138 2139 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2140 2141 vpxor %%XMM1, %%XMM1, %%T2 2142 2143 ;;;;;;;;;;;;;;;;;;;;;; 2144 2145 vmovdqu %%T5, [%%GDATA + HashKey_4] 2146 vpshufd %%T2, %%XMM5, 01001110b 2147 vpshufd %%T3, %%T5, 01001110b 2148 vpxor %%T2, %%T2, %%XMM5 2149 vpxor %%T3, %%T3, %%T5 2150 2151 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 2152 vpxor %%T6, %%T6, %%T4 2153 2154 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 2155 vpxor %%T7, %%T7, %%T4 2156 2157 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2158 2159 vpxor %%XMM1, %%XMM1, %%T2 2160 2161 ;;;;;;;;;;;;;;;;;;;;;; 2162 2163 vmovdqu %%T5, [%%GDATA + HashKey_3] 2164 vpshufd %%T2, %%XMM6, 01001110b 2165 vpshufd %%T3, %%T5, 01001110b 2166 vpxor %%T2, %%T2, %%XMM6 2167 vpxor %%T3, %%T3, %%T5 2168 2169 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 2170 vpxor %%T6, %%T6, %%T4 2171 2172 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 2173 vpxor %%T7, %%T7, %%T4 2174 2175 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2176 2177 vpxor %%XMM1, %%XMM1, %%T2 2178 2179 ;;;;;;;;;;;;;;;;;;;;;; 2180 2181 vmovdqu %%T5, [%%GDATA + HashKey_2] 2182 vpshufd %%T2, %%XMM7, 01001110b 2183 vpshufd %%T3, %%T5, 01001110b 2184 vpxor %%T2, %%T2, %%XMM7 2185 vpxor %%T3, %%T3, %%T5 2186 2187 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 2188 vpxor %%T6, %%T6, %%T4 2189 2190 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 2191 vpxor %%T7, %%T7, %%T4 2192 2193 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2194 2195 vpxor %%XMM1, %%XMM1, %%T2 2196 2197 ;;;;;;;;;;;;;;;;;;;;;; 2198 2199 vmovdqu %%T5, [%%GDATA + HashKey] 2200 vpshufd %%T2, %%XMM8, 01001110b 2201 vpshufd %%T3, %%T5, 01001110b 2202 vpxor %%T2, %%T2, %%XMM8 2203 vpxor %%T3, %%T3, %%T5 2204 2205 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 2206 vpxor %%T6, %%T6, %%T4 2207 2208 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 2209 vpxor %%T7, %%T7, %%T4 2210 2211 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2212 2213 vpxor %%XMM1, %%XMM1, %%T2 2214 vpxor %%XMM1, %%XMM1, %%T6 2215 vpxor %%T2, %%XMM1, %%T7 2216 2217 2218 2219 2220 vpslldq %%T4, %%T2, 8 2221 vpsrldq %%T2, %%T2, 8 2222 2223 vpxor %%T7, %%T7, %%T4 2224 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications 2225 2226 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2227 ;first phase of the reduction 2228 vmovdqa %%T3, [rel POLY2] 2229 2230 vpclmulqdq %%T2, %%T3, %%T7, 0x01 2231 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 2232 2233 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete 2234 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2235 2236 2237 ;second phase of the reduction 2238 vpclmulqdq %%T2, %%T3, %%T7, 0x00 2239 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2240 2241 vpclmulqdq %%T4, %%T3, %%T7, 0x10 2242 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2243 2244 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete 2245 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2246 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 2247%endmacro 2248 2249 2250; GHASH the last 4 ciphertext blocks. 2251%macro GHASH_LAST_7 15 2252%define %%GDATA %1 2253%define %%T1 %2 2254%define %%T2 %3 2255%define %%T3 %4 2256%define %%T4 %5 2257%define %%T5 %6 2258%define %%T6 %7 2259%define %%T7 %8 2260%define %%XMM1 %9 2261%define %%XMM2 %10 2262%define %%XMM3 %11 2263%define %%XMM4 %12 2264%define %%XMM5 %13 2265%define %%XMM6 %14 2266%define %%XMM7 %15 2267 2268 ;; Karatsuba Method 2269 2270 vmovdqu %%T5, [%%GDATA + HashKey_7] 2271 2272 vpshufd %%T2, %%XMM1, 01001110b 2273 vpshufd %%T3, %%T5, 01001110b 2274 vpxor %%T2, %%T2, %%XMM1 2275 vpxor %%T3, %%T3, %%T5 2276 2277 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 2278 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 2279 2280 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 2281 2282 ;;;;;;;;;;;;;;;;;;;;;; 2283 2284 vmovdqu %%T5, [%%GDATA + HashKey_6] 2285 vpshufd %%T2, %%XMM2, 01001110b 2286 vpshufd %%T3, %%T5, 01001110b 2287 vpxor %%T2, %%T2, %%XMM2 2288 vpxor %%T3, %%T3, %%T5 2289 2290 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 2291 vpxor %%T6, %%T6, %%T4 2292 2293 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 2294 vpxor %%T7, %%T7, %%T4 2295 2296 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2297 2298 vpxor %%XMM1, %%XMM1, %%T2 2299 2300 ;;;;;;;;;;;;;;;;;;;;;; 2301 2302 vmovdqu %%T5, [%%GDATA + HashKey_5] 2303 vpshufd %%T2, %%XMM3, 01001110b 2304 vpshufd %%T3, %%T5, 01001110b 2305 vpxor %%T2, %%T2, %%XMM3 2306 vpxor %%T3, %%T3, %%T5 2307 2308 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 2309 vpxor %%T6, %%T6, %%T4 2310 2311 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 2312 vpxor %%T7, %%T7, %%T4 2313 2314 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2315 2316 vpxor %%XMM1, %%XMM1, %%T2 2317 2318 ;;;;;;;;;;;;;;;;;;;;;; 2319 2320 vmovdqu %%T5, [%%GDATA + HashKey_4] 2321 vpshufd %%T2, %%XMM4, 01001110b 2322 vpshufd %%T3, %%T5, 01001110b 2323 vpxor %%T2, %%T2, %%XMM4 2324 vpxor %%T3, %%T3, %%T5 2325 2326 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 2327 vpxor %%T6, %%T6, %%T4 2328 2329 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 2330 vpxor %%T7, %%T7, %%T4 2331 2332 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2333 2334 vpxor %%XMM1, %%XMM1, %%T2 2335 2336 ;;;;;;;;;;;;;;;;;;;;;; 2337 2338 vmovdqu %%T5, [%%GDATA + HashKey_3] 2339 vpshufd %%T2, %%XMM5, 01001110b 2340 vpshufd %%T3, %%T5, 01001110b 2341 vpxor %%T2, %%T2, %%XMM5 2342 vpxor %%T3, %%T3, %%T5 2343 2344 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 2345 vpxor %%T6, %%T6, %%T4 2346 2347 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 2348 vpxor %%T7, %%T7, %%T4 2349 2350 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2351 2352 vpxor %%XMM1, %%XMM1, %%T2 2353 2354 ;;;;;;;;;;;;;;;;;;;;;; 2355 2356 vmovdqu %%T5, [%%GDATA + HashKey_2] 2357 vpshufd %%T2, %%XMM6, 01001110b 2358 vpshufd %%T3, %%T5, 01001110b 2359 vpxor %%T2, %%T2, %%XMM6 2360 vpxor %%T3, %%T3, %%T5 2361 2362 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 2363 vpxor %%T6, %%T6, %%T4 2364 2365 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 2366 vpxor %%T7, %%T7, %%T4 2367 2368 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2369 2370 vpxor %%XMM1, %%XMM1, %%T2 2371 2372 ;;;;;;;;;;;;;;;;;;;;;; 2373 2374 vmovdqu %%T5, [%%GDATA + HashKey_1] 2375 vpshufd %%T2, %%XMM7, 01001110b 2376 vpshufd %%T3, %%T5, 01001110b 2377 vpxor %%T2, %%T2, %%XMM7 2378 vpxor %%T3, %%T3, %%T5 2379 2380 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 2381 vpxor %%T6, %%T6, %%T4 2382 2383 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 2384 vpxor %%T7, %%T7, %%T4 2385 2386 vpclmulqdq %%T2, %%T2, %%T3, 0x00 2387 2388 vpxor %%XMM1, %%XMM1, %%T2 2389 2390 ;;;;;;;;;;;;;;;;;;;;;; 2391 2392 vpxor %%XMM1, %%XMM1, %%T6 2393 vpxor %%T2, %%XMM1, %%T7 2394 2395 2396 2397 2398 vpslldq %%T4, %%T2, 8 2399 vpsrldq %%T2, %%T2, 8 2400 2401 vpxor %%T7, %%T7, %%T4 2402 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications 2403 2404 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2405 ;first phase of the reduction 2406 vmovdqa %%T3, [rel POLY2] 2407 2408 vpclmulqdq %%T2, %%T3, %%T7, 0x01 2409 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs 2410 2411 vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete 2412 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2413 2414 2415 ;second phase of the reduction 2416 vpclmulqdq %%T2, %%T3, %%T7, 0x00 2417 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2418 2419 vpclmulqdq %%T4, %%T3, %%T7, 0x10 2420 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2421 2422 vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete 2423 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2424 vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 2425%endmacro 2426 2427 2428 2429;;; Handle encryption of the final partial block 2430;;; IN: 2431;;; r13 - Number of bytes to read 2432;;; MODIFIES: 2433;;; KEY - Key for encrypting the partial block 2434;;; HASH - Current hash value 2435;;; SMASHES: 2436;;; r10, r12, r15, rax 2437;;; T1, T2 2438;;; Note: 2439;;; PLAIN_CYPH_LEN, %7, is passed only to determine 2440;;; if buffer is big enough to do a 16 byte read & shift. 2441;;; 'LT16' is passed here only if buffer is known to be smaller 2442;;; than 16 bytes. 2443;;; Any other value passed here will result in 16 byte read 2444;;; code path. 2445;;; TBD: Remove HASH from the instantiation 2446%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8 2447%define %%KEY %1 2448%define %%T1 %2 2449%define %%T2 %3 2450%define %%CYPH_PLAIN_OUT %4 2451%define %%PLAIN_CYPH_IN %5 2452%define %%PLAIN_CYPH_LEN %6 2453%define %%ENC_DEC %7 2454%define %%DATA_OFFSET %8 2455 2456 ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting 2457%ifidn %%PLAIN_CYPH_LEN, LT16 2458 ;; Handle the case where the message is < 16 bytes 2459 lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] 2460 2461 ;; T1 - packed output 2462 ;; r10 - input data address 2463 ;; r13 - input data length 2464 ;; r12, r15, rax - temp registers 2465 READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax 2466 2467 lea r12, [SHIFT_MASK + 16] 2468 sub r12, r13 2469%else 2470 ;; Handle the case where the message is >= 16 bytes 2471 sub %%DATA_OFFSET, 16 2472 add %%DATA_OFFSET, r13 2473 ;; Receive the last <16 Byte block 2474 vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] 2475 sub %%DATA_OFFSET, r13 2476 add %%DATA_OFFSET, 16 2477 2478 lea r12, [SHIFT_MASK + 16] 2479 ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes 2480 ;; (r13 is the number of bytes in plaintext mod 16) 2481 sub r12, r13 2482 ;; Get the appropriate shuffle mask 2483 vmovdqu %%T2, [r12] 2484 ;; shift right 16-r13 bytes 2485 vpshufb %%T1, %%T2 2486%endif ; %%PLAIN_CYPH_LEN, LT16 2487 2488 ;; At this point T1 contains the partial block data 2489%ifidn %%ENC_DEC, DEC 2490 ;; Plaintext XOR E(K, Yn) 2491 ;; Set aside the ciphertext 2492 vmovdqa %%T2, %%T1 2493 vpxor %%KEY, %%KEY, %%T1 2494 ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext 2495 vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] 2496 ;; Mask out top 16-r13 bytes of ciphertext 2497 vpand %%KEY, %%KEY, %%T1 2498 2499 ;; Prepare the ciphertext for the hash 2500 ;; mask out top 16-r13 bytes of the plaintext 2501 vpand %%T2, %%T2, %%T1 2502%else 2503 ;; Plaintext XOR E(K, Yn) 2504 vpxor %%KEY, %%KEY, %%T1 2505 ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY 2506 vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] 2507 ;; Mask out top 16-r13 bytes of %%KEY 2508 vpand %%KEY, %%KEY, %%T1 2509%endif 2510 2511 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2512 ;; Output r13 Bytes 2513 vmovq rax, %%KEY 2514 cmp r13, 8 2515 jle %%_less_than_8_bytes_left 2516 2517 mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax 2518 add %%DATA_OFFSET, 8 2519 vpsrldq %%T1, %%KEY, 8 2520 vmovq rax, %%T1 2521 sub r13, 8 2522 2523%%_less_than_8_bytes_left: 2524 mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al 2525 add %%DATA_OFFSET, 1 2526 shr rax, 8 2527 sub r13, 1 2528 jne %%_less_than_8_bytes_left 2529 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2530 2531%ifidn %%ENC_DEC, DEC 2532 ;; If decrypt, restore the ciphertext into %%KEY 2533 vmovdqu %%KEY, %%T2 2534%endif 2535%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK 2536 2537 2538 2539; Encryption of a single block 2540%macro ENCRYPT_SINGLE_BLOCK 2 2541%define %%GDATA %1 2542%define %%XMM0 %2 2543 2544 vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] 2545%assign i 1 2546%rep NROUNDS 2547 vaesenc %%XMM0, [%%GDATA+16*i] 2548%assign i (i+1) 2549%endrep 2550 vaesenclast %%XMM0, [%%GDATA+16*i] 2551%endmacro 2552 2553 2554;; Start of Stack Setup 2555 2556%macro FUNC_SAVE 0 2557 ;; Required for Update/GMC_ENC 2558 ;the number of pushes must equal STACK_OFFSET 2559 push r12 2560 push r13 2561 push r14 2562 push r15 2563 mov r14, rsp 2564 2565 sub rsp, VARIABLE_OFFSET 2566 and rsp, ~63 2567 2568%ifidn __OUTPUT_FORMAT__, win64 2569 ; xmm6:xmm15 need to be maintained for Windows 2570 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 2571 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 2572 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 2573 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 2574 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 2575 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 2576 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 2577 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 2578 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 2579 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 2580%endif 2581%endmacro 2582 2583 2584%macro FUNC_RESTORE 0 2585 2586%ifdef SAFE_DATA 2587 clear_scratch_gps_asm 2588 clear_scratch_ymms_asm 2589%endif 2590%ifidn __OUTPUT_FORMAT__, win64 2591 vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] 2592 vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] 2593 vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] 2594 vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] 2595 vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] 2596 vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] 2597 vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] 2598 vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] 2599 vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] 2600 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] 2601%endif 2602 2603;; Required for Update/GMC_ENC 2604 mov rsp, r14 2605 pop r15 2606 pop r14 2607 pop r13 2608 pop r12 2609%endmacro 2610 2611%macro CALC_J0 15 2612%define %%KEY %1 ;; [in] Pointer to GCM KEY structure 2613%define %%IV %2 ;; [in] Pointer to IV 2614%define %%IV_LEN %3 ;; [in] IV length 2615%define %%J0 %4 ;; [out] XMM reg to contain J0 2616%define %%TMP0 %5 ;; [clobbered] Temporary GP reg 2617%define %%TMP1 %6 ;; [clobbered] Temporary GP reg 2618%define %%TMP2 %7 ;; [clobbered] Temporary GP reg 2619%define %%TMP3 %8 ;; [clobbered] Temporary GP reg 2620%define %%TMP4 %9 ;; [clobbered] Temporary GP reg 2621%define %%XTMP0 %10 ;; [clobbered] Temporary XMM reg 2622%define %%XTMP1 %11 ;; [clobbered] Temporary XMM reg 2623%define %%XTMP2 %12 ;; [clobbered] Temporary XMM reg 2624%define %%XTMP3 %13 ;; [clobbered] Temporary XMM reg 2625%define %%XTMP4 %14 ;; [clobbered] Temporary XMM reg 2626%define %%XTMP5 %15 ;; [clobbered] Temporary XMM reg 2627 2628 ;; J0 = GHASH(IV || 0s+64 || len(IV)64) 2629 ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ 2630 2631 ;; Calculate GHASH of (IV || 0s) 2632 vpxor %%J0, %%J0 2633 CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ 2634 %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2, %%TMP3, %%TMP4 2635 2636 ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) 2637 vmovdqu %%XTMP0, [%%KEY + HashKey] 2638 mov %%TMP2, %%IV_LEN 2639 shl %%TMP2, 3 ;; IV length in bits 2640 vmovq %%XTMP1, %%TMP2 2641 vpxor %%J0, %%XTMP1 2642 GHASH_MUL %%J0, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 2643 2644 vpshufb %%J0, [rel SHUF_MASK] ; perform a 16Byte swap 2645%endmacro 2646 2647 2648;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2649; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. 2650; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, IV_LEN, 2651; Additional Authentication data (A_IN), Additional Data length (A_LEN). 2652; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. 2653; Clobbers rax, r10-r13 and xmm0-xmm6 2654;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2655%macro GCM_INIT 5-6 2656%define %%GDATA_KEY %1 ; [in] Pointer to GCM Key data structure 2657%define %%GDATA_CTX %2 ; [in/out] Pointer to GCM Context data structure 2658%define %%IV %3 ; [in] Pointer to IV 2659%define %%A_IN %4 ; [in] Pointer to AAD 2660%define %%A_LEN %5 ; [in] AAD length 2661%define %%IV_LEN %6 ; [in] IV length 2662 2663%define %%AAD_HASH xmm14 2664 2665 2666 mov r10, %%A_LEN 2667 cmp r10, 0 2668 je %%_aad_is_zero 2669 2670 vpxor %%AAD_HASH, %%AAD_HASH 2671 CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax 2672 jmp %%_after_aad 2673 2674%%_aad_is_zero: 2675 vpxor %%AAD_HASH, %%AAD_HASH 2676 2677%%_after_aad: 2678 mov r10, %%A_LEN 2679 vpxor xmm2, xmm3 2680 2681 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash 2682 mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length 2683 xor r10, r10 2684 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 2685 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 2686 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 2687 mov r10, %%IV 2688%if %0 == 6 ;; IV is different than 12 bytes 2689 CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, r13, rax, xmm1, xmm0, \ 2690 xmm3, xmm4, xmm5, xmm6 2691%else ;; IV is 12 bytes 2692 vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 2693 vpinsrq xmm2, [r10], 0 2694 vpinsrd xmm2, [r10+8], 2 2695%endif 2696 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv 2697 2698 vpshufb xmm2, [rel SHUF_MASK] 2699 2700 vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv 2701%endmacro 2702 2703%macro GCM_ENC_DEC_SMALL 12 2704%define %%GDATA_KEY %1 2705%define %%GDATA_CTX %2 2706%define %%CYPH_PLAIN_OUT %3 2707%define %%PLAIN_CYPH_IN %4 2708%define %%PLAIN_CYPH_LEN %5 2709%define %%ENC_DEC %6 2710%define %%DATA_OFFSET %7 2711%define %%LENGTH %8 2712%define %%NUM_BLOCKS %9 2713%define %%CTR %10 2714%define %%HASH %11 2715%define %%INSTANCE_TYPE %12 2716 2717 ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. 2718 ;; cmp %%NUM_BLOCKS, 0 2719 ;; je %%_small_initial_blocks_encrypted 2720 cmp %%NUM_BLOCKS, 8 2721 je %%_small_initial_num_blocks_is_8 2722 cmp %%NUM_BLOCKS, 7 2723 je %%_small_initial_num_blocks_is_7 2724 cmp %%NUM_BLOCKS, 6 2725 je %%_small_initial_num_blocks_is_6 2726 cmp %%NUM_BLOCKS, 5 2727 je %%_small_initial_num_blocks_is_5 2728 cmp %%NUM_BLOCKS, 4 2729 je %%_small_initial_num_blocks_is_4 2730 cmp %%NUM_BLOCKS, 3 2731 je %%_small_initial_num_blocks_is_3 2732 cmp %%NUM_BLOCKS, 2 2733 je %%_small_initial_num_blocks_is_2 2734 2735 jmp %%_small_initial_num_blocks_is_1 2736 2737 2738%%_small_initial_num_blocks_is_8: 2739 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2740 jmp %%_small_initial_blocks_encrypted 2741 2742%%_small_initial_num_blocks_is_7: 2743 ;; r13 - %%LENGTH 2744 ;; xmm12 - T1 2745 ;; xmm13 - T2 2746 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys 2747 ;; xmm15 - T4 2748 ;; xmm11 - T5 2749 ;; xmm9 - CTR 2750 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys 2751 ;; xmm2 - XMM2 2752 ;; xmm3 - XMM3 2753 ;; xmm4 - XMM4 2754 ;; xmm5 - XMM5 2755 ;; xmm6 - XMM6 2756 ;; xmm7 - XMM7 2757 ;; xmm8 - XMM8 - AAD HASH IN 2758 ;; xmm10 - T6 2759 ;; xmm0 - T_key 2760 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2761 jmp %%_small_initial_blocks_encrypted 2762 2763%%_small_initial_num_blocks_is_6: 2764 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2765 jmp %%_small_initial_blocks_encrypted 2766 2767%%_small_initial_num_blocks_is_5: 2768 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2769 jmp %%_small_initial_blocks_encrypted 2770 2771%%_small_initial_num_blocks_is_4: 2772 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2773 jmp %%_small_initial_blocks_encrypted 2774 2775%%_small_initial_num_blocks_is_3: 2776 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2777 jmp %%_small_initial_blocks_encrypted 2778 2779%%_small_initial_num_blocks_is_2: 2780 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2781 jmp %%_small_initial_blocks_encrypted 2782 2783%%_small_initial_num_blocks_is_1: 2784 INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE 2785 2786 ;; Note: zero initial blocks not allowed. 2787 2788%%_small_initial_blocks_encrypted: 2789 2790%endmacro ; GCM_ENC_DEC_SMALL 2791 2792;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2793; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct 2794; has been initialized by GCM_INIT 2795; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. 2796; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), 2797; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). 2798; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX 2799; Clobbers rax, r10-r15, and xmm0-xmm15 2800;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2801%macro GCM_ENC_DEC 7 2802%define %%GDATA_KEY %1 2803%define %%GDATA_CTX %2 2804%define %%CYPH_PLAIN_OUT %3 2805%define %%PLAIN_CYPH_IN %4 2806%define %%PLAIN_CYPH_LEN %5 2807%define %%ENC_DEC %6 2808%define %%INSTANCE_TYPE %7 2809%define %%DATA_OFFSET r11 2810 2811; Macro flow: 2812; calculate the number of 16byte blocks in the message 2813; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' 2814; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' 2815; if there is a block of less than 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' 2816 2817 cmp %%PLAIN_CYPH_LEN, 0 2818 je %%_enc_dec_done 2819 2820 xor %%DATA_OFFSET, %%DATA_OFFSET 2821 ;; Update length of data processed 2822%ifidn __OUTPUT_FORMAT__, win64 2823 mov rax, %%PLAIN_CYPH_LEN 2824 add [%%GDATA_CTX + InLen], rax 2825%else 2826 add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN 2827%endif 2828 vmovdqu xmm13, [%%GDATA_KEY + HashKey] 2829 vmovdqu xmm8, [%%GDATA_CTX + AadHash] 2830 2831%ifidn %%INSTANCE_TYPE, multi_call 2832 ;; NOTE: partial block processing makes only sense for multi_call here. 2833 ;; Used for the update flow - if there was a previous partial 2834 ;; block fill the remaining bytes here. 2835 PARTIAL_BLOCK %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, xmm13, %%ENC_DEC 2836%endif 2837 2838 ;; lift CTR set from initial_blocks to here 2839%ifidn %%INSTANCE_TYPE, single_call 2840 vmovdqu xmm9, xmm2 2841%else 2842 vmovdqu xmm9, [%%GDATA_CTX + CurCount] 2843%endif 2844 2845 ;; Save the amount of data left to process in r10 2846 mov r13, %%PLAIN_CYPH_LEN 2847%ifidn %%INSTANCE_TYPE, multi_call 2848 ;; NOTE: %%DATA_OFFSET is zero in single_call case. 2849 ;; Consequently PLAIN_CYPH_LEN will never be zero after 2850 ;; %%DATA_OFFSET subtraction below. 2851 sub r13, %%DATA_OFFSET 2852 2853 ;; There may be no more data if it was consumed in the partial block. 2854 cmp r13, 0 2855 je %%_enc_dec_done 2856%endif ; %%INSTANCE_TYPE, multi_call 2857 mov r10, r13 2858 2859 ;; Determine how many blocks to process in INITIAL 2860 mov r12, r13 2861 shr r12, 4 2862 and r12, 7 2863 2864 ;; Process one additional block in INITIAL if there is a partial block 2865 and r10, 0xf 2866 blsmsk r10, r10 ; Set CF if zero 2867 cmc ; Flip CF 2868 adc r12, 0x0 ; Process an additional INITIAL block if CF set 2869 2870 ;; Less than 127B will be handled by the small message code, which 2871 ;; can process up to 7 16B blocks. 2872 cmp r13, 128 2873 jge %%_large_message_path 2874 2875 GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE 2876 jmp %%_ghash_done 2877 2878%%_large_message_path: 2879 and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will 2880 ; can be handled by the x8 partial loop. 2881 2882 cmp r12, 0 2883 je %%_initial_num_blocks_is_0 2884 cmp r12, 7 2885 je %%_initial_num_blocks_is_7 2886 cmp r12, 6 2887 je %%_initial_num_blocks_is_6 2888 cmp r12, 5 2889 je %%_initial_num_blocks_is_5 2890 cmp r12, 4 2891 je %%_initial_num_blocks_is_4 2892 cmp r12, 3 2893 je %%_initial_num_blocks_is_3 2894 cmp r12, 2 2895 je %%_initial_num_blocks_is_2 2896 2897 jmp %%_initial_num_blocks_is_1 2898 2899%%_initial_num_blocks_is_7: 2900 ;; r13 - %%LENGTH 2901 ;; xmm12 - T1 2902 ;; xmm13 - T2 2903 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys 2904 ;; xmm15 - T4 2905 ;; xmm11 - T5 2906 ;; xmm9 - CTR 2907 ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys 2908 ;; xmm2 - XMM2 2909 ;; xmm3 - XMM3 2910 ;; xmm4 - XMM4 2911 ;; xmm5 - XMM5 2912 ;; xmm6 - XMM6 2913 ;; xmm7 - XMM7 2914 ;; xmm8 - XMM8 - AAD HASH IN 2915 ;; xmm10 - T6 2916 ;; xmm0 - T_key 2917 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2918 jmp %%_initial_blocks_encrypted 2919 2920%%_initial_num_blocks_is_6: 2921 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2922 jmp %%_initial_blocks_encrypted 2923 2924%%_initial_num_blocks_is_5: 2925 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2926 jmp %%_initial_blocks_encrypted 2927 2928%%_initial_num_blocks_is_4: 2929 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2930 jmp %%_initial_blocks_encrypted 2931 2932%%_initial_num_blocks_is_3: 2933 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2934 jmp %%_initial_blocks_encrypted 2935 2936%%_initial_num_blocks_is_2: 2937 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2938 jmp %%_initial_blocks_encrypted 2939 2940%%_initial_num_blocks_is_1: 2941 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2942 jmp %%_initial_blocks_encrypted 2943 2944%%_initial_num_blocks_is_0: 2945 INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC 2946 2947 2948%%_initial_blocks_encrypted: 2949 ;; The entire message was encrypted processed in initial and now need to be hashed 2950 cmp r13, 0 2951 je %%_encrypt_done 2952 2953 ;; Encrypt the final <16 byte (partial) block, then hash 2954 cmp r13, 16 2955 jl %%_encrypt_final_partial 2956 2957 ;; Process 7 full blocks plus a partial block 2958 cmp r13, 128 2959 jl %%_encrypt_by_8_partial 2960 2961 2962%%_encrypt_by_8_parallel: 2963 ;; in_order vs. out_order is an optimization to increment the counter without shuffling 2964 ;; it back into little endian. r15d keeps track of when we need to increent in order so 2965 ;; that the carry is handled correctly. 2966 vmovd r15d, xmm9 2967 and r15d, 255 2968 vpshufb xmm9, [rel SHUF_MASK] 2969 2970 2971%%_encrypt_by_8_new: 2972 cmp r15d, 255-8 2973 jg %%_encrypt_by_8 2974 2975 2976 2977 ;; xmm0 - T1 2978 ;; xmm10 - T2 2979 ;; xmm11 - T3 2980 ;; xmm12 - T4 2981 ;; xmm13 - T5 2982 ;; xmm14 - T6 2983 ;; xmm9 - CTR 2984 ;; xmm1 - XMM1 2985 ;; xmm2 - XMM2 2986 ;; xmm3 - XMM3 2987 ;; xmm4 - XMM4 2988 ;; xmm5 - XMM5 2989 ;; xmm6 - XMM6 2990 ;; xmm7 - XMM7 2991 ;; xmm8 - XMM8 2992 ;; xmm15 - T7 2993 add r15b, 8 2994 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full 2995 add %%DATA_OFFSET, 128 2996 sub r13, 128 2997 cmp r13, 128 2998 jge %%_encrypt_by_8_new 2999 3000 vpshufb xmm9, [rel SHUF_MASK] 3001 jmp %%_encrypt_by_8_parallel_done 3002 3003%%_encrypt_by_8: 3004 vpshufb xmm9, [rel SHUF_MASK] 3005 add r15b, 8 3006 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full 3007 vpshufb xmm9, [rel SHUF_MASK] 3008 add %%DATA_OFFSET, 128 3009 sub r13, 128 3010 cmp r13, 128 3011 jge %%_encrypt_by_8_new 3012 vpshufb xmm9, [rel SHUF_MASK] 3013 3014 3015%%_encrypt_by_8_parallel_done: 3016 ;; Test to see if we need a by 8 with partial block. At this point 3017 ;; bytes remaining should be either zero or between 113-127. 3018 cmp r13, 0 3019 je %%_encrypt_done 3020 3021%%_encrypt_by_8_partial: 3022 ;; Shuffle needed to align key for partial block xor. out_order 3023 ;; is a little faster because it avoids extra shuffles. 3024 ;; TBD: Might need to account for when we don't have room to increment the counter. 3025 3026 3027 ;; Process parallel buffers with a final partial block. 3028 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial 3029 3030 3031 add %%DATA_OFFSET, 128-16 3032 sub r13, 128-16 3033 3034%%_encrypt_final_partial: 3035 3036 vpshufb xmm8, [rel SHUF_MASK] 3037 mov [%%GDATA_CTX + PBlockLen], r13 3038 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 3039 3040 ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext 3041 ;; GDATA, KEY, T1, T2 3042 ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET 3043 3044 vpshufb xmm8, [rel SHUF_MASK] 3045 3046 3047%%_encrypt_done: 3048 3049 ;; Mapping to macro parameters 3050 ;; IN: 3051 ;; xmm9 contains the counter 3052 ;; xmm1-xmm8 contain the xor'd ciphertext 3053 ;; OUT: 3054 ;; xmm14 contains the final hash 3055 ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 3056%ifidn %%INSTANCE_TYPE, multi_call 3057 mov r13, [%%GDATA_CTX + PBlockLen] 3058 cmp r13, 0 3059 jz %%_hash_last_8 3060 GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 3061 ;; XOR the partial word into the hash 3062 vpxor xmm14, xmm14, xmm8 3063 jmp %%_ghash_done 3064%endif 3065%%_hash_last_8: 3066 GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 3067 3068%%_ghash_done: 3069 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 3070 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 3071 3072%%_enc_dec_done: 3073 3074 3075%endmacro 3076 3077 3078;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3079; GCM_COMPLETE Finishes Encryption/Decryption of last partial block after GCM_UPDATE finishes. 3080; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX). 3081; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) 3082; Clobbers rax, r10-r12, and xmm0-xmm2, xmm5-xmm6, xmm9-xmm11, xmm13-xmm15 3083;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3084%macro GCM_COMPLETE 5 3085%define %%GDATA_KEY %1 3086%define %%GDATA_CTX %2 3087%define %%AUTH_TAG %3 3088%define %%AUTH_TAG_LEN %4 3089%define %%INSTANCE_TYPE %5 3090%define %%PLAIN_CYPH_LEN rax 3091 3092 vmovdqu xmm13, [%%GDATA_KEY + HashKey] 3093 ;; Start AES as early as possible 3094 vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 3095 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) 3096 3097%ifidn %%INSTANCE_TYPE, multi_call 3098 ;; If the GCM function is called as a single function call rather 3099 ;; than invoking the individual parts (init, update, finalize) we 3100 ;; can remove a write to read dependency on AadHash. 3101 vmovdqu xmm14, [%%GDATA_CTX + AadHash] 3102 3103 ;; Encrypt the final partial block. If we did this as a single call then 3104 ;; the partial block was handled in the main GCM_ENC_DEC macro. 3105 mov r12, [%%GDATA_CTX + PBlockLen] 3106 cmp r12, 0 3107 3108 je %%_partial_done 3109 3110 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block 3111 vmovdqu [%%GDATA_CTX + AadHash], xmm14 3112 3113%%_partial_done: 3114 3115%endif 3116 3117 mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) 3118 mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] 3119 3120 shl r12, 3 ; convert into number of bits 3121 vmovq xmm15, r12 ; len(A) in xmm15 3122 3123 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) 3124 vmovq xmm1, %%PLAIN_CYPH_LEN 3125 vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 3126 vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) 3127 3128 vpxor xmm14, xmm15 3129 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 3130 vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap 3131 3132 vpxor xmm9, xmm9, xmm14 3133 3134 3135%%_return_T: 3136 mov r10, %%AUTH_TAG ; r10 = authTag 3137 mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len 3138 3139 cmp r11, 16 3140 je %%_T_16 3141 3142 cmp r11, 12 3143 je %%_T_12 3144 3145 cmp r11, 8 3146 je %%_T_8 3147 3148 simd_store_avx r10, xmm9, r11, r12, rax 3149 jmp %%_return_T_done 3150%%_T_8: 3151 vmovq rax, xmm9 3152 mov [r10], rax 3153 jmp %%_return_T_done 3154%%_T_12: 3155 vmovq rax, xmm9 3156 mov [r10], rax 3157 vpsrldq xmm9, xmm9, 8 3158 vmovd eax, xmm9 3159 mov [r10 + 8], eax 3160 jmp %%_return_T_done 3161%%_T_16: 3162 vmovdqu [r10], xmm9 3163 3164%%_return_T_done: 3165 3166%ifdef SAFE_DATA 3167 ;; Clear sensitive data from context structure 3168 vpxor xmm0, xmm0 3169 vmovdqu [%%GDATA_CTX + AadHash], xmm0 3170 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0 3171%endif 3172%endmacro ; GCM_COMPLETE 3173 3174 3175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3176;void aes_gcm_precomp_128_avx_gen4 / 3177; aes_gcm_precomp_192_avx_gen4 / 3178; aes_gcm_precomp_256_avx_gen4 3179; (struct gcm_key_data *key_data) 3180;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3181MKGLOBAL(FN_NAME(precomp,_),function,) 3182FN_NAME(precomp,_): 3183 3184%ifdef SAFE_PARAM 3185 ;; Check key_data != NULL 3186 cmp arg1, 0 3187 jz exit_precomp 3188%endif 3189 3190 push r12 3191 push r13 3192 push r14 3193 push r15 3194 3195 mov r14, rsp 3196 3197 3198 3199 sub rsp, VARIABLE_OFFSET 3200 and rsp, ~63 ; align rsp to 64 bytes 3201 3202%ifidn __OUTPUT_FORMAT__, win64 3203 ; only xmm6 needs to be maintained 3204 vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 3205%endif 3206 3207 vpxor xmm6, xmm6 3208 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey 3209 3210 vpshufb xmm6, [rel SHUF_MASK] 3211 ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; 3212 vmovdqa xmm2, xmm6 3213 vpsllq xmm6, xmm6, 1 3214 vpsrlq xmm2, xmm2, 63 3215 vmovdqa xmm1, xmm2 3216 vpslldq xmm2, xmm2, 8 3217 vpsrldq xmm1, xmm1, 8 3218 vpor xmm6, xmm6, xmm2 3219 ;reduction 3220 vpshufd xmm2, xmm1, 00100100b 3221 vpcmpeqd xmm2, [rel TWOONE] 3222 vpand xmm2, xmm2, [rel POLY] 3223 vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly 3224 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3225 vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly 3226 3227 3228 PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 3229 3230%ifidn __OUTPUT_FORMAT__, win64 3231 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] 3232%endif 3233 mov rsp, r14 3234 3235 pop r15 3236 pop r14 3237 pop r13 3238 pop r12 3239 3240%ifdef SAFE_DATA 3241 clear_scratch_gps_asm 3242 clear_scratch_ymms_asm 3243%endif 3244exit_precomp: 3245 3246 ret 3247 3248 3249;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3250;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 3251; (const struct gcm_key_data *key_data, 3252; struct gcm_context_data *context_data, 3253; u8 *iv, 3254; const u8 *aad, 3255; u64 aad_len); 3256;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3257MKGLOBAL(FN_NAME(init,_),function,) 3258FN_NAME(init,_): 3259 push r12 3260 push r13 3261%ifidn __OUTPUT_FORMAT__, win64 3262 push r14 3263 push r15 3264 mov r14, rsp 3265 ; xmm6 needs to be maintained for Windows 3266 sub rsp, 1*16 3267 vmovdqu [rsp + 0*16], xmm6 3268%endif 3269 3270%ifdef SAFE_PARAM 3271 ;; Check key_data != NULL 3272 cmp arg1, 0 3273 jz exit_init 3274 3275 ;; Check context_data != NULL 3276 cmp arg2, 0 3277 jz exit_init 3278 3279 ;; Check IV != NULL 3280 cmp arg3, 0 3281 jz exit_init 3282 3283 ;; Check if aad_len == 0 3284 cmp arg5, 0 3285 jz skip_aad_check_init 3286 3287 ;; Check aad != NULL (aad_len != 0) 3288 cmp arg4, 0 3289 jz exit_init 3290 3291skip_aad_check_init: 3292%endif 3293 GCM_INIT arg1, arg2, arg3, arg4, arg5 3294 3295%ifdef SAFE_DATA 3296 clear_scratch_gps_asm 3297 clear_scratch_ymms_asm 3298%endif 3299exit_init: 3300 3301%ifidn __OUTPUT_FORMAT__, win64 3302 vmovdqu xmm6 , [rsp + 0*16] 3303 mov rsp, r14 3304 pop r15 3305 pop r14 3306%endif 3307 pop r13 3308 pop r12 3309 ret 3310 3311;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3312;void aes_gcm_init_var_iv_128_avx_gen4 / aes_gcm_init_var_iv_192_avx_gen4 / 3313; aes_gcm_init_var_iv_256_avx_gen4 3314; const struct gcm_key_data *key_data, 3315; struct gcm_context_data *context_data, 3316; u8 *iv, 3317; const u64 iv_len, 3318; const u8 *aad, 3319; const u64 aad_len); 3320;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3321MKGLOBAL(FN_NAME(init_var_iv,_),function,) 3322FN_NAME(init_var_iv,_): 3323 push r12 3324 push r13 3325%ifidn __OUTPUT_FORMAT__, win64 3326 push r14 3327 push r15 3328 mov r14, rsp 3329 ; xmm6 needs to be maintained for Windows 3330 sub rsp, 1*16 3331 vmovdqu [rsp + 0*16], xmm6 3332%endif 3333 3334%ifdef SAFE_PARAM 3335 ;; Check key_data != NULL 3336 cmp arg1, 0 3337 jz exit_init_IV 3338 3339 ;; Check context_data != NULL 3340 cmp arg2, 0 3341 jz exit_init_IV 3342 3343 ;; Check IV != NULL 3344 cmp arg3, 0 3345 jz exit_init_IV 3346 3347 ;; Check iv_len != 0 3348 cmp arg4, 0 3349 jz exit_init_IV 3350 3351 ;; Check if aad_len == 0 3352 cmp arg6, 0 3353 jz skip_aad_check_init_IV 3354 3355 ;; Check aad != NULL (aad_len != 0) 3356 cmp arg5, 0 3357 jz exit_init_IV 3358 3359skip_aad_check_init_IV: 3360%endif 3361 cmp arg4, 12 3362 je iv_len_12_init_IV 3363 3364 GCM_INIT arg1, arg2, arg3, arg5, arg6, arg4 3365 jmp skip_iv_len_12_init_IV 3366 3367iv_len_12_init_IV: 3368 GCM_INIT arg1, arg2, arg3, arg5, arg6 3369 3370skip_iv_len_12_init_IV: 3371%ifdef SAFE_DATA 3372 clear_scratch_gps_asm 3373 clear_scratch_ymms_asm 3374%endif 3375exit_init_IV: 3376 3377%ifidn __OUTPUT_FORMAT__, win64 3378 vmovdqu xmm6 , [rsp + 0*16] 3379 mov rsp, r14 3380 pop r15 3381 pop r14 3382%endif 3383 pop r13 3384 pop r12 3385 ret 3386 3387;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3388;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / 3389; aes_gcm_enc_128_update_avx_gen4 3390; (const struct gcm_key_data *key_data, 3391; struct gcm_context_data *context_data, 3392; u8 *out, 3393; const u8 *in, 3394; u64 plaintext_len); 3395;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3396MKGLOBAL(FN_NAME(enc,_update_),function,) 3397FN_NAME(enc,_update_): 3398 3399 FUNC_SAVE 3400 3401%ifdef SAFE_PARAM 3402 ;; Check key_data != NULL 3403 cmp arg1, 0 3404 jz exit_update_enc 3405 3406 ;; Check context_data != NULL 3407 cmp arg2, 0 3408 jz exit_update_enc 3409 3410 ;; Check if plaintext_len == 0 3411 cmp arg5, 0 3412 jz skip_in_out_check_update_enc 3413 3414 ;; Check out != NULL (plaintext_len != 0) 3415 cmp arg3, 0 3416 jz exit_update_enc 3417 3418 ;; Check in != NULL (plaintext_len != 0) 3419 cmp arg4, 0 3420 jz exit_update_enc 3421 3422skip_in_out_check_update_enc: 3423%endif 3424 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call 3425 3426exit_update_enc: 3427 FUNC_RESTORE 3428 3429 ret 3430 3431 3432;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3433;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / 3434; aes_gcm_dec_256_update_avx_gen4 3435; (const struct gcm_key_data *key_data, 3436; struct gcm_context_data *context_data, 3437; u8 *out, 3438; const u8 *in, 3439; u64 plaintext_len); 3440;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3441MKGLOBAL(FN_NAME(dec,_update_),function,) 3442FN_NAME(dec,_update_): 3443 3444 FUNC_SAVE 3445 3446%ifdef SAFE_PARAM 3447 ;; Check key_data != NULL 3448 cmp arg1, 0 3449 jz exit_update_dec 3450 3451 ;; Check context_data != NULL 3452 cmp arg2, 0 3453 jz exit_update_dec 3454 3455 ;; Check if plaintext_len == 0 3456 cmp arg5, 0 3457 jz skip_in_out_check_update_dec 3458 3459 ;; Check out != NULL (plaintext_len != 0) 3460 cmp arg3, 0 3461 jz exit_update_dec 3462 3463 ;; Check in != NULL (plaintext_len != 0) 3464 cmp arg4, 0 3465 jz exit_update_dec 3466 3467skip_in_out_check_update_dec: 3468%endif 3469 3470 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call 3471 3472exit_update_dec: 3473 FUNC_RESTORE 3474 3475 ret 3476 3477 3478;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3479;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / 3480; aes_gcm_enc_256_finalize_avx_gen4 3481; (const struct gcm_key_data *key_data, 3482; struct gcm_context_data *context_data, 3483; u8 *auth_tag, 3484; u64 auth_tag_len); 3485;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3486MKGLOBAL(FN_NAME(enc,_finalize_),function,) 3487FN_NAME(enc,_finalize_): 3488 3489%ifdef SAFE_PARAM 3490 ;; Check key_data != NULL 3491 cmp arg1, 0 3492 jz exit_enc_fin 3493 3494 ;; Check context_data != NULL 3495 cmp arg2, 0 3496 jz exit_enc_fin 3497 3498 ;; Check auth_tag != NULL 3499 cmp arg3, 0 3500 jz exit_enc_fin 3501 3502 ;; Check auth_tag_len == 0 or > 16 3503 cmp arg4, 0 3504 jz exit_enc_fin 3505 3506 cmp arg4, 16 3507 ja exit_enc_fin 3508%endif 3509 push r12 3510 3511%ifidn __OUTPUT_FORMAT__, win64 3512 ; xmm6:xmm15 need to be maintained for Windows 3513 sub rsp, 7*16 3514 vmovdqu [rsp + 0*16], xmm6 3515 vmovdqu [rsp + 1*16], xmm9 3516 vmovdqu [rsp + 2*16], xmm10 3517 vmovdqu [rsp + 3*16], xmm11 3518 vmovdqu [rsp + 4*16], xmm13 3519 vmovdqu [rsp + 5*16], xmm14 3520 vmovdqu [rsp + 6*16], xmm15 3521%endif 3522 GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call 3523 3524%ifdef SAFE_DATA 3525 clear_scratch_gps_asm 3526 clear_scratch_ymms_asm 3527%endif 3528%ifidn __OUTPUT_FORMAT__, win64 3529 vmovdqu xmm15, [rsp + 6*16] 3530 vmovdqu xmm14, [rsp + 5*16] 3531 vmovdqu xmm13, [rsp + 4*16] 3532 vmovdqu xmm11, [rsp + 3*16] 3533 vmovdqu xmm10, [rsp + 2*16] 3534 vmovdqu xmm9, [rsp + 1*16] 3535 vmovdqu xmm6, [rsp + 0*16] 3536 add rsp, 7*16 3537%endif 3538 pop r12 3539exit_enc_fin: 3540 3541ret 3542 3543 3544;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3545;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 3546; aes_gcm_dec_256_finalize_avx_gen4 3547; (const struct gcm_key_data *key_data, 3548; struct gcm_context_data *context_data, 3549; u8 *auth_tag, 3550; u64 auth_tag_len); 3551;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3552MKGLOBAL(FN_NAME(dec,_finalize_),function,) 3553FN_NAME(dec,_finalize_): 3554 3555%ifdef SAFE_PARAM 3556 ;; Check key_data != NULL 3557 cmp arg1, 0 3558 jz exit_dec_fin 3559 3560 ;; Check context_data != NULL 3561 cmp arg2, 0 3562 jz exit_dec_fin 3563 3564 ;; Check auth_tag != NULL 3565 cmp arg3, 0 3566 jz exit_dec_fin 3567 3568 ;; Check auth_tag_len == 0 or > 16 3569 cmp arg4, 0 3570 jz exit_dec_fin 3571 3572 cmp arg4, 16 3573 ja exit_dec_fin 3574%endif 3575 3576 push r12 3577 3578%ifidn __OUTPUT_FORMAT__, win64 3579 ; xmm6:xmm15 need to be maintained for Windows 3580 sub rsp, 7*16 3581 vmovdqu [rsp + 0*16], xmm6 3582 vmovdqu [rsp + 1*16], xmm9 3583 vmovdqu [rsp + 2*16], xmm10 3584 vmovdqu [rsp + 3*16], xmm11 3585 vmovdqu [rsp + 4*16], xmm13 3586 vmovdqu [rsp + 5*16], xmm14 3587 vmovdqu [rsp + 6*16], xmm15 3588%endif 3589 GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call 3590 3591%ifdef SAFE_DATA 3592 clear_scratch_gps_asm 3593 clear_scratch_ymms_asm 3594%endif 3595%ifidn __OUTPUT_FORMAT__, win64 3596 vmovdqu xmm15, [rsp + 6*16] 3597 vmovdqu xmm14, [rsp + 5*16] 3598 vmovdqu xmm13, [rsp + 4*16] 3599 vmovdqu xmm11, [rsp + 3*16] 3600 vmovdqu xmm10, [rsp + 2*16] 3601 vmovdqu xmm9, [rsp + 1*16] 3602 vmovdqu xmm6, [rsp + 0*16] 3603 add rsp, 7*16 3604%endif 3605 3606 pop r12 3607 3608exit_dec_fin: 3609 ret 3610 3611 3612;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3613;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 3614; (const struct gcm_key_data *key_data, 3615; struct gcm_context_data *context_data, 3616; u8 *out, 3617; const u8 *in, 3618; u64 plaintext_len, 3619; u8 *iv, 3620; const u8 *aad, 3621; u64 aad_len, 3622; u8 *auth_tag, 3623; u64 auth_tag_len); 3624;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3625MKGLOBAL(FN_NAME(enc,_),function,) 3626FN_NAME(enc,_): 3627 3628 FUNC_SAVE 3629 3630%ifdef SAFE_PARAM 3631 ;; Check key_data != NULL 3632 cmp arg1, 0 3633 jz exit_enc 3634 3635 ;; Check context_data != NULL 3636 cmp arg2, 0 3637 jz exit_enc 3638 3639 ;; Check IV != NULL 3640 cmp arg6, 0 3641 jz exit_enc 3642 3643 ;; Check auth_tag != NULL 3644 cmp arg9, 0 3645 jz exit_enc 3646 3647 ;; Check auth_tag_len == 0 or > 16 3648 cmp arg10, 0 3649 jz exit_enc 3650 3651 cmp arg10, 16 3652 ja exit_enc 3653 3654 ;; Check if plaintext_len == 0 3655 cmp arg5, 0 3656 jz skip_in_out_check_enc 3657 3658 ;; Check out != NULL (plaintext_len != 0) 3659 cmp arg3, 0 3660 jz exit_enc 3661 3662 ;; Check in != NULL (plaintext_len != 0) 3663 cmp arg4, 0 3664 jz exit_enc 3665 3666skip_in_out_check_enc: 3667 ;; Check if aad_len == 0 3668 cmp arg8, 0 3669 jz skip_aad_check_enc 3670 3671 ;; Check aad != NULL (aad_len != 0) 3672 cmp arg7, 0 3673 jz exit_enc 3674 3675skip_aad_check_enc: 3676%endif 3677 GCM_INIT arg1, arg2, arg6, arg7, arg8 3678 3679 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call 3680 3681 GCM_COMPLETE arg1, arg2, arg9, arg10, single_call 3682 3683exit_enc: 3684 FUNC_RESTORE 3685 3686 ret 3687 3688;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3689;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 3690; (const struct gcm_key_data *key_data, 3691; struct gcm_context_data *context_data, 3692; u8 *out, 3693; const u8 *in, 3694; u64 plaintext_len, 3695; u8 *iv, 3696; const u8 *aad, 3697; u64 aad_len, 3698; u8 *auth_tag, 3699; u64 auth_tag_len); 3700;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3701MKGLOBAL(FN_NAME(dec,_),function,) 3702FN_NAME(dec,_): 3703 3704 FUNC_SAVE 3705 3706%ifdef SAFE_PARAM 3707 ;; Check key_data != NULL 3708 cmp arg1, 0 3709 jz exit_dec 3710 3711 ;; Check context_data != NULL 3712 cmp arg2, 0 3713 jz exit_dec 3714 3715 ;; Check IV != NULL 3716 cmp arg6, 0 3717 jz exit_dec 3718 3719 ;; Check auth_tag != NULL 3720 cmp arg9, 0 3721 jz exit_dec 3722 3723 ;; Check auth_tag_len == 0 or > 16 3724 cmp arg10, 0 3725 jz exit_dec 3726 3727 cmp arg10, 16 3728 ja exit_dec 3729 3730 ;; Check if plaintext_len == 0 3731 cmp arg5, 0 3732 jz skip_in_out_check_dec 3733 3734 ;; Check out != NULL (plaintext_len != 0) 3735 cmp arg3, 0 3736 jz exit_dec 3737 3738 ;; Check in != NULL (plaintext_len != 0) 3739 cmp arg4, 0 3740 jz exit_dec 3741 3742skip_in_out_check_dec: 3743 ;; Check if aad_len == 0 3744 cmp arg8, 0 3745 jz skip_aad_check_dec 3746 3747 ;; Check aad != NULL (aad_len != 0) 3748 cmp arg7, 0 3749 jz exit_dec 3750 3751skip_aad_check_dec: 3752%endif 3753 GCM_INIT arg1, arg2, arg6, arg7, arg8 3754 3755 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call 3756 3757 GCM_COMPLETE arg1, arg2, arg9, arg10, single_call 3758 3759exit_dec: 3760 FUNC_RESTORE 3761 3762 ret 3763 3764 3765;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3766;void aes_gcm_enc_var_iv_128_avx_gen4 / aes_gcm_enc_var_iv_192_avx_gen4 / 3767; aes_gcm_enc_var_iv_256_avx_gen4 3768; const struct gcm_key_data *key_data, 3769; struct gcm_context_data *context_data, 3770; u8 *out, 3771; const u8 *in, 3772; u64 plaintext_len, 3773; u8 *iv, 3774; const u64 iv_len, 3775; const u8 *aad, 3776; const u64 aad_len, 3777; u8 *auth_tag, 3778; const u64 auth_tag_len); 3779;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3780MKGLOBAL(FN_NAME(enc_var_iv,_),function,) 3781FN_NAME(enc_var_iv,_): 3782 3783 FUNC_SAVE 3784 3785%ifdef SAFE_PARAM 3786 ;; Check key_data != NULL 3787 cmp arg1, 0 3788 jz exit_enc_IV 3789 3790 ;; Check context_data != NULL 3791 cmp arg2, 0 3792 jz exit_enc_IV 3793 3794 ;; Check IV != NULL 3795 cmp arg6, 0 3796 jz exit_enc_IV 3797 3798 ;; Check IV len != 0 3799 cmp arg7, 0 3800 jz exit_enc_IV 3801 3802 ;; Check auth_tag != NULL 3803 cmp arg10, 0 3804 jz exit_enc_IV 3805 3806 ;; Check auth_tag_len == 0 or > 16 3807 cmp arg11, 0 3808 jz exit_enc_IV 3809 3810 cmp arg11, 16 3811 ja exit_enc_IV 3812 3813 ;; Check if plaintext_len == 0 3814 cmp arg5, 0 3815 jz skip_in_out_check_enc_IV 3816 3817 ;; Check out != NULL (plaintext_len != 0) 3818 cmp arg3, 0 3819 jz exit_enc_IV 3820 3821 ;; Check in != NULL (plaintext_len != 0) 3822 cmp arg4, 0 3823 jz exit_enc_IV 3824 3825skip_in_out_check_enc_IV: 3826 ;; Check if aad_len == 0 3827 cmp arg9, 0 3828 jz skip_aad_check_enc_IV 3829 3830 ;; Check aad != NULL (aad_len != 0) 3831 cmp arg8, 0 3832 jz exit_enc_IV 3833 3834skip_aad_check_enc_IV: 3835%endif 3836 cmp arg7, 12 3837 je iv_len_12_enc_IV 3838 3839 GCM_INIT arg1, arg2, arg6, arg8, arg9, arg7 3840 jmp skip_iv_len_12_enc_IV 3841 3842iv_len_12_enc_IV: 3843 GCM_INIT arg1, arg2, arg6, arg8, arg9 3844 3845skip_iv_len_12_enc_IV: 3846 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call 3847 3848 GCM_COMPLETE arg1, arg2, arg10, arg11, single_call 3849 3850exit_enc_IV: 3851 FUNC_RESTORE 3852 3853 ret 3854 3855;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3856;void aes_gcm_dec_var_iv_128_avx_gen4 / aes_gcm_dec_var_iv_192_avx_gen4 / 3857; aes_gcm_dec_var_iv_256_avx_gen4 3858; const struct gcm_key_data *key_data, 3859; struct gcm_context_data *context_data, 3860; u8 *out, 3861; const u8 *in, 3862; u64 plaintext_len, 3863; u8 *iv, 3864; const u64 iv_len, 3865; const u8 *aad, 3866; const u64 aad_len, 3867; u8 *auth_tag, 3868; const u64 auth_tag_len); 3869;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3870MKGLOBAL(FN_NAME(dec_var_iv,_),function,) 3871FN_NAME(dec_var_iv,_): 3872 3873 FUNC_SAVE 3874 3875%ifdef SAFE_PARAM 3876 ;; Check key_data != NULL 3877 cmp arg1, 0 3878 jz exit_dec_IV 3879 3880 ;; Check context_data != NULL 3881 cmp arg2, 0 3882 jz exit_dec_IV 3883 3884 ;; Check IV != NULL 3885 cmp arg6, 0 3886 jz exit_dec_IV 3887 3888 ;; Check IV len != 0 3889 cmp arg7, 0 3890 jz exit_dec_IV 3891 3892 ;; Check auth_tag != NULL 3893 cmp arg10, 0 3894 jz exit_dec_IV 3895 3896 ;; Check auth_tag_len == 0 or > 16 3897 cmp arg11, 0 3898 jz exit_dec_IV 3899 3900 cmp arg11, 16 3901 ja exit_dec_IV 3902 3903 ;; Check if plaintext_len == 0 3904 cmp arg5, 0 3905 jz skip_in_out_check_dec_IV 3906 3907 ;; Check out != NULL (plaintext_len != 0) 3908 cmp arg3, 0 3909 jz exit_dec_IV 3910 3911 ;; Check in != NULL (plaintext_len != 0) 3912 cmp arg4, 0 3913 jz exit_dec_IV 3914 3915skip_in_out_check_dec_IV: 3916 ;; Check if aad_len == 0 3917 cmp arg9, 0 3918 jz skip_aad_check_dec_IV 3919 3920 ;; Check aad != NULL (aad_len != 0) 3921 cmp arg8, 0 3922 jz exit_dec_IV 3923 3924skip_aad_check_dec_IV: 3925%endif 3926 cmp arg7, 12 3927 je iv_len_12_dec_IV 3928 3929 GCM_INIT arg1, arg2, arg6, arg8, arg9, arg7 3930 jmp skip_iv_len_12_dec_IV 3931 3932iv_len_12_dec_IV: 3933 GCM_INIT arg1, arg2, arg6, arg8, arg9 3934 3935skip_iv_len_12_dec_IV: 3936 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call 3937 3938 GCM_COMPLETE arg1, arg2, arg10, arg11, single_call 3939 3940 3941exit_dec_IV: 3942 FUNC_RESTORE 3943 3944 ret 3945 3946%ifdef GCM128_MODE 3947;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3948;void ghash_avx_gen4 3949; const struct gcm_key_data *key_data, 3950; const void *in, 3951; const u64 in_len, 3952; void *tag, 3953; const u64 tag_len); 3954;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3955MKGLOBAL(ghash_avx_gen4,function,) 3956ghash_avx_gen4: 3957 3958 FUNC_SAVE 3959 3960%ifdef SAFE_PARAM 3961 ;; Check key_data != NULL 3962 cmp arg1, 0 3963 jz exit_ghash 3964 3965 ;; Check in != NULL 3966 cmp arg2, 0 3967 jz exit_ghash 3968 3969 ;; Check in_len != 0 3970 cmp arg3, 0 3971 jz exit_ghash 3972 3973 ;; Check tag != NULL 3974 cmp arg4, 0 3975 jz exit_ghash 3976 3977 ;; Check tag_len != 0 3978 cmp arg5, 0 3979 jz exit_ghash 3980%endif 3981 3982 vpxor xmm0, xmm0 3983 CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ 3984 r10, r11, r12, r13, rax 3985 3986 vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap 3987 3988 simd_store_avx arg4, xmm0, arg5, r12, rax 3989 3990exit_ghash: 3991 FUNC_RESTORE 3992 3993 ret 3994%endif 3995 3996;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3997; PARTIAL_BLOCK_GMAC: Handles the tag partial blocks between update calls. 3998; Requires the input data be at least 1 byte long. 3999; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_IN), 4000; input text length (PLAIN_LEN), hash subkey (HASH_SUBKEY). 4001; Output: Updated GDATA_CTX 4002; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 4003;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4004%macro PARTIAL_BLOCK_GMAC 7 4005%define %%GDATA_KEY %1 4006%define %%GDATA_CTX %2 4007%define %%PLAIN_IN %3 4008%define %%PLAIN_LEN %4 4009%define %%DATA_OFFSET %5 4010%define %%AAD_HASH %6 4011%define %%HASH_SUBKEY %7 4012 4013 mov r13, [%%GDATA_CTX + PBlockLen] 4014 cmp r13, 0 4015 ; Leave Macro if no partial blocks 4016 je %%_partial_block_done 4017 4018 ; Read in input data without over reading 4019 cmp %%PLAIN_LEN, 16 4020 jl %%_fewer_than_16_bytes 4021 ; If more than 16 bytes of data, just fill the xmm register 4022 VXLDR xmm1, [%%PLAIN_IN] 4023 jmp %%_data_read 4024 4025%%_fewer_than_16_bytes: 4026 lea r10, [%%PLAIN_IN] 4027 READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_LEN, rax, r12, r15 4028 4029 ; Finished reading in data 4030%%_data_read: 4031 4032 lea r12, [rel SHIFT_MASK] 4033 ; Adjust the shuffle mask pointer to be able to shift r13 bytes 4034 ; (16-r13 is the number of bytes in plaintext mod 16) 4035 add r12, r13 4036 ; Get the appropriate shuffle mask 4037 vmovdqu xmm2, [r12] 4038 vmovdqa xmm3, xmm1 4039 4040 mov r15, %%PLAIN_LEN 4041 add r15, r13 4042 ; Set r15 to be the amount of data left in PLAIN_IN after filling the block 4043 sub r15, 16 4044 ; Determine if partial block is not being filled and shift mask accordingly 4045 jge %%_no_extra_mask_1 4046 sub r12, r15 4047%%_no_extra_mask_1: 4048 4049 ; Get the appropriate mask to mask out bottom r13 bytes of xmm3 4050 vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] 4051 4052 vpand xmm3, xmm1 4053 vpshufb xmm3, [rel SHUF_MASK] 4054 vpshufb xmm3, xmm2 4055 vpxor %%AAD_HASH, xmm3 4056 4057 cmp r15,0 4058 jl %%_partial_incomplete_1 4059 4060 ; GHASH computation for the last <16 Byte block 4061 GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 4062 xor rax, rax 4063 mov [%%GDATA_CTX + PBlockLen], rax 4064 jmp %%_ghash_done 4065%%_partial_incomplete_1: 4066%ifidn __OUTPUT_FORMAT__, win64 4067 mov rax, %%PLAIN_LEN 4068 add [%%GDATA_CTX + PBlockLen], rax 4069%else 4070 add [%%GDATA_CTX + PBlockLen], %%PLAIN_LEN 4071%endif 4072%%_ghash_done: 4073 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH 4074 4075 cmp r15, 0 4076 jl %%_partial_fill 4077 4078 mov r12, 16 4079 ; Set r12 to be the number of bytes to skip after this macro 4080 sub r12, r13 4081 4082 jmp %%offset_set 4083%%_partial_fill: 4084 mov r12, %%PLAIN_LEN 4085%%offset_set: 4086 mov %%DATA_OFFSET, r12 4087%%_partial_block_done: 4088%endmacro ; PARTIAL_BLOCK_GMAC 4089 4090;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4091;void imb_aes_gmac_update_128_avx_gen4 / imb_aes_gmac_update_192_avx_gen4 / 4092; imb_aes_gmac_update_256_avx_gen4 4093; const struct gcm_key_data *key_data, 4094; struct gcm_context_data *context_data, 4095; const u8 *in, 4096; const u64 plaintext_len); 4097;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4098MKGLOBAL(GMAC_FN_NAME(update),function,) 4099GMAC_FN_NAME(update): 4100 4101 FUNC_SAVE 4102 4103 ;; Check if plaintext_len == 0 4104 cmp arg4, 0 4105 je exit_gmac_update 4106 4107%ifdef SAFE_PARAM 4108 ;; Check key_data != NULL 4109 cmp arg1, 0 4110 jz exit_gmac_update 4111 4112 ;; Check context_data != NULL 4113 cmp arg2, 0 4114 jz exit_gmac_update 4115 4116 ;; Check in != NULL (plaintext_len != 0) 4117 cmp arg3, 0 4118 jz exit_gmac_update 4119%endif 4120 4121 ; Increment size of "AAD length" for GMAC 4122 add [arg2 + AadLen], arg4 4123 4124 ;; Deal with previous partial block 4125 xor r11, r11 4126 vmovdqu xmm13, [arg1 + HashKey] 4127 vmovdqu xmm8, [arg2 + AadHash] 4128 4129 PARTIAL_BLOCK_GMAC arg1, arg2, arg3, arg4, r11, xmm8, xmm13 4130 4131 ; CALC_AAD_HASH needs to deal with multiple of 16 bytes 4132 sub arg4, r11 4133 add arg3, r11 4134 4135 vmovq xmm7, arg4 ; Save remaining length 4136 and arg4, -16 ; Get multiple of 16 bytes 4137 4138 or arg4, arg4 4139 jz no_full_blocks 4140 4141 ;; Calculate GHASH of this segment 4142 CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ 4143 r10, r11, r12, r13, rax 4144 vmovdqu [arg2 + AadHash], xmm8 ; ctx_data.aad hash = aad_hash 4145 4146no_full_blocks: 4147 add arg3, arg4 ; Point at partial block 4148 4149 vmovq arg4, xmm7 ; Restore original remaining length 4150 and arg4, 15 4151 jz exit_gmac_update 4152 4153 ; Save next partial block 4154 mov [arg2 + PBlockLen], arg4 4155 READ_SMALL_DATA_INPUT xmm1, arg3, arg4, r11, r12, r13 4156 vpshufb xmm1, [rel SHUF_MASK] 4157 vpxor xmm8, xmm1 4158 vmovdqu [arg2 + AadHash], xmm8 4159 4160exit_gmac_update: 4161 FUNC_RESTORE 4162 4163 ret 4164 4165%ifdef LINUX 4166section .note.GNU-stack noalloc noexec nowrite progbits 4167%endif 4168