1;; 2;; Copyright (c) 2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28%include "include/os.asm" 29%include "include/reg_sizes.asm" 30%include "include/zuc_sbox.inc" 31%include "include/transpose_avx2.asm" 32%include "include/memcpy.asm" 33%include "mb_mgr_datastruct.asm" 34 35%define APPEND(a,b) a %+ b 36 37section .data 38default rel 39 40align 32 41Ek_d: 42dd 0x0044D700, 0x0026BC00, 0x00626B00, 0x00135E00, 0x00578900, 0x0035E200, 0x00713500, 0x0009AF00 43dd 0x004D7800, 0x002F1300, 0x006BC400, 0x001AF100, 0x005E2600, 0x003C4D00, 0x00789A00, 0x0047AC00 44 45align 32 46shuf_mask_key: 47dd 0x00FFFFFF, 0x01FFFFFF, 0x02FFFFFF, 0x03FFFFFF, 0x04FFFFFF, 0x05FFFFFF, 0x06FFFFFF, 0x07FFFFFF, 48dd 0x08FFFFFF, 0x09FFFFFF, 0x0AFFFFFF, 0x0BFFFFFF, 0x0CFFFFFF, 0x0DFFFFFF, 0x0EFFFFFF, 0x0FFFFFFF, 49 50align 32 51shuf_mask_iv: 52dd 0xFFFFFF00, 0xFFFFFF01, 0xFFFFFF02, 0xFFFFFF03, 0xFFFFFF04, 0xFFFFFF05, 0xFFFFFF06, 0xFFFFFF07, 53dd 0xFFFFFF08, 0xFFFFFF09, 0xFFFFFF0A, 0xFFFFFF0B, 0xFFFFFF0C, 0xFFFFFF0D, 0xFFFFFF0E, 0xFFFFFF0F, 54 55align 32 56mask31: 57dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 58dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 59 60align 32 61swap_mask: 62db 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04 63db 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c 64db 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04 65db 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c 66 67 68align 32 69S1_S0_shuf: 70db 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F 71db 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F 72 73align 32 74S0_S1_shuf: 75db 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 76db 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 77 78align 32 79rev_S1_S0_shuf: 80db 0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F 81db 0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F 82 83align 32 84rev_S0_S1_shuf: 85db 0x08, 0x00, 0x09, 0x01, 0x0A, 0x02, 0x0B, 0x03, 0x0C, 0x04, 0x0D, 0x05, 0x0E, 0x06, 0x0F, 0x07 86db 0x08, 0x00, 0x09, 0x01, 0x0A, 0x02, 0x0B, 0x03, 0x0C, 0x04, 0x0D, 0x05, 0x0E, 0x06, 0x0F, 0x07 87 88align 32 89rot8_mod32: 90db 0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06, 91db 0x0B, 0x08, 0x09, 0x0A, 0x0F, 0x0C, 0x0D, 0x0E 92db 0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06, 93db 0x0B, 0x08, 0x09, 0x0A, 0x0F, 0x0C, 0x0D, 0x0E 94 95align 32 96rot16_mod32: 97db 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 98db 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D 99db 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 100db 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D 101 102align 32 103rot24_mod32: 104db 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04, 105db 0x09, 0x0A, 0x0B, 0x08, 0x0D, 0x0E, 0x0F, 0x0C 106db 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04, 107db 0x09, 0x0A, 0x0B, 0x08, 0x0D, 0x0E, 0x0F, 0x0C 108 109align 16 110broadcast_word: 111db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 112db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 113 114align 16 115all_ffs: 116dw 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff 117 118align 16 119all_threes: 120dw 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 121 122align 16 123all_fffcs: 124dw 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc 125 126align 16 127all_1fs: 128dw 0x001f, 0x001f, 0x001f, 0x001f, 0x001f, 0x001f, 0x001f, 0x001f 129 130align 16 131all_20s: 132dw 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020 133 134section .text 135align 64 136 137%define MASK31 ymm12 138 139%define OFS_R1 (16*(2*16)) 140%define OFS_R2 (OFS_R1 + (2*16)) 141%define OFS_X0 (OFS_R2 + (2*16)) 142%define OFS_X1 (OFS_X0 + (2*16)) 143%define OFS_X2 (OFS_X1 + (2*16)) 144 145%ifidn __OUTPUT_FORMAT__, win64 146 %define XMM_STORAGE 16*10 147 %define GP_STORAGE 8*8 148%else 149 %define XMM_STORAGE 0 150 %define GP_STORAGE 6*8 151%endif 152 153%define VARIABLE_OFFSET XMM_STORAGE + GP_STORAGE 154%define GP_OFFSET XMM_STORAGE 155 156%macro FUNC_SAVE 0 157 mov r11, rsp 158 sub rsp, VARIABLE_OFFSET 159 and rsp, ~15 160 161%ifidn __OUTPUT_FORMAT__, win64 162 ; xmm6:xmm15 need to be maintained for Windows 163 vmovdqa [rsp + 0*16], xmm6 164 vmovdqa [rsp + 1*16], xmm7 165 vmovdqa [rsp + 2*16], xmm8 166 vmovdqa [rsp + 3*16], xmm9 167 vmovdqa [rsp + 4*16], xmm10 168 vmovdqa [rsp + 5*16], xmm11 169 vmovdqa [rsp + 6*16], xmm12 170 vmovdqa [rsp + 7*16], xmm13 171 vmovdqa [rsp + 8*16], xmm14 172 vmovdqa [rsp + 9*16], xmm15 173 mov [rsp + GP_OFFSET + 48], rdi 174 mov [rsp + GP_OFFSET + 56], rsi 175%endif 176 mov [rsp + GP_OFFSET], r12 177 mov [rsp + GP_OFFSET + 8], r13 178 mov [rsp + GP_OFFSET + 16], r14 179 mov [rsp + GP_OFFSET + 24], r15 180 mov [rsp + GP_OFFSET + 32], rbx 181 mov [rsp + GP_OFFSET + 40], r11 ;; rsp pointer 182%endmacro 183 184 185%macro FUNC_RESTORE 0 186 187%ifidn __OUTPUT_FORMAT__, win64 188 vmovdqa xmm6, [rsp + 0*16] 189 vmovdqa xmm7, [rsp + 1*16] 190 vmovdqa xmm8, [rsp + 2*16] 191 vmovdqa xmm9, [rsp + 3*16] 192 vmovdqa xmm10, [rsp + 4*16] 193 vmovdqa xmm11, [rsp + 5*16] 194 vmovdqa xmm12, [rsp + 6*16] 195 vmovdqa xmm13, [rsp + 7*16] 196 vmovdqa xmm14, [rsp + 8*16] 197 vmovdqa xmm15, [rsp + 9*16] 198 mov rdi, [rsp + GP_OFFSET + 48] 199 mov rsi, [rsp + GP_OFFSET + 56] 200%endif 201 mov r12, [rsp + GP_OFFSET] 202 mov r13, [rsp + GP_OFFSET + 8] 203 mov r14, [rsp + GP_OFFSET + 16] 204 mov r15, [rsp + GP_OFFSET + 24] 205 mov rbx, [rsp + GP_OFFSET + 32] 206 mov rsp, [rsp + GP_OFFSET + 40] 207%endmacro 208 209; This macro reorder the LFSR registers 210; after N rounds (1 <= N <= 15), since the registers 211; are shifted every round 212; 213; The macro clobbers YMM0-15 214; 215%macro REORDER_LFSR 2 216%define %%STATE %1 217%define %%NUM_ROUNDS %2 218 219%if %%NUM_ROUNDS != 16 220%assign i 0 221%rep 16 222 vmovdqa APPEND(ymm,i), [%%STATE + 32*i] 223%assign i (i+1) 224%endrep 225 226%assign i 0 227%assign j %%NUM_ROUNDS 228%rep 16 229 vmovdqa [%%STATE + 32*i], APPEND(ymm,j) 230%assign i (i+1) 231%assign j ((j+1) % 16) 232%endrep 233%endif ;; %%NUM_ROUNDS != 16 234 235%endmacro 236 237;; 238;; make_u31() 239;; 240%macro make_u31 4 241 242%define %%Rt %1 243%define %%Ke %2 244%define %%Ek %3 245%define %%Iv %4 246 xor %%Rt, %%Rt 247 shrd %%Rt, %%Iv, 8 248 shrd %%Rt, %%Ek, 15 249 shrd %%Rt, %%Ke, 9 250%endmacro 251 252 253; 254; bits_reorg8() 255; 256%macro bits_reorg8 2-3 257%define %%STATE %1 ; [in] ZUC state 258%define %%ROUND_NUM %2 ; [in] Round number 259%define %%X3 %3 ; [out] YMM register containing X3 of all lanes 260 ; 261 ; ymm15 = LFSR_S15 262 ; ymm14 = LFSR_S14 263 ; ymm11 = LFSR_S11 264 ; ymm9 = LFSR_S9 265 ; ymm7 = LFSR_S7 266 ; ymm5 = LFSR_S5 267 ; ymm2 = LFSR_S2 268 ; ymm0 = LFSR_S0 269 ; 270 vmovdqa ymm15, [%%STATE + ((15 + %%ROUND_NUM) % 16)*32] 271 vmovdqa ymm14, [%%STATE + ((14 + %%ROUND_NUM) % 16)*32] 272 vmovdqa ymm11, [%%STATE + ((11 + %%ROUND_NUM) % 16)*32] 273 vmovdqa ymm9, [%%STATE + (( 9 + %%ROUND_NUM) % 16)*32] 274 vmovdqa ymm7, [%%STATE + (( 7 + %%ROUND_NUM) % 16)*32] 275 vmovdqa ymm5, [%%STATE + (( 5 + %%ROUND_NUM) % 16)*32] 276 vmovdqa ymm2, [%%STATE + (( 2 + %%ROUND_NUM) % 16)*32] 277 vmovdqa ymm0, [%%STATE + (( 0 + %%ROUND_NUM) % 16)*32] 278 279 vpxor ymm1, ymm1 280 vpslld ymm15, 1 281 vpblendw ymm3, ymm14, ymm1, 0xAA 282 vpblendw ymm15, ymm3, ymm15, 0xAA 283 284 vmovdqa [%%STATE + OFS_X0], ymm15 ; BRC_X0 285 vpslld ymm11, 16 286 vpsrld ymm9, 15 287 vpor ymm11, ymm9 288 vmovdqa [%%STATE + OFS_X1], ymm11 ; BRC_X1 289 vpslld ymm7, 16 290 vpsrld ymm5, 15 291 vpor ymm7, ymm5 292 vmovdqa [%%STATE + OFS_X2], ymm7 ; BRC_X2 293%if (%0 == 3) 294 vpslld ymm2, 16 295 vpsrld ymm0, 15 296 vpor %%X3, ymm2, ymm0 ; Store BRC_X3 in YMM register 297%endif 298%endmacro 299 300; 301; rot_mod32() 302; 303; uses ymm7 304; 305%macro rot_mod32 3 306%if (%3 == 8) 307 vpshufb %1, %2, [rel rot8_mod32] 308%elif (%3 == 16) 309 vpshufb %1, %2, [rel rot16_mod32] 310%elif (%3 == 24) 311 vpshufb %1, %2, [rel rot24_mod32] 312%else 313 vpslld %1, %2, %3 314 vpsrld ymm7, %2, (32 - %3) 315 316 vpor %1, ymm7 317%endif 318%endmacro 319 320 321; 322; nonlin_fun8() 323; 324; return 325; W value, updates F_R1[] / F_R2[] 326; 327%macro nonlin_fun8 1-2 328%define %%STATE %1 ; [in] ZUC state 329%define %%W %2 ; [out] YMM register to contain W for all lanes 330 331%if (%0 == 2) 332 vmovdqa %%W, [%%STATE + OFS_X0] 333 vpxor %%W, [%%STATE + OFS_R1] 334 vpaddd %%W, [%%STATE + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2 335%endif 336 337 vmovdqa ymm1, [%%STATE + OFS_R1] 338 vmovdqa ymm2, [%%STATE + OFS_R2] 339 vpaddd ymm1, [%%STATE + OFS_X1] ; W1 = F_R1 + BRC_X1 340 vpxor ymm2, [%%STATE + OFS_X2] ; W2 = F_R2 ^ BRC_X2 341 342 vpslld ymm3, ymm1, 16 343 vpsrld ymm4, ymm1, 16 344 vpslld ymm5, ymm2, 16 345 vpsrld ymm6, ymm2, 16 346 vpor ymm1, ymm3, ymm6 347 vpor ymm2, ymm4, ymm5 348 349 rot_mod32 ymm3, ymm1, 2 350 rot_mod32 ymm4, ymm1, 10 351 rot_mod32 ymm5, ymm1, 18 352 rot_mod32 ymm6, ymm1, 24 353 vpxor ymm1, ymm3 354 vpxor ymm1, ymm4 355 vpxor ymm1, ymm5 356 vpxor ymm1, ymm6 ; XMM1 = U = L1(P) 357 358 rot_mod32 ymm3, ymm2, 8 359 rot_mod32 ymm4, ymm2, 14 360 rot_mod32 ymm5, ymm2, 22 361 rot_mod32 ymm6, ymm2, 30 362 vpxor ymm2, ymm3 363 vpxor ymm2, ymm4 364 vpxor ymm2, ymm5 365 vpxor ymm2, ymm6 ; XMM2 = V = L2(Q) 366 367 ; Shuffle U and V to have all S0 lookups in XMM1 and all S1 lookups in XMM2 368 369 ; Compress all S0 and S1 input values in each register 370 vpshufb ymm1, [rel S0_S1_shuf] ; S0: Bytes 0-7,16-23 S1: Bytes 8-15,24-31 371 vpshufb ymm2, [rel S1_S0_shuf] ; S1: Bytes 0-7,16-23 S0: Bytes 8-15,24-31 372 373 vshufpd ymm3, ymm1, ymm2, 0xA ; All S0 input values 374 vshufpd ymm4, ymm2, ymm1, 0xA ; All S1 input values 375 376 ; Compute S0 and S1 values 377 S0_comput_AVX2 ymm3, ymm1, ymm2 378 S1_comput_AVX2 ymm4, ymm1, ymm2, ymm5 379 380 ; Need to shuffle back ymm1 & ymm2 before storing output 381 ; (revert what was done before S0 and S1 computations) 382 vshufpd ymm1, ymm3, ymm4, 0xA 383 vshufpd ymm2, ymm4, ymm3, 0xA 384 385 vpshufb ymm1, [rel rev_S0_S1_shuf] 386 vpshufb ymm2, [rel rev_S1_S0_shuf] 387 388 vmovdqa [%%STATE + OFS_R1], ymm1 389 vmovdqa [%%STATE + OFS_R2], ymm2 390%endmacro 391 392; 393; store32B_kstr8() 394; 395%macro store32B_kstr8 8 396%define %%DATA32B_L0 %1 ; [in] 32 bytes of keystream for lane 0 397%define %%DATA32B_L1 %2 ; [in] 32 bytes of keystream for lane 1 398%define %%DATA32B_L2 %3 ; [in] 32 bytes of keystream for lane 2 399%define %%DATA32B_L3 %4 ; [in] 32 bytes of keystream for lane 3 400%define %%DATA32B_L4 %5 ; [in] 32 bytes of keystream for lane 4 401%define %%DATA32B_L5 %6 ; [in] 32 bytes of keystream for lane 5 402%define %%DATA32B_L6 %7 ; [in] 32 bytes of keystream for lane 6 403%define %%DATA32B_L7 %8 ; [in] 32 bytes of keystream for lane 7 404 405 mov rcx, [rsp] 406 mov rdx, [rsp + 8] 407 mov r8, [rsp + 16] 408 mov r9, [rsp + 24] 409 vmovdqu [rcx], %%DATA32B_L0 410 vmovdqu [rdx], %%DATA32B_L1 411 vmovdqu [r8], %%DATA32B_L2 412 vmovdqu [r9], %%DATA32B_L3 413 414 mov rcx, [rsp + 32] 415 mov rdx, [rsp + 40] 416 mov r8, [rsp + 48] 417 mov r9, [rsp + 56] 418 vmovdqu [rcx], %%DATA32B_L4 419 vmovdqu [rdx], %%DATA32B_L5 420 vmovdqu [r8], %%DATA32B_L6 421 vmovdqu [r9], %%DATA32B_L7 422 423%endmacro 424 425; 426; store4B_kstr8() 427; 428; params 429; 430; %1 - YMM register with OFS_X3 431; return 432; 433%macro store4B_kstr8 1 434 mov rcx, [rsp] 435 mov rdx, [rsp + 8] 436 mov r8, [rsp + 16] 437 mov r9, [rsp + 24] 438 vpextrd [r9], XWORD(%1), 3 439 vpextrd [r8], XWORD(%1), 2 440 vpextrd [rdx], XWORD(%1), 1 441 vmovd [rcx], XWORD(%1) 442 add rcx, 4 443 add rdx, 4 444 add r8, 4 445 add r9, 4 446 mov [rsp], rcx 447 mov [rsp + 8], rdx 448 mov [rsp + 16], r8 449 mov [rsp + 24], r9 450 451 vextracti128 XWORD(%1), %1, 1 452 mov rcx, [rsp + 32] 453 mov rdx, [rsp + 40] 454 mov r8, [rsp + 48] 455 mov r9, [rsp + 56] 456 vpextrd [r9], XWORD(%1), 3 457 vpextrd [r8], XWORD(%1), 2 458 vpextrd [rdx], XWORD(%1), 1 459 vmovd [rcx], XWORD(%1) 460 add rcx, 4 461 add rdx, 4 462 add r8, 4 463 add r9, 4 464 mov [rsp + 32], rcx 465 mov [rsp + 40], rdx 466 mov [rsp + 48], r8 467 mov [rsp + 56], r9 468 469%endmacro 470 471 472; 473; add_mod31() 474; add two 32-bit args and reduce mod (2^31-1) 475; params 476; %1 - arg1/res 477; %2 - arg2 478; uses 479; ymm2 480; return 481; %1 482%macro add_mod31 2 483 vpaddd %1, %2 484 vpsrld ymm2, %1, 31 485 vpand %1, MASK31 486 vpaddd %1, ymm2 487%endmacro 488 489 490; 491; rot_mod31() 492; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1) 493; params 494; %1 - arg 495; %2 - # of bits 496; uses 497; ymm2 498; return 499; %1 500%macro rot_mod31 2 501 502 vpslld ymm2, %1, %2 503 vpsrld %1, %1, (31 - %2) 504 505 vpor %1, ymm2 506 vpand %1, MASK31 507%endmacro 508 509 510; 511; lfsr_updt8() 512; 513; 514%macro lfsr_updt8 3 515%define %%STATE %1 ; [in] ZUC state 516%define %%ROUND_NUM %2 ; [in] Round number 517%define %%W %3 ; [in/clobbered] YMM register to contain W for all lanes 518 ; 519 ; ymm1 = LFSR_S0 520 ; ymm4 = LFSR_S4 521 ; ymm10 = LFSR_S10 522 ; ymm13 = LFSR_S13 523 ; ymm15 = LFSR_S15 524 ; 525 vmovdqa ymm1, [%%STATE + (( 0 + %%ROUND_NUM) % 16)*32] 526 vmovdqa ymm4, [%%STATE + (( 4 + %%ROUND_NUM) % 16)*32] 527 vmovdqa ymm10, [%%STATE + ((10 + %%ROUND_NUM) % 16)*32] 528 vmovdqa ymm13, [%%STATE + ((13 + %%ROUND_NUM) % 16)*32] 529 vmovdqa ymm15, [%%STATE + ((15 + %%ROUND_NUM) % 16)*32] 530 531 ; Calculate LFSR feedback 532 add_mod31 %%W, ymm1 533 rot_mod31 ymm1, 8 534 add_mod31 %%W, ymm1 535 rot_mod31 ymm4, 20 536 add_mod31 %%W, ymm4 537 rot_mod31 ymm10, 21 538 add_mod31 %%W, ymm10 539 rot_mod31 ymm13, 17 540 add_mod31 %%W, ymm13 541 rot_mod31 ymm15, 15 542 add_mod31 %%W, ymm15 543 544 vmovdqa [%%STATE + (( 0 + %%ROUND_NUM) % 16)*32], %%W 545 546 ; LFSR_S16 = (LFSR_S15++) = eax 547%endmacro 548 549; 550; Initialize LFSR registers for a single lane 551; 552; This macro initializes 8 LFSR registers at time. 553; so it needs to be called twice. 554; 555; From spec, s_i (LFSR) registers need to be loaded as follows: 556; 557; For 0 <= i <= 15, let s_i= k_i || d_i || iv_i. 558; Where k_i is each byte of the key, d_i is a 15-bit constant 559; and iv_i is each byte of the IV. 560; 561%macro INIT_LFSR 7 562%define %%KEY %1 ;; [in] Key pointer 563%define %%IV %2 ;; [in] IV pointer 564%define %%SHUF_KEY %3 ;; [in] Shuffle key mask 565%define %%SHUF_IV %4 ;; [in] Shuffle key mask 566%define %%EKD_MASK %5 ;; [in] Shuffle key mask 567%define %%LFSR %6 ;; [out] YMM register to contain initialized LFSR regs 568%define %%YTMP %7 ;; [clobbered] YMM temporary register 569 570 vbroadcastf128 %%LFSR, [%%KEY] 571 vbroadcastf128 %%YTMP, [%%IV] 572 vpshufb %%LFSR, %%SHUF_KEY 573 vpsrld %%LFSR, 1 574 vpshufb %%YTMP, %%SHUF_IV 575 vpor %%LFSR, %%YTMP 576 vpor %%LFSR, %%EKD_MASK 577 578%endmacro 579 580 581MKGLOBAL(asm_ZucInitialization_8_avx2,function,internal) 582asm_ZucInitialization_8_avx2: 583 584%ifdef LINUX 585 %define pKe rdi 586 %define pIv rsi 587 %define pState rdx 588%else 589 %define pKe rcx 590 %define pIv rdx 591 %define pState r8 592%endif 593 594 FUNC_SAVE 595 596 ; Zero out R1/R2 (only lower half is used) 597 vpxor ymm0, ymm0 598%assign I 0 599%rep 2 600 vmovdqa [pState + OFS_R1 + I*32], ymm0 601%assign I (I + 1) 602%endrep 603 604 ;;; Initialize all LFSR registers in two steps: 605 ;;; first, registers 0-7, then registers 8-15 606 607%assign off 0 608%rep 2 609 ; Set read-only registers for shuffle masks for key, IV and Ek_d for 8 registers 610 vmovdqa ymm13, [rel shuf_mask_key + off] 611 vmovdqa ymm14, [rel shuf_mask_iv + off] 612 vmovdqa ymm15, [rel Ek_d + off] 613 614 ; Set 8xLFSR registers for all packets 615%assign idx 0 616%rep 8 617 mov r9, [pKe+8*idx] ; Load Key N pointer 618 mov r10, [pIv+8*idx] ; Load IV N pointer 619 INIT_LFSR r9, r10, ymm13, ymm14, ymm15, APPEND(ymm, idx), ymm12 620%assign idx (idx + 1) 621%endrep 622 623 ; Store 8xLFSR registers in memory (reordering first, 624 ; so all SX registers are together) 625 TRANSPOSE8_U32 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9 626 627%assign i 0 628%rep 8 629 vmovdqa [pState + 8*off + 32*i], APPEND(ymm, i) 630%assign i (i+1) 631%endrep 632 633%assign off (off + 32) 634%endrep 635 636 ; Load read-only registers 637 vmovdqa ymm12, [rel mask31] 638 639 mov rax, pState 640 641 ; Shift LFSR 32-times, update state variables 642%assign N 0 643%rep 32 644 bits_reorg8 rax, N 645 nonlin_fun8 rax, ymm0 646 vpsrld ymm0,1 ; Shift out LSB of W 647 lfsr_updt8 rax, N, ymm0 ; W (ymm0) used in LFSR update - not set to zero 648%assign N N+1 649%endrep 650 651 ; And once more, initial round from keygen phase = 33 times 652 bits_reorg8 rax, 0 653 nonlin_fun8 rax 654 655 vpxor ymm0, ymm0 656 lfsr_updt8 rax, 0, ymm0 657 658 FUNC_RESTORE 659 660 ret 661 662; 663; Generate N*4 bytes of keystream 664; for 8 buffers (where N is number of rounds) 665; 666%macro KEYGEN_8_AVX2 1 667%define %%NUM_ROUNDS %1 ; [in] Number of 4-byte rounds 668 669%ifdef LINUX 670 %define pState rdi 671 %define pKS rsi 672%else 673 %define pState rcx 674 %define pKS rdx 675%endif 676 677 FUNC_SAVE 678 679 ; Store 8 keystream pointers on the stack 680 ; and reserve memory for storing keystreams for all 8 buffers 681 mov r10, rsp 682 sub rsp, (8*8 + %%NUM_ROUNDS * 32) 683 and rsp, -31 684 685%assign i 0 686%rep 2 687 vmovdqa ymm0, [pKS + 32*i] 688 vmovdqa [rsp + 32*i], ymm0 689%assign i (i+1) 690%endrep 691 692 ; Load state pointer in RAX 693 mov rax, pState 694 695 ; Load read-only registers 696 vmovdqa ymm12, [rel mask31] 697 698 ; Generate N*4B of keystream in N rounds 699%assign N 1 700%rep %%NUM_ROUNDS 701 bits_reorg8 rax, N, ymm10 702 nonlin_fun8 rax, ymm0 703 ; OFS_X3 XOR W (ymm0) and store in stack 704 vpxor ymm10, ymm0 705 vmovdqa [rsp + 64 + (N-1)*32], ymm10 706 vpxor ymm0, ymm0 707 lfsr_updt8 rax, N, ymm0 708%assign N N+1 709%endrep 710 711%if (%%NUM_ROUNDS == 8) 712 ;; Load all OFS_X3 713 vmovdqa xmm0,[rsp + 64] 714 vmovdqa xmm1,[rsp + 64 + 32*1] 715 vmovdqa xmm2,[rsp + 64 + 32*2] 716 vmovdqa xmm3,[rsp + 64 + 32*3] 717 vmovdqa xmm4,[rsp + 64 + 16] 718 vmovdqa xmm5,[rsp + 64 + 32*1 + 16] 719 vmovdqa xmm6,[rsp + 64 + 32*2 + 16] 720 vmovdqa xmm7,[rsp + 64 + 32*3 + 16] 721 722 vinserti128 ymm0, ymm0, [rsp + 64 + 32*4], 0x01 723 vinserti128 ymm1, ymm1, [rsp + 64 + 32*5], 0x01 724 vinserti128 ymm2, ymm2, [rsp + 64 + 32*6], 0x01 725 vinserti128 ymm3, ymm3, [rsp + 64 + 32*7], 0x01 726 vinserti128 ymm4, ymm4, [rsp + 64 + 32*4 + 16], 0x01 727 vinserti128 ymm5, ymm5, [rsp + 64 + 32*5 + 16], 0x01 728 vinserti128 ymm6, ymm6, [rsp + 64 + 32*6 + 16], 0x01 729 vinserti128 ymm7, ymm7, [rsp + 64 + 32*7 + 16], 0x01 730 731 TRANSPOSE8_U32_PRELOADED ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9 732 733 store32B_kstr8 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 734 735 ;; Reorder LFSR registers, as not all 16 rounds have been completed 736 ;; (No need to do if NUM_ROUNDS != 8, as it would indicate that 737 ;; these would be the final rounds) 738 REORDER_LFSR rax, 8 739 740%else ;; NUM_ROUNDS == 8 741%assign idx 0 742%rep %%NUM_ROUNDS 743 vmovdqa APPEND(ymm, idx), [rsp + 64 + idx*32] 744 store4B_kstr8 APPEND(ymm, idx) 745%assign idx (idx + 1) 746%endrep 747%endif ;; NUM_ROUNDS == 8 748 749 ;; Clear stack frame containing keystream information 750%ifdef SAFE_DATA 751 vpxor ymm0, ymm0 752%assign i 0 753%rep (2+%%NUM_ROUNDS) 754 vmovdqa [rsp + i*32], ymm0 755%assign i (i+1) 756%endrep 757%endif 758 759 ;; Restore rsp pointer 760 mov rsp, r10 761 762 FUNC_RESTORE 763 764%endmacro 765 766;; 767;; void asm_ZucGenKeystream32B_8_avx2(state8_t *pSta, u32* pKeyStr[8]) 768;; 769;; WIN64 770;; RCX - pSta 771;; RDX - pKeyStr 772;; 773;; LIN64 774;; RDI - pSta 775;; RSI - pKeyStr 776;; 777MKGLOBAL(asm_ZucGenKeystream32B_8_avx2,function,internal) 778asm_ZucGenKeystream32B_8_avx2: 779 780 KEYGEN_8_AVX2 8 781 782 ret 783 784;; 785;; void asm_ZucGenKeystream8B_8_avx2(state8_t *pSta, u32* pKeyStr[8]) 786;; 787;; WIN64 788;; RCX - pSta 789;; RDX - pKeyStr 790;; 791;; LIN64 792;; RDI - pSta 793;; RSI - pKeyStr 794;; 795MKGLOBAL(asm_ZucGenKeystream8B_8_avx2,function,internal) 796asm_ZucGenKeystream8B_8_avx2: 797 798 KEYGEN_8_AVX2 2 799 800 ret 801 802;; 803;; Encrypt N*4B bytes on all 8 buffers 804;; where N is number of rounds (up to 8) 805;; In final call, an array of final bytes is read 806;; from memory and only these final bytes are of 807;; plaintext are read and XOR'ed. 808%macro CIPHERNx4B_8 4 809%define %%NROUNDS %1 810%define %%INITIAL_ROUND %2 811%define %%OFFSET %3 812%define %%LAST_CALL %4 813 814%ifdef LINUX 815%define %%TMP1 r8 816%define %%TMP2 r9 817%else 818%define %%TMP1 rdi 819%define %%TMP2 rsi 820%endif 821 ; Load read-only registers 822 vmovdqa ymm12, [rel mask31] 823 824 ; Generate N*4B of keystream in N rounds 825%assign N 1 826%assign round (%%INITIAL_ROUND + N) 827%rep %%NROUNDS 828 bits_reorg8 rax, round, ymm10 829 nonlin_fun8 rax, ymm0 830 ; OFS_XR XOR W (ymm0) 831 vpxor ymm10, ymm0 832 vmovdqa [rsp + (N-1)*32], ymm10 833 vpxor ymm0, ymm0 834 lfsr_updt8 rax, round, ymm0 835%assign N N+1 836%assign round (round + 1) 837%endrep 838 839%assign N 0 840%assign idx 8 841%rep %%NROUNDS 842 vmovdqa APPEND(ymm, idx), [rsp + N*32] 843%assign N N+1 844%assign idx (idx+1) 845%endrep 846 847 TRANSPOSE8_U32 ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, \ 848 ymm15, ymm0, ymm1 849 ;; XOR Input buffer with keystream in rounds of 32B 850 851 mov r12, [pIn] 852 mov r13, [pIn + 8] 853 mov r14, [pIn + 16] 854 mov r15, [pIn + 24] 855%if (%%LAST_CALL == 1) 856 ;; Save GP registers 857 mov [rsp + 32*8 + 16 + 8], %%TMP1 858 mov [rsp + 32*8 + 16 + 16], %%TMP2 859 860 ;; Read in r10 the word containing the number of final bytes to read for each lane 861 movzx r10d, word [rsp + 8*32] 862 simd_load_avx2 ymm0, r12 + %%OFFSET, r10, %%TMP1, %%TMP2 863 movzx r10d, word [rsp + 8*32 + 2] 864 simd_load_avx2 ymm1, r13 + %%OFFSET, r10, %%TMP1, %%TMP2 865 movzx r10d, word [rsp + 8*32 + 4] 866 simd_load_avx2 ymm2, r14 + %%OFFSET, r10, %%TMP1, %%TMP2 867 movzx r10d, word [rsp + 8*32 + 6] 868 simd_load_avx2 ymm3, r15 + %%OFFSET, r10, %%TMP1, %%TMP2 869%else 870 vmovdqu ymm0, [r12 + %%OFFSET] 871 vmovdqu ymm1, [r13 + %%OFFSET] 872 vmovdqu ymm2, [r14 + %%OFFSET] 873 vmovdqu ymm3, [r15 + %%OFFSET] 874%endif 875 876 mov r12, [pIn + 32] 877 mov r13, [pIn + 40] 878 mov r14, [pIn + 48] 879 mov r15, [pIn + 56] 880%if (%%LAST_CALL == 1) 881 movzx r10d, word [rsp + 8*32 + 8] 882 simd_load_avx2 ymm4, r12 + %%OFFSET, r10, %%TMP1, %%TMP2 883 movzx r10d, word [rsp + 8*32 + 10] 884 simd_load_avx2 ymm5, r13 + %%OFFSET, r10, %%TMP1, %%TMP2 885 movzx r10d, word [rsp + 8*32 + 12] 886 simd_load_avx2 ymm6, r14 + %%OFFSET, r10, %%TMP1, %%TMP2 887 movzx r10d, word [rsp + 8*32 + 14] 888 simd_load_avx2 ymm7, r15 + %%OFFSET, r10, %%TMP1, %%TMP2 889%else 890 vmovdqu ymm4, [r12 + %%OFFSET] 891 vmovdqu ymm5, [r13 + %%OFFSET] 892 vmovdqu ymm6, [r14 + %%OFFSET] 893 vmovdqu ymm7, [r15 + %%OFFSET] 894%endif 895 ; Shuffle all keystreams and XOR with plaintext 896%assign %%I 0 897%assign %%J 8 898%rep 8 899 vpshufb ymm %+ %%J, [rel swap_mask] 900 vpxor ymm %+ %%J, ymm %+ %%I 901%assign %%I (%%I + 1) 902%assign %%J (%%J + 1) 903%endrep 904 905 ;; Write output 906 mov r12, [pOut] 907 mov r13, [pOut + 8] 908 mov r14, [pOut + 16] 909 mov r15, [pOut + 24] 910 911%if (%%LAST_CALL == 1) 912 add r12, %%OFFSET 913 add r13, %%OFFSET 914 add r14, %%OFFSET 915 add r15, %%OFFSET 916 ;; Read in r10 the word containing the number of final bytes to write for each lane 917 movzx r10d, word [rsp + 8*32] 918 simd_store_avx2 r12, ymm8, r10, %%TMP1, %%TMP2 919 movzx r10d, word [rsp + 8*32 + 2] 920 simd_store_avx2 r13, ymm9, r10, %%TMP1, %%TMP2 921 movzx r10d, word [rsp + 8*32 + 4] 922 simd_store_avx2 r14, ymm10, r10, %%TMP1, %%TMP2 923 movzx r10d, word [rsp + 8*32 + 6] 924 simd_store_avx2 r15, ymm11, r10, %%TMP1, %%TMP2 925%else 926 vmovdqu [r12 + %%OFFSET], ymm8 927 vmovdqu [r13 + %%OFFSET], ymm9 928 vmovdqu [r14 + %%OFFSET], ymm10 929 vmovdqu [r15 + %%OFFSET], ymm11 930%endif 931 932 mov r12, [pOut + 32] 933 mov r13, [pOut + 40] 934 mov r14, [pOut + 48] 935 mov r15, [pOut + 56] 936 937%if (%%LAST_CALL == 1) 938 add r12, %%OFFSET 939 add r13, %%OFFSET 940 add r14, %%OFFSET 941 add r15, %%OFFSET 942 movzx r10d, word [rsp + 8*32 + 8] 943 simd_store_avx2 r12, ymm12, r10, %%TMP1, %%TMP2 944 movzx r10d, word [rsp + 8*32 + 10] 945 simd_store_avx2 r13, ymm13, r10, %%TMP1, %%TMP2 946 movzx r10d, word [rsp + 8*32 + 12] 947 simd_store_avx2 r14, ymm14, r10, %%TMP1, %%TMP2 948 movzx r10d, word [rsp + 8*32 + 14] 949 simd_store_avx2 r15, ymm15, r10, %%TMP1, %%TMP2 950 951 ; Restore registers 952 mov %%TMP1, [rsp + 32*8 + 16 + 8] 953 mov %%TMP2, [rsp + 32*8 + 16 + 16] 954%else 955 vmovdqu [r12 + %%OFFSET], ymm12 956 vmovdqu [r13 + %%OFFSET], ymm13 957 vmovdqu [r14 + %%OFFSET], ymm14 958 vmovdqu [r15 + %%OFFSET], ymm15 959%endif 960 961%endmacro 962 963;; 964;; void asm_ZucCipher_8_avx2(state16_t *pSta, u64 *pIn[8], 965;; u64 *pOut[8], u16 lengths, u64 min_length); 966;; 967;; WIN64 968;; RCX - pSta 969;; RDX - pIn 970;; R8 - pOut 971;; R9 - lengths 972;; rsp + 40 - min_length 973;; 974;; LIN64 975;; RDI - pSta 976;; RSI - pIn 977;; RDX - pOut 978;; RCX - lengths 979;; R8 - min_length 980;; 981MKGLOBAL(asm_ZucCipher_8_avx2,function,internal) 982asm_ZucCipher_8_avx2: 983 984%ifdef LINUX 985 %define pState rdi 986 %define pIn rsi 987 %define pOut rdx 988 %define lengths rcx 989 %define arg5 r8 990%else 991 %define pState rcx 992 %define pIn rdx 993 %define pOut r8 994 %define lengths r9 995 %define arg5 [rsp + 40] 996%endif 997 998%define min_length r10 999%define buf_idx r11 1000 1001 mov min_length, arg5 1002 1003 or min_length, min_length 1004 jz exit_cipher32 1005 1006 FUNC_SAVE 1007 1008 ;; Convert all lengths from UINT16_MAX (indicating that lane is not valid) to min length 1009 vmovd xmm0, DWORD(min_length) 1010 vpshufb xmm0, xmm0, [rel broadcast_word] 1011 vmovdqa xmm1, [lengths] 1012 vpcmpeqw xmm2, xmm2 ;; Get all ff's in XMM register 1013 vpcmpeqw xmm3, xmm1, xmm2 ;; Mask with FFFF in NULL jobs 1014 1015 vpand xmm4, xmm3, xmm0 ;; Length of valid job in all NULL jobs 1016 vpxor xmm2, xmm3 ;; Mask with 0000 in NULL jobs 1017 vpand xmm1, xmm2 ;; Zero out lengths of NULL jobs 1018 vpor xmm1, xmm4 ;; XMM1 contain updated lengths 1019 1020 ; Round up to nearest multiple of 4 bytes 1021 vpaddw xmm0, [rel all_threes] 1022 vpand xmm0, [rel all_fffcs] 1023 1024 ; Calculate remaining bytes to encrypt after function call 1025 vpsubw xmm2, xmm1, xmm0 1026 vpxor xmm3, xmm3 1027 vpcmpgtw xmm4, xmm2, xmm3 ;; Mask with FFFF in lengths > 0 1028 ; Set to zero the lengths of the lanes which are going to be completed (lengths < 0) 1029 vpand xmm2, xmm4 1030 vmovdqa [lengths], xmm2 ; Update in memory the final updated lengths 1031 1032 ; Calculate number of bytes to encrypt after round of 32 bytes (up to 31 bytes), 1033 ; for each lane, and store it in stack to be used in the last round 1034 vpsubw xmm1, xmm2 ; Bytes to encrypt in all lanes 1035 vpand xmm1, [rel all_1fs] ; Number of final bytes (up to 31 bytes) for each lane 1036 vpcmpeqw xmm2, xmm1, xmm3 ;; Mask with FFFF in lengths == 0 1037 vpand xmm2, [rel all_20s] ;; 32 in positions where lengths was 0 1038 vpor xmm1, xmm2 ;; Number of final bytes (up to 32 bytes) for each lane 1039 1040 ; Allocate stack frame to store keystreams (32*8 bytes), number of final bytes (16 bytes), 1041 ; space for rsp (8 bytes) and 2 GP registers (16 bytes) that will be clobbered later 1042 mov rax, rsp 1043 sub rsp, (32*8 + 16 + 16 + 8) 1044 and rsp, -31 1045 xor buf_idx, buf_idx 1046 vmovdqu [rsp + 32*8], xmm1 1047 mov [rsp + 32*8 + 16], rax 1048 1049 ; Load state pointer in RAX 1050 mov rax, pState 1051 1052loop_cipher64: 1053 cmp min_length, 64 1054 jl exit_loop_cipher64 1055 1056 CIPHERNx4B_8 8, 0, buf_idx, 0 1057 1058 add buf_idx, 32 1059 sub min_length, 32 1060 1061 CIPHERNx4B_8 8, 8, buf_idx, 0 1062 1063 add buf_idx, 32 1064 sub min_length, 32 1065 1066 jmp loop_cipher64 1067exit_loop_cipher64: 1068 1069 ; Check if at least 32 bytes are left to encrypt 1070 cmp min_length, 32 1071 jl less_than_32 1072 1073 CIPHERNx4B_8 8, 0, buf_idx, 0 1074 REORDER_LFSR rax, 8 1075 1076 add buf_idx, 32 1077 sub min_length, 32 1078 1079 ; Check if there are more bytes left to encrypt 1080less_than_32: 1081 1082 mov r15, min_length 1083 add r15, 3 1084 shr r15, 2 ;; number of rounds left (round up length to nearest multiple of 4B) 1085 jz exit_final_rounds 1086 1087_final_rounds_is_1_8: 1088 cmp r15, 4 1089 je _num_final_rounds_is_4 1090 jl _final_rounds_is_1_3 1091 1092 ; Final rounds 5-8 1093 cmp r15, 8 1094 je _num_final_rounds_is_8 1095 cmp r15, 7 1096 je _num_final_rounds_is_7 1097 cmp r15, 6 1098 je _num_final_rounds_is_6 1099 cmp r15, 5 1100 je _num_final_rounds_is_5 1101 1102_final_rounds_is_1_3: 1103 cmp r15, 3 1104 je _num_final_rounds_is_3 1105 cmp r15, 2 1106 je _num_final_rounds_is_2 1107 1108 jmp _num_final_rounds_is_1 1109 1110 ; Perform encryption of last bytes (<= 31 bytes) and reorder LFSR registers 1111%assign I 1 1112%rep 8 1113APPEND(_num_final_rounds_is_,I): 1114 CIPHERNx4B_8 I, 0, buf_idx, 1 1115 REORDER_LFSR rax, I 1116 add buf_idx, (I*4) 1117 jmp exit_final_rounds 1118%assign I (I + 1) 1119%endrep 1120 1121exit_final_rounds: 1122 ;; update in/out pointers 1123 1124 ; Broadcast buf_idx in all qwords of ymm0 1125 vmovq xmm0, buf_idx 1126 vpshufd xmm0, xmm0, 0x44 1127 vperm2f128 ymm0, ymm0, 0x0 1128 vpaddq ymm1, ymm0, [pIn] 1129 vpaddq ymm2, ymm0, [pIn + 32] 1130 vmovdqa [pIn], ymm1 1131 vmovdqa [pIn + 32], ymm2 1132 vpaddq ymm1, ymm0, [pOut] 1133 vpaddq ymm2, ymm0, [pOut + 32] 1134 vmovdqa [pOut], ymm1 1135 vmovdqa [pOut + 32], ymm2 1136 1137 ;; Clear stack frame containing keystream information 1138%ifdef SAFE_DATA 1139 vpxor ymm0, ymm0 1140%assign i 0 1141%rep 8 1142 vmovdqa [rsp + i*32], ymm0 1143%assign i (i+1) 1144%endrep 1145%endif 1146 ; Restore rsp 1147 mov rsp, [rsp + 32*8 + 16] 1148 1149 FUNC_RESTORE 1150 1151exit_cipher32: 1152 1153 ret 1154 1155;---------------------------------------------------------------------------------------- 1156;---------------------------------------------------------------------------------------- 1157 1158%ifdef LINUX 1159section .note.GNU-stack noalloc noexec nowrite progbits 1160%endif 1161