1/* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher 2 * 3 * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This file is part of Libgcrypt. 6 * 7 * Libgcrypt is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU Lesser General Public License as 9 * published by the Free Software Foundation; either version 2.1 of 10 * the License, or (at your option) any later version. 11 * 12 * Libgcrypt is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21#include <config.h> 22 23#ifdef __x86_64 24#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ 25 defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ 26 defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) 27 28#include "asm-common-amd64.h" 29 30#define CAMELLIA_TABLE_BYTE_LEN 272 31 32/* struct CAMELLIA_context: */ 33#define key_table 0 34#define key_bitlength CAMELLIA_TABLE_BYTE_LEN 35 36/* register macros */ 37#define CTX %rdi 38 39/********************************************************************** 40 helper macros 41 **********************************************************************/ 42#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 43 vpand x, mask4bit, tmp0; \ 44 vpandn x, mask4bit, x; \ 45 vpsrld $4, x, x; \ 46 \ 47 vpshufb tmp0, lo_t, tmp0; \ 48 vpshufb x, hi_t, x; \ 49 vpxor tmp0, x, x; 50 51/********************************************************************** 52 16-way camellia 53 **********************************************************************/ 54 55/* 56 * IN: 57 * x0..x7: byte-sliced AB state 58 * mem_cd: register pointer storing CD state 59 * key: index for key material 60 * OUT: 61 * x0..x7: new byte-sliced CD state 62 */ 63#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 64 t7, mem_cd, key) \ 65 /* \ 66 * S-function with AES subbytes \ 67 */ \ 68 vmovdqa .Linv_shift_row rRIP, t4; \ 69 vbroadcastss .L0f0f0f0f rRIP, t7; \ 70 vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \ 71 vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \ 72 \ 73 /* AES inverse shift rows */ \ 74 vpshufb t4, x0, x0; \ 75 vpshufb t4, x7, x7; \ 76 vpshufb t4, x1, x1; \ 77 vpshufb t4, x4, x4; \ 78 vpshufb t4, x2, x2; \ 79 vpshufb t4, x5, x5; \ 80 vpshufb t4, x3, x3; \ 81 vpshufb t4, x6, x6; \ 82 \ 83 /* prefilter sboxes 1, 2 and 3 */ \ 84 vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \ 85 vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \ 86 filter_8bit(x0, t0, t1, t7, t6); \ 87 filter_8bit(x7, t0, t1, t7, t6); \ 88 filter_8bit(x1, t0, t1, t7, t6); \ 89 filter_8bit(x4, t0, t1, t7, t6); \ 90 filter_8bit(x2, t0, t1, t7, t6); \ 91 filter_8bit(x5, t0, t1, t7, t6); \ 92 \ 93 /* prefilter sbox 4 */ \ 94 vpxor t4, t4, t4; \ 95 filter_8bit(x3, t2, t3, t7, t6); \ 96 filter_8bit(x6, t2, t3, t7, t6); \ 97 \ 98 /* AES subbytes + AES shift rows */ \ 99 vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \ 100 vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \ 101 vaesenclast t4, x0, x0; \ 102 vaesenclast t4, x7, x7; \ 103 vaesenclast t4, x1, x1; \ 104 vaesenclast t4, x4, x4; \ 105 vaesenclast t4, x2, x2; \ 106 vaesenclast t4, x5, x5; \ 107 vaesenclast t4, x3, x3; \ 108 vaesenclast t4, x6, x6; \ 109 \ 110 /* postfilter sboxes 1 and 4 */ \ 111 vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \ 112 vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \ 113 filter_8bit(x0, t0, t1, t7, t6); \ 114 filter_8bit(x7, t0, t1, t7, t6); \ 115 filter_8bit(x3, t0, t1, t7, t6); \ 116 filter_8bit(x6, t0, t1, t7, t6); \ 117 \ 118 /* postfilter sbox 3 */ \ 119 vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \ 120 vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \ 121 filter_8bit(x2, t2, t3, t7, t6); \ 122 filter_8bit(x5, t2, t3, t7, t6); \ 123 \ 124 vpxor t6, t6, t6; \ 125 vmovq key, t0; \ 126 \ 127 /* postfilter sbox 2 */ \ 128 filter_8bit(x1, t4, t5, t7, t2); \ 129 filter_8bit(x4, t4, t5, t7, t2); \ 130 \ 131 vpsrldq $5, t0, t5; \ 132 vpsrldq $1, t0, t1; \ 133 vpsrldq $2, t0, t2; \ 134 vpsrldq $3, t0, t3; \ 135 vpsrldq $4, t0, t4; \ 136 vpshufb t6, t0, t0; \ 137 vpshufb t6, t1, t1; \ 138 vpshufb t6, t2, t2; \ 139 vpshufb t6, t3, t3; \ 140 vpshufb t6, t4, t4; \ 141 vpsrldq $2, t5, t7; \ 142 vpshufb t6, t7, t7; \ 143 \ 144 /* P-function */ \ 145 vpxor x5, x0, x0; \ 146 vpxor x6, x1, x1; \ 147 vpxor x7, x2, x2; \ 148 vpxor x4, x3, x3; \ 149 \ 150 vpxor x2, x4, x4; \ 151 vpxor x3, x5, x5; \ 152 vpxor x0, x6, x6; \ 153 vpxor x1, x7, x7; \ 154 \ 155 vpxor x7, x0, x0; \ 156 vpxor x4, x1, x1; \ 157 vpxor x5, x2, x2; \ 158 vpxor x6, x3, x3; \ 159 \ 160 vpxor x3, x4, x4; \ 161 vpxor x0, x5, x5; \ 162 vpxor x1, x6, x6; \ 163 vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 164 \ 165 /* Add key material and result to CD (x becomes new CD) */ \ 166 \ 167 vpxor t3, x4, x4; \ 168 vpxor 0 * 16(mem_cd), x4, x4; \ 169 \ 170 vpxor t2, x5, x5; \ 171 vpxor 1 * 16(mem_cd), x5, x5; \ 172 \ 173 vpsrldq $1, t5, t3; \ 174 vpshufb t6, t5, t5; \ 175 vpshufb t6, t3, t6; \ 176 \ 177 vpxor t1, x6, x6; \ 178 vpxor 2 * 16(mem_cd), x6, x6; \ 179 \ 180 vpxor t0, x7, x7; \ 181 vpxor 3 * 16(mem_cd), x7, x7; \ 182 \ 183 vpxor t7, x0, x0; \ 184 vpxor 4 * 16(mem_cd), x0, x0; \ 185 \ 186 vpxor t6, x1, x1; \ 187 vpxor 5 * 16(mem_cd), x1, x1; \ 188 \ 189 vpxor t5, x2, x2; \ 190 vpxor 6 * 16(mem_cd), x2, x2; \ 191 \ 192 vpxor t4, x3, x3; \ 193 vpxor 7 * 16(mem_cd), x3, x3; 194 195/* 196 * IN/OUT: 197 * x0..x7: byte-sliced AB state preloaded 198 * mem_ab: byte-sliced AB state in memory 199 * mem_cb: byte-sliced CD state in memory 200 */ 201#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 202 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 203 roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 204 y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ 205 \ 206 vmovdqu x4, 0 * 16(mem_cd); \ 207 vmovdqu x5, 1 * 16(mem_cd); \ 208 vmovdqu x6, 2 * 16(mem_cd); \ 209 vmovdqu x7, 3 * 16(mem_cd); \ 210 vmovdqu x0, 4 * 16(mem_cd); \ 211 vmovdqu x1, 5 * 16(mem_cd); \ 212 vmovdqu x2, 6 * 16(mem_cd); \ 213 vmovdqu x3, 7 * 16(mem_cd); \ 214 \ 215 roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ 216 y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ 217 \ 218 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 219 220#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 221 222#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 223 /* Store new AB state */ \ 224 vmovdqu x0, 0 * 16(mem_ab); \ 225 vmovdqu x1, 1 * 16(mem_ab); \ 226 vmovdqu x2, 2 * 16(mem_ab); \ 227 vmovdqu x3, 3 * 16(mem_ab); \ 228 vmovdqu x4, 4 * 16(mem_ab); \ 229 vmovdqu x5, 5 * 16(mem_ab); \ 230 vmovdqu x6, 6 * 16(mem_ab); \ 231 vmovdqu x7, 7 * 16(mem_ab); 232 233#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 234 y6, y7, mem_ab, mem_cd, i) \ 235 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 236 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 237 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 238 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 239 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 240 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 241 242#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 243 y6, y7, mem_ab, mem_cd, i) \ 244 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 245 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 246 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 247 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 248 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 249 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 250 251/* 252 * IN: 253 * v0..3: byte-sliced 32-bit integers 254 * OUT: 255 * v0..3: (IN <<< 1) 256 */ 257#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ 258 vpcmpgtb v0, zero, t0; \ 259 vpaddb v0, v0, v0; \ 260 vpabsb t0, t0; \ 261 \ 262 vpcmpgtb v1, zero, t1; \ 263 vpaddb v1, v1, v1; \ 264 vpabsb t1, t1; \ 265 \ 266 vpcmpgtb v2, zero, t2; \ 267 vpaddb v2, v2, v2; \ 268 vpabsb t2, t2; \ 269 \ 270 vpor t0, v1, v1; \ 271 \ 272 vpcmpgtb v3, zero, t0; \ 273 vpaddb v3, v3, v3; \ 274 vpabsb t0, t0; \ 275 \ 276 vpor t1, v2, v2; \ 277 vpor t2, v3, v3; \ 278 vpor t0, v0, v0; 279 280/* 281 * IN: 282 * r: byte-sliced AB state in memory 283 * l: byte-sliced CD state in memory 284 * OUT: 285 * x0..x7: new byte-sliced CD state 286 */ 287#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 288 tt1, tt2, tt3, kll, klr, krl, krr) \ 289 /* \ 290 * t0 = kll; \ 291 * t0 &= ll; \ 292 * lr ^= rol32(t0, 1); \ 293 */ \ 294 vpxor tt0, tt0, tt0; \ 295 vmovd kll, t0; \ 296 vpshufb tt0, t0, t3; \ 297 vpsrldq $1, t0, t0; \ 298 vpshufb tt0, t0, t2; \ 299 vpsrldq $1, t0, t0; \ 300 vpshufb tt0, t0, t1; \ 301 vpsrldq $1, t0, t0; \ 302 vpshufb tt0, t0, t0; \ 303 \ 304 vpand l0, t0, t0; \ 305 vpand l1, t1, t1; \ 306 vpand l2, t2, t2; \ 307 vpand l3, t3, t3; \ 308 \ 309 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 310 \ 311 vpxor l4, t0, l4; \ 312 vmovdqu l4, 4 * 16(l); \ 313 vpxor l5, t1, l5; \ 314 vmovdqu l5, 5 * 16(l); \ 315 vpxor l6, t2, l6; \ 316 vmovdqu l6, 6 * 16(l); \ 317 vpxor l7, t3, l7; \ 318 vmovdqu l7, 7 * 16(l); \ 319 \ 320 /* \ 321 * t2 = krr; \ 322 * t2 |= rr; \ 323 * rl ^= t2; \ 324 */ \ 325 \ 326 vmovd krr, t0; \ 327 vpshufb tt0, t0, t3; \ 328 vpsrldq $1, t0, t0; \ 329 vpshufb tt0, t0, t2; \ 330 vpsrldq $1, t0, t0; \ 331 vpshufb tt0, t0, t1; \ 332 vpsrldq $1, t0, t0; \ 333 vpshufb tt0, t0, t0; \ 334 \ 335 vpor 4 * 16(r), t0, t0; \ 336 vpor 5 * 16(r), t1, t1; \ 337 vpor 6 * 16(r), t2, t2; \ 338 vpor 7 * 16(r), t3, t3; \ 339 \ 340 vpxor 0 * 16(r), t0, t0; \ 341 vpxor 1 * 16(r), t1, t1; \ 342 vpxor 2 * 16(r), t2, t2; \ 343 vpxor 3 * 16(r), t3, t3; \ 344 vmovdqu t0, 0 * 16(r); \ 345 vmovdqu t1, 1 * 16(r); \ 346 vmovdqu t2, 2 * 16(r); \ 347 vmovdqu t3, 3 * 16(r); \ 348 \ 349 /* \ 350 * t2 = krl; \ 351 * t2 &= rl; \ 352 * rr ^= rol32(t2, 1); \ 353 */ \ 354 vmovd krl, t0; \ 355 vpshufb tt0, t0, t3; \ 356 vpsrldq $1, t0, t0; \ 357 vpshufb tt0, t0, t2; \ 358 vpsrldq $1, t0, t0; \ 359 vpshufb tt0, t0, t1; \ 360 vpsrldq $1, t0, t0; \ 361 vpshufb tt0, t0, t0; \ 362 \ 363 vpand 0 * 16(r), t0, t0; \ 364 vpand 1 * 16(r), t1, t1; \ 365 vpand 2 * 16(r), t2, t2; \ 366 vpand 3 * 16(r), t3, t3; \ 367 \ 368 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 369 \ 370 vpxor 4 * 16(r), t0, t0; \ 371 vpxor 5 * 16(r), t1, t1; \ 372 vpxor 6 * 16(r), t2, t2; \ 373 vpxor 7 * 16(r), t3, t3; \ 374 vmovdqu t0, 4 * 16(r); \ 375 vmovdqu t1, 5 * 16(r); \ 376 vmovdqu t2, 6 * 16(r); \ 377 vmovdqu t3, 7 * 16(r); \ 378 \ 379 /* \ 380 * t0 = klr; \ 381 * t0 |= lr; \ 382 * ll ^= t0; \ 383 */ \ 384 \ 385 vmovd klr, t0; \ 386 vpshufb tt0, t0, t3; \ 387 vpsrldq $1, t0, t0; \ 388 vpshufb tt0, t0, t2; \ 389 vpsrldq $1, t0, t0; \ 390 vpshufb tt0, t0, t1; \ 391 vpsrldq $1, t0, t0; \ 392 vpshufb tt0, t0, t0; \ 393 \ 394 vpor l4, t0, t0; \ 395 vpor l5, t1, t1; \ 396 vpor l6, t2, t2; \ 397 vpor l7, t3, t3; \ 398 \ 399 vpxor l0, t0, l0; \ 400 vmovdqu l0, 0 * 16(l); \ 401 vpxor l1, t1, l1; \ 402 vmovdqu l1, 1 * 16(l); \ 403 vpxor l2, t2, l2; \ 404 vmovdqu l2, 2 * 16(l); \ 405 vpxor l3, t3, l3; \ 406 vmovdqu l3, 3 * 16(l); 407 408#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 409 vpunpckhdq x1, x0, t2; \ 410 vpunpckldq x1, x0, x0; \ 411 \ 412 vpunpckldq x3, x2, t1; \ 413 vpunpckhdq x3, x2, x2; \ 414 \ 415 vpunpckhqdq t1, x0, x1; \ 416 vpunpcklqdq t1, x0, x0; \ 417 \ 418 vpunpckhqdq x2, t2, x3; \ 419 vpunpcklqdq x2, t2, x2; 420 421#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 422 a3, b3, c3, d3, st0, st1) \ 423 vmovdqu d2, st0; \ 424 vmovdqu d3, st1; \ 425 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 426 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 427 vmovdqu st0, d2; \ 428 vmovdqu st1, d3; \ 429 \ 430 vmovdqu a0, st0; \ 431 vmovdqu a1, st1; \ 432 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 433 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 434 \ 435 vmovdqu .Lshufb_16x16b rRIP, a0; \ 436 vmovdqu st1, a1; \ 437 vpshufb a0, a2, a2; \ 438 vpshufb a0, a3, a3; \ 439 vpshufb a0, b0, b0; \ 440 vpshufb a0, b1, b1; \ 441 vpshufb a0, b2, b2; \ 442 vpshufb a0, b3, b3; \ 443 vpshufb a0, a1, a1; \ 444 vpshufb a0, c0, c0; \ 445 vpshufb a0, c1, c1; \ 446 vpshufb a0, c2, c2; \ 447 vpshufb a0, c3, c3; \ 448 vpshufb a0, d0, d0; \ 449 vpshufb a0, d1, d1; \ 450 vpshufb a0, d2, d2; \ 451 vpshufb a0, d3, d3; \ 452 vmovdqu d3, st1; \ 453 vmovdqu st0, d3; \ 454 vpshufb a0, d3, a0; \ 455 vmovdqu d2, st0; \ 456 \ 457 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 458 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 459 vmovdqu st0, d2; \ 460 vmovdqu st1, d3; \ 461 \ 462 vmovdqu b0, st0; \ 463 vmovdqu b1, st1; \ 464 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 465 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 466 vmovdqu st0, b0; \ 467 vmovdqu st1, b1; \ 468 /* does not adjust output bytes inside vectors */ 469 470#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \ 471 vpunpcklbw a, b, t0; \ 472 vpunpckhbw a, b, b; \ 473 \ 474 vpunpcklbw c, d, t1; \ 475 vpunpckhbw c, d, d; \ 476 \ 477 vpunpcklbw e, f, t2; \ 478 vpunpckhbw e, f, f; \ 479 \ 480 vpunpcklbw g, h, t3; \ 481 vpunpckhbw g, h, h; \ 482 \ 483 vpunpcklwd t0, t1, g; \ 484 vpunpckhwd t0, t1, t0; \ 485 \ 486 vpunpcklwd b, d, t1; \ 487 vpunpckhwd b, d, e; \ 488 \ 489 vpunpcklwd t2, t3, c; \ 490 vpunpckhwd t2, t3, t2; \ 491 \ 492 vpunpcklwd f, h, t3; \ 493 vpunpckhwd f, h, b; \ 494 \ 495 vpunpcklwd e, b, t4; \ 496 vpunpckhwd e, b, b; \ 497 \ 498 vpunpcklwd t1, t3, e; \ 499 vpunpckhwd t1, t3, f; \ 500 \ 501 vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \ 502 \ 503 vpunpcklwd g, c, d; \ 504 vpunpckhwd g, c, c; \ 505 \ 506 vpunpcklwd t0, t2, t1; \ 507 vpunpckhwd t0, t2, h; \ 508 \ 509 vpunpckhqdq b, h, a; \ 510 vpshufb t3, a, a; \ 511 vpunpcklqdq b, h, b; \ 512 vpshufb t3, b, b; \ 513 \ 514 vpunpckhqdq e, d, g; \ 515 vpshufb t3, g, g; \ 516 vpunpcklqdq e, d, h; \ 517 vpshufb t3, h, h; \ 518 \ 519 vpunpckhqdq f, c, e; \ 520 vpshufb t3, e, e; \ 521 vpunpcklqdq f, c, f; \ 522 vpshufb t3, f, f; \ 523 \ 524 vpunpckhqdq t4, t1, c; \ 525 vpshufb t3, c, c; \ 526 vpunpcklqdq t4, t1, d; \ 527 vpshufb t3, d, d; 528 529/* load blocks to registers and apply pre-whitening */ 530#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 531 y6, y7, rio, key) \ 532 vmovq key, x0; \ 533 vpshufb .Lpack_bswap rRIP, x0, x0; \ 534 \ 535 vpxor 0 * 16(rio), x0, y7; \ 536 vpxor 1 * 16(rio), x0, y6; \ 537 vpxor 2 * 16(rio), x0, y5; \ 538 vpxor 3 * 16(rio), x0, y4; \ 539 vpxor 4 * 16(rio), x0, y3; \ 540 vpxor 5 * 16(rio), x0, y2; \ 541 vpxor 6 * 16(rio), x0, y1; \ 542 vpxor 7 * 16(rio), x0, y0; \ 543 vpxor 8 * 16(rio), x0, x7; \ 544 vpxor 9 * 16(rio), x0, x6; \ 545 vpxor 10 * 16(rio), x0, x5; \ 546 vpxor 11 * 16(rio), x0, x4; \ 547 vpxor 12 * 16(rio), x0, x3; \ 548 vpxor 13 * 16(rio), x0, x2; \ 549 vpxor 14 * 16(rio), x0, x1; \ 550 vpxor 15 * 16(rio), x0, x0; 551 552/* byteslice pre-whitened blocks and store to temporary memory */ 553#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 554 y6, y7, mem_ab, mem_cd) \ 555 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 556 y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 557 \ 558 vmovdqu x0, 0 * 16(mem_ab); \ 559 vmovdqu x1, 1 * 16(mem_ab); \ 560 vmovdqu x2, 2 * 16(mem_ab); \ 561 vmovdqu x3, 3 * 16(mem_ab); \ 562 vmovdqu x4, 4 * 16(mem_ab); \ 563 vmovdqu x5, 5 * 16(mem_ab); \ 564 vmovdqu x6, 6 * 16(mem_ab); \ 565 vmovdqu x7, 7 * 16(mem_ab); \ 566 vmovdqu y0, 0 * 16(mem_cd); \ 567 vmovdqu y1, 1 * 16(mem_cd); \ 568 vmovdqu y2, 2 * 16(mem_cd); \ 569 vmovdqu y3, 3 * 16(mem_cd); \ 570 vmovdqu y4, 4 * 16(mem_cd); \ 571 vmovdqu y5, 5 * 16(mem_cd); \ 572 vmovdqu y6, 6 * 16(mem_cd); \ 573 vmovdqu y7, 7 * 16(mem_cd); 574 575/* de-byteslice, apply post-whitening and store blocks */ 576#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 577 y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 578 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 579 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 580 \ 581 vmovdqu x0, stack_tmp0; \ 582 \ 583 vmovq key, x0; \ 584 vpshufb .Lpack_bswap rRIP, x0, x0; \ 585 \ 586 vpxor x0, y7, y7; \ 587 vpxor x0, y6, y6; \ 588 vpxor x0, y5, y5; \ 589 vpxor x0, y4, y4; \ 590 vpxor x0, y3, y3; \ 591 vpxor x0, y2, y2; \ 592 vpxor x0, y1, y1; \ 593 vpxor x0, y0, y0; \ 594 vpxor x0, x7, x7; \ 595 vpxor x0, x6, x6; \ 596 vpxor x0, x5, x5; \ 597 vpxor x0, x4, x4; \ 598 vpxor x0, x3, x3; \ 599 vpxor x0, x2, x2; \ 600 vpxor x0, x1, x1; \ 601 vpxor stack_tmp0, x0, x0; 602 603#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 604 y6, y7, rio) \ 605 vmovdqu x0, 0 * 16(rio); \ 606 vmovdqu x1, 1 * 16(rio); \ 607 vmovdqu x2, 2 * 16(rio); \ 608 vmovdqu x3, 3 * 16(rio); \ 609 vmovdqu x4, 4 * 16(rio); \ 610 vmovdqu x5, 5 * 16(rio); \ 611 vmovdqu x6, 6 * 16(rio); \ 612 vmovdqu x7, 7 * 16(rio); \ 613 vmovdqu y0, 8 * 16(rio); \ 614 vmovdqu y1, 9 * 16(rio); \ 615 vmovdqu y2, 10 * 16(rio); \ 616 vmovdqu y3, 11 * 16(rio); \ 617 vmovdqu y4, 12 * 16(rio); \ 618 vmovdqu y5, 13 * 16(rio); \ 619 vmovdqu y6, 14 * 16(rio); \ 620 vmovdqu y7, 15 * 16(rio); 621 622.text 623.align 16 624 625#define SHUFB_BYTES(idx) \ 626 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 627 628.Lshufb_16x16b: 629 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 630 631.Lpack_bswap: 632 .long 0x00010203 633 .long 0x04050607 634 .long 0x80808080 635 .long 0x80808080 636 637/* For CTR-mode IV byteswap */ 638.Lbswap128_mask: 639 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 640 641/* 642 * pre-SubByte transform 643 * 644 * pre-lookup for sbox1, sbox2, sbox3: 645 * swap_bitendianness( 646 * isom_map_camellia_to_aes( 647 * camellia_f( 648 * swap_bitendianess(in) 649 * ) 650 * ) 651 * ) 652 * 653 * (note: '⊕ 0xc5' inside camellia_f()) 654 */ 655.Lpre_tf_lo_s1: 656 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 657 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 658.Lpre_tf_hi_s1: 659 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 660 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 661 662/* 663 * pre-SubByte transform 664 * 665 * pre-lookup for sbox4: 666 * swap_bitendianness( 667 * isom_map_camellia_to_aes( 668 * camellia_f( 669 * swap_bitendianess(in <<< 1) 670 * ) 671 * ) 672 * ) 673 * 674 * (note: '⊕ 0xc5' inside camellia_f()) 675 */ 676.Lpre_tf_lo_s4: 677 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 678 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 679.Lpre_tf_hi_s4: 680 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 681 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 682 683/* 684 * post-SubByte transform 685 * 686 * post-lookup for sbox1, sbox4: 687 * swap_bitendianness( 688 * camellia_h( 689 * isom_map_aes_to_camellia( 690 * swap_bitendianness( 691 * aes_inverse_affine_transform(in) 692 * ) 693 * ) 694 * ) 695 * ) 696 * 697 * (note: '⊕ 0x6e' inside camellia_h()) 698 */ 699.Lpost_tf_lo_s1: 700 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 701 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 702.Lpost_tf_hi_s1: 703 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 704 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 705 706/* 707 * post-SubByte transform 708 * 709 * post-lookup for sbox2: 710 * swap_bitendianness( 711 * camellia_h( 712 * isom_map_aes_to_camellia( 713 * swap_bitendianness( 714 * aes_inverse_affine_transform(in) 715 * ) 716 * ) 717 * ) 718 * ) <<< 1 719 * 720 * (note: '⊕ 0x6e' inside camellia_h()) 721 */ 722.Lpost_tf_lo_s2: 723 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 724 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 725.Lpost_tf_hi_s2: 726 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 727 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 728 729/* 730 * post-SubByte transform 731 * 732 * post-lookup for sbox3: 733 * swap_bitendianness( 734 * camellia_h( 735 * isom_map_aes_to_camellia( 736 * swap_bitendianness( 737 * aes_inverse_affine_transform(in) 738 * ) 739 * ) 740 * ) 741 * ) >>> 1 742 * 743 * (note: '⊕ 0x6e' inside camellia_h()) 744 */ 745.Lpost_tf_lo_s3: 746 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 747 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 748.Lpost_tf_hi_s3: 749 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 750 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 751 752/* For isolating SubBytes from AESENCLAST, inverse shift row */ 753.Linv_shift_row: 754 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 755 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 756 757/* shuffle mask for 8x8 byte transpose */ 758.Ltranspose_8x8_shuf: 759 .byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7 760 761.align 4 762/* 4-bit mask */ 763.L0f0f0f0f: 764 .long 0x0f0f0f0f 765 766 767.align 8 768ELF(.type __camellia_enc_blk16,@function;) 769 770__camellia_enc_blk16: 771 /* input: 772 * %rdi: ctx, CTX 773 * %rax: temporary storage, 256 bytes 774 * %r8d: 24 for 16 byte key, 32 for larger 775 * %xmm0..%xmm15: 16 plaintext blocks 776 * output: 777 * %xmm0..%xmm15: 16 encrypted blocks, order swapped: 778 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 779 */ 780 CFI_STARTPROC(); 781 782 leaq 8 * 16(%rax), %rcx; 783 784 leaq (-8 * 8)(CTX, %r8, 8), %r8; 785 786 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 787 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 788 %xmm15, %rax, %rcx); 789 790.align 8 791.Lenc_loop: 792 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 793 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 794 %xmm15, %rax, %rcx, 0); 795 796 cmpq %r8, CTX; 797 je .Lenc_done; 798 leaq (8 * 8)(CTX), CTX; 799 800 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 801 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 802 %xmm15, 803 ((key_table) + 0)(CTX), 804 ((key_table) + 4)(CTX), 805 ((key_table) + 8)(CTX), 806 ((key_table) + 12)(CTX)); 807 jmp .Lenc_loop; 808 809.align 8 810.Lenc_done: 811 /* load CD for output */ 812 vmovdqu 0 * 16(%rcx), %xmm8; 813 vmovdqu 1 * 16(%rcx), %xmm9; 814 vmovdqu 2 * 16(%rcx), %xmm10; 815 vmovdqu 3 * 16(%rcx), %xmm11; 816 vmovdqu 4 * 16(%rcx), %xmm12; 817 vmovdqu 5 * 16(%rcx), %xmm13; 818 vmovdqu 6 * 16(%rcx), %xmm14; 819 vmovdqu 7 * 16(%rcx), %xmm15; 820 821 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 822 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 823 %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax)); 824 825 ret; 826 CFI_ENDPROC(); 827ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) 828 829.align 8 830ELF(.type __camellia_dec_blk16,@function;) 831 832__camellia_dec_blk16: 833 /* input: 834 * %rdi: ctx, CTX 835 * %rax: temporary storage, 256 bytes 836 * %r8d: 24 for 16 byte key, 32 for larger 837 * %xmm0..%xmm15: 16 encrypted blocks 838 * output: 839 * %xmm0..%xmm15: 16 plaintext blocks, order swapped: 840 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 841 */ 842 CFI_STARTPROC(); 843 844 movq %r8, %rcx; 845 movq CTX, %r8 846 leaq (-8 * 8)(CTX, %rcx, 8), CTX; 847 848 leaq 8 * 16(%rax), %rcx; 849 850 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 851 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 852 %xmm15, %rax, %rcx); 853 854.align 8 855.Ldec_loop: 856 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 857 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 858 %xmm15, %rax, %rcx, 0); 859 860 cmpq %r8, CTX; 861 je .Ldec_done; 862 863 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 864 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 865 %xmm15, 866 ((key_table) + 8)(CTX), 867 ((key_table) + 12)(CTX), 868 ((key_table) + 0)(CTX), 869 ((key_table) + 4)(CTX)); 870 871 leaq (-8 * 8)(CTX), CTX; 872 jmp .Ldec_loop; 873 874.align 8 875.Ldec_done: 876 /* load CD for output */ 877 vmovdqu 0 * 16(%rcx), %xmm8; 878 vmovdqu 1 * 16(%rcx), %xmm9; 879 vmovdqu 2 * 16(%rcx), %xmm10; 880 vmovdqu 3 * 16(%rcx), %xmm11; 881 vmovdqu 4 * 16(%rcx), %xmm12; 882 vmovdqu 5 * 16(%rcx), %xmm13; 883 vmovdqu 6 * 16(%rcx), %xmm14; 884 vmovdqu 7 * 16(%rcx), %xmm15; 885 886 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 887 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 888 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); 889 890 ret; 891 CFI_ENDPROC(); 892ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) 893 894#define inc_le128(x, minus_one, tmp) \ 895 vpcmpeqq minus_one, x, tmp; \ 896 vpsubq minus_one, x, x; \ 897 vpslldq $8, tmp, tmp; \ 898 vpsubq tmp, x, x; 899 900.align 8 901.globl _gcry_camellia_aesni_avx_ctr_enc 902ELF(.type _gcry_camellia_aesni_avx_ctr_enc,@function;) 903 904_gcry_camellia_aesni_avx_ctr_enc: 905 /* input: 906 * %rdi: ctx, CTX 907 * %rsi: dst (16 blocks) 908 * %rdx: src (16 blocks) 909 * %rcx: iv (big endian, 128bit) 910 */ 911 CFI_STARTPROC(); 912 913 pushq %rbp; 914 CFI_PUSH(%rbp); 915 movq %rsp, %rbp; 916 CFI_DEF_CFA_REGISTER(%rbp); 917 918 vzeroupper; 919 920 cmpl $128, key_bitlength(CTX); 921 movl $32, %r8d; 922 movl $24, %eax; 923 cmovel %eax, %r8d; /* max */ 924 925 subq $(16 * 16), %rsp; 926 andq $~31, %rsp; 927 movq %rsp, %rax; 928 929 vmovdqa .Lbswap128_mask rRIP, %xmm14; 930 931 /* load IV and byteswap */ 932 vmovdqu (%rcx), %xmm15; 933 vmovdqu %xmm15, 15 * 16(%rax); 934 vpshufb %xmm14, %xmm15, %xmm0; /* be => le */ 935 936 vpcmpeqd %xmm15, %xmm15, %xmm15; 937 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ 938 939 /* construct IVs */ 940 inc_le128(%xmm0, %xmm15, %xmm13); 941 vpshufb %xmm14, %xmm0, %xmm13; 942 vmovdqu %xmm13, 14 * 16(%rax); 943 inc_le128(%xmm0, %xmm15, %xmm13); 944 vpshufb %xmm14, %xmm0, %xmm13; 945 vmovdqu %xmm13, 13 * 16(%rax); 946 inc_le128(%xmm0, %xmm15, %xmm13); 947 vpshufb %xmm14, %xmm0, %xmm12; 948 inc_le128(%xmm0, %xmm15, %xmm13); 949 vpshufb %xmm14, %xmm0, %xmm11; 950 inc_le128(%xmm0, %xmm15, %xmm13); 951 vpshufb %xmm14, %xmm0, %xmm10; 952 inc_le128(%xmm0, %xmm15, %xmm13); 953 vpshufb %xmm14, %xmm0, %xmm9; 954 inc_le128(%xmm0, %xmm15, %xmm13); 955 vpshufb %xmm14, %xmm0, %xmm8; 956 inc_le128(%xmm0, %xmm15, %xmm13); 957 vpshufb %xmm14, %xmm0, %xmm7; 958 inc_le128(%xmm0, %xmm15, %xmm13); 959 vpshufb %xmm14, %xmm0, %xmm6; 960 inc_le128(%xmm0, %xmm15, %xmm13); 961 vpshufb %xmm14, %xmm0, %xmm5; 962 inc_le128(%xmm0, %xmm15, %xmm13); 963 vpshufb %xmm14, %xmm0, %xmm4; 964 inc_le128(%xmm0, %xmm15, %xmm13); 965 vpshufb %xmm14, %xmm0, %xmm3; 966 inc_le128(%xmm0, %xmm15, %xmm13); 967 vpshufb %xmm14, %xmm0, %xmm2; 968 inc_le128(%xmm0, %xmm15, %xmm13); 969 vpshufb %xmm14, %xmm0, %xmm1; 970 inc_le128(%xmm0, %xmm15, %xmm13); 971 vmovdqa %xmm0, %xmm13; 972 vpshufb %xmm14, %xmm0, %xmm0; 973 inc_le128(%xmm13, %xmm15, %xmm14); 974 vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */ 975 vmovdqu %xmm13, (%rcx); 976 977 /* inpack16_pre: */ 978 vmovq (key_table)(CTX), %xmm15; 979 vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; 980 vpxor %xmm0, %xmm15, %xmm0; 981 vpxor %xmm1, %xmm15, %xmm1; 982 vpxor %xmm2, %xmm15, %xmm2; 983 vpxor %xmm3, %xmm15, %xmm3; 984 vpxor %xmm4, %xmm15, %xmm4; 985 vpxor %xmm5, %xmm15, %xmm5; 986 vpxor %xmm6, %xmm15, %xmm6; 987 vpxor %xmm7, %xmm15, %xmm7; 988 vpxor %xmm8, %xmm15, %xmm8; 989 vpxor %xmm9, %xmm15, %xmm9; 990 vpxor %xmm10, %xmm15, %xmm10; 991 vpxor %xmm11, %xmm15, %xmm11; 992 vpxor %xmm12, %xmm15, %xmm12; 993 vpxor 13 * 16(%rax), %xmm15, %xmm13; 994 vpxor 14 * 16(%rax), %xmm15, %xmm14; 995 vpxor 15 * 16(%rax), %xmm15, %xmm15; 996 997 call __camellia_enc_blk16; 998 999 vpxor 0 * 16(%rdx), %xmm7, %xmm7; 1000 vpxor 1 * 16(%rdx), %xmm6, %xmm6; 1001 vpxor 2 * 16(%rdx), %xmm5, %xmm5; 1002 vpxor 3 * 16(%rdx), %xmm4, %xmm4; 1003 vpxor 4 * 16(%rdx), %xmm3, %xmm3; 1004 vpxor 5 * 16(%rdx), %xmm2, %xmm2; 1005 vpxor 6 * 16(%rdx), %xmm1, %xmm1; 1006 vpxor 7 * 16(%rdx), %xmm0, %xmm0; 1007 vpxor 8 * 16(%rdx), %xmm15, %xmm15; 1008 vpxor 9 * 16(%rdx), %xmm14, %xmm14; 1009 vpxor 10 * 16(%rdx), %xmm13, %xmm13; 1010 vpxor 11 * 16(%rdx), %xmm12, %xmm12; 1011 vpxor 12 * 16(%rdx), %xmm11, %xmm11; 1012 vpxor 13 * 16(%rdx), %xmm10, %xmm10; 1013 vpxor 14 * 16(%rdx), %xmm9, %xmm9; 1014 vpxor 15 * 16(%rdx), %xmm8, %xmm8; 1015 1016 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1017 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1018 %xmm8, %rsi); 1019 1020 vzeroall; 1021 1022 leave; 1023 CFI_LEAVE(); 1024 ret; 1025 CFI_ENDPROC(); 1026ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) 1027 1028.align 8 1029.globl _gcry_camellia_aesni_avx_cbc_dec 1030ELF(.type _gcry_camellia_aesni_avx_cbc_dec,@function;) 1031 1032_gcry_camellia_aesni_avx_cbc_dec: 1033 /* input: 1034 * %rdi: ctx, CTX 1035 * %rsi: dst (16 blocks) 1036 * %rdx: src (16 blocks) 1037 * %rcx: iv 1038 */ 1039 CFI_STARTPROC(); 1040 1041 pushq %rbp; 1042 CFI_PUSH(%rbp); 1043 movq %rsp, %rbp; 1044 CFI_DEF_CFA_REGISTER(%rbp); 1045 1046 vzeroupper; 1047 1048 movq %rcx, %r9; 1049 1050 cmpl $128, key_bitlength(CTX); 1051 movl $32, %r8d; 1052 movl $24, %eax; 1053 cmovel %eax, %r8d; /* max */ 1054 1055 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1056 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1057 %xmm15, %rdx, (key_table)(CTX, %r8, 8)); 1058 1059 subq $(16 * 16), %rsp; 1060 andq $~31, %rsp; 1061 movq %rsp, %rax; 1062 1063 call __camellia_dec_blk16; 1064 1065 /* XOR output with IV */ 1066 vpxor (%r9), %xmm7, %xmm7; 1067 vpxor (0 * 16)(%rdx), %xmm6, %xmm6; 1068 vpxor (1 * 16)(%rdx), %xmm5, %xmm5; 1069 vpxor (2 * 16)(%rdx), %xmm4, %xmm4; 1070 vpxor (3 * 16)(%rdx), %xmm3, %xmm3; 1071 vpxor (4 * 16)(%rdx), %xmm2, %xmm2; 1072 vpxor (5 * 16)(%rdx), %xmm1, %xmm1; 1073 vpxor (6 * 16)(%rdx), %xmm0, %xmm0; 1074 vpxor (7 * 16)(%rdx), %xmm15, %xmm15; 1075 vpxor (8 * 16)(%rdx), %xmm14, %xmm14; 1076 vpxor (9 * 16)(%rdx), %xmm13, %xmm13; 1077 vpxor (10 * 16)(%rdx), %xmm12, %xmm12; 1078 vpxor (11 * 16)(%rdx), %xmm11, %xmm11; 1079 vpxor (12 * 16)(%rdx), %xmm10, %xmm10; 1080 vpxor (13 * 16)(%rdx), %xmm9, %xmm9; 1081 vpxor (14 * 16)(%rdx), %xmm8, %xmm8; 1082 movq (15 * 16 + 0)(%rdx), %r10; 1083 movq (15 * 16 + 8)(%rdx), %r11; 1084 1085 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1086 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1087 %xmm8, %rsi); 1088 1089 /* store new IV */ 1090 movq %r10, (0)(%r9); 1091 movq %r11, (8)(%r9); 1092 1093 vzeroall; 1094 1095 leave; 1096 CFI_LEAVE(); 1097 ret; 1098 CFI_ENDPROC(); 1099ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;) 1100 1101.align 8 1102.globl _gcry_camellia_aesni_avx_cfb_dec 1103ELF(.type _gcry_camellia_aesni_avx_cfb_dec,@function;) 1104 1105_gcry_camellia_aesni_avx_cfb_dec: 1106 /* input: 1107 * %rdi: ctx, CTX 1108 * %rsi: dst (16 blocks) 1109 * %rdx: src (16 blocks) 1110 * %rcx: iv 1111 */ 1112 CFI_STARTPROC(); 1113 1114 pushq %rbp; 1115 CFI_PUSH(%rbp); 1116 movq %rsp, %rbp; 1117 CFI_DEF_CFA_REGISTER(%rbp); 1118 1119 vzeroupper; 1120 1121 cmpl $128, key_bitlength(CTX); 1122 movl $32, %r8d; 1123 movl $24, %eax; 1124 cmovel %eax, %r8d; /* max */ 1125 1126 subq $(16 * 16), %rsp; 1127 andq $~31, %rsp; 1128 movq %rsp, %rax; 1129 1130 /* inpack16_pre: */ 1131 vmovq (key_table)(CTX), %xmm0; 1132 vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0; 1133 vpxor (%rcx), %xmm0, %xmm15; 1134 vmovdqu 15 * 16(%rdx), %xmm1; 1135 vmovdqu %xmm1, (%rcx); /* store new IV */ 1136 vpxor 0 * 16(%rdx), %xmm0, %xmm14; 1137 vpxor 1 * 16(%rdx), %xmm0, %xmm13; 1138 vpxor 2 * 16(%rdx), %xmm0, %xmm12; 1139 vpxor 3 * 16(%rdx), %xmm0, %xmm11; 1140 vpxor 4 * 16(%rdx), %xmm0, %xmm10; 1141 vpxor 5 * 16(%rdx), %xmm0, %xmm9; 1142 vpxor 6 * 16(%rdx), %xmm0, %xmm8; 1143 vpxor 7 * 16(%rdx), %xmm0, %xmm7; 1144 vpxor 8 * 16(%rdx), %xmm0, %xmm6; 1145 vpxor 9 * 16(%rdx), %xmm0, %xmm5; 1146 vpxor 10 * 16(%rdx), %xmm0, %xmm4; 1147 vpxor 11 * 16(%rdx), %xmm0, %xmm3; 1148 vpxor 12 * 16(%rdx), %xmm0, %xmm2; 1149 vpxor 13 * 16(%rdx), %xmm0, %xmm1; 1150 vpxor 14 * 16(%rdx), %xmm0, %xmm0; 1151 1152 call __camellia_enc_blk16; 1153 1154 vpxor 0 * 16(%rdx), %xmm7, %xmm7; 1155 vpxor 1 * 16(%rdx), %xmm6, %xmm6; 1156 vpxor 2 * 16(%rdx), %xmm5, %xmm5; 1157 vpxor 3 * 16(%rdx), %xmm4, %xmm4; 1158 vpxor 4 * 16(%rdx), %xmm3, %xmm3; 1159 vpxor 5 * 16(%rdx), %xmm2, %xmm2; 1160 vpxor 6 * 16(%rdx), %xmm1, %xmm1; 1161 vpxor 7 * 16(%rdx), %xmm0, %xmm0; 1162 vpxor 8 * 16(%rdx), %xmm15, %xmm15; 1163 vpxor 9 * 16(%rdx), %xmm14, %xmm14; 1164 vpxor 10 * 16(%rdx), %xmm13, %xmm13; 1165 vpxor 11 * 16(%rdx), %xmm12, %xmm12; 1166 vpxor 12 * 16(%rdx), %xmm11, %xmm11; 1167 vpxor 13 * 16(%rdx), %xmm10, %xmm10; 1168 vpxor 14 * 16(%rdx), %xmm9, %xmm9; 1169 vpxor 15 * 16(%rdx), %xmm8, %xmm8; 1170 1171 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1172 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1173 %xmm8, %rsi); 1174 1175 vzeroall; 1176 1177 leave; 1178 CFI_LEAVE(); 1179 ret; 1180 CFI_ENDPROC(); 1181ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) 1182 1183.align 8 1184.globl _gcry_camellia_aesni_avx_ocb_enc 1185ELF(.type _gcry_camellia_aesni_avx_ocb_enc,@function;) 1186 1187_gcry_camellia_aesni_avx_ocb_enc: 1188 /* input: 1189 * %rdi: ctx, CTX 1190 * %rsi: dst (16 blocks) 1191 * %rdx: src (16 blocks) 1192 * %rcx: offset 1193 * %r8 : checksum 1194 * %r9 : L pointers (void *L[16]) 1195 */ 1196 CFI_STARTPROC(); 1197 1198 pushq %rbp; 1199 CFI_PUSH(%rbp); 1200 movq %rsp, %rbp; 1201 CFI_DEF_CFA_REGISTER(%rbp); 1202 1203 vzeroupper; 1204 1205 subq $(16 * 16 + 4 * 8), %rsp; 1206 andq $~31, %rsp; 1207 movq %rsp, %rax; 1208 1209 movq %r10, (16 * 16 + 0 * 8)(%rsp); 1210 movq %r11, (16 * 16 + 1 * 8)(%rsp); 1211 movq %r12, (16 * 16 + 2 * 8)(%rsp); 1212 movq %r13, (16 * 16 + 3 * 8)(%rsp); 1213 CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); 1214 CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); 1215 CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); 1216 CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); 1217 1218 vmovdqu (%rcx), %xmm14; 1219 vmovdqu (%r8), %xmm15; 1220 1221 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ 1222 /* Checksum_i = Checksum_{i-1} xor P_i */ 1223 /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ 1224 1225#define OCB_INPUT(n, lreg, xreg) \ 1226 vmovdqu (n * 16)(%rdx), xreg; \ 1227 vpxor (lreg), %xmm14, %xmm14; \ 1228 vpxor xreg, %xmm15, %xmm15; \ 1229 vpxor xreg, %xmm14, xreg; \ 1230 vmovdqu %xmm14, (n * 16)(%rsi); 1231 movq (0 * 8)(%r9), %r10; 1232 movq (1 * 8)(%r9), %r11; 1233 movq (2 * 8)(%r9), %r12; 1234 movq (3 * 8)(%r9), %r13; 1235 OCB_INPUT(0, %r10, %xmm0); 1236 vmovdqu %xmm0, (15 * 16)(%rax); 1237 OCB_INPUT(1, %r11, %xmm0); 1238 vmovdqu %xmm0, (14 * 16)(%rax); 1239 OCB_INPUT(2, %r12, %xmm13); 1240 OCB_INPUT(3, %r13, %xmm12); 1241 movq (4 * 8)(%r9), %r10; 1242 movq (5 * 8)(%r9), %r11; 1243 movq (6 * 8)(%r9), %r12; 1244 movq (7 * 8)(%r9), %r13; 1245 OCB_INPUT(4, %r10, %xmm11); 1246 OCB_INPUT(5, %r11, %xmm10); 1247 OCB_INPUT(6, %r12, %xmm9); 1248 OCB_INPUT(7, %r13, %xmm8); 1249 movq (8 * 8)(%r9), %r10; 1250 movq (9 * 8)(%r9), %r11; 1251 movq (10 * 8)(%r9), %r12; 1252 movq (11 * 8)(%r9), %r13; 1253 OCB_INPUT(8, %r10, %xmm7); 1254 OCB_INPUT(9, %r11, %xmm6); 1255 OCB_INPUT(10, %r12, %xmm5); 1256 OCB_INPUT(11, %r13, %xmm4); 1257 movq (12 * 8)(%r9), %r10; 1258 movq (13 * 8)(%r9), %r11; 1259 movq (14 * 8)(%r9), %r12; 1260 movq (15 * 8)(%r9), %r13; 1261 OCB_INPUT(12, %r10, %xmm3); 1262 OCB_INPUT(13, %r11, %xmm2); 1263 OCB_INPUT(14, %r12, %xmm1); 1264 OCB_INPUT(15, %r13, %xmm0); 1265#undef OCB_INPUT 1266 1267 vmovdqu %xmm14, (%rcx); 1268 vmovdqu %xmm15, (%r8); 1269 1270 cmpl $128, key_bitlength(CTX); 1271 movl $32, %r8d; 1272 movl $24, %r10d; 1273 cmovel %r10d, %r8d; /* max */ 1274 1275 /* inpack16_pre: */ 1276 vmovq (key_table)(CTX), %xmm15; 1277 vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; 1278 vpxor %xmm0, %xmm15, %xmm0; 1279 vpxor %xmm1, %xmm15, %xmm1; 1280 vpxor %xmm2, %xmm15, %xmm2; 1281 vpxor %xmm3, %xmm15, %xmm3; 1282 vpxor %xmm4, %xmm15, %xmm4; 1283 vpxor %xmm5, %xmm15, %xmm5; 1284 vpxor %xmm6, %xmm15, %xmm6; 1285 vpxor %xmm7, %xmm15, %xmm7; 1286 vpxor %xmm8, %xmm15, %xmm8; 1287 vpxor %xmm9, %xmm15, %xmm9; 1288 vpxor %xmm10, %xmm15, %xmm10; 1289 vpxor %xmm11, %xmm15, %xmm11; 1290 vpxor %xmm12, %xmm15, %xmm12; 1291 vpxor %xmm13, %xmm15, %xmm13; 1292 vpxor 14 * 16(%rax), %xmm15, %xmm14; 1293 vpxor 15 * 16(%rax), %xmm15, %xmm15; 1294 1295 call __camellia_enc_blk16; 1296 1297 vpxor 0 * 16(%rsi), %xmm7, %xmm7; 1298 vpxor 1 * 16(%rsi), %xmm6, %xmm6; 1299 vpxor 2 * 16(%rsi), %xmm5, %xmm5; 1300 vpxor 3 * 16(%rsi), %xmm4, %xmm4; 1301 vpxor 4 * 16(%rsi), %xmm3, %xmm3; 1302 vpxor 5 * 16(%rsi), %xmm2, %xmm2; 1303 vpxor 6 * 16(%rsi), %xmm1, %xmm1; 1304 vpxor 7 * 16(%rsi), %xmm0, %xmm0; 1305 vpxor 8 * 16(%rsi), %xmm15, %xmm15; 1306 vpxor 9 * 16(%rsi), %xmm14, %xmm14; 1307 vpxor 10 * 16(%rsi), %xmm13, %xmm13; 1308 vpxor 11 * 16(%rsi), %xmm12, %xmm12; 1309 vpxor 12 * 16(%rsi), %xmm11, %xmm11; 1310 vpxor 13 * 16(%rsi), %xmm10, %xmm10; 1311 vpxor 14 * 16(%rsi), %xmm9, %xmm9; 1312 vpxor 15 * 16(%rsi), %xmm8, %xmm8; 1313 1314 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1315 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1316 %xmm8, %rsi); 1317 1318 vzeroall; 1319 1320 movq (16 * 16 + 0 * 8)(%rsp), %r10; 1321 movq (16 * 16 + 1 * 8)(%rsp), %r11; 1322 movq (16 * 16 + 2 * 8)(%rsp), %r12; 1323 movq (16 * 16 + 3 * 8)(%rsp), %r13; 1324 CFI_RESTORE(%r10); 1325 CFI_RESTORE(%r11); 1326 CFI_RESTORE(%r12); 1327 CFI_RESTORE(%r13); 1328 1329 leave; 1330 CFI_LEAVE(); 1331 ret; 1332 CFI_ENDPROC(); 1333ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;) 1334 1335.align 8 1336.globl _gcry_camellia_aesni_avx_ocb_dec 1337ELF(.type _gcry_camellia_aesni_avx_ocb_dec,@function;) 1338 1339_gcry_camellia_aesni_avx_ocb_dec: 1340 /* input: 1341 * %rdi: ctx, CTX 1342 * %rsi: dst (16 blocks) 1343 * %rdx: src (16 blocks) 1344 * %rcx: offset 1345 * %r8 : checksum 1346 * %r9 : L pointers (void *L[16]) 1347 */ 1348 CFI_STARTPROC(); 1349 1350 pushq %rbp; 1351 CFI_PUSH(%rbp); 1352 movq %rsp, %rbp; 1353 CFI_DEF_CFA_REGISTER(%rbp); 1354 1355 vzeroupper; 1356 1357 subq $(16 * 16 + 4 * 8), %rsp; 1358 andq $~31, %rsp; 1359 movq %rsp, %rax; 1360 1361 movq %r10, (16 * 16 + 0 * 8)(%rsp); 1362 movq %r11, (16 * 16 + 1 * 8)(%rsp); 1363 movq %r12, (16 * 16 + 2 * 8)(%rsp); 1364 movq %r13, (16 * 16 + 3 * 8)(%rsp); 1365 CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); 1366 CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); 1367 CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); 1368 CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); 1369 1370 vmovdqu (%rcx), %xmm15; 1371 1372 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ 1373 /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ 1374 1375#define OCB_INPUT(n, lreg, xreg) \ 1376 vmovdqu (n * 16)(%rdx), xreg; \ 1377 vpxor (lreg), %xmm15, %xmm15; \ 1378 vpxor xreg, %xmm15, xreg; \ 1379 vmovdqu %xmm15, (n * 16)(%rsi); 1380 movq (0 * 8)(%r9), %r10; 1381 movq (1 * 8)(%r9), %r11; 1382 movq (2 * 8)(%r9), %r12; 1383 movq (3 * 8)(%r9), %r13; 1384 OCB_INPUT(0, %r10, %xmm0); 1385 vmovdqu %xmm0, (15 * 16)(%rax); 1386 OCB_INPUT(1, %r11, %xmm14); 1387 OCB_INPUT(2, %r12, %xmm13); 1388 OCB_INPUT(3, %r13, %xmm12); 1389 movq (4 * 8)(%r9), %r10; 1390 movq (5 * 8)(%r9), %r11; 1391 movq (6 * 8)(%r9), %r12; 1392 movq (7 * 8)(%r9), %r13; 1393 OCB_INPUT(4, %r10, %xmm11); 1394 OCB_INPUT(5, %r11, %xmm10); 1395 OCB_INPUT(6, %r12, %xmm9); 1396 OCB_INPUT(7, %r13, %xmm8); 1397 movq (8 * 8)(%r9), %r10; 1398 movq (9 * 8)(%r9), %r11; 1399 movq (10 * 8)(%r9), %r12; 1400 movq (11 * 8)(%r9), %r13; 1401 OCB_INPUT(8, %r10, %xmm7); 1402 OCB_INPUT(9, %r11, %xmm6); 1403 OCB_INPUT(10, %r12, %xmm5); 1404 OCB_INPUT(11, %r13, %xmm4); 1405 movq (12 * 8)(%r9), %r10; 1406 movq (13 * 8)(%r9), %r11; 1407 movq (14 * 8)(%r9), %r12; 1408 movq (15 * 8)(%r9), %r13; 1409 OCB_INPUT(12, %r10, %xmm3); 1410 OCB_INPUT(13, %r11, %xmm2); 1411 OCB_INPUT(14, %r12, %xmm1); 1412 OCB_INPUT(15, %r13, %xmm0); 1413#undef OCB_INPUT 1414 1415 vmovdqu %xmm15, (%rcx); 1416 1417 movq %r8, %r10; 1418 1419 cmpl $128, key_bitlength(CTX); 1420 movl $32, %r8d; 1421 movl $24, %r9d; 1422 cmovel %r9d, %r8d; /* max */ 1423 1424 /* inpack16_pre: */ 1425 vmovq (key_table)(CTX, %r8, 8), %xmm15; 1426 vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; 1427 vpxor %xmm0, %xmm15, %xmm0; 1428 vpxor %xmm1, %xmm15, %xmm1; 1429 vpxor %xmm2, %xmm15, %xmm2; 1430 vpxor %xmm3, %xmm15, %xmm3; 1431 vpxor %xmm4, %xmm15, %xmm4; 1432 vpxor %xmm5, %xmm15, %xmm5; 1433 vpxor %xmm6, %xmm15, %xmm6; 1434 vpxor %xmm7, %xmm15, %xmm7; 1435 vpxor %xmm8, %xmm15, %xmm8; 1436 vpxor %xmm9, %xmm15, %xmm9; 1437 vpxor %xmm10, %xmm15, %xmm10; 1438 vpxor %xmm11, %xmm15, %xmm11; 1439 vpxor %xmm12, %xmm15, %xmm12; 1440 vpxor %xmm13, %xmm15, %xmm13; 1441 vpxor %xmm14, %xmm15, %xmm14; 1442 vpxor 15 * 16(%rax), %xmm15, %xmm15; 1443 1444 call __camellia_dec_blk16; 1445 1446 vpxor 0 * 16(%rsi), %xmm7, %xmm7; 1447 vpxor 1 * 16(%rsi), %xmm6, %xmm6; 1448 vpxor 2 * 16(%rsi), %xmm5, %xmm5; 1449 vpxor 3 * 16(%rsi), %xmm4, %xmm4; 1450 vpxor 4 * 16(%rsi), %xmm3, %xmm3; 1451 vpxor 5 * 16(%rsi), %xmm2, %xmm2; 1452 vpxor 6 * 16(%rsi), %xmm1, %xmm1; 1453 vpxor 7 * 16(%rsi), %xmm0, %xmm0; 1454 vmovdqu %xmm7, (7 * 16)(%rax); 1455 vpxor 8 * 16(%rsi), %xmm15, %xmm15; 1456 vpxor 9 * 16(%rsi), %xmm14, %xmm14; 1457 vpxor 10 * 16(%rsi), %xmm13, %xmm13; 1458 vpxor 11 * 16(%rsi), %xmm12, %xmm12; 1459 vpxor 12 * 16(%rsi), %xmm11, %xmm11; 1460 vpxor 13 * 16(%rsi), %xmm10, %xmm10; 1461 vpxor 14 * 16(%rsi), %xmm9, %xmm9; 1462 vpxor 15 * 16(%rsi), %xmm8, %xmm8; 1463 1464 /* Checksum_i = Checksum_{i-1} xor P_i */ 1465 1466 vpxor (%r10), %xmm7, %xmm7; 1467 vpxor %xmm6, %xmm7, %xmm7; 1468 vpxor %xmm5, %xmm7, %xmm7; 1469 vpxor %xmm4, %xmm7, %xmm7; 1470 vpxor %xmm3, %xmm7, %xmm7; 1471 vpxor %xmm2, %xmm7, %xmm7; 1472 vpxor %xmm1, %xmm7, %xmm7; 1473 vpxor %xmm0, %xmm7, %xmm7; 1474 vpxor %xmm15, %xmm7, %xmm7; 1475 vpxor %xmm14, %xmm7, %xmm7; 1476 vpxor %xmm13, %xmm7, %xmm7; 1477 vpxor %xmm12, %xmm7, %xmm7; 1478 vpxor %xmm11, %xmm7, %xmm7; 1479 vpxor %xmm10, %xmm7, %xmm7; 1480 vpxor %xmm9, %xmm7, %xmm7; 1481 vpxor %xmm8, %xmm7, %xmm7; 1482 vmovdqu %xmm7, (%r10); 1483 vmovdqu (7 * 16)(%rax), %xmm7; 1484 1485 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1486 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1487 %xmm8, %rsi); 1488 1489 vzeroall; 1490 1491 movq (16 * 16 + 0 * 8)(%rsp), %r10; 1492 movq (16 * 16 + 1 * 8)(%rsp), %r11; 1493 movq (16 * 16 + 2 * 8)(%rsp), %r12; 1494 movq (16 * 16 + 3 * 8)(%rsp), %r13; 1495 CFI_RESTORE(%r10); 1496 CFI_RESTORE(%r11); 1497 CFI_RESTORE(%r12); 1498 CFI_RESTORE(%r13); 1499 1500 leave; 1501 CFI_LEAVE(); 1502 ret; 1503 CFI_ENDPROC(); 1504ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;) 1505 1506.align 8 1507.globl _gcry_camellia_aesni_avx_ocb_auth 1508ELF(.type _gcry_camellia_aesni_avx_ocb_auth,@function;) 1509 1510_gcry_camellia_aesni_avx_ocb_auth: 1511 /* input: 1512 * %rdi: ctx, CTX 1513 * %rsi: abuf (16 blocks) 1514 * %rdx: offset 1515 * %rcx: checksum 1516 * %r8 : L pointers (void *L[16]) 1517 */ 1518 CFI_STARTPROC(); 1519 1520 pushq %rbp; 1521 CFI_PUSH(%rbp); 1522 movq %rsp, %rbp; 1523 CFI_DEF_CFA_REGISTER(%rbp); 1524 1525 vzeroupper; 1526 1527 subq $(16 * 16 + 4 * 8), %rsp; 1528 andq $~31, %rsp; 1529 movq %rsp, %rax; 1530 1531 movq %r10, (16 * 16 + 0 * 8)(%rsp); 1532 movq %r11, (16 * 16 + 1 * 8)(%rsp); 1533 movq %r12, (16 * 16 + 2 * 8)(%rsp); 1534 movq %r13, (16 * 16 + 3 * 8)(%rsp); 1535 CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); 1536 CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); 1537 CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); 1538 CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); 1539 1540 vmovdqu (%rdx), %xmm15; 1541 1542 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ 1543 /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ 1544 1545#define OCB_INPUT(n, lreg, xreg) \ 1546 vmovdqu (n * 16)(%rsi), xreg; \ 1547 vpxor (lreg), %xmm15, %xmm15; \ 1548 vpxor xreg, %xmm15, xreg; 1549 1550 movq (0 * 8)(%r8), %r10; 1551 movq (1 * 8)(%r8), %r11; 1552 movq (2 * 8)(%r8), %r12; 1553 movq (3 * 8)(%r8), %r13; 1554 OCB_INPUT(0, %r10, %xmm0); 1555 vmovdqu %xmm0, (15 * 16)(%rax); 1556 OCB_INPUT(1, %r11, %xmm14); 1557 OCB_INPUT(2, %r12, %xmm13); 1558 OCB_INPUT(3, %r13, %xmm12); 1559 movq (4 * 8)(%r8), %r10; 1560 movq (5 * 8)(%r8), %r11; 1561 movq (6 * 8)(%r8), %r12; 1562 movq (7 * 8)(%r8), %r13; 1563 OCB_INPUT(4, %r10, %xmm11); 1564 OCB_INPUT(5, %r11, %xmm10); 1565 OCB_INPUT(6, %r12, %xmm9); 1566 OCB_INPUT(7, %r13, %xmm8); 1567 movq (8 * 8)(%r8), %r10; 1568 movq (9 * 8)(%r8), %r11; 1569 movq (10 * 8)(%r8), %r12; 1570 movq (11 * 8)(%r8), %r13; 1571 OCB_INPUT(8, %r10, %xmm7); 1572 OCB_INPUT(9, %r11, %xmm6); 1573 OCB_INPUT(10, %r12, %xmm5); 1574 OCB_INPUT(11, %r13, %xmm4); 1575 movq (12 * 8)(%r8), %r10; 1576 movq (13 * 8)(%r8), %r11; 1577 movq (14 * 8)(%r8), %r12; 1578 movq (15 * 8)(%r8), %r13; 1579 OCB_INPUT(12, %r10, %xmm3); 1580 OCB_INPUT(13, %r11, %xmm2); 1581 OCB_INPUT(14, %r12, %xmm1); 1582 OCB_INPUT(15, %r13, %xmm0); 1583#undef OCB_INPUT 1584 1585 cmpl $128, key_bitlength(CTX); 1586 movl $32, %r8d; 1587 movl $24, %r10d; 1588 cmovel %r10d, %r8d; /* max */ 1589 1590 vmovdqu %xmm15, (%rdx); 1591 1592 movq %rcx, %r10; 1593 1594 /* inpack16_pre: */ 1595 vmovq (key_table)(CTX), %xmm15; 1596 vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; 1597 vpxor %xmm0, %xmm15, %xmm0; 1598 vpxor %xmm1, %xmm15, %xmm1; 1599 vpxor %xmm2, %xmm15, %xmm2; 1600 vpxor %xmm3, %xmm15, %xmm3; 1601 vpxor %xmm4, %xmm15, %xmm4; 1602 vpxor %xmm5, %xmm15, %xmm5; 1603 vpxor %xmm6, %xmm15, %xmm6; 1604 vpxor %xmm7, %xmm15, %xmm7; 1605 vpxor %xmm8, %xmm15, %xmm8; 1606 vpxor %xmm9, %xmm15, %xmm9; 1607 vpxor %xmm10, %xmm15, %xmm10; 1608 vpxor %xmm11, %xmm15, %xmm11; 1609 vpxor %xmm12, %xmm15, %xmm12; 1610 vpxor %xmm13, %xmm15, %xmm13; 1611 vpxor %xmm14, %xmm15, %xmm14; 1612 vpxor 15 * 16(%rax), %xmm15, %xmm15; 1613 1614 call __camellia_enc_blk16; 1615 1616 vpxor %xmm7, %xmm6, %xmm6; 1617 vpxor %xmm5, %xmm4, %xmm4; 1618 vpxor %xmm3, %xmm2, %xmm2; 1619 vpxor %xmm1, %xmm0, %xmm0; 1620 vpxor %xmm15, %xmm14, %xmm14; 1621 vpxor %xmm13, %xmm12, %xmm12; 1622 vpxor %xmm11, %xmm10, %xmm10; 1623 vpxor %xmm9, %xmm8, %xmm8; 1624 1625 vpxor %xmm6, %xmm4, %xmm4; 1626 vpxor %xmm2, %xmm0, %xmm0; 1627 vpxor %xmm14, %xmm12, %xmm12; 1628 vpxor %xmm10, %xmm8, %xmm8; 1629 1630 vpxor %xmm4, %xmm0, %xmm0; 1631 vpxor %xmm12, %xmm8, %xmm8; 1632 1633 vpxor %xmm0, %xmm8, %xmm0; 1634 vpxor (%r10), %xmm0, %xmm0; 1635 vmovdqu %xmm0, (%r10); 1636 1637 vzeroall; 1638 1639 movq (16 * 16 + 0 * 8)(%rsp), %r10; 1640 movq (16 * 16 + 1 * 8)(%rsp), %r11; 1641 movq (16 * 16 + 2 * 8)(%rsp), %r12; 1642 movq (16 * 16 + 3 * 8)(%rsp), %r13; 1643 CFI_RESTORE(%r10); 1644 CFI_RESTORE(%r11); 1645 CFI_RESTORE(%r12); 1646 CFI_RESTORE(%r13); 1647 1648 leave; 1649 CFI_LEAVE(); 1650 ret; 1651 CFI_ENDPROC(); 1652ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;) 1653 1654/* 1655 * IN: 1656 * ab: 64-bit AB state 1657 * cd: 64-bit CD state 1658 */ 1659#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \ 1660 _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \ 1661 vmovq key, t0; \ 1662 vpxor x, x, t3; \ 1663 \ 1664 vpxor ab, t0, x; \ 1665 \ 1666 /* \ 1667 * S-function with AES subbytes \ 1668 */ \ 1669 \ 1670 /* input rotation for sbox4 (<<< 1) */ \ 1671 vpand x, sbox4mask, t0; \ 1672 vpandn x, sbox4mask, x; \ 1673 vpaddw t0, t0, t1; \ 1674 vpsrlw $7, t0, t0; \ 1675 vpor t0, t1, t0; \ 1676 vpand sbox4mask, t0, t0; \ 1677 vpor t0, x, x; \ 1678 \ 1679 vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \ 1680 vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \ 1681 \ 1682 /* prefilter sboxes */ \ 1683 filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \ 1684 \ 1685 /* AES subbytes + AES shift rows + AES inv shift rows */ \ 1686 vaesenclast t3, x, x; \ 1687 \ 1688 /* postfilter sboxes */ \ 1689 filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \ 1690 \ 1691 /* output rotation for sbox2 (<<< 1) */ \ 1692 /* output rotation for sbox3 (>>> 1) */ \ 1693 vpshufb inv_shift_row, x, t1; \ 1694 vpshufb .Lsp0044440444044404mask rRIP, x, t4; \ 1695 vpshufb .Lsp1110111010011110mask rRIP, x, x; \ 1696 vpaddb t1, t1, t2; \ 1697 vpsrlw $7, t1, t0; \ 1698 vpsllw $7, t1, t3; \ 1699 vpor t0, t2, t0; \ 1700 vpsrlw $1, t1, t1; \ 1701 vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \ 1702 vpor t1, t3, t1; \ 1703 \ 1704 vpxor x, t4, t4; \ 1705 vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \ 1706 vpxor t4, t0, t0; \ 1707 vpxor t1, t0, t0; \ 1708 vpsrldq $8, t0, x; \ 1709 vpxor t0, x, x; 1710 1711#define vec_rol128(in, out, nrol, t0) \ 1712 vpshufd $0x4e, in, out; \ 1713 vpsllq $(nrol), in, t0; \ 1714 vpsrlq $(64-(nrol)), out, out; \ 1715 vpaddd t0, out, out; 1716 1717#define vec_ror128(in, out, nror, t0) \ 1718 vpshufd $0x4e, in, out; \ 1719 vpsrlq $(nror), in, t0; \ 1720 vpsllq $(64-(nror)), out, out; \ 1721 vpaddd t0, out, out; 1722 1723 1724.align 16 1725.Linv_shift_row_and_unpcklbw: 1726 .byte 0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff 1727 .byte 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff 1728.Lsp0044440444044404mask: 1729 .long 0xffff0404, 0x0404ff04; 1730 .long 0x0d0dff0d, 0x0d0dff0d; 1731.Lsp1110111010011110mask: 1732 .long 0x000000ff, 0x000000ff; 1733 .long 0x0bffff0b, 0x0b0b0bff; 1734.Lsp0222022222000222mask: 1735 .long 0xff060606, 0xff060606; 1736 .long 0x0c0cffff, 0xff0c0c0c; 1737.Lsp3033303303303033mask: 1738 .long 0x04ff0404, 0x04ff0404; 1739 .long 0xff0a0aff, 0x0aff0a0a; 1740.Lsbox4_input_mask: 1741 .byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00; 1742.Lsigma1: 1743 .long 0x3BCC908B, 0xA09E667F; 1744.Lsigma2: 1745 .long 0x4CAA73B2, 0xB67AE858; 1746.Lsigma3: 1747 .long 0xE94F82BE, 0xC6EF372F; 1748.Lsigma4: 1749 .long 0xF1D36F1C, 0x54FF53A5; 1750.Lsigma5: 1751 .long 0xDE682D1D, 0x10E527FA; 1752.Lsigma6: 1753 .long 0xB3E6C1FD, 0xB05688C2; 1754 1755 1756.align 8 1757ELF(.type __camellia_avx_setup128,@function;) 1758__camellia_avx_setup128: 1759 /* input: 1760 * %rdi: ctx, CTX; subkey storage at key_table(CTX) 1761 * %xmm0: key 1762 */ 1763 CFI_STARTPROC(); 1764 1765#define cmll_sub(n, ctx) (key_table+((n)*8))(ctx) 1766#define KL128 %xmm0 1767#define KA128 %xmm2 1768 1769 vpshufb .Lbswap128_mask rRIP, KL128, KL128; 1770 1771 vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11; 1772 vmovq .Lsbox4_input_mask rRIP, %xmm12; 1773 vbroadcastss .L0f0f0f0f rRIP, %xmm13; 1774 vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14; 1775 vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15; 1776 1777 /* 1778 * Generate KA 1779 */ 1780 vpsrldq $8, KL128, %xmm2; 1781 vmovdqa KL128, %xmm3; 1782 vpslldq $8, %xmm3, %xmm3; 1783 vpsrldq $8, %xmm3, %xmm3; 1784 1785 camellia_f(%xmm2, %xmm4, %xmm1, 1786 %xmm5, %xmm6, %xmm7, %xmm8, 1787 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP); 1788 vpxor %xmm4, %xmm3, %xmm3; 1789 camellia_f(%xmm3, %xmm2, %xmm1, 1790 %xmm5, %xmm6, %xmm7, %xmm8, 1791 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP); 1792 camellia_f(%xmm2, %xmm3, %xmm1, 1793 %xmm5, %xmm6, %xmm7, %xmm8, 1794 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP); 1795 vpxor %xmm4, %xmm3, %xmm3; 1796 camellia_f(%xmm3, %xmm4, %xmm1, 1797 %xmm5, %xmm6, %xmm7, %xmm8, 1798 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP); 1799 1800 vpslldq $8, %xmm3, %xmm3; 1801 vpxor %xmm4, %xmm2, %xmm2; 1802 vpsrldq $8, %xmm3, %xmm3; 1803 vpslldq $8, %xmm2, KA128; 1804 vpor %xmm3, KA128, KA128; 1805 1806 /* 1807 * Generate subkeys 1808 */ 1809 vmovdqu KA128, cmll_sub(24, CTX); 1810 vec_rol128(KL128, %xmm3, 15, %xmm15); 1811 vec_rol128(KA128, %xmm4, 15, %xmm15); 1812 vec_rol128(KA128, %xmm5, 30, %xmm15); 1813 vec_rol128(KL128, %xmm6, 45, %xmm15); 1814 vec_rol128(KA128, %xmm7, 45, %xmm15); 1815 vec_rol128(KL128, %xmm8, 60, %xmm15); 1816 vec_rol128(KA128, %xmm9, 60, %xmm15); 1817 vec_ror128(KL128, %xmm10, 128-77, %xmm15); 1818 1819 /* absorb kw2 to other subkeys */ 1820 vpslldq $8, KL128, %xmm15; 1821 vpsrldq $8, %xmm15, %xmm15; 1822 vpxor %xmm15, KA128, KA128; 1823 vpxor %xmm15, %xmm3, %xmm3; 1824 vpxor %xmm15, %xmm4, %xmm4; 1825 1826 /* subl(1) ^= subr(1) & ~subr(9); */ 1827 vpandn %xmm15, %xmm5, %xmm13; 1828 vpslldq $12, %xmm13, %xmm13; 1829 vpsrldq $8, %xmm13, %xmm13; 1830 vpxor %xmm13, %xmm15, %xmm15; 1831 /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */ 1832 vpand %xmm15, %xmm5, %xmm14; 1833 vpslld $1, %xmm14, %xmm11; 1834 vpsrld $31, %xmm14, %xmm14; 1835 vpaddd %xmm11, %xmm14, %xmm14; 1836 vpslldq $8, %xmm14, %xmm14; 1837 vpsrldq $12, %xmm14, %xmm14; 1838 vpxor %xmm14, %xmm15, %xmm15; 1839 1840 vpxor %xmm15, %xmm6, %xmm6; 1841 vpxor %xmm15, %xmm8, %xmm8; 1842 vpxor %xmm15, %xmm9, %xmm9; 1843 1844 /* subl(1) ^= subr(1) & ~subr(17); */ 1845 vpandn %xmm15, %xmm10, %xmm13; 1846 vpslldq $12, %xmm13, %xmm13; 1847 vpsrldq $8, %xmm13, %xmm13; 1848 vpxor %xmm13, %xmm15, %xmm15; 1849 /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */ 1850 vpand %xmm15, %xmm10, %xmm14; 1851 vpslld $1, %xmm14, %xmm11; 1852 vpsrld $31, %xmm14, %xmm14; 1853 vpaddd %xmm11, %xmm14, %xmm14; 1854 vpslldq $8, %xmm14, %xmm14; 1855 vpsrldq $12, %xmm14, %xmm14; 1856 vpxor %xmm14, %xmm15, %xmm15; 1857 1858 vpshufd $0x1b, KL128, KL128; 1859 vpshufd $0x1b, KA128, KA128; 1860 vpshufd $0x1b, %xmm3, %xmm3; 1861 vpshufd $0x1b, %xmm4, %xmm4; 1862 vpshufd $0x1b, %xmm5, %xmm5; 1863 vpshufd $0x1b, %xmm6, %xmm6; 1864 vpshufd $0x1b, %xmm7, %xmm7; 1865 vpshufd $0x1b, %xmm8, %xmm8; 1866 vpshufd $0x1b, %xmm9, %xmm9; 1867 vpshufd $0x1b, %xmm10, %xmm10; 1868 1869 vmovdqu KL128, cmll_sub(0, CTX); 1870 vpshufd $0x1b, KL128, KL128; 1871 vmovdqu KA128, cmll_sub(2, CTX); 1872 vmovdqu %xmm3, cmll_sub(4, CTX); 1873 vmovdqu %xmm4, cmll_sub(6, CTX); 1874 vmovdqu %xmm5, cmll_sub(8, CTX); 1875 vmovdqu %xmm6, cmll_sub(10, CTX); 1876 vpsrldq $8, %xmm8, %xmm8; 1877 vmovq %xmm7, cmll_sub(12, CTX); 1878 vmovq %xmm8, cmll_sub(13, CTX); 1879 vmovdqu %xmm9, cmll_sub(14, CTX); 1880 vmovdqu %xmm10, cmll_sub(16, CTX); 1881 1882 vmovdqu cmll_sub(24, CTX), KA128; 1883 1884 vec_ror128(KL128, %xmm3, 128 - 94, %xmm7); 1885 vec_ror128(KA128, %xmm4, 128 - 94, %xmm7); 1886 vec_ror128(KL128, %xmm5, 128 - 111, %xmm7); 1887 vec_ror128(KA128, %xmm6, 128 - 111, %xmm7); 1888 1889 vpxor %xmm15, %xmm3, %xmm3; 1890 vpxor %xmm15, %xmm4, %xmm4; 1891 vpxor %xmm15, %xmm5, %xmm5; 1892 vpslldq $8, %xmm15, %xmm15; 1893 vpxor %xmm15, %xmm6, %xmm6; 1894 1895 /* absorb kw4 to other subkeys */ 1896 vpslldq $8, %xmm6, %xmm15; 1897 vpxor %xmm15, %xmm5, %xmm5; 1898 vpxor %xmm15, %xmm4, %xmm4; 1899 vpxor %xmm15, %xmm3, %xmm3; 1900 1901 /* subl(25) ^= subr(25) & ~subr(16); */ 1902 vpshufd $0x1b, cmll_sub(16, CTX), %xmm10; 1903 vpandn %xmm15, %xmm10, %xmm13; 1904 vpslldq $4, %xmm13, %xmm13; 1905 vpxor %xmm13, %xmm15, %xmm15; 1906 /* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */ 1907 vpand %xmm15, %xmm10, %xmm14; 1908 vpslld $1, %xmm14, %xmm11; 1909 vpsrld $31, %xmm14, %xmm14; 1910 vpaddd %xmm11, %xmm14, %xmm14; 1911 vpsrldq $12, %xmm14, %xmm14; 1912 vpslldq $8, %xmm14, %xmm14; 1913 vpxor %xmm14, %xmm15, %xmm15; 1914 1915 vpshufd $0x1b, %xmm3, %xmm3; 1916 vpshufd $0x1b, %xmm4, %xmm4; 1917 vpshufd $0x1b, %xmm5, %xmm5; 1918 vpshufd $0x1b, %xmm6, %xmm6; 1919 1920 vmovdqu %xmm3, cmll_sub(18, CTX); 1921 vmovdqu %xmm4, cmll_sub(20, CTX); 1922 vmovdqu %xmm5, cmll_sub(22, CTX); 1923 vmovdqu %xmm6, cmll_sub(24, CTX); 1924 1925 vpshufd $0x1b, cmll_sub(14, CTX), %xmm3; 1926 vpshufd $0x1b, cmll_sub(12, CTX), %xmm4; 1927 vpshufd $0x1b, cmll_sub(10, CTX), %xmm5; 1928 vpshufd $0x1b, cmll_sub(8, CTX), %xmm6; 1929 1930 vpxor %xmm15, %xmm3, %xmm3; 1931 vpxor %xmm15, %xmm4, %xmm4; 1932 vpxor %xmm15, %xmm5, %xmm5; 1933 1934 /* subl(25) ^= subr(25) & ~subr(8); */ 1935 vpandn %xmm15, %xmm6, %xmm13; 1936 vpslldq $4, %xmm13, %xmm13; 1937 vpxor %xmm13, %xmm15, %xmm15; 1938 /* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */ 1939 vpand %xmm15, %xmm6, %xmm14; 1940 vpslld $1, %xmm14, %xmm11; 1941 vpsrld $31, %xmm14, %xmm14; 1942 vpaddd %xmm11, %xmm14, %xmm14; 1943 vpsrldq $12, %xmm14, %xmm14; 1944 vpslldq $8, %xmm14, %xmm14; 1945 vpxor %xmm14, %xmm15, %xmm15; 1946 1947 vpshufd $0x1b, %xmm3, %xmm3; 1948 vpshufd $0x1b, %xmm4, %xmm4; 1949 vpshufd $0x1b, %xmm5, %xmm5; 1950 1951 vmovdqu %xmm3, cmll_sub(14, CTX); 1952 vmovdqu %xmm4, cmll_sub(12, CTX); 1953 vmovdqu %xmm5, cmll_sub(10, CTX); 1954 1955 vpshufd $0x1b, cmll_sub(6, CTX), %xmm6; 1956 vpshufd $0x1b, cmll_sub(4, CTX), %xmm4; 1957 vpshufd $0x1b, cmll_sub(2, CTX), %xmm2; 1958 vpshufd $0x1b, cmll_sub(0, CTX), %xmm0; 1959 1960 vpxor %xmm15, %xmm6, %xmm6; 1961 vpxor %xmm15, %xmm4, %xmm4; 1962 vpxor %xmm15, %xmm2, %xmm2; 1963 vpxor %xmm15, %xmm0, %xmm0; 1964 1965 vpshufd $0x1b, %xmm6, %xmm6; 1966 vpshufd $0x1b, %xmm4, %xmm4; 1967 vpshufd $0x1b, %xmm2, %xmm2; 1968 vpshufd $0x1b, %xmm0, %xmm0; 1969 1970 vpsrldq $8, %xmm2, %xmm3; 1971 vpsrldq $8, %xmm4, %xmm5; 1972 vpsrldq $8, %xmm6, %xmm7; 1973 1974 /* 1975 * key XOR is end of F-function. 1976 */ 1977 vpxor %xmm2, %xmm0, %xmm0; 1978 vpxor %xmm4, %xmm2, %xmm2; 1979 1980 vmovq %xmm0, cmll_sub(0, CTX); 1981 vmovq %xmm3, cmll_sub(2, CTX); 1982 vpxor %xmm5, %xmm3, %xmm3; 1983 vpxor %xmm6, %xmm4, %xmm4; 1984 vpxor %xmm7, %xmm5, %xmm5; 1985 vmovq %xmm2, cmll_sub(3, CTX); 1986 vmovq %xmm3, cmll_sub(4, CTX); 1987 vmovq %xmm4, cmll_sub(5, CTX); 1988 vmovq %xmm5, cmll_sub(6, CTX); 1989 1990 vmovq cmll_sub(7, CTX), %xmm7; 1991 vmovq cmll_sub(8, CTX), %xmm8; 1992 vmovq cmll_sub(9, CTX), %xmm9; 1993 vmovq cmll_sub(10, CTX), %xmm10; 1994 /* tl = subl(10) ^ (subr(10) & ~subr(8)); */ 1995 vpandn %xmm10, %xmm8, %xmm15; 1996 vpsrldq $4, %xmm15, %xmm15; 1997 vpxor %xmm15, %xmm10, %xmm0; 1998 /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */ 1999 vpand %xmm8, %xmm0, %xmm15; 2000 vpslld $1, %xmm15, %xmm14; 2001 vpsrld $31, %xmm15, %xmm15; 2002 vpaddd %xmm14, %xmm15, %xmm15; 2003 vpslldq $12, %xmm15, %xmm15; 2004 vpsrldq $8, %xmm15, %xmm15; 2005 vpxor %xmm15, %xmm0, %xmm0; 2006 2007 vpxor %xmm0, %xmm6, %xmm6; 2008 vmovq %xmm6, cmll_sub(7, CTX); 2009 2010 vmovq cmll_sub(11, CTX), %xmm11; 2011 vmovq cmll_sub(12, CTX), %xmm12; 2012 vmovq cmll_sub(13, CTX), %xmm13; 2013 vmovq cmll_sub(14, CTX), %xmm14; 2014 vmovq cmll_sub(15, CTX), %xmm15; 2015 /* tl = subl(7) ^ (subr(7) & ~subr(9)); */ 2016 vpandn %xmm7, %xmm9, %xmm1; 2017 vpsrldq $4, %xmm1, %xmm1; 2018 vpxor %xmm1, %xmm7, %xmm0; 2019 /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */ 2020 vpand %xmm9, %xmm0, %xmm1; 2021 vpslld $1, %xmm1, %xmm2; 2022 vpsrld $31, %xmm1, %xmm1; 2023 vpaddd %xmm2, %xmm1, %xmm1; 2024 vpslldq $12, %xmm1, %xmm1; 2025 vpsrldq $8, %xmm1, %xmm1; 2026 vpxor %xmm1, %xmm0, %xmm0; 2027 2028 vpxor %xmm11, %xmm0, %xmm0; 2029 vpxor %xmm12, %xmm10, %xmm10; 2030 vpxor %xmm13, %xmm11, %xmm11; 2031 vpxor %xmm14, %xmm12, %xmm12; 2032 vpxor %xmm15, %xmm13, %xmm13; 2033 vmovq %xmm0, cmll_sub(10, CTX); 2034 vmovq %xmm10, cmll_sub(11, CTX); 2035 vmovq %xmm11, cmll_sub(12, CTX); 2036 vmovq %xmm12, cmll_sub(13, CTX); 2037 vmovq %xmm13, cmll_sub(14, CTX); 2038 2039 vmovq cmll_sub(16, CTX), %xmm6; 2040 vmovq cmll_sub(17, CTX), %xmm7; 2041 vmovq cmll_sub(18, CTX), %xmm8; 2042 vmovq cmll_sub(19, CTX), %xmm9; 2043 vmovq cmll_sub(20, CTX), %xmm10; 2044 /* tl = subl(18) ^ (subr(18) & ~subr(16)); */ 2045 vpandn %xmm8, %xmm6, %xmm1; 2046 vpsrldq $4, %xmm1, %xmm1; 2047 vpxor %xmm1, %xmm8, %xmm0; 2048 /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */ 2049 vpand %xmm6, %xmm0, %xmm1; 2050 vpslld $1, %xmm1, %xmm2; 2051 vpsrld $31, %xmm1, %xmm1; 2052 vpaddd %xmm2, %xmm1, %xmm1; 2053 vpslldq $12, %xmm1, %xmm1; 2054 vpsrldq $8, %xmm1, %xmm1; 2055 vpxor %xmm1, %xmm0, %xmm0; 2056 2057 vpxor %xmm14, %xmm0, %xmm0; 2058 vmovq %xmm0, cmll_sub(15, CTX); 2059 2060 /* tl = subl(15) ^ (subr(15) & ~subr(17)); */ 2061 vpandn %xmm15, %xmm7, %xmm1; 2062 vpsrldq $4, %xmm1, %xmm1; 2063 vpxor %xmm1, %xmm15, %xmm0; 2064 /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */ 2065 vpand %xmm7, %xmm0, %xmm1; 2066 vpslld $1, %xmm1, %xmm2; 2067 vpsrld $31, %xmm1, %xmm1; 2068 vpaddd %xmm2, %xmm1, %xmm1; 2069 vpslldq $12, %xmm1, %xmm1; 2070 vpsrldq $8, %xmm1, %xmm1; 2071 vpxor %xmm1, %xmm0, %xmm0; 2072 2073 vmovq cmll_sub(21, CTX), %xmm1; 2074 vmovq cmll_sub(22, CTX), %xmm2; 2075 vmovq cmll_sub(23, CTX), %xmm3; 2076 vmovq cmll_sub(24, CTX), %xmm4; 2077 2078 vpxor %xmm9, %xmm0, %xmm0; 2079 vpxor %xmm10, %xmm8, %xmm8; 2080 vpxor %xmm1, %xmm9, %xmm9; 2081 vpxor %xmm2, %xmm10, %xmm10; 2082 vpxor %xmm3, %xmm1, %xmm1; 2083 vpxor %xmm4, %xmm3, %xmm3; 2084 2085 vmovq %xmm0, cmll_sub(18, CTX); 2086 vmovq %xmm8, cmll_sub(19, CTX); 2087 vmovq %xmm9, cmll_sub(20, CTX); 2088 vmovq %xmm10, cmll_sub(21, CTX); 2089 vmovq %xmm1, cmll_sub(22, CTX); 2090 vmovq %xmm2, cmll_sub(23, CTX); 2091 vmovq %xmm3, cmll_sub(24, CTX); 2092 2093 /* kw2 and kw4 are unused now. */ 2094 movq $0, cmll_sub(1, CTX); 2095 movq $0, cmll_sub(25, CTX); 2096 2097 vzeroall; 2098 2099 ret; 2100 CFI_ENDPROC(); 2101ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;) 2102 2103.align 8 2104ELF(.type __camellia_avx_setup256,@function;) 2105 2106__camellia_avx_setup256: 2107 /* input: 2108 * %rdi: ctx, CTX; subkey storage at key_table(CTX) 2109 * %xmm0 & %xmm1: key 2110 */ 2111 CFI_STARTPROC(); 2112 2113#define KL128 %xmm0 2114#define KR128 %xmm1 2115#define KA128 %xmm2 2116#define KB128 %xmm3 2117 2118 vpshufb .Lbswap128_mask rRIP, KL128, KL128; 2119 vpshufb .Lbswap128_mask rRIP, KR128, KR128; 2120 2121 vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11; 2122 vmovq .Lsbox4_input_mask rRIP, %xmm12; 2123 vbroadcastss .L0f0f0f0f rRIP, %xmm13; 2124 vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14; 2125 vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15; 2126 2127 /* 2128 * Generate KA 2129 */ 2130 vpxor KL128, KR128, %xmm3; 2131 vpsrldq $8, KR128, %xmm6; 2132 vpsrldq $8, %xmm3, %xmm2; 2133 vpslldq $8, %xmm3, %xmm3; 2134 vpsrldq $8, %xmm3, %xmm3; 2135 2136 camellia_f(%xmm2, %xmm4, %xmm5, 2137 %xmm7, %xmm8, %xmm9, %xmm10, 2138 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP); 2139 vpxor %xmm4, %xmm3, %xmm3; 2140 camellia_f(%xmm3, %xmm2, %xmm5, 2141 %xmm7, %xmm8, %xmm9, %xmm10, 2142 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP); 2143 vpxor %xmm6, %xmm2, %xmm2; 2144 camellia_f(%xmm2, %xmm3, %xmm5, 2145 %xmm7, %xmm8, %xmm9, %xmm10, 2146 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP); 2147 vpxor %xmm4, %xmm3, %xmm3; 2148 vpxor KR128, %xmm3, %xmm3; 2149 camellia_f(%xmm3, %xmm4, %xmm5, 2150 %xmm7, %xmm8, %xmm9, %xmm10, 2151 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP); 2152 2153 vpslldq $8, %xmm3, %xmm3; 2154 vpxor %xmm4, %xmm2, %xmm2; 2155 vpsrldq $8, %xmm3, %xmm3; 2156 vpslldq $8, %xmm2, KA128; 2157 vpor %xmm3, KA128, KA128; 2158 2159 /* 2160 * Generate KB 2161 */ 2162 vpxor KA128, KR128, %xmm3; 2163 vpsrldq $8, %xmm3, %xmm4; 2164 vpslldq $8, %xmm3, %xmm3; 2165 vpsrldq $8, %xmm3, %xmm3; 2166 2167 camellia_f(%xmm4, %xmm5, %xmm6, 2168 %xmm7, %xmm8, %xmm9, %xmm10, 2169 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP); 2170 vpxor %xmm5, %xmm3, %xmm3; 2171 2172 camellia_f(%xmm3, %xmm5, %xmm6, 2173 %xmm7, %xmm8, %xmm9, %xmm10, 2174 %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP); 2175 vpslldq $8, %xmm3, %xmm3; 2176 vpxor %xmm5, %xmm4, %xmm4; 2177 vpsrldq $8, %xmm3, %xmm3; 2178 vpslldq $8, %xmm4, %xmm4; 2179 vpor %xmm3, %xmm4, KB128; 2180 2181 /* 2182 * Generate subkeys 2183 */ 2184 vmovdqu KB128, cmll_sub(32, CTX); 2185 vec_rol128(KR128, %xmm4, 15, %xmm15); 2186 vec_rol128(KA128, %xmm5, 15, %xmm15); 2187 vec_rol128(KR128, %xmm6, 30, %xmm15); 2188 vec_rol128(KB128, %xmm7, 30, %xmm15); 2189 vec_rol128(KL128, %xmm8, 45, %xmm15); 2190 vec_rol128(KA128, %xmm9, 45, %xmm15); 2191 vec_rol128(KL128, %xmm10, 60, %xmm15); 2192 vec_rol128(KR128, %xmm11, 60, %xmm15); 2193 vec_rol128(KB128, %xmm12, 60, %xmm15); 2194 2195 /* absorb kw2 to other subkeys */ 2196 vpslldq $8, KL128, %xmm15; 2197 vpsrldq $8, %xmm15, %xmm15; 2198 vpxor %xmm15, KB128, KB128; 2199 vpxor %xmm15, %xmm4, %xmm4; 2200 vpxor %xmm15, %xmm5, %xmm5; 2201 2202 /* subl(1) ^= subr(1) & ~subr(9); */ 2203 vpandn %xmm15, %xmm6, %xmm13; 2204 vpslldq $12, %xmm13, %xmm13; 2205 vpsrldq $8, %xmm13, %xmm13; 2206 vpxor %xmm13, %xmm15, %xmm15; 2207 /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */ 2208 vpand %xmm15, %xmm6, %xmm14; 2209 vpslld $1, %xmm14, %xmm13; 2210 vpsrld $31, %xmm14, %xmm14; 2211 vpaddd %xmm13, %xmm14, %xmm14; 2212 vpslldq $8, %xmm14, %xmm14; 2213 vpsrldq $12, %xmm14, %xmm14; 2214 vpxor %xmm14, %xmm15, %xmm15; 2215 2216 vpxor %xmm15, %xmm7, %xmm7; 2217 vpxor %xmm15, %xmm8, %xmm8; 2218 vpxor %xmm15, %xmm9, %xmm9; 2219 2220 vpshufd $0x1b, KL128, KL128; 2221 vpshufd $0x1b, KB128, KB128; 2222 vpshufd $0x1b, %xmm4, %xmm4; 2223 vpshufd $0x1b, %xmm5, %xmm5; 2224 vpshufd $0x1b, %xmm6, %xmm6; 2225 vpshufd $0x1b, %xmm7, %xmm7; 2226 vpshufd $0x1b, %xmm8, %xmm8; 2227 vpshufd $0x1b, %xmm9, %xmm9; 2228 2229 vmovdqu KL128, cmll_sub(0, CTX); 2230 vpshufd $0x1b, KL128, KL128; 2231 vmovdqu KB128, cmll_sub(2, CTX); 2232 vmovdqu %xmm4, cmll_sub(4, CTX); 2233 vmovdqu %xmm5, cmll_sub(6, CTX); 2234 vmovdqu %xmm6, cmll_sub(8, CTX); 2235 vmovdqu %xmm7, cmll_sub(10, CTX); 2236 vmovdqu %xmm8, cmll_sub(12, CTX); 2237 vmovdqu %xmm9, cmll_sub(14, CTX); 2238 2239 vmovdqu cmll_sub(32, CTX), KB128; 2240 2241 /* subl(1) ^= subr(1) & ~subr(17); */ 2242 vpandn %xmm15, %xmm10, %xmm13; 2243 vpslldq $12, %xmm13, %xmm13; 2244 vpsrldq $8, %xmm13, %xmm13; 2245 vpxor %xmm13, %xmm15, %xmm15; 2246 /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */ 2247 vpand %xmm15, %xmm10, %xmm14; 2248 vpslld $1, %xmm14, %xmm13; 2249 vpsrld $31, %xmm14, %xmm14; 2250 vpaddd %xmm13, %xmm14, %xmm14; 2251 vpslldq $8, %xmm14, %xmm14; 2252 vpsrldq $12, %xmm14, %xmm14; 2253 vpxor %xmm14, %xmm15, %xmm15; 2254 2255 vpxor %xmm15, %xmm11, %xmm11; 2256 vpxor %xmm15, %xmm12, %xmm12; 2257 2258 vec_ror128(KL128, %xmm4, 128-77, %xmm14); 2259 vec_ror128(KA128, %xmm5, 128-77, %xmm14); 2260 vec_ror128(KR128, %xmm6, 128-94, %xmm14); 2261 vec_ror128(KA128, %xmm7, 128-94, %xmm14); 2262 vec_ror128(KL128, %xmm8, 128-111, %xmm14); 2263 vec_ror128(KB128, %xmm9, 128-111, %xmm14); 2264 2265 vpxor %xmm15, %xmm4, %xmm4; 2266 2267 vpshufd $0x1b, %xmm10, %xmm10; 2268 vpshufd $0x1b, %xmm11, %xmm11; 2269 vpshufd $0x1b, %xmm12, %xmm12; 2270 vpshufd $0x1b, %xmm4, %xmm4; 2271 2272 vmovdqu %xmm10, cmll_sub(16, CTX); 2273 vmovdqu %xmm11, cmll_sub(18, CTX); 2274 vmovdqu %xmm12, cmll_sub(20, CTX); 2275 vmovdqu %xmm4, cmll_sub(22, CTX); 2276 2277 /* subl(1) ^= subr(1) & ~subr(25); */ 2278 vpandn %xmm15, %xmm5, %xmm13; 2279 vpslldq $12, %xmm13, %xmm13; 2280 vpsrldq $8, %xmm13, %xmm13; 2281 vpxor %xmm13, %xmm15, %xmm15; 2282 /* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */ 2283 vpand %xmm15, %xmm5, %xmm14; 2284 vpslld $1, %xmm14, %xmm13; 2285 vpsrld $31, %xmm14, %xmm14; 2286 vpaddd %xmm13, %xmm14, %xmm14; 2287 vpslldq $8, %xmm14, %xmm14; 2288 vpsrldq $12, %xmm14, %xmm14; 2289 vpxor %xmm14, %xmm15, %xmm15; 2290 2291 vpxor %xmm15, %xmm6, %xmm6; 2292 vpxor %xmm15, %xmm7, %xmm7; 2293 vpxor %xmm15, %xmm8, %xmm8; 2294 vpslldq $8, %xmm15, %xmm15; 2295 vpxor %xmm15, %xmm9, %xmm9; 2296 2297 /* absorb kw4 to other subkeys */ 2298 vpslldq $8, %xmm9, %xmm15; 2299 vpxor %xmm15, %xmm8, %xmm8; 2300 vpxor %xmm15, %xmm7, %xmm7; 2301 vpxor %xmm15, %xmm6, %xmm6; 2302 2303 /* subl(33) ^= subr(33) & ~subr(24); */ 2304 vpandn %xmm15, %xmm5, %xmm14; 2305 vpslldq $4, %xmm14, %xmm14; 2306 vpxor %xmm14, %xmm15, %xmm15; 2307 /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */ 2308 vpand %xmm15, %xmm5, %xmm14; 2309 vpslld $1, %xmm14, %xmm13; 2310 vpsrld $31, %xmm14, %xmm14; 2311 vpaddd %xmm13, %xmm14, %xmm14; 2312 vpsrldq $12, %xmm14, %xmm14; 2313 vpslldq $8, %xmm14, %xmm14; 2314 vpxor %xmm14, %xmm15, %xmm15; 2315 2316 vpshufd $0x1b, %xmm5, %xmm5; 2317 vpshufd $0x1b, %xmm6, %xmm6; 2318 vpshufd $0x1b, %xmm7, %xmm7; 2319 vpshufd $0x1b, %xmm8, %xmm8; 2320 vpshufd $0x1b, %xmm9, %xmm9; 2321 2322 vmovdqu %xmm5, cmll_sub(24, CTX); 2323 vmovdqu %xmm6, cmll_sub(26, CTX); 2324 vmovdqu %xmm7, cmll_sub(28, CTX); 2325 vmovdqu %xmm8, cmll_sub(30, CTX); 2326 vmovdqu %xmm9, cmll_sub(32, CTX); 2327 2328 vpshufd $0x1b, cmll_sub(22, CTX), %xmm0; 2329 vpshufd $0x1b, cmll_sub(20, CTX), %xmm1; 2330 vpshufd $0x1b, cmll_sub(18, CTX), %xmm2; 2331 vpshufd $0x1b, cmll_sub(16, CTX), %xmm3; 2332 vpshufd $0x1b, cmll_sub(14, CTX), %xmm4; 2333 vpshufd $0x1b, cmll_sub(12, CTX), %xmm5; 2334 vpshufd $0x1b, cmll_sub(10, CTX), %xmm6; 2335 vpshufd $0x1b, cmll_sub(8, CTX), %xmm7; 2336 2337 vpxor %xmm15, %xmm0, %xmm0; 2338 vpxor %xmm15, %xmm1, %xmm1; 2339 vpxor %xmm15, %xmm2, %xmm2; 2340 2341 /* subl(33) ^= subr(33) & ~subr(24); */ 2342 vpandn %xmm15, %xmm3, %xmm14; 2343 vpslldq $4, %xmm14, %xmm14; 2344 vpxor %xmm14, %xmm15, %xmm15; 2345 /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */ 2346 vpand %xmm15, %xmm3, %xmm14; 2347 vpslld $1, %xmm14, %xmm13; 2348 vpsrld $31, %xmm14, %xmm14; 2349 vpaddd %xmm13, %xmm14, %xmm14; 2350 vpsrldq $12, %xmm14, %xmm14; 2351 vpslldq $8, %xmm14, %xmm14; 2352 vpxor %xmm14, %xmm15, %xmm15; 2353 2354 vpxor %xmm15, %xmm4, %xmm4; 2355 vpxor %xmm15, %xmm5, %xmm5; 2356 vpxor %xmm15, %xmm6, %xmm6; 2357 2358 vpshufd $0x1b, %xmm0, %xmm0; 2359 vpshufd $0x1b, %xmm1, %xmm1; 2360 vpshufd $0x1b, %xmm2, %xmm2; 2361 vpshufd $0x1b, %xmm4, %xmm4; 2362 vpshufd $0x1b, %xmm5, %xmm5; 2363 vpshufd $0x1b, %xmm6, %xmm6; 2364 2365 vmovdqu %xmm0, cmll_sub(22, CTX); 2366 vmovdqu %xmm1, cmll_sub(20, CTX); 2367 vmovdqu %xmm2, cmll_sub(18, CTX); 2368 vmovdqu %xmm4, cmll_sub(14, CTX); 2369 vmovdqu %xmm5, cmll_sub(12, CTX); 2370 vmovdqu %xmm6, cmll_sub(10, CTX); 2371 2372 vpshufd $0x1b, cmll_sub(6, CTX), %xmm6; 2373 vpshufd $0x1b, cmll_sub(4, CTX), %xmm4; 2374 vpshufd $0x1b, cmll_sub(2, CTX), %xmm2; 2375 vpshufd $0x1b, cmll_sub(0, CTX), %xmm0; 2376 2377 /* subl(33) ^= subr(33) & ~subr(24); */ 2378 vpandn %xmm15, %xmm7, %xmm14; 2379 vpslldq $4, %xmm14, %xmm14; 2380 vpxor %xmm14, %xmm15, %xmm15; 2381 /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */ 2382 vpand %xmm15, %xmm7, %xmm14; 2383 vpslld $1, %xmm14, %xmm13; 2384 vpsrld $31, %xmm14, %xmm14; 2385 vpaddd %xmm13, %xmm14, %xmm14; 2386 vpsrldq $12, %xmm14, %xmm14; 2387 vpslldq $8, %xmm14, %xmm14; 2388 vpxor %xmm14, %xmm15, %xmm15; 2389 2390 vpxor %xmm15, %xmm6, %xmm6; 2391 vpxor %xmm15, %xmm4, %xmm4; 2392 vpxor %xmm15, %xmm2, %xmm2; 2393 vpxor %xmm15, %xmm0, %xmm0; 2394 2395 vpshufd $0x1b, %xmm6, %xmm6; 2396 vpshufd $0x1b, %xmm4, %xmm4; 2397 vpshufd $0x1b, %xmm2, %xmm2; 2398 vpshufd $0x1b, %xmm0, %xmm0; 2399 2400 vpsrldq $8, %xmm2, %xmm3; 2401 vpsrldq $8, %xmm4, %xmm5; 2402 vpsrldq $8, %xmm6, %xmm7; 2403 2404 /* 2405 * key XOR is end of F-function. 2406 */ 2407 vpxor %xmm2, %xmm0, %xmm0; 2408 vpxor %xmm4, %xmm2, %xmm2; 2409 2410 vmovq %xmm0, cmll_sub(0, CTX); 2411 vmovq %xmm3, cmll_sub(2, CTX); 2412 vpxor %xmm5, %xmm3, %xmm3; 2413 vpxor %xmm6, %xmm4, %xmm4; 2414 vpxor %xmm7, %xmm5, %xmm5; 2415 vmovq %xmm2, cmll_sub(3, CTX); 2416 vmovq %xmm3, cmll_sub(4, CTX); 2417 vmovq %xmm4, cmll_sub(5, CTX); 2418 vmovq %xmm5, cmll_sub(6, CTX); 2419 2420 vmovq cmll_sub(7, CTX), %xmm7; 2421 vmovq cmll_sub(8, CTX), %xmm8; 2422 vmovq cmll_sub(9, CTX), %xmm9; 2423 vmovq cmll_sub(10, CTX), %xmm10; 2424 /* tl = subl(10) ^ (subr(10) & ~subr(8)); */ 2425 vpandn %xmm10, %xmm8, %xmm15; 2426 vpsrldq $4, %xmm15, %xmm15; 2427 vpxor %xmm15, %xmm10, %xmm0; 2428 /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */ 2429 vpand %xmm8, %xmm0, %xmm15; 2430 vpslld $1, %xmm15, %xmm14; 2431 vpsrld $31, %xmm15, %xmm15; 2432 vpaddd %xmm14, %xmm15, %xmm15; 2433 vpslldq $12, %xmm15, %xmm15; 2434 vpsrldq $8, %xmm15, %xmm15; 2435 vpxor %xmm15, %xmm0, %xmm0; 2436 2437 vpxor %xmm0, %xmm6, %xmm6; 2438 vmovq %xmm6, cmll_sub(7, CTX); 2439 2440 vmovq cmll_sub(11, CTX), %xmm11; 2441 vmovq cmll_sub(12, CTX), %xmm12; 2442 vmovq cmll_sub(13, CTX), %xmm13; 2443 vmovq cmll_sub(14, CTX), %xmm14; 2444 vmovq cmll_sub(15, CTX), %xmm15; 2445 /* tl = subl(7) ^ (subr(7) & ~subr(9)); */ 2446 vpandn %xmm7, %xmm9, %xmm1; 2447 vpsrldq $4, %xmm1, %xmm1; 2448 vpxor %xmm1, %xmm7, %xmm0; 2449 /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */ 2450 vpand %xmm9, %xmm0, %xmm1; 2451 vpslld $1, %xmm1, %xmm2; 2452 vpsrld $31, %xmm1, %xmm1; 2453 vpaddd %xmm2, %xmm1, %xmm1; 2454 vpslldq $12, %xmm1, %xmm1; 2455 vpsrldq $8, %xmm1, %xmm1; 2456 vpxor %xmm1, %xmm0, %xmm0; 2457 2458 vpxor %xmm11, %xmm0, %xmm0; 2459 vpxor %xmm12, %xmm10, %xmm10; 2460 vpxor %xmm13, %xmm11, %xmm11; 2461 vpxor %xmm14, %xmm12, %xmm12; 2462 vpxor %xmm15, %xmm13, %xmm13; 2463 vmovq %xmm0, cmll_sub(10, CTX); 2464 vmovq %xmm10, cmll_sub(11, CTX); 2465 vmovq %xmm11, cmll_sub(12, CTX); 2466 vmovq %xmm12, cmll_sub(13, CTX); 2467 vmovq %xmm13, cmll_sub(14, CTX); 2468 2469 vmovq cmll_sub(16, CTX), %xmm6; 2470 vmovq cmll_sub(17, CTX), %xmm7; 2471 vmovq cmll_sub(18, CTX), %xmm8; 2472 vmovq cmll_sub(19, CTX), %xmm9; 2473 vmovq cmll_sub(20, CTX), %xmm10; 2474 /* tl = subl(18) ^ (subr(18) & ~subr(16)); */ 2475 vpandn %xmm8, %xmm6, %xmm1; 2476 vpsrldq $4, %xmm1, %xmm1; 2477 vpxor %xmm1, %xmm8, %xmm0; 2478 /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */ 2479 vpand %xmm6, %xmm0, %xmm1; 2480 vpslld $1, %xmm1, %xmm2; 2481 vpsrld $31, %xmm1, %xmm1; 2482 vpaddd %xmm2, %xmm1, %xmm1; 2483 vpslldq $12, %xmm1, %xmm1; 2484 vpsrldq $8, %xmm1, %xmm1; 2485 vpxor %xmm1, %xmm0, %xmm0; 2486 2487 vpxor %xmm14, %xmm0, %xmm0; 2488 vmovq %xmm0, cmll_sub(15, CTX); 2489 2490 /* tl = subl(15) ^ (subr(15) & ~subr(17)); */ 2491 vpandn %xmm15, %xmm7, %xmm1; 2492 vpsrldq $4, %xmm1, %xmm1; 2493 vpxor %xmm1, %xmm15, %xmm0; 2494 /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */ 2495 vpand %xmm7, %xmm0, %xmm1; 2496 vpslld $1, %xmm1, %xmm2; 2497 vpsrld $31, %xmm1, %xmm1; 2498 vpaddd %xmm2, %xmm1, %xmm1; 2499 vpslldq $12, %xmm1, %xmm1; 2500 vpsrldq $8, %xmm1, %xmm1; 2501 vpxor %xmm1, %xmm0, %xmm0; 2502 2503 vmovq cmll_sub(21, CTX), %xmm1; 2504 vmovq cmll_sub(22, CTX), %xmm2; 2505 vmovq cmll_sub(23, CTX), %xmm3; 2506 vmovq cmll_sub(24, CTX), %xmm4; 2507 2508 vpxor %xmm9, %xmm0, %xmm0; 2509 vpxor %xmm10, %xmm8, %xmm8; 2510 vpxor %xmm1, %xmm9, %xmm9; 2511 vpxor %xmm2, %xmm10, %xmm10; 2512 vpxor %xmm3, %xmm1, %xmm1; 2513 2514 vmovq %xmm0, cmll_sub(18, CTX); 2515 vmovq %xmm8, cmll_sub(19, CTX); 2516 vmovq %xmm9, cmll_sub(20, CTX); 2517 vmovq %xmm10, cmll_sub(21, CTX); 2518 vmovq %xmm1, cmll_sub(22, CTX); 2519 2520 vmovq cmll_sub(25, CTX), %xmm5; 2521 vmovq cmll_sub(26, CTX), %xmm6; 2522 vmovq cmll_sub(27, CTX), %xmm7; 2523 vmovq cmll_sub(28, CTX), %xmm8; 2524 vmovq cmll_sub(29, CTX), %xmm9; 2525 vmovq cmll_sub(30, CTX), %xmm10; 2526 vmovq cmll_sub(31, CTX), %xmm11; 2527 vmovq cmll_sub(32, CTX), %xmm12; 2528 2529 /* tl = subl(26) ^ (subr(26) & ~subr(24)); */ 2530 vpandn %xmm6, %xmm4, %xmm15; 2531 vpsrldq $4, %xmm15, %xmm15; 2532 vpxor %xmm15, %xmm6, %xmm0; 2533 /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */ 2534 vpand %xmm4, %xmm0, %xmm15; 2535 vpslld $1, %xmm15, %xmm14; 2536 vpsrld $31, %xmm15, %xmm15; 2537 vpaddd %xmm14, %xmm15, %xmm15; 2538 vpslldq $12, %xmm15, %xmm15; 2539 vpsrldq $8, %xmm15, %xmm15; 2540 vpxor %xmm15, %xmm0, %xmm0; 2541 2542 vpxor %xmm0, %xmm2, %xmm2; 2543 vmovq %xmm2, cmll_sub(23, CTX); 2544 2545 /* tl = subl(23) ^ (subr(23) & ~subr(25)); */ 2546 vpandn %xmm3, %xmm5, %xmm15; 2547 vpsrldq $4, %xmm15, %xmm15; 2548 vpxor %xmm15, %xmm3, %xmm0; 2549 /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */ 2550 vpand %xmm5, %xmm0, %xmm15; 2551 vpslld $1, %xmm15, %xmm14; 2552 vpsrld $31, %xmm15, %xmm15; 2553 vpaddd %xmm14, %xmm15, %xmm15; 2554 vpslldq $12, %xmm15, %xmm15; 2555 vpsrldq $8, %xmm15, %xmm15; 2556 vpxor %xmm15, %xmm0, %xmm0; 2557 2558 vpxor %xmm7, %xmm0, %xmm0; 2559 vpxor %xmm8, %xmm6, %xmm6; 2560 vpxor %xmm9, %xmm7, %xmm7; 2561 vpxor %xmm10, %xmm8, %xmm8; 2562 vpxor %xmm11, %xmm9, %xmm9; 2563 vpxor %xmm12, %xmm11, %xmm11; 2564 2565 vmovq %xmm0, cmll_sub(26, CTX); 2566 vmovq %xmm6, cmll_sub(27, CTX); 2567 vmovq %xmm7, cmll_sub(28, CTX); 2568 vmovq %xmm8, cmll_sub(29, CTX); 2569 vmovq %xmm9, cmll_sub(30, CTX); 2570 vmovq %xmm10, cmll_sub(31, CTX); 2571 vmovq %xmm11, cmll_sub(32, CTX); 2572 2573 /* kw2 and kw4 are unused now. */ 2574 movq $0, cmll_sub(1, CTX); 2575 movq $0, cmll_sub(33, CTX); 2576 2577 vzeroall; 2578 2579 ret; 2580 CFI_ENDPROC(); 2581ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;) 2582 2583.align 8 2584.globl _gcry_camellia_aesni_avx_keygen 2585ELF(.type _gcry_camellia_aesni_avx_keygen,@function;) 2586 2587_gcry_camellia_aesni_avx_keygen: 2588 /* input: 2589 * %rdi: ctx, CTX 2590 * %rsi: key 2591 * %rdx: keylen 2592 */ 2593 CFI_STARTPROC(); 2594 2595 vzeroupper; 2596 2597 vmovdqu (%rsi), %xmm0; 2598 cmpl $24, %edx; 2599 jb __camellia_avx_setup128; 2600 je .Lprepare_key192; 2601 2602 vmovdqu 16(%rsi), %xmm1; 2603 jmp __camellia_avx_setup256; 2604 2605.Lprepare_key192: 2606 vpcmpeqd %xmm2, %xmm2, %xmm2; 2607 vmovq 16(%rsi), %xmm1; 2608 2609 vpxor %xmm1, %xmm2, %xmm2; 2610 vpslldq $8, %xmm2, %xmm2; 2611 vpor %xmm2, %xmm1, %xmm1; 2612 2613 jmp __camellia_avx_setup256; 2614 CFI_ENDPROC(); 2615ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;) 2616 2617#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ 2618#endif /*__x86_64*/ 2619