1/* rinjdael-amd64.S - AMD64 assembly implementation of AES cipher 2 * 3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This file is part of Libgcrypt. 6 * 7 * Libgcrypt is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU Lesser General Public License as 9 * published by the Free Software Foundation; either version 2.1 of 10 * the License, or (at your option) any later version. 11 * 12 * Libgcrypt is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21#ifdef __x86_64 22#include <config.h> 23#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ 24 defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES) 25 26#include "asm-common-amd64.h" 27 28.text 29 30/* table macros */ 31#define E0 (0) 32#define Es0 (1) 33#define Esize 4 34#define Essize 4 35 36#define D0 (0) 37#define Ds0 (4 * 256) 38#define Dsize 4 39#define Dssize 1 40 41/* register macros */ 42#define CTX %rdi 43#define RTAB %r12 44 45#define RA %rax 46#define RB %rbx 47#define RC %rcx 48#define RD %rdx 49 50#define RAd %eax 51#define RBd %ebx 52#define RCd %ecx 53#define RDd %edx 54 55#define RAbl %al 56#define RBbl %bl 57#define RCbl %cl 58#define RDbl %dl 59 60#define RAbh %ah 61#define RBbh %bh 62#define RCbh %ch 63#define RDbh %dh 64 65#define RNA %r8 66#define RNB %r9 67#define RNC %r10 68#define RND %r11 69 70#define RNAd %r8d 71#define RNBd %r9d 72#define RNCd %r10d 73#define RNDd %r11d 74 75#define RT0 %rbp 76#define RT1 %rsi 77 78#define RT0d %ebp 79#define RT1d %esi 80 81/* helper macros */ 82#define do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ 83 movzbl source ## bl, t0 ## d; \ 84 movzbl source ## bh, t1 ## d; \ 85 op ## l table1(RTAB,t0,tablemul), dest1 ## d; \ 86 op ## l table2(RTAB,t1,tablemul), dest2 ## d; 87 88#define do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ 89 movzbl source ## bl, t0 ## d; \ 90 movzbl source ## bh, t1 ## d; \ 91 shrl $(shf), source ## d; \ 92 op ## l table1(RTAB,t0,tablemul), dest1 ## d; \ 93 op ## l table2(RTAB,t1,tablemul), dest2 ## d; 94 95#define last_do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ 96 movzbl source ## bl, t0 ## d; \ 97 movzbl source ## bh, t1 ## d; \ 98 movzbl table1(RTAB,t0,tablemul), t0 ## d; \ 99 movzbl table2(RTAB,t1,tablemul), t1 ## d; \ 100 op ## l t0 ## d, dest1 ## d; \ 101 op ## l t1 ## d, dest2 ## d; 102 103#define last_do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ 104 movzbl source ## bl, t0 ## d; \ 105 movzbl source ## bh, t1 ## d; \ 106 shrl $(shf), source ## d; \ 107 movzbl table1(RTAB,t0,tablemul), t0 ## d; \ 108 movzbl table2(RTAB,t1,tablemul), t1 ## d; \ 109 op ## l t0 ## d, dest1 ## d; \ 110 op ## l t1 ## d, dest2 ## d; 111 112/*********************************************************************** 113 * AMD64 assembly implementation of the AES cipher 114 ***********************************************************************/ 115#define addroundkey(round, ra, rb, rc, rd) \ 116 xorl (((round) * 16) + 0 * 4)(CTX), ra ## d; \ 117 xorl (((round) * 16) + 1 * 4)(CTX), rb ## d; \ 118 xorl (((round) * 16) + 2 * 4)(CTX), rc ## d; \ 119 xorl (((round) * 16) + 3 * 4)(CTX), rd ## d; 120 121#define do_encround(next_r) \ 122 do16bit_shr(16, mov, RA, Esize, E0, RNA, E0, RND, RT0, RT1); \ 123 do16bit( mov, RA, Esize, E0, RNC, E0, RNB, RT0, RT1); \ 124 movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ 125 roll $8, RNDd; \ 126 xorl RNAd, RAd; \ 127 roll $8, RNCd; \ 128 roll $8, RNBd; \ 129 roll $8, RAd; \ 130 \ 131 do16bit_shr(16, xor, RD, Esize, E0, RND, E0, RNC, RT0, RT1); \ 132 do16bit( xor, RD, Esize, E0, RNB, E0, RA, RT0, RT1); \ 133 movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ 134 roll $8, RNCd; \ 135 xorl RNDd, RDd; \ 136 roll $8, RNBd; \ 137 roll $8, RAd; \ 138 roll $8, RDd; \ 139 \ 140 do16bit_shr(16, xor, RC, Esize, E0, RNC, E0, RNB, RT0, RT1); \ 141 do16bit( xor, RC, Esize, E0, RA, E0, RD, RT0, RT1); \ 142 movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ 143 roll $8, RNBd; \ 144 xorl RNCd, RCd; \ 145 roll $8, RAd; \ 146 roll $8, RDd; \ 147 roll $8, RCd; \ 148 \ 149 do16bit_shr(16, xor, RB, Esize, E0, RNB, E0, RA, RT0, RT1); \ 150 do16bit( xor, RB, Esize, E0, RD, E0, RC, RT0, RT1); \ 151 movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ 152 roll $8, RAd; \ 153 xorl RNBd, RBd; \ 154 roll $16, RDd; \ 155 roll $24, RCd; 156 157#define do_lastencround(next_r) \ 158 do16bit_shr(16, movzb, RA, Essize, Es0, RNA, Es0, RND, RT0, RT1); \ 159 do16bit( movzb, RA, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \ 160 movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ 161 roll $8, RNDd; \ 162 xorl RNAd, RAd; \ 163 roll $8, RNCd; \ 164 roll $8, RNBd; \ 165 roll $8, RAd; \ 166 \ 167 last_do16bit_shr(16, xor, RD, Essize, Es0, RND, Es0, RNC, RT0, RT1); \ 168 last_do16bit( xor, RD, Essize, Es0, RNB, Es0, RA, RT0, RT1); \ 169 movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ 170 roll $8, RNCd; \ 171 xorl RNDd, RDd; \ 172 roll $8, RNBd; \ 173 roll $8, RAd; \ 174 roll $8, RDd; \ 175 \ 176 last_do16bit_shr(16, xor, RC, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \ 177 last_do16bit( xor, RC, Essize, Es0, RA, Es0, RD, RT0, RT1); \ 178 movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ 179 roll $8, RNBd; \ 180 xorl RNCd, RCd; \ 181 roll $8, RAd; \ 182 roll $8, RDd; \ 183 roll $8, RCd; \ 184 \ 185 last_do16bit_shr(16, xor, RB, Essize, Es0, RNB, Es0, RA, RT0, RT1); \ 186 last_do16bit( xor, RB, Essize, Es0, RD, Es0, RC, RT0, RT1); \ 187 movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ 188 roll $8, RAd; \ 189 xorl RNBd, RBd; \ 190 roll $16, RDd; \ 191 roll $24, RCd; 192 193#define firstencround(round) \ 194 addroundkey(round, RA, RB, RC, RD); \ 195 do_encround((round) + 1); 196 197#define encround(round) \ 198 do_encround((round) + 1); 199 200#define lastencround(round) \ 201 do_lastencround((round) + 1); 202 203.align 8 204.globl _gcry_aes_amd64_encrypt_block 205ELF(.type _gcry_aes_amd64_encrypt_block,@function;) 206 207_gcry_aes_amd64_encrypt_block: 208 /* input: 209 * %rdi: keysched, CTX 210 * %rsi: dst 211 * %rdx: src 212 * %ecx: number of rounds.. 10, 12 or 14 213 * %r8: encryption tables 214 */ 215 CFI_STARTPROC(); 216 ENTER_SYSV_FUNC_PARAMS_5 217 218 subq $(5 * 8), %rsp; 219 CFI_ADJUST_CFA_OFFSET(5 * 8); 220 movq %rsi, (0 * 8)(%rsp); 221 movl %ecx, (1 * 8)(%rsp); 222 movq %rbp, (2 * 8)(%rsp); 223 movq %rbx, (3 * 8)(%rsp); 224 movq %r12, (4 * 8)(%rsp); 225 CFI_REL_OFFSET(%rbp, 2 * 8); 226 CFI_REL_OFFSET(%rbx, 3 * 8); 227 CFI_REL_OFFSET(%r12, 4 * 8); 228 229 leaq (%r8), RTAB; 230 231 /* read input block */ 232 movl 0 * 4(%rdx), RAd; 233 movl 1 * 4(%rdx), RBd; 234 movl 2 * 4(%rdx), RCd; 235 movl 3 * 4(%rdx), RDd; 236 237 firstencround(0); 238 encround(1); 239 encround(2); 240 encround(3); 241 encround(4); 242 encround(5); 243 encround(6); 244 encround(7); 245 encround(8); 246 cmpl $12, (1 * 8)(%rsp); 247 jnb .Lenc_not_128; 248 lastencround(9); 249 250.align 4 251.Lenc_done: 252 /* write output block */ 253 movq (0 * 8)(%rsp), %rsi; 254 movl RAd, 0 * 4(%rsi); 255 movl RBd, 1 * 4(%rsi); 256 movl RCd, 2 * 4(%rsi); 257 movl RDd, 3 * 4(%rsi); 258 259 CFI_REMEMBER_STATE(); 260 261 movq (4 * 8)(%rsp), %r12; 262 movq (3 * 8)(%rsp), %rbx; 263 movq (2 * 8)(%rsp), %rbp; 264 CFI_RESTORE(%r12); 265 CFI_RESTORE(%rbx); 266 CFI_RESTORE(%rbp); 267 addq $(5 * 8), %rsp; 268 CFI_ADJUST_CFA_OFFSET(-5 * 8); 269 270 movl $(6 * 8), %eax; 271 272 EXIT_SYSV_FUNC 273 ret; 274 275 CFI_RESTORE_STATE(); 276.align 4 277.Lenc_not_128: 278 je .Lenc_192 279 280 encround(9); 281 encround(10); 282 encround(11); 283 encround(12); 284 lastencround(13); 285 286 jmp .Lenc_done; 287 288.align 4 289.Lenc_192: 290 encround(9); 291 encround(10); 292 lastencround(11); 293 294 jmp .Lenc_done; 295 CFI_ENDPROC(); 296ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;) 297 298#define do_decround(next_r) \ 299 do16bit_shr(16, mov, RA, Dsize, D0, RNA, D0, RNB, RT0, RT1); \ 300 do16bit( mov, RA, Dsize, D0, RNC, D0, RND, RT0, RT1); \ 301 movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ 302 roll $8, RNBd; \ 303 xorl RNAd, RAd; \ 304 roll $8, RNCd; \ 305 roll $8, RNDd; \ 306 roll $8, RAd; \ 307 \ 308 do16bit_shr(16, xor, RB, Dsize, D0, RNB, D0, RNC, RT0, RT1); \ 309 do16bit( xor, RB, Dsize, D0, RND, D0, RA, RT0, RT1); \ 310 movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ 311 roll $8, RNCd; \ 312 xorl RNBd, RBd; \ 313 roll $8, RNDd; \ 314 roll $8, RAd; \ 315 roll $8, RBd; \ 316 \ 317 do16bit_shr(16, xor, RC, Dsize, D0, RNC, D0, RND, RT0, RT1); \ 318 do16bit( xor, RC, Dsize, D0, RA, D0, RB, RT0, RT1); \ 319 movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ 320 roll $8, RNDd; \ 321 xorl RNCd, RCd; \ 322 roll $8, RAd; \ 323 roll $8, RBd; \ 324 roll $8, RCd; \ 325 \ 326 do16bit_shr(16, xor, RD, Dsize, D0, RND, D0, RA, RT0, RT1); \ 327 do16bit( xor, RD, Dsize, D0, RB, D0, RC, RT0, RT1); \ 328 movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ 329 roll $8, RAd; \ 330 xorl RNDd, RDd; \ 331 roll $16, RBd; \ 332 roll $24, RCd; 333 334#define do_lastdecround(next_r) \ 335 do16bit_shr(16, movzb, RA, Dssize, Ds0, RNA, Ds0, RNB, RT0, RT1); \ 336 do16bit( movzb, RA, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \ 337 movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ 338 roll $8, RNBd; \ 339 xorl RNAd, RAd; \ 340 roll $8, RNCd; \ 341 roll $8, RNDd; \ 342 roll $8, RAd; \ 343 \ 344 last_do16bit_shr(16, xor, RB, Dssize, Ds0, RNB, Ds0, RNC, RT0, RT1); \ 345 last_do16bit( xor, RB, Dssize, Ds0, RND, Ds0, RA, RT0, RT1); \ 346 movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ 347 roll $8, RNCd; \ 348 xorl RNBd, RBd; \ 349 roll $8, RNDd; \ 350 roll $8, RAd; \ 351 roll $8, RBd; \ 352 \ 353 last_do16bit_shr(16, xor, RC, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \ 354 last_do16bit( xor, RC, Dssize, Ds0, RA, Ds0, RB, RT0, RT1); \ 355 movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ 356 roll $8, RNDd; \ 357 xorl RNCd, RCd; \ 358 roll $8, RAd; \ 359 roll $8, RBd; \ 360 roll $8, RCd; \ 361 \ 362 last_do16bit_shr(16, xor, RD, Dssize, Ds0, RND, Ds0, RA, RT0, RT1); \ 363 last_do16bit( xor, RD, Dssize, Ds0, RB, Ds0, RC, RT0, RT1); \ 364 movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ 365 roll $8, RAd; \ 366 xorl RNDd, RDd; \ 367 roll $16, RBd; \ 368 roll $24, RCd; 369 370#define firstdecround(round) \ 371 addroundkey((round + 1), RA, RB, RC, RD); \ 372 do_decround(round); 373 374#define decround(round) \ 375 do_decround(round); 376 377#define lastdecround(round) \ 378 do_lastdecround(round); 379 380.align 8 381.globl _gcry_aes_amd64_decrypt_block 382ELF(.type _gcry_aes_amd64_decrypt_block,@function;) 383 384_gcry_aes_amd64_decrypt_block: 385 /* input: 386 * %rdi: keysched, CTX 387 * %rsi: dst 388 * %rdx: src 389 * %ecx: number of rounds.. 10, 12 or 14 390 * %r8: decryption tables 391 */ 392 CFI_STARTPROC(); 393 ENTER_SYSV_FUNC_PARAMS_5 394 395 subq $(5 * 8), %rsp; 396 CFI_ADJUST_CFA_OFFSET(5 * 8); 397 movq %rsi, (0 * 8)(%rsp); 398 movl %ecx, (1 * 8)(%rsp); 399 movq %rbp, (2 * 8)(%rsp); 400 movq %rbx, (3 * 8)(%rsp); 401 movq %r12, (4 * 8)(%rsp); 402 CFI_REL_OFFSET(%rbp, 2 * 8); 403 CFI_REL_OFFSET(%rbx, 3 * 8); 404 CFI_REL_OFFSET(%r12, 4 * 8); 405 406 leaq (%r8), RTAB; 407 408 /* read input block */ 409 movl 0 * 4(%rdx), RAd; 410 movl 1 * 4(%rdx), RBd; 411 movl 2 * 4(%rdx), RCd; 412 movl 3 * 4(%rdx), RDd; 413 414 cmpl $12, (1 * 8)(%rsp); 415 jnb .Ldec_256; 416 417 firstdecround(9); 418.align 4 419.Ldec_tail: 420 decround(8); 421 decround(7); 422 decround(6); 423 decround(5); 424 decround(4); 425 decround(3); 426 decround(2); 427 decround(1); 428 lastdecround(0); 429 430 /* write output block */ 431 movq (0 * 8)(%rsp), %rsi; 432 movl RAd, 0 * 4(%rsi); 433 movl RBd, 1 * 4(%rsi); 434 movl RCd, 2 * 4(%rsi); 435 movl RDd, 3 * 4(%rsi); 436 437 CFI_REMEMBER_STATE(); 438 439 movq (4 * 8)(%rsp), %r12; 440 movq (3 * 8)(%rsp), %rbx; 441 movq (2 * 8)(%rsp), %rbp; 442 CFI_RESTORE(%r12); 443 CFI_RESTORE(%rbx); 444 CFI_RESTORE(%rbp); 445 addq $(5 * 8), %rsp; 446 CFI_ADJUST_CFA_OFFSET(-5 * 8); 447 448 movl $(6 * 8), %eax; 449 450 EXIT_SYSV_FUNC 451 ret; 452 453 CFI_RESTORE_STATE(); 454.align 4 455.Ldec_256: 456 je .Ldec_192; 457 458 firstdecround(13); 459 decround(12); 460 decround(11); 461 decround(10); 462 decround(9); 463 464 jmp .Ldec_tail; 465 466.align 4 467.Ldec_192: 468 firstdecround(11); 469 decround(10); 470 decround(9); 471 472 jmp .Ldec_tail; 473 CFI_ENDPROC(); 474ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;) 475 476#endif /*USE_AES*/ 477#endif /*__x86_64*/ 478