1 /* 2 * AES-NI support functions 3 * 4 * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved 5 * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 6 * 7 * This file is provided under the Apache License 2.0, or the 8 * GNU General Public License v2.0 or later. 9 * 10 * ********** 11 * Apache License 2.0: 12 * 13 * Licensed under the Apache License, Version 2.0 (the "License"); you may 14 * not use this file except in compliance with the License. 15 * You may obtain a copy of the License at 16 * 17 * http://www.apache.org/licenses/LICENSE-2.0 18 * 19 * Unless required by applicable law or agreed to in writing, software 20 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 21 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 * See the License for the specific language governing permissions and 23 * limitations under the License. 24 * 25 * ********** 26 * 27 * ********** 28 * GNU General Public License v2.0 or later: 29 * 30 * This program is free software; you can redistribute it and/or modify 31 * it under the terms of the GNU General Public License as published by 32 * the Free Software Foundation; either version 2 of the License, or 33 * (at your option) any later version. 34 * 35 * This program is distributed in the hope that it will be useful, 36 * but WITHOUT ANY WARRANTY; without even the implied warranty of 37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 38 * GNU General Public License for more details. 39 * 40 * You should have received a copy of the GNU General Public License along 41 * with this program; if not, write to the Free Software Foundation, Inc., 42 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 43 * 44 * ********** 45 * 46 * This file is part of mbed TLS (https://tls.mbed.org) 47 */ 48 49 /* 50 * [AES-WP] http://software.intel.com/en-us/articles/intel-advanced-encryption-standard-aes-instructions-set 51 * [CLMUL-WP] http://software.intel.com/en-us/articles/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode/ 52 */ 53 54 #if !defined(MBEDTLS_CONFIG_FILE) 55 #include "mbedtls/config.h" 56 #else 57 #include MBEDTLS_CONFIG_FILE 58 #endif 59 60 #if defined(MBEDTLS_AESNI_C) 61 62 #include "mbedtls/aesni.h" 63 64 #include <string.h> 65 66 #ifndef asm 67 #define asm __asm 68 #endif 69 70 #if defined(MBEDTLS_HAVE_X86_64) 71 72 /* 73 * AES-NI support detection routine 74 */ 75 int mbedtls_aesni_has_support( unsigned int what ) 76 { 77 static int done = 0; 78 static unsigned int c = 0; 79 80 if( ! done ) 81 { 82 asm( "movl $1, %%eax \n\t" 83 "cpuid \n\t" 84 : "=c" (c) 85 : 86 : "eax", "ebx", "edx" ); 87 done = 1; 88 } 89 90 return( ( c & what ) != 0 ); 91 } 92 93 /* 94 * Binutils needs to be at least 2.19 to support AES-NI instructions. 95 * Unfortunately, a lot of users have a lower version now (2014-04). 96 * Emit bytecode directly in order to support "old" version of gas. 97 * 98 * Opcodes from the Intel architecture reference manual, vol. 3. 99 * We always use registers, so we don't need prefixes for memory operands. 100 * Operand macros are in gas order (src, dst) as opposed to Intel order 101 * (dst, src) in order to blend better into the surrounding assembly code. 102 */ 103 #define AESDEC ".byte 0x66,0x0F,0x38,0xDE," 104 #define AESDECLAST ".byte 0x66,0x0F,0x38,0xDF," 105 #define AESENC ".byte 0x66,0x0F,0x38,0xDC," 106 #define AESENCLAST ".byte 0x66,0x0F,0x38,0xDD," 107 #define AESIMC ".byte 0x66,0x0F,0x38,0xDB," 108 #define AESKEYGENA ".byte 0x66,0x0F,0x3A,0xDF," 109 #define PCLMULQDQ ".byte 0x66,0x0F,0x3A,0x44," 110 111 #define xmm0_xmm0 "0xC0" 112 #define xmm0_xmm1 "0xC8" 113 #define xmm0_xmm2 "0xD0" 114 #define xmm0_xmm3 "0xD8" 115 #define xmm0_xmm4 "0xE0" 116 #define xmm1_xmm0 "0xC1" 117 #define xmm1_xmm2 "0xD1" 118 119 /* 120 * AES-NI AES-ECB block en(de)cryption 121 */ 122 int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx, 123 int mode, 124 const unsigned char input[16], 125 unsigned char output[16] ) 126 { 127 asm( "movdqu (%3), %%xmm0 \n\t" // load input 128 "movdqu (%1), %%xmm1 \n\t" // load round key 0 129 "pxor %%xmm1, %%xmm0 \n\t" // round 0 130 "add $16, %1 \n\t" // point to next round key 131 "subl $1, %0 \n\t" // normal rounds = nr - 1 132 "test %2, %2 \n\t" // mode? 133 "jz 2f \n\t" // 0 = decrypt 134 135 "1: \n\t" // encryption loop 136 "movdqu (%1), %%xmm1 \n\t" // load round key 137 AESENC xmm1_xmm0 "\n\t" // do round 138 "add $16, %1 \n\t" // point to next round key 139 "subl $1, %0 \n\t" // loop 140 "jnz 1b \n\t" 141 "movdqu (%1), %%xmm1 \n\t" // load round key 142 AESENCLAST xmm1_xmm0 "\n\t" // last round 143 "jmp 3f \n\t" 144 145 "2: \n\t" // decryption loop 146 "movdqu (%1), %%xmm1 \n\t" 147 AESDEC xmm1_xmm0 "\n\t" // do round 148 "add $16, %1 \n\t" 149 "subl $1, %0 \n\t" 150 "jnz 2b \n\t" 151 "movdqu (%1), %%xmm1 \n\t" // load round key 152 AESDECLAST xmm1_xmm0 "\n\t" // last round 153 154 "3: \n\t" 155 "movdqu %%xmm0, (%4) \n\t" // export output 156 : 157 : "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output) 158 : "memory", "cc", "xmm0", "xmm1" ); 159 160 161 return( 0 ); 162 } 163 164 /* 165 * GCM multiplication: c = a times b in GF(2^128) 166 * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5. 167 */ 168 void mbedtls_aesni_gcm_mult( unsigned char c[16], 169 const unsigned char a[16], 170 const unsigned char b[16] ) 171 { 172 unsigned char aa[16], bb[16], cc[16]; 173 size_t i; 174 175 /* The inputs are in big-endian order, so byte-reverse them */ 176 for( i = 0; i < 16; i++ ) 177 { 178 aa[i] = a[15 - i]; 179 bb[i] = b[15 - i]; 180 } 181 182 asm( "movdqu (%0), %%xmm0 \n\t" // a1:a0 183 "movdqu (%1), %%xmm1 \n\t" // b1:b0 184 185 /* 186 * Caryless multiplication xmm2:xmm1 = xmm0 * xmm1 187 * using [CLMUL-WP] algorithm 1 (p. 13). 188 */ 189 "movdqa %%xmm1, %%xmm2 \n\t" // copy of b1:b0 190 "movdqa %%xmm1, %%xmm3 \n\t" // same 191 "movdqa %%xmm1, %%xmm4 \n\t" // same 192 PCLMULQDQ xmm0_xmm1 ",0x00 \n\t" // a0*b0 = c1:c0 193 PCLMULQDQ xmm0_xmm2 ",0x11 \n\t" // a1*b1 = d1:d0 194 PCLMULQDQ xmm0_xmm3 ",0x10 \n\t" // a0*b1 = e1:e0 195 PCLMULQDQ xmm0_xmm4 ",0x01 \n\t" // a1*b0 = f1:f0 196 "pxor %%xmm3, %%xmm4 \n\t" // e1+f1:e0+f0 197 "movdqa %%xmm4, %%xmm3 \n\t" // same 198 "psrldq $8, %%xmm4 \n\t" // 0:e1+f1 199 "pslldq $8, %%xmm3 \n\t" // e0+f0:0 200 "pxor %%xmm4, %%xmm2 \n\t" // d1:d0+e1+f1 201 "pxor %%xmm3, %%xmm1 \n\t" // c1+e0+f1:c0 202 203 /* 204 * Now shift the result one bit to the left, 205 * taking advantage of [CLMUL-WP] eq 27 (p. 20) 206 */ 207 "movdqa %%xmm1, %%xmm3 \n\t" // r1:r0 208 "movdqa %%xmm2, %%xmm4 \n\t" // r3:r2 209 "psllq $1, %%xmm1 \n\t" // r1<<1:r0<<1 210 "psllq $1, %%xmm2 \n\t" // r3<<1:r2<<1 211 "psrlq $63, %%xmm3 \n\t" // r1>>63:r0>>63 212 "psrlq $63, %%xmm4 \n\t" // r3>>63:r2>>63 213 "movdqa %%xmm3, %%xmm5 \n\t" // r1>>63:r0>>63 214 "pslldq $8, %%xmm3 \n\t" // r0>>63:0 215 "pslldq $8, %%xmm4 \n\t" // r2>>63:0 216 "psrldq $8, %%xmm5 \n\t" // 0:r1>>63 217 "por %%xmm3, %%xmm1 \n\t" // r1<<1|r0>>63:r0<<1 218 "por %%xmm4, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1 219 "por %%xmm5, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1|r1>>63 220 221 /* 222 * Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1 223 * using [CLMUL-WP] algorithm 5 (p. 20). 224 * Currently xmm2:xmm1 holds x3:x2:x1:x0 (already shifted). 225 */ 226 /* Step 2 (1) */ 227 "movdqa %%xmm1, %%xmm3 \n\t" // x1:x0 228 "movdqa %%xmm1, %%xmm4 \n\t" // same 229 "movdqa %%xmm1, %%xmm5 \n\t" // same 230 "psllq $63, %%xmm3 \n\t" // x1<<63:x0<<63 = stuff:a 231 "psllq $62, %%xmm4 \n\t" // x1<<62:x0<<62 = stuff:b 232 "psllq $57, %%xmm5 \n\t" // x1<<57:x0<<57 = stuff:c 233 234 /* Step 2 (2) */ 235 "pxor %%xmm4, %%xmm3 \n\t" // stuff:a+b 236 "pxor %%xmm5, %%xmm3 \n\t" // stuff:a+b+c 237 "pslldq $8, %%xmm3 \n\t" // a+b+c:0 238 "pxor %%xmm3, %%xmm1 \n\t" // x1+a+b+c:x0 = d:x0 239 240 /* Steps 3 and 4 */ 241 "movdqa %%xmm1,%%xmm0 \n\t" // d:x0 242 "movdqa %%xmm1,%%xmm4 \n\t" // same 243 "movdqa %%xmm1,%%xmm5 \n\t" // same 244 "psrlq $1, %%xmm0 \n\t" // e1:x0>>1 = e1:e0' 245 "psrlq $2, %%xmm4 \n\t" // f1:x0>>2 = f1:f0' 246 "psrlq $7, %%xmm5 \n\t" // g1:x0>>7 = g1:g0' 247 "pxor %%xmm4, %%xmm0 \n\t" // e1+f1:e0'+f0' 248 "pxor %%xmm5, %%xmm0 \n\t" // e1+f1+g1:e0'+f0'+g0' 249 // e0'+f0'+g0' is almost e0+f0+g0, ex\tcept for some missing 250 // bits carried from d. Now get those\t bits back in. 251 "movdqa %%xmm1,%%xmm3 \n\t" // d:x0 252 "movdqa %%xmm1,%%xmm4 \n\t" // same 253 "movdqa %%xmm1,%%xmm5 \n\t" // same 254 "psllq $63, %%xmm3 \n\t" // d<<63:stuff 255 "psllq $62, %%xmm4 \n\t" // d<<62:stuff 256 "psllq $57, %%xmm5 \n\t" // d<<57:stuff 257 "pxor %%xmm4, %%xmm3 \n\t" // d<<63+d<<62:stuff 258 "pxor %%xmm5, %%xmm3 \n\t" // missing bits of d:stuff 259 "psrldq $8, %%xmm3 \n\t" // 0:missing bits of d 260 "pxor %%xmm3, %%xmm0 \n\t" // e1+f1+g1:e0+f0+g0 261 "pxor %%xmm1, %%xmm0 \n\t" // h1:h0 262 "pxor %%xmm2, %%xmm0 \n\t" // x3+h1:x2+h0 263 264 "movdqu %%xmm0, (%2) \n\t" // done 265 : 266 : "r" (aa), "r" (bb), "r" (cc) 267 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); 268 269 /* Now byte-reverse the outputs */ 270 for( i = 0; i < 16; i++ ) 271 c[i] = cc[15 - i]; 272 273 return; 274 } 275 276 /* 277 * Compute decryption round keys from encryption round keys 278 */ 279 void mbedtls_aesni_inverse_key( unsigned char *invkey, 280 const unsigned char *fwdkey, int nr ) 281 { 282 unsigned char *ik = invkey; 283 const unsigned char *fk = fwdkey + 16 * nr; 284 285 memcpy( ik, fk, 16 ); 286 287 for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 ) 288 asm( "movdqu (%0), %%xmm0 \n\t" 289 AESIMC xmm0_xmm0 "\n\t" 290 "movdqu %%xmm0, (%1) \n\t" 291 : 292 : "r" (fk), "r" (ik) 293 : "memory", "xmm0" ); 294 295 memcpy( ik, fk, 16 ); 296 } 297 298 /* 299 * Key expansion, 128-bit case 300 */ 301 static void aesni_setkey_enc_128( unsigned char *rk, 302 const unsigned char *key ) 303 { 304 asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key 305 "movdqu %%xmm0, (%0) \n\t" // as round key 0 306 "jmp 2f \n\t" // skip auxiliary routine 307 308 /* 309 * Finish generating the next round key. 310 * 311 * On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff 312 * with X = rot( sub( r3 ) ) ^ RCON. 313 * 314 * On exit, xmm0 is r7:r6:r5:r4 315 * with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3 316 * and those are written to the round key buffer. 317 */ 318 "1: \n\t" 319 "pshufd $0xff, %%xmm1, %%xmm1 \n\t" // X:X:X:X 320 "pxor %%xmm0, %%xmm1 \n\t" // X+r3:X+r2:X+r1:r4 321 "pslldq $4, %%xmm0 \n\t" // r2:r1:r0:0 322 "pxor %%xmm0, %%xmm1 \n\t" // X+r3+r2:X+r2+r1:r5:r4 323 "pslldq $4, %%xmm0 \n\t" // etc 324 "pxor %%xmm0, %%xmm1 \n\t" 325 "pslldq $4, %%xmm0 \n\t" 326 "pxor %%xmm1, %%xmm0 \n\t" // update xmm0 for next time! 327 "add $16, %0 \n\t" // point to next round key 328 "movdqu %%xmm0, (%0) \n\t" // write it 329 "ret \n\t" 330 331 /* Main "loop" */ 332 "2: \n\t" 333 AESKEYGENA xmm0_xmm1 ",0x01 \n\tcall 1b \n\t" 334 AESKEYGENA xmm0_xmm1 ",0x02 \n\tcall 1b \n\t" 335 AESKEYGENA xmm0_xmm1 ",0x04 \n\tcall 1b \n\t" 336 AESKEYGENA xmm0_xmm1 ",0x08 \n\tcall 1b \n\t" 337 AESKEYGENA xmm0_xmm1 ",0x10 \n\tcall 1b \n\t" 338 AESKEYGENA xmm0_xmm1 ",0x20 \n\tcall 1b \n\t" 339 AESKEYGENA xmm0_xmm1 ",0x40 \n\tcall 1b \n\t" 340 AESKEYGENA xmm0_xmm1 ",0x80 \n\tcall 1b \n\t" 341 AESKEYGENA xmm0_xmm1 ",0x1B \n\tcall 1b \n\t" 342 AESKEYGENA xmm0_xmm1 ",0x36 \n\tcall 1b \n\t" 343 : 344 : "r" (rk), "r" (key) 345 : "memory", "cc", "0" ); 346 } 347 348 /* 349 * Key expansion, 192-bit case 350 */ 351 static void aesni_setkey_enc_192( unsigned char *rk, 352 const unsigned char *key ) 353 { 354 asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key 355 "movdqu %%xmm0, (%0) \n\t" 356 "add $16, %0 \n\t" 357 "movq 16(%1), %%xmm1 \n\t" 358 "movq %%xmm1, (%0) \n\t" 359 "add $8, %0 \n\t" 360 "jmp 2f \n\t" // skip auxiliary routine 361 362 /* 363 * Finish generating the next 6 quarter-keys. 364 * 365 * On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4 366 * and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON. 367 * 368 * On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10 369 * and those are written to the round key buffer. 370 */ 371 "1: \n\t" 372 "pshufd $0x55, %%xmm2, %%xmm2 \n\t" // X:X:X:X 373 "pxor %%xmm0, %%xmm2 \n\t" // X+r3:X+r2:X+r1:r4 374 "pslldq $4, %%xmm0 \n\t" // etc 375 "pxor %%xmm0, %%xmm2 \n\t" 376 "pslldq $4, %%xmm0 \n\t" 377 "pxor %%xmm0, %%xmm2 \n\t" 378 "pslldq $4, %%xmm0 \n\t" 379 "pxor %%xmm2, %%xmm0 \n\t" // update xmm0 = r9:r8:r7:r6 380 "movdqu %%xmm0, (%0) \n\t" 381 "add $16, %0 \n\t" 382 "pshufd $0xff, %%xmm0, %%xmm2 \n\t" // r9:r9:r9:r9 383 "pxor %%xmm1, %%xmm2 \n\t" // stuff:stuff:r9+r5:r10 384 "pslldq $4, %%xmm1 \n\t" // r2:r1:r0:0 385 "pxor %%xmm2, %%xmm1 \n\t" // xmm1 = stuff:stuff:r11:r10 386 "movq %%xmm1, (%0) \n\t" 387 "add $8, %0 \n\t" 388 "ret \n\t" 389 390 "2: \n\t" 391 AESKEYGENA xmm1_xmm2 ",0x01 \n\tcall 1b \n\t" 392 AESKEYGENA xmm1_xmm2 ",0x02 \n\tcall 1b \n\t" 393 AESKEYGENA xmm1_xmm2 ",0x04 \n\tcall 1b \n\t" 394 AESKEYGENA xmm1_xmm2 ",0x08 \n\tcall 1b \n\t" 395 AESKEYGENA xmm1_xmm2 ",0x10 \n\tcall 1b \n\t" 396 AESKEYGENA xmm1_xmm2 ",0x20 \n\tcall 1b \n\t" 397 AESKEYGENA xmm1_xmm2 ",0x40 \n\tcall 1b \n\t" 398 AESKEYGENA xmm1_xmm2 ",0x80 \n\tcall 1b \n\t" 399 400 : 401 : "r" (rk), "r" (key) 402 : "memory", "cc", "0" ); 403 } 404 405 /* 406 * Key expansion, 256-bit case 407 */ 408 static void aesni_setkey_enc_256( unsigned char *rk, 409 const unsigned char *key ) 410 { 411 asm( "movdqu (%1), %%xmm0 \n\t" 412 "movdqu %%xmm0, (%0) \n\t" 413 "add $16, %0 \n\t" 414 "movdqu 16(%1), %%xmm1 \n\t" 415 "movdqu %%xmm1, (%0) \n\t" 416 "jmp 2f \n\t" // skip auxiliary routine 417 418 /* 419 * Finish generating the next two round keys. 420 * 421 * On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and 422 * xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON 423 * 424 * On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12 425 * and those have been written to the output buffer. 426 */ 427 "1: \n\t" 428 "pshufd $0xff, %%xmm2, %%xmm2 \n\t" 429 "pxor %%xmm0, %%xmm2 \n\t" 430 "pslldq $4, %%xmm0 \n\t" 431 "pxor %%xmm0, %%xmm2 \n\t" 432 "pslldq $4, %%xmm0 \n\t" 433 "pxor %%xmm0, %%xmm2 \n\t" 434 "pslldq $4, %%xmm0 \n\t" 435 "pxor %%xmm2, %%xmm0 \n\t" 436 "add $16, %0 \n\t" 437 "movdqu %%xmm0, (%0) \n\t" 438 439 /* Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 ) 440 * and proceed to generate next round key from there */ 441 AESKEYGENA xmm0_xmm2 ",0x00 \n\t" 442 "pshufd $0xaa, %%xmm2, %%xmm2 \n\t" 443 "pxor %%xmm1, %%xmm2 \n\t" 444 "pslldq $4, %%xmm1 \n\t" 445 "pxor %%xmm1, %%xmm2 \n\t" 446 "pslldq $4, %%xmm1 \n\t" 447 "pxor %%xmm1, %%xmm2 \n\t" 448 "pslldq $4, %%xmm1 \n\t" 449 "pxor %%xmm2, %%xmm1 \n\t" 450 "add $16, %0 \n\t" 451 "movdqu %%xmm1, (%0) \n\t" 452 "ret \n\t" 453 454 /* 455 * Main "loop" - Generating one more key than necessary, 456 * see definition of mbedtls_aes_context.buf 457 */ 458 "2: \n\t" 459 AESKEYGENA xmm1_xmm2 ",0x01 \n\tcall 1b \n\t" 460 AESKEYGENA xmm1_xmm2 ",0x02 \n\tcall 1b \n\t" 461 AESKEYGENA xmm1_xmm2 ",0x04 \n\tcall 1b \n\t" 462 AESKEYGENA xmm1_xmm2 ",0x08 \n\tcall 1b \n\t" 463 AESKEYGENA xmm1_xmm2 ",0x10 \n\tcall 1b \n\t" 464 AESKEYGENA xmm1_xmm2 ",0x20 \n\tcall 1b \n\t" 465 AESKEYGENA xmm1_xmm2 ",0x40 \n\tcall 1b \n\t" 466 : 467 : "r" (rk), "r" (key) 468 : "memory", "cc", "0" ); 469 } 470 471 /* 472 * Key expansion, wrapper 473 */ 474 int mbedtls_aesni_setkey_enc( unsigned char *rk, 475 const unsigned char *key, 476 size_t bits ) 477 { 478 switch( bits ) 479 { 480 case 128: aesni_setkey_enc_128( rk, key ); break; 481 case 192: aesni_setkey_enc_192( rk, key ); break; 482 case 256: aesni_setkey_enc_256( rk, key ); break; 483 default : return( MBEDTLS_ERR_AES_INVALID_KEY_LENGTH ); 484 } 485 486 return( 0 ); 487 } 488 489 #endif /* MBEDTLS_HAVE_X86_64 */ 490 491 #endif /* MBEDTLS_AESNI_C */ 492