/* * Copyright (c) 2017 Thomas Pornin * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define BR_POWER_ASM_MACROS 1 #include "inner.h" /* * This code contains the AES key schedule implementation using the * POWER8 opcodes. */ #if BR_POWER8 static void key_schedule_128(unsigned char *sk, const unsigned char *key) { long cc; static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B }; #if BR_POWER8_LE static const uint32_t idx2be[] = { 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C }; #endif cc = 0; /* * We use the VSX instructions for loading and storing the * key/subkeys, since they support unaligned accesses. The rest * of the computation is VMX only. VMX register 0 is VSX * register 32. */ asm volatile ( /* * v0 = all-zero word * v1 = constant -8 / +8, copied into four words * v2 = current subkey * v3 = Rcon (x4 words) * v6 = constant 8, copied into four words * v7 = constant 0x11B, copied into four words * v8 = constant for byteswapping words */ vspltisw(0, 0) #if BR_POWER8_LE vspltisw(1, -8) #else vspltisw(1, 8) #endif lxvw4x(34, 0, %[key]) vspltisw(3, 1) vspltisw(6, 8) lxvw4x(39, 0, %[fmod]) #if BR_POWER8_LE lxvw4x(40, 0, %[idx2be]) #endif /* * First subkey is a copy of the key itself. */ #if BR_POWER8_LE vperm(4, 2, 2, 8) stxvw4x(36, 0, %[sk]) #else stxvw4x(34, 0, %[sk]) #endif /* * Loop must run 10 times. */ li(%[cc], 10) mtctr(%[cc]) label(loop) /* Increment subkey address */ addi(%[sk], %[sk], 16) /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */ vrlw(4, 2, 1) vsbox(4, 4) #if BR_POWER8_LE vxor(4, 4, 3) #else vsldoi(5, 3, 0, 3) vxor(4, 4, 5) #endif vspltw(4, 4, 3) /* XOR words for next subkey */ vsldoi(5, 0, 2, 12) vxor(2, 2, 5) vsldoi(5, 0, 2, 12) vxor(2, 2, 5) vsldoi(5, 0, 2, 12) vxor(2, 2, 5) vxor(2, 2, 4) /* Store next subkey */ #if BR_POWER8_LE vperm(4, 2, 2, 8) stxvw4x(36, 0, %[sk]) #else stxvw4x(34, 0, %[sk]) #endif /* Update Rcon */ vadduwm(3, 3, 3) vsrw(4, 3, 6) vsubuwm(4, 0, 4) vand(4, 4, 7) vxor(3, 3, 4) bdnz(loop) : [sk] "+b" (sk), [cc] "+b" (cc) : [key] "b" (key), [fmod] "b" (fmod) #if BR_POWER8_LE , [idx2be] "b" (idx2be) #endif : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory" ); } static void key_schedule_192(unsigned char *sk, const unsigned char *key) { long cc; #if BR_POWER8_LE static const uint32_t idx2be[] = { 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C }; #endif cc = 0; /* * We use the VSX instructions for loading and storing the * key/subkeys, since they support unaligned accesses. The rest * of the computation is VMX only. VMX register 0 is VSX * register 32. */ asm volatile ( /* * v0 = all-zero word * v1 = constant -8 / +8, copied into four words * v2, v3 = current subkey * v5 = Rcon (x4 words) (already shifted on big-endian) * v6 = constant 8, copied into four words * v8 = constant for byteswapping words * * The left two words of v3 are ignored. */ vspltisw(0, 0) #if BR_POWER8_LE vspltisw(1, -8) #else vspltisw(1, 8) #endif li(%[cc], 8) lxvw4x(34, 0, %[key]) lxvw4x(35, %[cc], %[key]) vsldoi(3, 3, 0, 8) vspltisw(5, 1) #if !BR_POWER8_LE vsldoi(5, 5, 0, 3) #endif vspltisw(6, 8) #if BR_POWER8_LE lxvw4x(40, 0, %[idx2be]) #endif /* * Loop must run 8 times. Each iteration produces 256 * bits of subkeys, with a 64-bit overlap. */ li(%[cc], 8) mtctr(%[cc]) li(%[cc], 16) label(loop) /* * Last 6 words in v2:v3l. Compute next 6 words into * v3r:v4. */ vrlw(10, 3, 1) vsbox(10, 10) vxor(10, 10, 5) vspltw(10, 10, 1) vsldoi(11, 0, 10, 8) vsldoi(12, 0, 2, 12) vxor(12, 2, 12) vsldoi(13, 0, 12, 12) vxor(12, 12, 13) vsldoi(13, 0, 12, 12) vxor(12, 12, 13) vspltw(13, 12, 3) vxor(13, 13, 3) vsldoi(14, 0, 3, 12) vxor(13, 13, 14) vsldoi(4, 12, 13, 8) vsldoi(14, 0, 3, 8) vsldoi(3, 14, 12, 8) vxor(3, 3, 11) vxor(4, 4, 10) /* * Update Rcon. Since for a 192-bit key, we use only 8 * such constants, we will not hit the field modulus, * so a simple shift (addition) works well. */ vadduwm(5, 5, 5) /* * Write out the two left 128-bit words */ #if BR_POWER8_LE vperm(10, 2, 2, 8) vperm(11, 3, 3, 8) stxvw4x(42, 0, %[sk]) stxvw4x(43, %[cc], %[sk]) #else stxvw4x(34, 0, %[sk]) stxvw4x(35, %[cc], %[sk]) #endif addi(%[sk], %[sk], 24) /* * Shift words for next iteration. */ vsldoi(2, 3, 4, 8) vsldoi(3, 4, 0, 8) bdnz(loop) /* * The loop wrote the first 50 subkey words, but we need * to produce 52, so we must do one last write. */ #if BR_POWER8_LE vperm(10, 2, 2, 8) stxvw4x(42, 0, %[sk]) #else stxvw4x(34, 0, %[sk]) #endif : [sk] "+b" (sk), [cc] "+b" (cc) : [key] "b" (key) #if BR_POWER8_LE , [idx2be] "b" (idx2be) #endif : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" ); } static void key_schedule_256(unsigned char *sk, const unsigned char *key) { long cc; #if BR_POWER8_LE static const uint32_t idx2be[] = { 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C }; #endif cc = 0; /* * We use the VSX instructions for loading and storing the * key/subkeys, since they support unaligned accesses. The rest * of the computation is VMX only. VMX register 0 is VSX * register 32. */ asm volatile ( /* * v0 = all-zero word * v1 = constant -8 / +8, copied into four words * v2, v3 = current subkey * v6 = Rcon (x4 words) (already shifted on big-endian) * v7 = constant 8, copied into four words * v8 = constant for byteswapping words * * The left two words of v3 are ignored. */ vspltisw(0, 0) #if BR_POWER8_LE vspltisw(1, -8) #else vspltisw(1, 8) #endif li(%[cc], 16) lxvw4x(34, 0, %[key]) lxvw4x(35, %[cc], %[key]) vspltisw(6, 1) #if !BR_POWER8_LE vsldoi(6, 6, 0, 3) #endif vspltisw(7, 8) #if BR_POWER8_LE lxvw4x(40, 0, %[idx2be]) #endif /* * Loop must run 7 times. Each iteration produces two * subkeys. */ li(%[cc], 7) mtctr(%[cc]) li(%[cc], 16) label(loop) /* * Current words are in v2:v3. Compute next word in v4. */ vrlw(10, 3, 1) vsbox(10, 10) vxor(10, 10, 6) vspltw(10, 10, 3) vsldoi(4, 0, 2, 12) vxor(4, 2, 4) vsldoi(5, 0, 4, 12) vxor(4, 4, 5) vsldoi(5, 0, 4, 12) vxor(4, 4, 5) vxor(4, 4, 10) /* * Then other word in v5. */ vsbox(10, 4) vspltw(10, 10, 3) vsldoi(5, 0, 3, 12) vxor(5, 3, 5) vsldoi(11, 0, 5, 12) vxor(5, 5, 11) vsldoi(11, 0, 5, 12) vxor(5, 5, 11) vxor(5, 5, 10) /* * Update Rcon. Since for a 256-bit key, we use only 7 * such constants, we will not hit the field modulus, * so a simple shift (addition) works well. */ vadduwm(6, 6, 6) /* * Write out the two left 128-bit words */ #if BR_POWER8_LE vperm(10, 2, 2, 8) vperm(11, 3, 3, 8) stxvw4x(42, 0, %[sk]) stxvw4x(43, %[cc], %[sk]) #else stxvw4x(34, 0, %[sk]) stxvw4x(35, %[cc], %[sk]) #endif addi(%[sk], %[sk], 32) /* * Replace v2:v3 with v4:v5. */ vxor(2, 0, 4) vxor(3, 0, 5) bdnz(loop) /* * The loop wrote the first 14 subkeys, but we need 15, * so we must do an extra write. */ #if BR_POWER8_LE vperm(10, 2, 2, 8) stxvw4x(42, 0, %[sk]) #else stxvw4x(34, 0, %[sk]) #endif : [sk] "+b" (sk), [cc] "+b" (cc) : [key] "b" (key) #if BR_POWER8_LE , [idx2be] "b" (idx2be) #endif : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" ); } /* see inner.h */ int br_aes_pwr8_supported(void) { return 1; } /* see inner.h */ unsigned br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len) { switch (len) { case 16: key_schedule_128(sk, key); return 10; case 24: key_schedule_192(sk, key); return 12; default: key_schedule_256(sk, key); return 14; } } #endif