1/* $OpenBSD: sha512_amd64_generic.S,v 1.1 2024/11/16 14:56:39 jsing Exp $ */ 2/* 3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18#ifdef __CET__ 19#include <cet.h> 20#else 21#define _CET_ENDBR 22#endif 23 24#define ctx %rdi 25#define in %rsi 26#define num %rdx 27 28#define round %rdi 29 30#define hs0 %r8 31#define hs1 %r9 32#define hs2 %r10 33#define hs3 %r11 34#define hs4 %r12 35#define hs5 %r13 36#define hs6 %r14 37#define hs7 %r15 38 39#define k512 %rbp 40 41#define tmp0 %rax 42#define tmp1 %rbx 43#define tmp2 %rcx 44#define tmp3 %rdx 45 46/* 47 * Load message into wt, storing a copy in the message schedule: 48 * 49 * Wt = Mt 50 */ 51#define sha512_message_schedule_load(idx, m, w, wt) \ 52 movq (m, round, 8), wt; \ 53 bswapq wt; \ 54 movq wt, ((idx&0xf)*8)(w); 55 56/* 57 * Update message schedule and return current value in wt: 58 * 59 * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16) 60 * 61 * sigma0(x) = ror(x, 1) ^ ror(x, 8) ^ (x >> 7) 62 * sigma1(x) = ror(x, 19) ^ ror(x, 61) ^ (x >> 6) 63 * 64 */ 65#define sha512_message_schedule_update(idx, w, wt) \ 66 movq (((idx-2)&0xf)*8)(w), wt; /* sigma1 */ \ 67 movq wt, tmp1; /* sigma1 */ \ 68 rorq $(61-19), tmp1; /* sigma1 */ \ 69 xorq wt, tmp1; /* sigma1 */ \ 70 rorq $19, tmp1; /* sigma1 */ \ 71 shrq $6, wt; /* sigma1 */ \ 72 xorq tmp1, wt; /* sigma1 */ \ 73 \ 74 addq (((idx-7)&0xf)*8)(w), wt; /* Wt-7 */ \ 75 addq (((idx-16)&0xf)*8)(w), wt; /* Wt-16 */ \ 76 \ 77 movq (((idx-15)&0xf)*8)(w), tmp2; /* sigma0 */ \ 78 movq tmp2, tmp3; /* sigma0 */ \ 79 rorq $(8-1), tmp2; /* sigma0 */ \ 80 xorq tmp3, tmp2; /* sigma0 */ \ 81 rorq $1, tmp2; /* sigma0 */ \ 82 shrq $7, tmp3; /* sigma0 */ \ 83 xorq tmp3, tmp2; /* sigma0 */ \ 84 addq tmp2, wt; /* sigma0 */ \ 85 \ 86 movq wt, ((idx&0xf)*8)(w); 87 88/* 89 * Compute a SHA-512 round: 90 * 91 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt 92 * T2 = Sigma0(a) + Maj(a, b, c) 93 * 94 * Sigma0(x) = ror(x, 28) ^ ror(x, 34) ^ ror(x, 39) 95 * Sigma1(x) = ror(x, 14) ^ ror(x, 18) ^ ror(x, 41) 96 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z 97 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) 98 * 99 * Upon completion d = d + T1, h = T1 + T2, pending rotation. 100 */ 101#define sha512_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \ 102 addq wt, h; /* T1 Wt */ \ 103 addq (k512, round, 8), h; /* T1 Kt */ \ 104 \ 105 movq e, tmp1; /* T1 Sigma1 */ \ 106 rorq $(41-18), tmp1; /* T1 Sigma1 */ \ 107 xorq e, tmp1; /* T1 Sigma1 */ \ 108 rorq $(18-14), tmp1; /* T1 Sigma1 */ \ 109 xorq e, tmp1; /* T1 Sigma1 */ \ 110 rorq $14, tmp1; /* T1 Sigma1 */ \ 111 addq tmp1, h; /* T1 Sigma1 */ \ 112 \ 113 movq f, tmp2; /* T1 Ch */ \ 114 xorq g, tmp2; /* T1 Ch */ \ 115 andq e, tmp2; /* T1 Ch */ \ 116 xorq g, tmp2; /* T1 Ch */ \ 117 addq tmp2, h; /* T1 Ch */ \ 118 \ 119 addq h, d; /* d += T1 */ \ 120 \ 121 movq a, tmp1; /* T2 Sigma0 */ \ 122 rorq $(39-34), tmp1; /* T2 Sigma0 */ \ 123 xorq a, tmp1; /* T2 Sigma0 */ \ 124 rorq $(34-28), tmp1; /* T2 Sigma0 */ \ 125 xorq a, tmp1; /* T2 Sigma0 */ \ 126 rorq $28, tmp1; /* T2 Sigma0 */ \ 127 addq tmp1, h; /* T2 Sigma0 */ \ 128 \ 129 movq b, tmp2; /* T2 Maj */ \ 130 xorq c, tmp2; /* T2 Maj */ \ 131 andq a, tmp2; /* T2 Maj */ \ 132 movq b, tmp3; /* T2 Maj */ \ 133 andq c, tmp3; /* T2 Maj */ \ 134 xorq tmp2, tmp3; /* T2 Maj */ \ 135 addq tmp3, h; /* T2 Maj */ \ 136 \ 137 addq $1, round; 138 139#define sha512_round_load(idx, a, b, c, d, e, f, g, h) \ 140 sha512_message_schedule_load(idx, in, %rsp, tmp0) \ 141 sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0) 142 143#define sha512_round_update(idx, a, b, c, d, e, f, g, h) \ 144 sha512_message_schedule_update(idx, %rsp, tmp0) \ 145 sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0) 146 147.text 148 149/* 150 * void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num); 151 * 152 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 153 */ 154.align 16 155.globl sha512_block_generic 156.type sha512_block_generic,@function 157sha512_block_generic: 158 _CET_ENDBR 159 160 /* Save callee save registers. */ 161 pushq %rbx 162 pushq %rbp 163 pushq %r12 164 pushq %r13 165 pushq %r14 166 pushq %r15 167 168 /* Allocate space for message schedule and context pointer. */ 169 movq %rsp, %rax 170 subq $(128+3*8), %rsp 171 andq $~63, %rsp 172 movq %rax, (128+2*8)(%rsp) 173 movq ctx, (128+1*8)(%rsp) 174 175 /* Compute and store end of message. */ 176 shlq $7, num 177 leaq (in, num, 1), %rbx 178 movq %rbx, (128+0*8)(%rsp) 179 180 /* Address of SHA-512 constants. */ 181 leaq K512(%rip), k512 182 183 /* Load current hash state from context. */ 184 movq (0*8)(ctx), hs0 185 movq (1*8)(ctx), hs1 186 movq (2*8)(ctx), hs2 187 movq (3*8)(ctx), hs3 188 movq (4*8)(ctx), hs4 189 movq (5*8)(ctx), hs5 190 movq (6*8)(ctx), hs6 191 movq (7*8)(ctx), hs7 192 193 jmp .Lblock_loop0 194 195.align 16 196.Lblock_loop0: 197 mov $0, round 198 199 /* Round 0 through 15. */ 200 sha512_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 201 sha512_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 202 sha512_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 203 sha512_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 204 sha512_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 205 sha512_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 206 sha512_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 207 sha512_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 208 sha512_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 209 sha512_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 210 sha512_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 211 sha512_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 212 sha512_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 213 sha512_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 214 sha512_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 215 sha512_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 216 217 jmp .Lblock_loop16 218 219.align 16 220.Lblock_loop16: 221 /* Round 16 through 79. */ 222 sha512_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 223 sha512_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 224 sha512_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 225 sha512_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 226 sha512_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 227 sha512_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 228 sha512_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 229 sha512_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 230 sha512_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 231 sha512_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 232 sha512_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 233 sha512_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 234 sha512_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 235 sha512_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 236 sha512_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 237 sha512_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 238 239 cmp $80, round 240 jb .Lblock_loop16 241 242 movq (128+1*8)(%rsp), ctx 243 244 /* Add intermediate state to hash state. */ 245 addq (0*8)(ctx), hs0 246 addq (1*8)(ctx), hs1 247 addq (2*8)(ctx), hs2 248 addq (3*8)(ctx), hs3 249 addq (4*8)(ctx), hs4 250 addq (5*8)(ctx), hs5 251 addq (6*8)(ctx), hs6 252 addq (7*8)(ctx), hs7 253 254 /* Store new hash state to context. */ 255 movq hs0, (0*8)(ctx) 256 movq hs1, (1*8)(ctx) 257 movq hs2, (2*8)(ctx) 258 movq hs3, (3*8)(ctx) 259 movq hs4, (4*8)(ctx) 260 movq hs5, (5*8)(ctx) 261 movq hs6, (6*8)(ctx) 262 movq hs7, (7*8)(ctx) 263 264 addq $128, in 265 cmpq (128+0*8)(%rsp), in 266 jb .Lblock_loop0 267 268 movq (128+2*8)(%rsp), %rsp 269 270 /* Restore callee save registers. */ 271 popq %r15 272 popq %r14 273 popq %r13 274 popq %r12 275 popq %rbp 276 popq %rbx 277 278 ret 279 280/* 281 * SHA-512 constants - see FIPS 180-4 section 4.2.3. 282 */ 283.rodata 284.align 64 285.type K512,@object 286K512: 287.quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc 288.quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 289.quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 290.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 291.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 292.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 293.quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 294.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 295.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df 296.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b 297.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 298.quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 299.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 300.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 301.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec 302.quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b 303.quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 304.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b 305.quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c 306.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 307.size K512,.-K512 308