1*b077aed3SPierre Pronchery /* 2*b077aed3SPierre Pronchery * Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. 3*b077aed3SPierre Pronchery * Copyright (c) 2020, Intel Corporation. All Rights Reserved. 4*b077aed3SPierre Pronchery * 5*b077aed3SPierre Pronchery * Licensed under the Apache License 2.0 (the "License"). You may not use 6*b077aed3SPierre Pronchery * this file except in compliance with the License. You can obtain a copy 7*b077aed3SPierre Pronchery * in the file LICENSE in the source distribution or at 8*b077aed3SPierre Pronchery * https://www.openssl.org/source/license.html 9*b077aed3SPierre Pronchery * 10*b077aed3SPierre Pronchery * 11*b077aed3SPierre Pronchery * Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov 12*b077aed3SPierre Pronchery * Intel Corporation 13*b077aed3SPierre Pronchery * 14*b077aed3SPierre Pronchery */ 15*b077aed3SPierre Pronchery 16*b077aed3SPierre Pronchery #include <openssl/opensslconf.h> 17*b077aed3SPierre Pronchery #include <openssl/crypto.h> 18*b077aed3SPierre Pronchery #include "rsaz_exp.h" 19*b077aed3SPierre Pronchery 20*b077aed3SPierre Pronchery #ifndef RSAZ_ENABLED 21*b077aed3SPierre Pronchery NON_EMPTY_TRANSLATION_UNIT 22*b077aed3SPierre Pronchery #else 23*b077aed3SPierre Pronchery # include <assert.h> 24*b077aed3SPierre Pronchery # include <string.h> 25*b077aed3SPierre Pronchery 26*b077aed3SPierre Pronchery # if defined(__GNUC__) 27*b077aed3SPierre Pronchery # define ALIGN64 __attribute__((aligned(64))) 28*b077aed3SPierre Pronchery # elif defined(_MSC_VER) 29*b077aed3SPierre Pronchery # define ALIGN64 __declspec(align(64)) 30*b077aed3SPierre Pronchery # else 31*b077aed3SPierre Pronchery # define ALIGN64 32*b077aed3SPierre Pronchery # endif 33*b077aed3SPierre Pronchery 34*b077aed3SPierre Pronchery # define ALIGN_OF(ptr, boundary) \ 35*b077aed3SPierre Pronchery ((unsigned char *)(ptr) + (boundary - (((size_t)(ptr)) & (boundary - 1)))) 36*b077aed3SPierre Pronchery 37*b077aed3SPierre Pronchery /* Internal radix */ 38*b077aed3SPierre Pronchery # define DIGIT_SIZE (52) 39*b077aed3SPierre Pronchery /* 52-bit mask */ 40*b077aed3SPierre Pronchery # define DIGIT_MASK ((uint64_t)0xFFFFFFFFFFFFF) 41*b077aed3SPierre Pronchery 42*b077aed3SPierre Pronchery # define BITS2WORD8_SIZE(x) (((x) + 7) >> 3) 43*b077aed3SPierre Pronchery # define BITS2WORD64_SIZE(x) (((x) + 63) >> 6) 44*b077aed3SPierre Pronchery 45*b077aed3SPierre Pronchery static ossl_inline uint64_t get_digit52(const uint8_t *in, int in_len); 46*b077aed3SPierre Pronchery static ossl_inline void put_digit52(uint8_t *out, int out_len, uint64_t digit); 47*b077aed3SPierre Pronchery static void to_words52(BN_ULONG *out, int out_len, const BN_ULONG *in, 48*b077aed3SPierre Pronchery int in_bitsize); 49*b077aed3SPierre Pronchery static void from_words52(BN_ULONG *bn_out, int out_bitsize, const BN_ULONG *in); 50*b077aed3SPierre Pronchery static ossl_inline void set_bit(BN_ULONG *a, int idx); 51*b077aed3SPierre Pronchery 52*b077aed3SPierre Pronchery /* Number of |digit_size|-bit digits in |bitsize|-bit value */ 53*b077aed3SPierre Pronchery static ossl_inline int number_of_digits(int bitsize, int digit_size) 54*b077aed3SPierre Pronchery { 55*b077aed3SPierre Pronchery return (bitsize + digit_size - 1) / digit_size; 56*b077aed3SPierre Pronchery } 57*b077aed3SPierre Pronchery 58*b077aed3SPierre Pronchery typedef void (*AMM52)(BN_ULONG *res, const BN_ULONG *base, 59*b077aed3SPierre Pronchery const BN_ULONG *exp, const BN_ULONG *m, BN_ULONG k0); 60*b077aed3SPierre Pronchery typedef void (*EXP52_x2)(BN_ULONG *res, const BN_ULONG *base, 61*b077aed3SPierre Pronchery const BN_ULONG *exp[2], const BN_ULONG *m, 62*b077aed3SPierre Pronchery const BN_ULONG *rr, const BN_ULONG k0[2]); 63*b077aed3SPierre Pronchery 64*b077aed3SPierre Pronchery /* 65*b077aed3SPierre Pronchery * For details of the methods declared below please refer to 66*b077aed3SPierre Pronchery * crypto/bn/asm/rsaz-avx512.pl 67*b077aed3SPierre Pronchery * 68*b077aed3SPierre Pronchery * Naming notes: 69*b077aed3SPierre Pronchery * amm = Almost Montgomery Multiplication 70*b077aed3SPierre Pronchery * ams = Almost Montgomery Squaring 71*b077aed3SPierre Pronchery * 52x20 - data represented as array of 20 digits in 52-bit radix 72*b077aed3SPierre Pronchery * _x1_/_x2_ - 1 or 2 independent inputs/outputs 73*b077aed3SPierre Pronchery * _256 suffix - uses 256-bit (AVX512VL) registers 74*b077aed3SPierre Pronchery */ 75*b077aed3SPierre Pronchery 76*b077aed3SPierre Pronchery /*AMM = Almost Montgomery Multiplication. */ 77*b077aed3SPierre Pronchery void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, const BN_ULONG *base, 78*b077aed3SPierre Pronchery const BN_ULONG *exp, const BN_ULONG *m, 79*b077aed3SPierre Pronchery BN_ULONG k0); 80*b077aed3SPierre Pronchery static void RSAZ_exp52x20_x2_256(BN_ULONG *res, const BN_ULONG *base, 81*b077aed3SPierre Pronchery const BN_ULONG *exp[2], const BN_ULONG *m, 82*b077aed3SPierre Pronchery const BN_ULONG *rr, const BN_ULONG k0[2]); 83*b077aed3SPierre Pronchery void ossl_rsaz_amm52x20_x2_256(BN_ULONG *out, const BN_ULONG *a, 84*b077aed3SPierre Pronchery const BN_ULONG *b, const BN_ULONG *m, 85*b077aed3SPierre Pronchery const BN_ULONG k0[2]); 86*b077aed3SPierre Pronchery void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, 87*b077aed3SPierre Pronchery const BN_ULONG *red_table, 88*b077aed3SPierre Pronchery int red_table_idx, int tbl_idx); 89*b077aed3SPierre Pronchery 90*b077aed3SPierre Pronchery /* 91*b077aed3SPierre Pronchery * Dual Montgomery modular exponentiation using prime moduli of the 92*b077aed3SPierre Pronchery * same bit size, optimized with AVX512 ISA. 93*b077aed3SPierre Pronchery * 94*b077aed3SPierre Pronchery * Input and output parameters for each exponentiation are independent and 95*b077aed3SPierre Pronchery * denoted here by index |i|, i = 1..2. 96*b077aed3SPierre Pronchery * 97*b077aed3SPierre Pronchery * Input and output are all in regular 2^64 radix. 98*b077aed3SPierre Pronchery * 99*b077aed3SPierre Pronchery * Each moduli shall be |factor_size| bit size. 100*b077aed3SPierre Pronchery * 101*b077aed3SPierre Pronchery * NOTE: currently only 2x1024 case is supported. 102*b077aed3SPierre Pronchery * 103*b077aed3SPierre Pronchery * [out] res|i| - result of modular exponentiation: array of qword values 104*b077aed3SPierre Pronchery * in regular (2^64) radix. Size of array shall be enough 105*b077aed3SPierre Pronchery * to hold |factor_size| bits. 106*b077aed3SPierre Pronchery * [in] base|i| - base 107*b077aed3SPierre Pronchery * [in] exp|i| - exponent 108*b077aed3SPierre Pronchery * [in] m|i| - moduli 109*b077aed3SPierre Pronchery * [in] rr|i| - Montgomery parameter RR = R^2 mod m|i| 110*b077aed3SPierre Pronchery * [in] k0_|i| - Montgomery parameter k0 = -1/m|i| mod 2^64 111*b077aed3SPierre Pronchery * [in] factor_size - moduli bit size 112*b077aed3SPierre Pronchery * 113*b077aed3SPierre Pronchery * \return 0 in case of failure, 114*b077aed3SPierre Pronchery * 1 in case of success. 115*b077aed3SPierre Pronchery */ 116*b077aed3SPierre Pronchery int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, 117*b077aed3SPierre Pronchery const BN_ULONG *base1, 118*b077aed3SPierre Pronchery const BN_ULONG *exp1, 119*b077aed3SPierre Pronchery const BN_ULONG *m1, 120*b077aed3SPierre Pronchery const BN_ULONG *rr1, 121*b077aed3SPierre Pronchery BN_ULONG k0_1, 122*b077aed3SPierre Pronchery BN_ULONG *res2, 123*b077aed3SPierre Pronchery const BN_ULONG *base2, 124*b077aed3SPierre Pronchery const BN_ULONG *exp2, 125*b077aed3SPierre Pronchery const BN_ULONG *m2, 126*b077aed3SPierre Pronchery const BN_ULONG *rr2, 127*b077aed3SPierre Pronchery BN_ULONG k0_2, 128*b077aed3SPierre Pronchery int factor_size) 129*b077aed3SPierre Pronchery { 130*b077aed3SPierre Pronchery int ret = 0; 131*b077aed3SPierre Pronchery 132*b077aed3SPierre Pronchery /* 133*b077aed3SPierre Pronchery * Number of word-size (BN_ULONG) digits to store exponent in redundant 134*b077aed3SPierre Pronchery * representation. 135*b077aed3SPierre Pronchery */ 136*b077aed3SPierre Pronchery int exp_digits = number_of_digits(factor_size + 2, DIGIT_SIZE); 137*b077aed3SPierre Pronchery int coeff_pow = 4 * (DIGIT_SIZE * exp_digits - factor_size); 138*b077aed3SPierre Pronchery BN_ULONG *base1_red, *m1_red, *rr1_red; 139*b077aed3SPierre Pronchery BN_ULONG *base2_red, *m2_red, *rr2_red; 140*b077aed3SPierre Pronchery BN_ULONG *coeff_red; 141*b077aed3SPierre Pronchery BN_ULONG *storage = NULL; 142*b077aed3SPierre Pronchery BN_ULONG *storage_aligned = NULL; 143*b077aed3SPierre Pronchery BN_ULONG storage_len_bytes = 7 * exp_digits * sizeof(BN_ULONG); 144*b077aed3SPierre Pronchery 145*b077aed3SPierre Pronchery /* AMM = Almost Montgomery Multiplication */ 146*b077aed3SPierre Pronchery AMM52 amm = NULL; 147*b077aed3SPierre Pronchery /* Dual (2-exps in parallel) exponentiation */ 148*b077aed3SPierre Pronchery EXP52_x2 exp_x2 = NULL; 149*b077aed3SPierre Pronchery 150*b077aed3SPierre Pronchery const BN_ULONG *exp[2] = {0}; 151*b077aed3SPierre Pronchery BN_ULONG k0[2] = {0}; 152*b077aed3SPierre Pronchery 153*b077aed3SPierre Pronchery /* Only 1024-bit factor size is supported now */ 154*b077aed3SPierre Pronchery switch (factor_size) { 155*b077aed3SPierre Pronchery case 1024: 156*b077aed3SPierre Pronchery amm = ossl_rsaz_amm52x20_x1_256; 157*b077aed3SPierre Pronchery exp_x2 = RSAZ_exp52x20_x2_256; 158*b077aed3SPierre Pronchery break; 159*b077aed3SPierre Pronchery default: 160*b077aed3SPierre Pronchery goto err; 161*b077aed3SPierre Pronchery } 162*b077aed3SPierre Pronchery 163*b077aed3SPierre Pronchery storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes + 64); 164*b077aed3SPierre Pronchery if (storage == NULL) 165*b077aed3SPierre Pronchery goto err; 166*b077aed3SPierre Pronchery storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64); 167*b077aed3SPierre Pronchery 168*b077aed3SPierre Pronchery /* Memory layout for red(undant) representations */ 169*b077aed3SPierre Pronchery base1_red = storage_aligned; 170*b077aed3SPierre Pronchery base2_red = storage_aligned + 1 * exp_digits; 171*b077aed3SPierre Pronchery m1_red = storage_aligned + 2 * exp_digits; 172*b077aed3SPierre Pronchery m2_red = storage_aligned + 3 * exp_digits; 173*b077aed3SPierre Pronchery rr1_red = storage_aligned + 4 * exp_digits; 174*b077aed3SPierre Pronchery rr2_red = storage_aligned + 5 * exp_digits; 175*b077aed3SPierre Pronchery coeff_red = storage_aligned + 6 * exp_digits; 176*b077aed3SPierre Pronchery 177*b077aed3SPierre Pronchery /* Convert base_i, m_i, rr_i, from regular to 52-bit radix */ 178*b077aed3SPierre Pronchery to_words52(base1_red, exp_digits, base1, factor_size); 179*b077aed3SPierre Pronchery to_words52(base2_red, exp_digits, base2, factor_size); 180*b077aed3SPierre Pronchery to_words52(m1_red, exp_digits, m1, factor_size); 181*b077aed3SPierre Pronchery to_words52(m2_red, exp_digits, m2, factor_size); 182*b077aed3SPierre Pronchery to_words52(rr1_red, exp_digits, rr1, factor_size); 183*b077aed3SPierre Pronchery to_words52(rr2_red, exp_digits, rr2, factor_size); 184*b077aed3SPierre Pronchery 185*b077aed3SPierre Pronchery /* 186*b077aed3SPierre Pronchery * Compute target domain Montgomery converters RR' for each modulus 187*b077aed3SPierre Pronchery * based on precomputed original domain's RR. 188*b077aed3SPierre Pronchery * 189*b077aed3SPierre Pronchery * RR -> RR' transformation steps: 190*b077aed3SPierre Pronchery * (1) coeff = 2^k 191*b077aed3SPierre Pronchery * (2) t = AMM(RR,RR) = RR^2 / R' mod m 192*b077aed3SPierre Pronchery * (3) RR' = AMM(t, coeff) = RR^2 * 2^k / R'^2 mod m 193*b077aed3SPierre Pronchery * where 194*b077aed3SPierre Pronchery * k = 4 * (52 * digits52 - modlen) 195*b077aed3SPierre Pronchery * R = 2^(64 * ceil(modlen/64)) mod m 196*b077aed3SPierre Pronchery * RR = R^2 mod M 197*b077aed3SPierre Pronchery * R' = 2^(52 * ceil(modlen/52)) mod m 198*b077aed3SPierre Pronchery * 199*b077aed3SPierre Pronchery * modlen = 1024: k = 64, RR = 2^2048 mod m, RR' = 2^2080 mod m 200*b077aed3SPierre Pronchery */ 201*b077aed3SPierre Pronchery memset(coeff_red, 0, exp_digits * sizeof(BN_ULONG)); 202*b077aed3SPierre Pronchery /* (1) in reduced domain representation */ 203*b077aed3SPierre Pronchery set_bit(coeff_red, 64 * (int)(coeff_pow / 52) + coeff_pow % 52); 204*b077aed3SPierre Pronchery 205*b077aed3SPierre Pronchery amm(rr1_red, rr1_red, rr1_red, m1_red, k0_1); /* (2) for m1 */ 206*b077aed3SPierre Pronchery amm(rr1_red, rr1_red, coeff_red, m1_red, k0_1); /* (3) for m1 */ 207*b077aed3SPierre Pronchery 208*b077aed3SPierre Pronchery amm(rr2_red, rr2_red, rr2_red, m2_red, k0_2); /* (2) for m2 */ 209*b077aed3SPierre Pronchery amm(rr2_red, rr2_red, coeff_red, m2_red, k0_2); /* (3) for m2 */ 210*b077aed3SPierre Pronchery 211*b077aed3SPierre Pronchery exp[0] = exp1; 212*b077aed3SPierre Pronchery exp[1] = exp2; 213*b077aed3SPierre Pronchery 214*b077aed3SPierre Pronchery k0[0] = k0_1; 215*b077aed3SPierre Pronchery k0[1] = k0_2; 216*b077aed3SPierre Pronchery 217*b077aed3SPierre Pronchery exp_x2(rr1_red, base1_red, exp, m1_red, rr1_red, k0); 218*b077aed3SPierre Pronchery 219*b077aed3SPierre Pronchery /* Convert rr_i back to regular radix */ 220*b077aed3SPierre Pronchery from_words52(res1, factor_size, rr1_red); 221*b077aed3SPierre Pronchery from_words52(res2, factor_size, rr2_red); 222*b077aed3SPierre Pronchery 223*b077aed3SPierre Pronchery /* bn_reduce_once_in_place expects number of BN_ULONG, not bit size */ 224*b077aed3SPierre Pronchery factor_size /= sizeof(BN_ULONG) * 8; 225*b077aed3SPierre Pronchery 226*b077aed3SPierre Pronchery bn_reduce_once_in_place(res1, /*carry=*/0, m1, storage, factor_size); 227*b077aed3SPierre Pronchery bn_reduce_once_in_place(res2, /*carry=*/0, m2, storage, factor_size); 228*b077aed3SPierre Pronchery 229*b077aed3SPierre Pronchery ret = 1; 230*b077aed3SPierre Pronchery err: 231*b077aed3SPierre Pronchery if (storage != NULL) { 232*b077aed3SPierre Pronchery OPENSSL_cleanse(storage, storage_len_bytes); 233*b077aed3SPierre Pronchery OPENSSL_free(storage); 234*b077aed3SPierre Pronchery } 235*b077aed3SPierre Pronchery return ret; 236*b077aed3SPierre Pronchery } 237*b077aed3SPierre Pronchery 238*b077aed3SPierre Pronchery /* 239*b077aed3SPierre Pronchery * Dual 1024-bit w-ary modular exponentiation using prime moduli of the same 240*b077aed3SPierre Pronchery * bit size using Almost Montgomery Multiplication, optimized with AVX512_IFMA 241*b077aed3SPierre Pronchery * ISA. 242*b077aed3SPierre Pronchery * 243*b077aed3SPierre Pronchery * The parameter w (window size) = 5. 244*b077aed3SPierre Pronchery * 245*b077aed3SPierre Pronchery * [out] res - result of modular exponentiation: 2x20 qword 246*b077aed3SPierre Pronchery * values in 2^52 radix. 247*b077aed3SPierre Pronchery * [in] base - base (2x20 qword values in 2^52 radix) 248*b077aed3SPierre Pronchery * [in] exp - array of 2 pointers to 16 qword values in 2^64 radix. 249*b077aed3SPierre Pronchery * Exponent is not converted to redundant representation. 250*b077aed3SPierre Pronchery * [in] m - moduli (2x20 qword values in 2^52 radix) 251*b077aed3SPierre Pronchery * [in] rr - Montgomery parameter for 2 moduli: RR = 2^2080 mod m. 252*b077aed3SPierre Pronchery * (2x20 qword values in 2^52 radix) 253*b077aed3SPierre Pronchery * [in] k0 - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64 254*b077aed3SPierre Pronchery * 255*b077aed3SPierre Pronchery * \return (void). 256*b077aed3SPierre Pronchery */ 257*b077aed3SPierre Pronchery static void RSAZ_exp52x20_x2_256(BN_ULONG *out, /* [2][20] */ 258*b077aed3SPierre Pronchery const BN_ULONG *base, /* [2][20] */ 259*b077aed3SPierre Pronchery const BN_ULONG *exp[2], /* 2x16 */ 260*b077aed3SPierre Pronchery const BN_ULONG *m, /* [2][20] */ 261*b077aed3SPierre Pronchery const BN_ULONG *rr, /* [2][20] */ 262*b077aed3SPierre Pronchery const BN_ULONG k0[2]) 263*b077aed3SPierre Pronchery { 264*b077aed3SPierre Pronchery # define BITSIZE_MODULUS (1024) 265*b077aed3SPierre Pronchery # define EXP_WIN_SIZE (5) 266*b077aed3SPierre Pronchery # define EXP_WIN_MASK ((1U << EXP_WIN_SIZE) - 1) 267*b077aed3SPierre Pronchery /* 268*b077aed3SPierre Pronchery * Number of digits (64-bit words) in redundant representation to handle 269*b077aed3SPierre Pronchery * modulus bits 270*b077aed3SPierre Pronchery */ 271*b077aed3SPierre Pronchery # define RED_DIGITS (20) 272*b077aed3SPierre Pronchery # define EXP_DIGITS (16) 273*b077aed3SPierre Pronchery # define DAMM ossl_rsaz_amm52x20_x2_256 274*b077aed3SPierre Pronchery /* 275*b077aed3SPierre Pronchery * Squaring is done using multiplication now. That can be a subject of 276*b077aed3SPierre Pronchery * optimization in future. 277*b077aed3SPierre Pronchery */ 278*b077aed3SPierre Pronchery # define DAMS(r,a,m,k0) \ 279*b077aed3SPierre Pronchery ossl_rsaz_amm52x20_x2_256((r),(a),(a),(m),(k0)) 280*b077aed3SPierre Pronchery 281*b077aed3SPierre Pronchery /* Allocate stack for red(undant) result Y and multiplier X */ 282*b077aed3SPierre Pronchery ALIGN64 BN_ULONG red_Y[2][RED_DIGITS]; 283*b077aed3SPierre Pronchery ALIGN64 BN_ULONG red_X[2][RED_DIGITS]; 284*b077aed3SPierre Pronchery 285*b077aed3SPierre Pronchery /* Allocate expanded exponent */ 286*b077aed3SPierre Pronchery ALIGN64 BN_ULONG expz[2][EXP_DIGITS + 1]; 287*b077aed3SPierre Pronchery 288*b077aed3SPierre Pronchery /* Pre-computed table of base powers */ 289*b077aed3SPierre Pronchery ALIGN64 BN_ULONG red_table[1U << EXP_WIN_SIZE][2][RED_DIGITS]; 290*b077aed3SPierre Pronchery 291*b077aed3SPierre Pronchery int idx; 292*b077aed3SPierre Pronchery 293*b077aed3SPierre Pronchery memset(red_Y, 0, sizeof(red_Y)); 294*b077aed3SPierre Pronchery memset(red_table, 0, sizeof(red_table)); 295*b077aed3SPierre Pronchery memset(red_X, 0, sizeof(red_X)); 296*b077aed3SPierre Pronchery 297*b077aed3SPierre Pronchery /* 298*b077aed3SPierre Pronchery * Compute table of powers base^i, i = 0, ..., (2^EXP_WIN_SIZE) - 1 299*b077aed3SPierre Pronchery * table[0] = mont(x^0) = mont(1) 300*b077aed3SPierre Pronchery * table[1] = mont(x^1) = mont(x) 301*b077aed3SPierre Pronchery */ 302*b077aed3SPierre Pronchery red_X[0][0] = 1; 303*b077aed3SPierre Pronchery red_X[1][0] = 1; 304*b077aed3SPierre Pronchery DAMM(red_table[0][0], (const BN_ULONG*)red_X, rr, m, k0); 305*b077aed3SPierre Pronchery DAMM(red_table[1][0], base, rr, m, k0); 306*b077aed3SPierre Pronchery 307*b077aed3SPierre Pronchery for (idx = 1; idx < (int)((1U << EXP_WIN_SIZE) / 2); idx++) { 308*b077aed3SPierre Pronchery DAMS(red_table[2 * idx + 0][0], red_table[1 * idx][0], m, k0); 309*b077aed3SPierre Pronchery DAMM(red_table[2 * idx + 1][0], red_table[2 * idx][0], red_table[1][0], m, k0); 310*b077aed3SPierre Pronchery } 311*b077aed3SPierre Pronchery 312*b077aed3SPierre Pronchery /* Copy and expand exponents */ 313*b077aed3SPierre Pronchery memcpy(expz[0], exp[0], EXP_DIGITS * sizeof(BN_ULONG)); 314*b077aed3SPierre Pronchery expz[0][EXP_DIGITS] = 0; 315*b077aed3SPierre Pronchery memcpy(expz[1], exp[1], EXP_DIGITS * sizeof(BN_ULONG)); 316*b077aed3SPierre Pronchery expz[1][EXP_DIGITS] = 0; 317*b077aed3SPierre Pronchery 318*b077aed3SPierre Pronchery /* Exponentiation */ 319*b077aed3SPierre Pronchery { 320*b077aed3SPierre Pronchery const int rem = BITSIZE_MODULUS % EXP_WIN_SIZE; 321*b077aed3SPierre Pronchery BN_ULONG table_idx_mask = EXP_WIN_MASK; 322*b077aed3SPierre Pronchery 323*b077aed3SPierre Pronchery int exp_bit_no = BITSIZE_MODULUS - rem; 324*b077aed3SPierre Pronchery int exp_chunk_no = exp_bit_no / 64; 325*b077aed3SPierre Pronchery int exp_chunk_shift = exp_bit_no % 64; 326*b077aed3SPierre Pronchery 327*b077aed3SPierre Pronchery BN_ULONG red_table_idx_0, red_table_idx_1; 328*b077aed3SPierre Pronchery 329*b077aed3SPierre Pronchery /* 330*b077aed3SPierre Pronchery * If rem == 0, then 331*b077aed3SPierre Pronchery * exp_bit_no = modulus_bitsize - exp_win_size 332*b077aed3SPierre Pronchery * However, this isn't possible because rem is { 1024, 1536, 2048 } % 5 333*b077aed3SPierre Pronchery * which is { 4, 1, 3 } respectively. 334*b077aed3SPierre Pronchery * 335*b077aed3SPierre Pronchery * If this assertion ever fails the fix above is easy. 336*b077aed3SPierre Pronchery */ 337*b077aed3SPierre Pronchery OPENSSL_assert(rem != 0); 338*b077aed3SPierre Pronchery 339*b077aed3SPierre Pronchery /* Process 1-st exp window - just init result */ 340*b077aed3SPierre Pronchery red_table_idx_0 = expz[0][exp_chunk_no]; 341*b077aed3SPierre Pronchery red_table_idx_1 = expz[1][exp_chunk_no]; 342*b077aed3SPierre Pronchery /* 343*b077aed3SPierre Pronchery * The function operates with fixed moduli sizes divisible by 64, 344*b077aed3SPierre Pronchery * thus table index here is always in supported range [0, EXP_WIN_SIZE). 345*b077aed3SPierre Pronchery */ 346*b077aed3SPierre Pronchery red_table_idx_0 >>= exp_chunk_shift; 347*b077aed3SPierre Pronchery red_table_idx_1 >>= exp_chunk_shift; 348*b077aed3SPierre Pronchery 349*b077aed3SPierre Pronchery ossl_extract_multiplier_2x20_win5(red_Y[0], (const BN_ULONG*)red_table, 350*b077aed3SPierre Pronchery (int)red_table_idx_0, 0); 351*b077aed3SPierre Pronchery ossl_extract_multiplier_2x20_win5(red_Y[1], (const BN_ULONG*)red_table, 352*b077aed3SPierre Pronchery (int)red_table_idx_1, 1); 353*b077aed3SPierre Pronchery 354*b077aed3SPierre Pronchery /* Process other exp windows */ 355*b077aed3SPierre Pronchery for (exp_bit_no -= EXP_WIN_SIZE; exp_bit_no >= 0; exp_bit_no -= EXP_WIN_SIZE) { 356*b077aed3SPierre Pronchery /* Extract pre-computed multiplier from the table */ 357*b077aed3SPierre Pronchery { 358*b077aed3SPierre Pronchery BN_ULONG T; 359*b077aed3SPierre Pronchery 360*b077aed3SPierre Pronchery exp_chunk_no = exp_bit_no / 64; 361*b077aed3SPierre Pronchery exp_chunk_shift = exp_bit_no % 64; 362*b077aed3SPierre Pronchery { 363*b077aed3SPierre Pronchery red_table_idx_0 = expz[0][exp_chunk_no]; 364*b077aed3SPierre Pronchery T = expz[0][exp_chunk_no + 1]; 365*b077aed3SPierre Pronchery 366*b077aed3SPierre Pronchery red_table_idx_0 >>= exp_chunk_shift; 367*b077aed3SPierre Pronchery /* 368*b077aed3SPierre Pronchery * Get additional bits from then next quadword 369*b077aed3SPierre Pronchery * when 64-bit boundaries are crossed. 370*b077aed3SPierre Pronchery */ 371*b077aed3SPierre Pronchery if (exp_chunk_shift > 64 - EXP_WIN_SIZE) { 372*b077aed3SPierre Pronchery T <<= (64 - exp_chunk_shift); 373*b077aed3SPierre Pronchery red_table_idx_0 ^= T; 374*b077aed3SPierre Pronchery } 375*b077aed3SPierre Pronchery red_table_idx_0 &= table_idx_mask; 376*b077aed3SPierre Pronchery 377*b077aed3SPierre Pronchery ossl_extract_multiplier_2x20_win5(red_X[0], 378*b077aed3SPierre Pronchery (const BN_ULONG*)red_table, 379*b077aed3SPierre Pronchery (int)red_table_idx_0, 0); 380*b077aed3SPierre Pronchery } 381*b077aed3SPierre Pronchery { 382*b077aed3SPierre Pronchery red_table_idx_1 = expz[1][exp_chunk_no]; 383*b077aed3SPierre Pronchery T = expz[1][exp_chunk_no + 1]; 384*b077aed3SPierre Pronchery 385*b077aed3SPierre Pronchery red_table_idx_1 >>= exp_chunk_shift; 386*b077aed3SPierre Pronchery /* 387*b077aed3SPierre Pronchery * Get additional bits from then next quadword 388*b077aed3SPierre Pronchery * when 64-bit boundaries are crossed. 389*b077aed3SPierre Pronchery */ 390*b077aed3SPierre Pronchery if (exp_chunk_shift > 64 - EXP_WIN_SIZE) { 391*b077aed3SPierre Pronchery T <<= (64 - exp_chunk_shift); 392*b077aed3SPierre Pronchery red_table_idx_1 ^= T; 393*b077aed3SPierre Pronchery } 394*b077aed3SPierre Pronchery red_table_idx_1 &= table_idx_mask; 395*b077aed3SPierre Pronchery 396*b077aed3SPierre Pronchery ossl_extract_multiplier_2x20_win5(red_X[1], 397*b077aed3SPierre Pronchery (const BN_ULONG*)red_table, 398*b077aed3SPierre Pronchery (int)red_table_idx_1, 1); 399*b077aed3SPierre Pronchery } 400*b077aed3SPierre Pronchery } 401*b077aed3SPierre Pronchery 402*b077aed3SPierre Pronchery /* Series of squaring */ 403*b077aed3SPierre Pronchery DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); 404*b077aed3SPierre Pronchery DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); 405*b077aed3SPierre Pronchery DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); 406*b077aed3SPierre Pronchery DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); 407*b077aed3SPierre Pronchery DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); 408*b077aed3SPierre Pronchery 409*b077aed3SPierre Pronchery DAMM((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); 410*b077aed3SPierre Pronchery } 411*b077aed3SPierre Pronchery } 412*b077aed3SPierre Pronchery 413*b077aed3SPierre Pronchery /* 414*b077aed3SPierre Pronchery * 415*b077aed3SPierre Pronchery * NB: After the last AMM of exponentiation in Montgomery domain, the result 416*b077aed3SPierre Pronchery * may be 1025-bit, but the conversion out of Montgomery domain performs an 417*b077aed3SPierre Pronchery * AMM(x,1) which guarantees that the final result is less than |m|, so no 418*b077aed3SPierre Pronchery * conditional subtraction is needed here. See "Efficient Software 419*b077aed3SPierre Pronchery * Implementations of Modular Exponentiation" (by Shay Gueron) paper for details. 420*b077aed3SPierre Pronchery */ 421*b077aed3SPierre Pronchery 422*b077aed3SPierre Pronchery /* Convert result back in regular 2^52 domain */ 423*b077aed3SPierre Pronchery memset(red_X, 0, sizeof(red_X)); 424*b077aed3SPierre Pronchery red_X[0][0] = 1; 425*b077aed3SPierre Pronchery red_X[1][0] = 1; 426*b077aed3SPierre Pronchery DAMM(out, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); 427*b077aed3SPierre Pronchery 428*b077aed3SPierre Pronchery /* Clear exponents */ 429*b077aed3SPierre Pronchery OPENSSL_cleanse(expz, sizeof(expz)); 430*b077aed3SPierre Pronchery OPENSSL_cleanse(red_Y, sizeof(red_Y)); 431*b077aed3SPierre Pronchery 432*b077aed3SPierre Pronchery # undef DAMS 433*b077aed3SPierre Pronchery # undef DAMM 434*b077aed3SPierre Pronchery # undef EXP_DIGITS 435*b077aed3SPierre Pronchery # undef RED_DIGITS 436*b077aed3SPierre Pronchery # undef EXP_WIN_MASK 437*b077aed3SPierre Pronchery # undef EXP_WIN_SIZE 438*b077aed3SPierre Pronchery # undef BITSIZE_MODULUS 439*b077aed3SPierre Pronchery } 440*b077aed3SPierre Pronchery 441*b077aed3SPierre Pronchery static ossl_inline uint64_t get_digit52(const uint8_t *in, int in_len) 442*b077aed3SPierre Pronchery { 443*b077aed3SPierre Pronchery uint64_t digit = 0; 444*b077aed3SPierre Pronchery 445*b077aed3SPierre Pronchery assert(in != NULL); 446*b077aed3SPierre Pronchery 447*b077aed3SPierre Pronchery for (; in_len > 0; in_len--) { 448*b077aed3SPierre Pronchery digit <<= 8; 449*b077aed3SPierre Pronchery digit += (uint64_t)(in[in_len - 1]); 450*b077aed3SPierre Pronchery } 451*b077aed3SPierre Pronchery return digit; 452*b077aed3SPierre Pronchery } 453*b077aed3SPierre Pronchery 454*b077aed3SPierre Pronchery /* 455*b077aed3SPierre Pronchery * Convert array of words in regular (base=2^64) representation to array of 456*b077aed3SPierre Pronchery * words in redundant (base=2^52) one. 457*b077aed3SPierre Pronchery */ 458*b077aed3SPierre Pronchery static void to_words52(BN_ULONG *out, int out_len, 459*b077aed3SPierre Pronchery const BN_ULONG *in, int in_bitsize) 460*b077aed3SPierre Pronchery { 461*b077aed3SPierre Pronchery uint8_t *in_str = NULL; 462*b077aed3SPierre Pronchery 463*b077aed3SPierre Pronchery assert(out != NULL); 464*b077aed3SPierre Pronchery assert(in != NULL); 465*b077aed3SPierre Pronchery /* Check destination buffer capacity */ 466*b077aed3SPierre Pronchery assert(out_len >= number_of_digits(in_bitsize, DIGIT_SIZE)); 467*b077aed3SPierre Pronchery 468*b077aed3SPierre Pronchery in_str = (uint8_t *)in; 469*b077aed3SPierre Pronchery 470*b077aed3SPierre Pronchery for (; in_bitsize >= (2 * DIGIT_SIZE); in_bitsize -= (2 * DIGIT_SIZE), out += 2) { 471*b077aed3SPierre Pronchery uint64_t digit; 472*b077aed3SPierre Pronchery 473*b077aed3SPierre Pronchery memcpy(&digit, in_str, sizeof(digit)); 474*b077aed3SPierre Pronchery out[0] = digit & DIGIT_MASK; 475*b077aed3SPierre Pronchery in_str += 6; 476*b077aed3SPierre Pronchery memcpy(&digit, in_str, sizeof(digit)); 477*b077aed3SPierre Pronchery out[1] = (digit >> 4) & DIGIT_MASK; 478*b077aed3SPierre Pronchery in_str += 7; 479*b077aed3SPierre Pronchery out_len -= 2; 480*b077aed3SPierre Pronchery } 481*b077aed3SPierre Pronchery 482*b077aed3SPierre Pronchery if (in_bitsize > DIGIT_SIZE) { 483*b077aed3SPierre Pronchery uint64_t digit = get_digit52(in_str, 7); 484*b077aed3SPierre Pronchery 485*b077aed3SPierre Pronchery out[0] = digit & DIGIT_MASK; 486*b077aed3SPierre Pronchery in_str += 6; 487*b077aed3SPierre Pronchery in_bitsize -= DIGIT_SIZE; 488*b077aed3SPierre Pronchery digit = get_digit52(in_str, BITS2WORD8_SIZE(in_bitsize)); 489*b077aed3SPierre Pronchery out[1] = digit >> 4; 490*b077aed3SPierre Pronchery out += 2; 491*b077aed3SPierre Pronchery out_len -= 2; 492*b077aed3SPierre Pronchery } else if (in_bitsize > 0) { 493*b077aed3SPierre Pronchery out[0] = get_digit52(in_str, BITS2WORD8_SIZE(in_bitsize)); 494*b077aed3SPierre Pronchery out++; 495*b077aed3SPierre Pronchery out_len--; 496*b077aed3SPierre Pronchery } 497*b077aed3SPierre Pronchery 498*b077aed3SPierre Pronchery while (out_len > 0) { 499*b077aed3SPierre Pronchery *out = 0; 500*b077aed3SPierre Pronchery out_len--; 501*b077aed3SPierre Pronchery out++; 502*b077aed3SPierre Pronchery } 503*b077aed3SPierre Pronchery } 504*b077aed3SPierre Pronchery 505*b077aed3SPierre Pronchery static ossl_inline void put_digit52(uint8_t *pStr, int strLen, uint64_t digit) 506*b077aed3SPierre Pronchery { 507*b077aed3SPierre Pronchery assert(pStr != NULL); 508*b077aed3SPierre Pronchery 509*b077aed3SPierre Pronchery for (; strLen > 0; strLen--) { 510*b077aed3SPierre Pronchery *pStr++ = (uint8_t)(digit & 0xFF); 511*b077aed3SPierre Pronchery digit >>= 8; 512*b077aed3SPierre Pronchery } 513*b077aed3SPierre Pronchery } 514*b077aed3SPierre Pronchery 515*b077aed3SPierre Pronchery /* 516*b077aed3SPierre Pronchery * Convert array of words in redundant (base=2^52) representation to array of 517*b077aed3SPierre Pronchery * words in regular (base=2^64) one. 518*b077aed3SPierre Pronchery */ 519*b077aed3SPierre Pronchery static void from_words52(BN_ULONG *out, int out_bitsize, const BN_ULONG *in) 520*b077aed3SPierre Pronchery { 521*b077aed3SPierre Pronchery int i; 522*b077aed3SPierre Pronchery int out_len = BITS2WORD64_SIZE(out_bitsize); 523*b077aed3SPierre Pronchery 524*b077aed3SPierre Pronchery assert(out != NULL); 525*b077aed3SPierre Pronchery assert(in != NULL); 526*b077aed3SPierre Pronchery 527*b077aed3SPierre Pronchery for (i = 0; i < out_len; i++) 528*b077aed3SPierre Pronchery out[i] = 0; 529*b077aed3SPierre Pronchery 530*b077aed3SPierre Pronchery { 531*b077aed3SPierre Pronchery uint8_t *out_str = (uint8_t *)out; 532*b077aed3SPierre Pronchery 533*b077aed3SPierre Pronchery for (; out_bitsize >= (2 * DIGIT_SIZE); 534*b077aed3SPierre Pronchery out_bitsize -= (2 * DIGIT_SIZE), in += 2) { 535*b077aed3SPierre Pronchery uint64_t digit; 536*b077aed3SPierre Pronchery 537*b077aed3SPierre Pronchery digit = in[0]; 538*b077aed3SPierre Pronchery memcpy(out_str, &digit, sizeof(digit)); 539*b077aed3SPierre Pronchery out_str += 6; 540*b077aed3SPierre Pronchery digit = digit >> 48 | in[1] << 4; 541*b077aed3SPierre Pronchery memcpy(out_str, &digit, sizeof(digit)); 542*b077aed3SPierre Pronchery out_str += 7; 543*b077aed3SPierre Pronchery } 544*b077aed3SPierre Pronchery 545*b077aed3SPierre Pronchery if (out_bitsize > DIGIT_SIZE) { 546*b077aed3SPierre Pronchery put_digit52(out_str, 7, in[0]); 547*b077aed3SPierre Pronchery out_str += 6; 548*b077aed3SPierre Pronchery out_bitsize -= DIGIT_SIZE; 549*b077aed3SPierre Pronchery put_digit52(out_str, BITS2WORD8_SIZE(out_bitsize), 550*b077aed3SPierre Pronchery (in[1] << 4 | in[0] >> 48)); 551*b077aed3SPierre Pronchery } else if (out_bitsize) { 552*b077aed3SPierre Pronchery put_digit52(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]); 553*b077aed3SPierre Pronchery } 554*b077aed3SPierre Pronchery } 555*b077aed3SPierre Pronchery } 556*b077aed3SPierre Pronchery 557*b077aed3SPierre Pronchery /* 558*b077aed3SPierre Pronchery * Set bit at index |idx| in the words array |a|. 559*b077aed3SPierre Pronchery * It does not do any boundaries checks, make sure the index is valid before 560*b077aed3SPierre Pronchery * calling the function. 561*b077aed3SPierre Pronchery */ 562*b077aed3SPierre Pronchery static ossl_inline void set_bit(BN_ULONG *a, int idx) 563*b077aed3SPierre Pronchery { 564*b077aed3SPierre Pronchery assert(a != NULL); 565*b077aed3SPierre Pronchery 566*b077aed3SPierre Pronchery { 567*b077aed3SPierre Pronchery int i, j; 568*b077aed3SPierre Pronchery 569*b077aed3SPierre Pronchery i = idx / BN_BITS2; 570*b077aed3SPierre Pronchery j = idx % BN_BITS2; 571*b077aed3SPierre Pronchery a[i] |= (((BN_ULONG)1) << j); 572*b077aed3SPierre Pronchery } 573*b077aed3SPierre Pronchery } 574*b077aed3SPierre Pronchery 575*b077aed3SPierre Pronchery #endif 576