1 /*
2  * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3  * Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4  *
5  * Licensed under the OpenSSL license (the "License").  You may not use
6  * this file except in compliance with the License.  You can obtain a copy
7  * in the file LICENSE in the source distribution or at
8  * https://www.openssl.org/source/license.html
9  *
10  * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11  * (1) Intel Corporation, Israel Development Center, Haifa, Israel
12  * (2) University of Haifa, Israel
13  */
14 
15 #include "rsaz_exp.h"
16 
17 #if defined(RSAZ_ENABLED)
18 
19 #include <openssl/mem.h>
20 
21 #include "internal.h"
22 #include "../../internal.h"
23 
24 
25 // one is 1 in RSAZ's representation.
26 alignas(64) static const BN_ULONG one[40] = {
27     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
29 // two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is
30 // 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22).
31 alignas(64) static const BN_ULONG two80[40] = {
32     0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33     0, 0, 0,       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
34 
RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],const BN_ULONG base_norm[16],const BN_ULONG exponent[16],const BN_ULONG m_norm[16],const BN_ULONG RR[16],BN_ULONG k0,BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN])35 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
36                             const BN_ULONG base_norm[16],
37                             const BN_ULONG exponent[16],
38                             const BN_ULONG m_norm[16], const BN_ULONG RR[16],
39                             BN_ULONG k0,
40                             BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]) {
41   OPENSSL_STATIC_ASSERT(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH % 64 == 0,
42                         "MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH is too small");
43   assert((uintptr_t)storage % 64 == 0);
44 
45   BN_ULONG *a_inv, *m, *result, *table_s = storage + 40 * 3, *R2 = table_s;
46   // Note |R2| aliases |table_s|.
47   if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) {
48     result = storage;
49     a_inv = storage + 40;
50     m = storage + 40 * 2;  // should not cross page
51   } else {
52     m = storage;  // should not cross page
53     result = storage + 40;
54     a_inv = storage + 40 * 2;
55   }
56 
57   rsaz_1024_norm2red_avx2(m, m_norm);
58   rsaz_1024_norm2red_avx2(a_inv, base_norm);
59   rsaz_1024_norm2red_avx2(R2, RR);
60 
61   // Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix,
62   // giving R = 2^(36*29) = 2^1044.
63   rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
64   // R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052
65   rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
66   // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2
67 
68   // table[0] = 1
69   rsaz_1024_mul_avx2(result, R2, one, m, k0);
70   // table[1] = a_inv^1
71   rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
72 
73   rsaz_1024_scatter5_avx2(table_s, result, 0);
74   rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
75 
76   // table[2] = a_inv^2
77   rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
78   rsaz_1024_scatter5_avx2(table_s, result, 2);
79 #if 0
80   // This is almost 2x smaller and less than 1% slower.
81   for (int index = 3; index < 32; index++) {
82     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
83     rsaz_1024_scatter5_avx2(table_s, result, index);
84   }
85 #else
86   // table[4] = a_inv^4
87   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
88   rsaz_1024_scatter5_avx2(table_s, result, 4);
89   // table[8] = a_inv^8
90   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
91   rsaz_1024_scatter5_avx2(table_s, result, 8);
92   // table[16] = a_inv^16
93   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
94   rsaz_1024_scatter5_avx2(table_s, result, 16);
95   // table[17] = a_inv^17
96   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
97   rsaz_1024_scatter5_avx2(table_s, result, 17);
98 
99   // table[3]
100   rsaz_1024_gather5_avx2(result, table_s, 2);
101   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
102   rsaz_1024_scatter5_avx2(table_s, result, 3);
103   // table[6]
104   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
105   rsaz_1024_scatter5_avx2(table_s, result, 6);
106   // table[12]
107   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
108   rsaz_1024_scatter5_avx2(table_s, result, 12);
109   // table[24]
110   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
111   rsaz_1024_scatter5_avx2(table_s, result, 24);
112   // table[25]
113   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
114   rsaz_1024_scatter5_avx2(table_s, result, 25);
115 
116   // table[5]
117   rsaz_1024_gather5_avx2(result, table_s, 4);
118   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
119   rsaz_1024_scatter5_avx2(table_s, result, 5);
120   // table[10]
121   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
122   rsaz_1024_scatter5_avx2(table_s, result, 10);
123   // table[20]
124   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
125   rsaz_1024_scatter5_avx2(table_s, result, 20);
126   // table[21]
127   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
128   rsaz_1024_scatter5_avx2(table_s, result, 21);
129 
130   // table[7]
131   rsaz_1024_gather5_avx2(result, table_s, 6);
132   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
133   rsaz_1024_scatter5_avx2(table_s, result, 7);
134   // table[14]
135   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
136   rsaz_1024_scatter5_avx2(table_s, result, 14);
137   // table[28]
138   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
139   rsaz_1024_scatter5_avx2(table_s, result, 28);
140   // table[29]
141   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
142   rsaz_1024_scatter5_avx2(table_s, result, 29);
143 
144   // table[9]
145   rsaz_1024_gather5_avx2(result, table_s, 8);
146   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
147   rsaz_1024_scatter5_avx2(table_s, result, 9);
148   // table[18]
149   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
150   rsaz_1024_scatter5_avx2(table_s, result, 18);
151   // table[19]
152   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
153   rsaz_1024_scatter5_avx2(table_s, result, 19);
154 
155   // table[11]
156   rsaz_1024_gather5_avx2(result, table_s, 10);
157   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
158   rsaz_1024_scatter5_avx2(table_s, result, 11);
159   // table[22]
160   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
161   rsaz_1024_scatter5_avx2(table_s, result, 22);
162   // table[23]
163   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
164   rsaz_1024_scatter5_avx2(table_s, result, 23);
165 
166   // table[13]
167   rsaz_1024_gather5_avx2(result, table_s, 12);
168   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
169   rsaz_1024_scatter5_avx2(table_s, result, 13);
170   // table[26]
171   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
172   rsaz_1024_scatter5_avx2(table_s, result, 26);
173   // table[27]
174   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
175   rsaz_1024_scatter5_avx2(table_s, result, 27);
176 
177   // table[15]
178   rsaz_1024_gather5_avx2(result, table_s, 14);
179   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
180   rsaz_1024_scatter5_avx2(table_s, result, 15);
181   // table[30]
182   rsaz_1024_sqr_avx2(result, result, m, k0, 1);
183   rsaz_1024_scatter5_avx2(table_s, result, 30);
184   // table[31]
185   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
186   rsaz_1024_scatter5_avx2(table_s, result, 31);
187 #endif
188 
189   const uint8_t *p_str = (const uint8_t *)exponent;
190 
191   // load first window
192   int wvalue = p_str[127] >> 3;
193   rsaz_1024_gather5_avx2(result, table_s, wvalue);
194 
195   int index = 1014;
196   while (index > -1) {  // Loop for the remaining 127 windows.
197 
198     rsaz_1024_sqr_avx2(result, result, m, k0, 5);
199 
200     uint16_t wvalue_16;
201     memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16));
202     wvalue = wvalue_16;
203     wvalue = (wvalue >> (index % 8)) & 31;
204     index -= 5;
205 
206     rsaz_1024_gather5_avx2(a_inv, table_s, wvalue);  // Borrow |a_inv|.
207     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
208   }
209 
210   // Square four times.
211   rsaz_1024_sqr_avx2(result, result, m, k0, 4);
212 
213   wvalue = p_str[0] & 15;
214 
215   rsaz_1024_gather5_avx2(a_inv, table_s, wvalue);  // Borrow |a_inv|.
216   rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
217 
218   // Convert from Montgomery.
219   rsaz_1024_mul_avx2(result, result, one, m, k0);
220 
221   rsaz_1024_red2norm_avx2(result_norm, result);
222 
223   OPENSSL_cleanse(storage, MOD_EXP_CTIME_STORAGE_LEN * sizeof(BN_ULONG));
224 }
225 
226 #endif  // RSAZ_ENABLED
227