1 #include "inner.h"
2 
3 /*
4  * Support functions for signatures (hash-to-point, norm).
5  *
6  * ==========================(LICENSE BEGIN)============================
7  *
8  * Copyright (c) 2017-2019  Falcon Project
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining
11  * a copy of this software and associated documentation files (the
12  * "Software"), to deal in the Software without restriction, including
13  * without limitation the rights to use, copy, modify, merge, publish,
14  * distribute, sublicense, and/or sell copies of the Software, and to
15  * permit persons to whom the Software is furnished to do so, subject to
16  * the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be
19  * included in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28  *
29  * ===========================(LICENSE END)=============================
30  *
31  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
32  */
33 
34 
35 /* see inner.h */
36 void
PQCLEAN_FALCON512_AVX2_hash_to_point_vartime(inner_shake256_context * sc,uint16_t * x,unsigned logn)37 PQCLEAN_FALCON512_AVX2_hash_to_point_vartime(
38     inner_shake256_context *sc,
39     uint16_t *x, unsigned logn) {
40     /*
41      * This is the straightforward per-the-spec implementation. It
42      * is not constant-time, thus it might reveal information on the
43      * plaintext (at least, enough to check the plaintext against a
44      * list of potential plaintexts) in a scenario where the
45      * attacker does not have access to the signature value or to
46      * the public key, but knows the nonce (without knowledge of the
47      * nonce, the hashed output cannot be matched against potential
48      * plaintexts).
49      */
50     size_t n;
51 
52     n = (size_t)1 << logn;
53     while (n > 0) {
54         uint8_t buf[2];
55         uint32_t w;
56 
57         inner_shake256_extract(sc, (void *)buf, sizeof buf);
58         w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
59         if (w < 61445) {
60             while (w >= 12289) {
61                 w -= 12289;
62             }
63             *x ++ = (uint16_t)w;
64             n --;
65         }
66     }
67 }
68 
69 /* see inner.h */
70 void
PQCLEAN_FALCON512_AVX2_hash_to_point_ct(inner_shake256_context * sc,uint16_t * x,unsigned logn,uint8_t * tmp)71 PQCLEAN_FALCON512_AVX2_hash_to_point_ct(
72     inner_shake256_context *sc,
73     uint16_t *x, unsigned logn, uint8_t *tmp) {
74     /*
75      * Each 16-bit sample is a value in 0..65535. The value is
76      * kept if it falls in 0..61444 (because 61445 = 5*12289)
77      * and rejected otherwise; thus, each sample has probability
78      * about 0.93758 of being selected.
79      *
80      * We want to oversample enough to be sure that we will
81      * have enough values with probability at least 1 - 2^(-256).
82      * Depending on degree N, this leads to the following
83      * required oversampling:
84      *
85      *   logn     n  oversampling
86      *     1      2     65
87      *     2      4     67
88      *     3      8     71
89      *     4     16     77
90      *     5     32     86
91      *     6     64    100
92      *     7    128    122
93      *     8    256    154
94      *     9    512    205
95      *    10   1024    287
96      *
97      * If logn >= 7, then the provided temporary buffer is large
98      * enough. Otherwise, we use a stack buffer of 63 entries
99      * (i.e. 126 bytes) for the values that do not fit in tmp[].
100      */
101 
102     static const uint16_t overtab[] = {
103         0, /* unused */
104         65,
105         67,
106         71,
107         77,
108         86,
109         100,
110         122,
111         154,
112         205,
113         287
114     };
115 
116     unsigned n, n2, u, m, p, over;
117     uint16_t *tt1, tt2[63];
118 
119     /*
120      * We first generate m 16-bit value. Values 0..n-1 go to x[].
121      * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
122      * We also reduce modulo q the values; rejected values are set
123      * to 0xFFFF.
124      */
125     n = 1U << logn;
126     n2 = n << 1;
127     over = overtab[logn];
128     m = n + over;
129     tt1 = (uint16_t *)tmp;
130     for (u = 0; u < m; u ++) {
131         uint8_t buf[2];
132         uint32_t w, wr;
133 
134         inner_shake256_extract(sc, buf, sizeof buf);
135         w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
136         wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
137         wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
138         wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
139         wr |= ((w - 61445) >> 31) - 1;
140         if (u < n) {
141             x[u] = (uint16_t)wr;
142         } else if (u < n2) {
143             tt1[u - n] = (uint16_t)wr;
144         } else {
145             tt2[u - n2] = (uint16_t)wr;
146         }
147     }
148 
149     /*
150      * Now we must "squeeze out" the invalid values. We do this in
151      * a logarithmic sequence of passes; each pass computes where a
152      * value should go, and moves it down by 'p' slots if necessary,
153      * where 'p' uses an increasing powers-of-two scale. It can be
154      * shown that in all cases where the loop decides that a value
155      * has to be moved down by p slots, the destination slot is
156      * "free" (i.e. contains an invalid value).
157      */
158     for (p = 1; p <= over; p <<= 1) {
159         unsigned v;
160 
161         /*
162          * In the loop below:
163          *
164          *   - v contains the index of the final destination of
165          *     the value; it is recomputed dynamically based on
166          *     whether values are valid or not.
167          *
168          *   - u is the index of the value we consider ("source");
169          *     its address is s.
170          *
171          *   - The loop may swap the value with the one at index
172          *     u-p. The address of the swap destination is d.
173          */
174         v = 0;
175         for (u = 0; u < m; u ++) {
176             uint16_t *s, *d;
177             unsigned j, sv, dv, mk;
178 
179             if (u < n) {
180                 s = &x[u];
181             } else if (u < n2) {
182                 s = &tt1[u - n];
183             } else {
184                 s = &tt2[u - n2];
185             }
186             sv = *s;
187 
188             /*
189              * The value in sv should ultimately go to
190              * address v, i.e. jump back by u-v slots.
191              */
192             j = u - v;
193 
194             /*
195              * We increment v for the next iteration, but
196              * only if the source value is valid. The mask
197              * 'mk' is -1 if the value is valid, 0 otherwise,
198              * so we _subtract_ mk.
199              */
200             mk = (sv >> 15) - 1U;
201             v -= mk;
202 
203             /*
204              * In this loop we consider jumps by p slots; if
205              * u < p then there is nothing more to do.
206              */
207             if (u < p) {
208                 continue;
209             }
210 
211             /*
212              * Destination for the swap: value at address u-p.
213              */
214             if ((u - p) < n) {
215                 d = &x[u - p];
216             } else if ((u - p) < n2) {
217                 d = &tt1[(u - p) - n];
218             } else {
219                 d = &tt2[(u - p) - n2];
220             }
221             dv = *d;
222 
223             /*
224              * The swap should be performed only if the source
225              * is valid AND the jump j has its 'p' bit set.
226              */
227             mk &= -(((j & p) + 0x1FF) >> 9);
228 
229             *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
230             *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
231         }
232     }
233 }
234 
235 /* see inner.h */
236 int
PQCLEAN_FALCON512_AVX2_is_short(const int16_t * s1,const int16_t * s2,unsigned logn)237 PQCLEAN_FALCON512_AVX2_is_short(
238     const int16_t *s1, const int16_t *s2, unsigned logn) {
239     /*
240      * We use the l2-norm. Code below uses only 32-bit operations to
241      * compute the square of the norm with saturation to 2^32-1 if
242      * the value exceeds 2^31-1.
243      */
244     size_t n, u;
245     uint32_t s, ng;
246 
247     n = (size_t)1 << logn;
248     s = 0;
249     ng = 0;
250     for (u = 0; u < n; u ++) {
251         int32_t z;
252 
253         z = s1[u];
254         s += (uint32_t)(z * z);
255         ng |= s;
256         z = s2[u];
257         s += (uint32_t)(z * z);
258         ng |= s;
259     }
260     s |= -(ng >> 31);
261 
262     /*
263      * Acceptance bound on the l2-norm is:
264      *   1.2*1.55*sqrt(q)*sqrt(2*N)
265      * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
266      */
267     return s < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
268 }
269 
270 /* see inner.h */
271 int
PQCLEAN_FALCON512_AVX2_is_short_half(uint32_t sqn,const int16_t * s2,unsigned logn)272 PQCLEAN_FALCON512_AVX2_is_short_half(
273     uint32_t sqn, const int16_t *s2, unsigned logn) {
274     size_t n, u;
275     uint32_t ng;
276 
277     n = (size_t)1 << logn;
278     ng = -(sqn >> 31);
279     for (u = 0; u < n; u ++) {
280         int32_t z;
281 
282         z = s2[u];
283         sqn += (uint32_t)(z * z);
284         ng |= sqn;
285     }
286     sqn |= -(ng >> 31);
287 
288     /*
289      * Acceptance bound on the l2-norm is:
290      *   1.2*1.55*sqrt(q)*sqrt(2*N)
291      * Value 7085 is floor((1.2^2)*(1.55^2)*2*1024).
292      */
293     return sqn < (((uint32_t)7085 * (uint32_t)12289) >> (10 - logn));
294 }
295