1 #include "inner.h"
2 #include <assert.h>
3 /*
4  * PRNG and interface to the system RNG.
5  *
6  * ==========================(LICENSE BEGIN)============================
7  *
8  * Copyright (c) 2017-2019  Falcon Project
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining
11  * a copy of this software and associated documentation files (the
12  * "Software"), to deal in the Software without restriction, including
13  * without limitation the rights to use, copy, modify, merge, publish,
14  * distribute, sublicense, and/or sell copies of the Software, and to
15  * permit persons to whom the Software is furnished to do so, subject to
16  * the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be
19  * included in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28  *
29  * ===========================(LICENSE END)=============================
30  *
31  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
32  */
33 
34 
35 
36 /*
37  * Include relevant system header files. For Win32, this will also need
38  * linking with advapi32.dll, which we trigger with an appropriate #pragma.
39  */
40 
41 /* see inner.h */
42 int
PQCLEAN_FALCON1024_AVX2_get_seed(void * seed,size_t len)43 PQCLEAN_FALCON1024_AVX2_get_seed(void *seed, size_t len) {
44     (void)seed;
45     if (len == 0) {
46         return 1;
47     }
48     return 0;
49 }
50 
51 /* see inner.h */
52 void
PQCLEAN_FALCON1024_AVX2_prng_init(prng * p,inner_shake256_context * src)53 PQCLEAN_FALCON1024_AVX2_prng_init(prng *p, inner_shake256_context *src) {
54     inner_shake256_extract(src, p->state.d, 56);
55     PQCLEAN_FALCON1024_AVX2_prng_refill(p);
56 }
57 
58 /*
59  * PRNG based on ChaCha20.
60  *
61  * State consists in key (32 bytes) then IV (16 bytes) and block counter
62  * (8 bytes). Normally, we should not care about local endianness (this
63  * is for a PRNG), but for the NIST competition we need reproducible KAT
64  * vectors that work across architectures, so we enforce little-endian
65  * interpretation where applicable. Moreover, output words are "spread
66  * out" over the output buffer with the interleaving pattern that is
67  * naturally obtained from the AVX2 implementation that runs eight
68  * ChaCha20 instances in parallel.
69  *
70  * The block counter is XORed into the first 8 bytes of the IV.
71  */
72 void
PQCLEAN_FALCON1024_AVX2_prng_refill(prng * p)73 PQCLEAN_FALCON1024_AVX2_prng_refill(prng *p) {
74 
75     static const uint32_t CW[] = {
76         0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77     };
78 
79     uint64_t cc;
80     size_t u;
81     int i;
82     uint32_t *sw;
83     union {
84         uint32_t w[16];
85         __m256i y[2];  /* for alignment */
86     } t;
87     __m256i state[16], init[16];
88 
89     sw = (uint32_t *)p->state.d;
90 
91     /*
92      * XOR next counter values into state.
93      */
94     cc = *(uint64_t *)(p->state.d + 48);
95     for (u = 0; u < 8; u ++) {
96         t.w[u] = (uint32_t)(cc + u);
97         t.w[u + 8] = (uint32_t)((cc + u) >> 32);
98     }
99     *(uint64_t *)(p->state.d + 48) = cc + 8;
100 
101     /*
102      * Load state.
103      */
104     for (u = 0; u < 4; u ++) {
105         state[u] = init[u] =
106                        _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)CW[u]));
107     }
108     for (u = 0; u < 10; u ++) {
109         state[u + 4] = init[u + 4] =
110                            _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[u]));
111     }
112     state[14] = init[14] = _mm256_xor_si256(
113                                _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[10])),
114                                _mm256_loadu_si256((__m256i *)&t.w[0]));
115     state[15] = init[15] = _mm256_xor_si256(
116                                _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[11])),
117                                _mm256_loadu_si256((__m256i *)&t.w[8]));
118 
119     /*
120      * Do all rounds.
121      */
122     for (i = 0; i < 10; i ++) {
123 
124 #define QROUND(a, b, c, d)   do { \
125         state[a] = _mm256_add_epi32(state[a], state[b]); \
126         state[d] = _mm256_xor_si256(state[d], state[a]); \
127         state[d] = _mm256_or_si256( \
128                                     _mm256_slli_epi32(state[d], 16), \
129                                     _mm256_srli_epi32(state[d], 16)); \
130         state[c] = _mm256_add_epi32(state[c], state[d]); \
131         state[b] = _mm256_xor_si256(state[b], state[c]); \
132         state[b] = _mm256_or_si256( \
133                                     _mm256_slli_epi32(state[b], 12), \
134                                     _mm256_srli_epi32(state[b], 20)); \
135         state[a] = _mm256_add_epi32(state[a], state[b]); \
136         state[d] = _mm256_xor_si256(state[d], state[a]); \
137         state[d] = _mm256_or_si256( \
138                                     _mm256_slli_epi32(state[d],  8), \
139                                     _mm256_srli_epi32(state[d], 24)); \
140         state[c] = _mm256_add_epi32(state[c], state[d]); \
141         state[b] = _mm256_xor_si256(state[b], state[c]); \
142         state[b] = _mm256_or_si256( \
143                                     _mm256_slli_epi32(state[b], 7), \
144                                     _mm256_srli_epi32(state[b], 25)); \
145     } while (0)
146 
147         QROUND( 0,  4,  8, 12);
148         QROUND( 1,  5,  9, 13);
149         QROUND( 2,  6, 10, 14);
150         QROUND( 3,  7, 11, 15);
151         QROUND( 0,  5, 10, 15);
152         QROUND( 1,  6, 11, 12);
153         QROUND( 2,  7,  8, 13);
154         QROUND( 3,  4,  9, 14);
155 
156 #undef QROUND
157 
158     }
159 
160     /*
161      * Add initial state back and encode the result in the destination
162      * buffer. We can dump the AVX2 values "as is" because the non-AVX2
163      * code uses a compatible order of values.
164      */
165     for (u = 0; u < 16; u ++) {
166         _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
167                             _mm256_add_epi32(state[u], init[u]));
168     }
169 
170 
171     p->ptr = 0;
172 }
173 
174 /* see inner.h */
175 void
PQCLEAN_FALCON1024_AVX2_prng_get_bytes(prng * p,void * dst,size_t len)176 PQCLEAN_FALCON1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) {
177     uint8_t *buf;
178 
179     buf = dst;
180     while (len > 0) {
181         size_t clen;
182 
183         clen = (sizeof p->buf.d) - p->ptr;
184         if (clen > len) {
185             clen = len;
186         }
187         memcpy(buf, p->buf.d, clen);
188         buf += clen;
189         len -= clen;
190         p->ptr += clen;
191         if (p->ptr == sizeof p->buf.d) {
192             PQCLEAN_FALCON1024_AVX2_prng_refill(p);
193         }
194     }
195 }
196