1 /*
2 * (C) 2018 Jack Lloyd
3 *
4 * Botan is released under the Simplified BSD License (see license.txt)
5 */
6 
7 #include <botan/chacha.h>
8 #include <botan/internal/simd_avx2.h>
9 
10 namespace Botan {
11 
12 //static
13 BOTAN_FUNC_ISA("avx2")
chacha_avx2_x8(uint8_t output[64* 8],uint32_t state[16],size_t rounds)14 void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds)
15    {
16    SIMD_8x32::reset_registers();
17 
18    BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
19    const SIMD_8x32 CTR0 = SIMD_8x32(0, 1, 2, 3, 4, 5, 6, 7);
20 
21    const uint32_t C = 0xFFFFFFFF - state[12];
22    const SIMD_8x32 CTR1 = SIMD_8x32(0, C < 1, C < 2, C < 3, C < 4, C < 5, C < 6, C < 7);
23 
24    SIMD_8x32 R00 = SIMD_8x32::splat(state[ 0]);
25    SIMD_8x32 R01 = SIMD_8x32::splat(state[ 1]);
26    SIMD_8x32 R02 = SIMD_8x32::splat(state[ 2]);
27    SIMD_8x32 R03 = SIMD_8x32::splat(state[ 3]);
28    SIMD_8x32 R04 = SIMD_8x32::splat(state[ 4]);
29    SIMD_8x32 R05 = SIMD_8x32::splat(state[ 5]);
30    SIMD_8x32 R06 = SIMD_8x32::splat(state[ 6]);
31    SIMD_8x32 R07 = SIMD_8x32::splat(state[ 7]);
32    SIMD_8x32 R08 = SIMD_8x32::splat(state[ 8]);
33    SIMD_8x32 R09 = SIMD_8x32::splat(state[ 9]);
34    SIMD_8x32 R10 = SIMD_8x32::splat(state[10]);
35    SIMD_8x32 R11 = SIMD_8x32::splat(state[11]);
36    SIMD_8x32 R12 = SIMD_8x32::splat(state[12]) + CTR0;
37    SIMD_8x32 R13 = SIMD_8x32::splat(state[13]) + CTR1;
38    SIMD_8x32 R14 = SIMD_8x32::splat(state[14]);
39    SIMD_8x32 R15 = SIMD_8x32::splat(state[15]);
40 
41    for(size_t r = 0; r != rounds / 2; ++r)
42       {
43       R00 += R04;
44       R01 += R05;
45       R02 += R06;
46       R03 += R07;
47 
48       R12 ^= R00;
49       R13 ^= R01;
50       R14 ^= R02;
51       R15 ^= R03;
52 
53       R12 = R12.rotl<16>();
54       R13 = R13.rotl<16>();
55       R14 = R14.rotl<16>();
56       R15 = R15.rotl<16>();
57 
58       R08 += R12;
59       R09 += R13;
60       R10 += R14;
61       R11 += R15;
62 
63       R04 ^= R08;
64       R05 ^= R09;
65       R06 ^= R10;
66       R07 ^= R11;
67 
68       R04 = R04.rotl<12>();
69       R05 = R05.rotl<12>();
70       R06 = R06.rotl<12>();
71       R07 = R07.rotl<12>();
72 
73       R00 += R04;
74       R01 += R05;
75       R02 += R06;
76       R03 += R07;
77 
78       R12 ^= R00;
79       R13 ^= R01;
80       R14 ^= R02;
81       R15 ^= R03;
82 
83       R12 = R12.rotl<8>();
84       R13 = R13.rotl<8>();
85       R14 = R14.rotl<8>();
86       R15 = R15.rotl<8>();
87 
88       R08 += R12;
89       R09 += R13;
90       R10 += R14;
91       R11 += R15;
92 
93       R04 ^= R08;
94       R05 ^= R09;
95       R06 ^= R10;
96       R07 ^= R11;
97 
98       R04 = R04.rotl<7>();
99       R05 = R05.rotl<7>();
100       R06 = R06.rotl<7>();
101       R07 = R07.rotl<7>();
102 
103       R00 += R05;
104       R01 += R06;
105       R02 += R07;
106       R03 += R04;
107 
108       R15 ^= R00;
109       R12 ^= R01;
110       R13 ^= R02;
111       R14 ^= R03;
112 
113       R15 = R15.rotl<16>();
114       R12 = R12.rotl<16>();
115       R13 = R13.rotl<16>();
116       R14 = R14.rotl<16>();
117 
118       R10 += R15;
119       R11 += R12;
120       R08 += R13;
121       R09 += R14;
122 
123       R05 ^= R10;
124       R06 ^= R11;
125       R07 ^= R08;
126       R04 ^= R09;
127 
128       R05 = R05.rotl<12>();
129       R06 = R06.rotl<12>();
130       R07 = R07.rotl<12>();
131       R04 = R04.rotl<12>();
132 
133       R00 += R05;
134       R01 += R06;
135       R02 += R07;
136       R03 += R04;
137 
138       R15 ^= R00;
139       R12 ^= R01;
140       R13 ^= R02;
141       R14 ^= R03;
142 
143       R15 = R15.rotl<8>();
144       R12 = R12.rotl<8>();
145       R13 = R13.rotl<8>();
146       R14 = R14.rotl<8>();
147 
148       R10 += R15;
149       R11 += R12;
150       R08 += R13;
151       R09 += R14;
152 
153       R05 ^= R10;
154       R06 ^= R11;
155       R07 ^= R08;
156       R04 ^= R09;
157 
158       R05 = R05.rotl<7>();
159       R06 = R06.rotl<7>();
160       R07 = R07.rotl<7>();
161       R04 = R04.rotl<7>();
162       }
163 
164    R00 += SIMD_8x32::splat(state[0]);
165    R01 += SIMD_8x32::splat(state[1]);
166    R02 += SIMD_8x32::splat(state[2]);
167    R03 += SIMD_8x32::splat(state[3]);
168    R04 += SIMD_8x32::splat(state[4]);
169    R05 += SIMD_8x32::splat(state[5]);
170    R06 += SIMD_8x32::splat(state[6]);
171    R07 += SIMD_8x32::splat(state[7]);
172    R08 += SIMD_8x32::splat(state[8]);
173    R09 += SIMD_8x32::splat(state[9]);
174    R10 += SIMD_8x32::splat(state[10]);
175    R11 += SIMD_8x32::splat(state[11]);
176    R12 += SIMD_8x32::splat(state[12]) + CTR0;
177    R13 += SIMD_8x32::splat(state[13]) + CTR1;
178    R14 += SIMD_8x32::splat(state[14]);
179    R15 += SIMD_8x32::splat(state[15]);
180 
181    SIMD_8x32::transpose(R00, R01, R02, R03, R04, R05, R06, R07);
182    SIMD_8x32::transpose(R08, R09, R10, R11, R12, R13, R14, R15);
183 
184    R00.store_le(output);
185    R08.store_le(output + 32*1);
186    R01.store_le(output + 32*2);
187    R09.store_le(output + 32*3);
188    R02.store_le(output + 32*4);
189    R10.store_le(output + 32*5);
190    R03.store_le(output + 32*6);
191    R11.store_le(output + 32*7);
192    R04.store_le(output + 32*8);
193    R12.store_le(output + 32*9);
194    R05.store_le(output + 32*10);
195    R13.store_le(output + 32*11);
196    R06.store_le(output + 32*12);
197    R14.store_le(output + 32*13);
198    R07.store_le(output + 32*14);
199    R15.store_le(output + 32*15);
200 
201    SIMD_8x32::zero_registers();
202 
203    state[12] += 8;
204    if(state[12] < 8)
205       state[13]++;
206    }
207 }
208