1 /*
2 * (C) 2018 Jack Lloyd
3 *
4 * Botan is released under the Simplified BSD License (see license.txt)
5 */
6
7 #include <botan/chacha.h>
8 #include <botan/internal/simd_avx2.h>
9
10 namespace Botan {
11
12 //static
13 BOTAN_FUNC_ISA("avx2")
chacha_avx2_x8(uint8_t output[64* 8],uint32_t state[16],size_t rounds)14 void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds)
15 {
16 SIMD_8x32::reset_registers();
17
18 BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
19 const SIMD_8x32 CTR0 = SIMD_8x32(0, 1, 2, 3, 4, 5, 6, 7);
20
21 const uint32_t C = 0xFFFFFFFF - state[12];
22 const SIMD_8x32 CTR1 = SIMD_8x32(0, C < 1, C < 2, C < 3, C < 4, C < 5, C < 6, C < 7);
23
24 SIMD_8x32 R00 = SIMD_8x32::splat(state[ 0]);
25 SIMD_8x32 R01 = SIMD_8x32::splat(state[ 1]);
26 SIMD_8x32 R02 = SIMD_8x32::splat(state[ 2]);
27 SIMD_8x32 R03 = SIMD_8x32::splat(state[ 3]);
28 SIMD_8x32 R04 = SIMD_8x32::splat(state[ 4]);
29 SIMD_8x32 R05 = SIMD_8x32::splat(state[ 5]);
30 SIMD_8x32 R06 = SIMD_8x32::splat(state[ 6]);
31 SIMD_8x32 R07 = SIMD_8x32::splat(state[ 7]);
32 SIMD_8x32 R08 = SIMD_8x32::splat(state[ 8]);
33 SIMD_8x32 R09 = SIMD_8x32::splat(state[ 9]);
34 SIMD_8x32 R10 = SIMD_8x32::splat(state[10]);
35 SIMD_8x32 R11 = SIMD_8x32::splat(state[11]);
36 SIMD_8x32 R12 = SIMD_8x32::splat(state[12]) + CTR0;
37 SIMD_8x32 R13 = SIMD_8x32::splat(state[13]) + CTR1;
38 SIMD_8x32 R14 = SIMD_8x32::splat(state[14]);
39 SIMD_8x32 R15 = SIMD_8x32::splat(state[15]);
40
41 for(size_t r = 0; r != rounds / 2; ++r)
42 {
43 R00 += R04;
44 R01 += R05;
45 R02 += R06;
46 R03 += R07;
47
48 R12 ^= R00;
49 R13 ^= R01;
50 R14 ^= R02;
51 R15 ^= R03;
52
53 R12 = R12.rotl<16>();
54 R13 = R13.rotl<16>();
55 R14 = R14.rotl<16>();
56 R15 = R15.rotl<16>();
57
58 R08 += R12;
59 R09 += R13;
60 R10 += R14;
61 R11 += R15;
62
63 R04 ^= R08;
64 R05 ^= R09;
65 R06 ^= R10;
66 R07 ^= R11;
67
68 R04 = R04.rotl<12>();
69 R05 = R05.rotl<12>();
70 R06 = R06.rotl<12>();
71 R07 = R07.rotl<12>();
72
73 R00 += R04;
74 R01 += R05;
75 R02 += R06;
76 R03 += R07;
77
78 R12 ^= R00;
79 R13 ^= R01;
80 R14 ^= R02;
81 R15 ^= R03;
82
83 R12 = R12.rotl<8>();
84 R13 = R13.rotl<8>();
85 R14 = R14.rotl<8>();
86 R15 = R15.rotl<8>();
87
88 R08 += R12;
89 R09 += R13;
90 R10 += R14;
91 R11 += R15;
92
93 R04 ^= R08;
94 R05 ^= R09;
95 R06 ^= R10;
96 R07 ^= R11;
97
98 R04 = R04.rotl<7>();
99 R05 = R05.rotl<7>();
100 R06 = R06.rotl<7>();
101 R07 = R07.rotl<7>();
102
103 R00 += R05;
104 R01 += R06;
105 R02 += R07;
106 R03 += R04;
107
108 R15 ^= R00;
109 R12 ^= R01;
110 R13 ^= R02;
111 R14 ^= R03;
112
113 R15 = R15.rotl<16>();
114 R12 = R12.rotl<16>();
115 R13 = R13.rotl<16>();
116 R14 = R14.rotl<16>();
117
118 R10 += R15;
119 R11 += R12;
120 R08 += R13;
121 R09 += R14;
122
123 R05 ^= R10;
124 R06 ^= R11;
125 R07 ^= R08;
126 R04 ^= R09;
127
128 R05 = R05.rotl<12>();
129 R06 = R06.rotl<12>();
130 R07 = R07.rotl<12>();
131 R04 = R04.rotl<12>();
132
133 R00 += R05;
134 R01 += R06;
135 R02 += R07;
136 R03 += R04;
137
138 R15 ^= R00;
139 R12 ^= R01;
140 R13 ^= R02;
141 R14 ^= R03;
142
143 R15 = R15.rotl<8>();
144 R12 = R12.rotl<8>();
145 R13 = R13.rotl<8>();
146 R14 = R14.rotl<8>();
147
148 R10 += R15;
149 R11 += R12;
150 R08 += R13;
151 R09 += R14;
152
153 R05 ^= R10;
154 R06 ^= R11;
155 R07 ^= R08;
156 R04 ^= R09;
157
158 R05 = R05.rotl<7>();
159 R06 = R06.rotl<7>();
160 R07 = R07.rotl<7>();
161 R04 = R04.rotl<7>();
162 }
163
164 R00 += SIMD_8x32::splat(state[0]);
165 R01 += SIMD_8x32::splat(state[1]);
166 R02 += SIMD_8x32::splat(state[2]);
167 R03 += SIMD_8x32::splat(state[3]);
168 R04 += SIMD_8x32::splat(state[4]);
169 R05 += SIMD_8x32::splat(state[5]);
170 R06 += SIMD_8x32::splat(state[6]);
171 R07 += SIMD_8x32::splat(state[7]);
172 R08 += SIMD_8x32::splat(state[8]);
173 R09 += SIMD_8x32::splat(state[9]);
174 R10 += SIMD_8x32::splat(state[10]);
175 R11 += SIMD_8x32::splat(state[11]);
176 R12 += SIMD_8x32::splat(state[12]) + CTR0;
177 R13 += SIMD_8x32::splat(state[13]) + CTR1;
178 R14 += SIMD_8x32::splat(state[14]);
179 R15 += SIMD_8x32::splat(state[15]);
180
181 SIMD_8x32::transpose(R00, R01, R02, R03, R04, R05, R06, R07);
182 SIMD_8x32::transpose(R08, R09, R10, R11, R12, R13, R14, R15);
183
184 R00.store_le(output);
185 R08.store_le(output + 32*1);
186 R01.store_le(output + 32*2);
187 R09.store_le(output + 32*3);
188 R02.store_le(output + 32*4);
189 R10.store_le(output + 32*5);
190 R03.store_le(output + 32*6);
191 R11.store_le(output + 32*7);
192 R04.store_le(output + 32*8);
193 R12.store_le(output + 32*9);
194 R05.store_le(output + 32*10);
195 R13.store_le(output + 32*11);
196 R06.store_le(output + 32*12);
197 R14.store_le(output + 32*13);
198 R07.store_le(output + 32*14);
199 R15.store_le(output + 32*15);
200
201 SIMD_8x32::zero_registers();
202
203 state[12] += 8;
204 if(state[12] < 8)
205 state[13]++;
206 }
207 }
208