1 #include <stdint.h>
2 #include <stdio.h>
3 #include <string.h>
4 
5 #include "sha256avx.h"
6 
7 // Transpose 8 vectors containing 32-bit values
PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_transpose(u256 s[8])8 void PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_transpose(u256 s[8]) {
9     u256 tmp0[8];
10     u256 tmp1[8];
11     tmp0[0] = _mm256_unpacklo_epi32(s[0], s[1]);
12     tmp0[1] = _mm256_unpackhi_epi32(s[0], s[1]);
13     tmp0[2] = _mm256_unpacklo_epi32(s[2], s[3]);
14     tmp0[3] = _mm256_unpackhi_epi32(s[2], s[3]);
15     tmp0[4] = _mm256_unpacklo_epi32(s[4], s[5]);
16     tmp0[5] = _mm256_unpackhi_epi32(s[4], s[5]);
17     tmp0[6] = _mm256_unpacklo_epi32(s[6], s[7]);
18     tmp0[7] = _mm256_unpackhi_epi32(s[6], s[7]);
19     tmp1[0] = _mm256_unpacklo_epi64(tmp0[0], tmp0[2]);
20     tmp1[1] = _mm256_unpackhi_epi64(tmp0[0], tmp0[2]);
21     tmp1[2] = _mm256_unpacklo_epi64(tmp0[1], tmp0[3]);
22     tmp1[3] = _mm256_unpackhi_epi64(tmp0[1], tmp0[3]);
23     tmp1[4] = _mm256_unpacklo_epi64(tmp0[4], tmp0[6]);
24     tmp1[5] = _mm256_unpackhi_epi64(tmp0[4], tmp0[6]);
25     tmp1[6] = _mm256_unpacklo_epi64(tmp0[5], tmp0[7]);
26     tmp1[7] = _mm256_unpackhi_epi64(tmp0[5], tmp0[7]);
27     s[0] = _mm256_permute2x128_si256(tmp1[0], tmp1[4], 0x20);
28     s[1] = _mm256_permute2x128_si256(tmp1[1], tmp1[5], 0x20);
29     s[2] = _mm256_permute2x128_si256(tmp1[2], tmp1[6], 0x20);
30     s[3] = _mm256_permute2x128_si256(tmp1[3], tmp1[7], 0x20);
31     s[4] = _mm256_permute2x128_si256(tmp1[0], tmp1[4], 0x31);
32     s[5] = _mm256_permute2x128_si256(tmp1[1], tmp1[5], 0x31);
33     s[6] = _mm256_permute2x128_si256(tmp1[2], tmp1[6], 0x31);
34     s[7] = _mm256_permute2x128_si256(tmp1[3], tmp1[7], 0x31);
35 }
36 
PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_clone_statex8(sha256ctxx8 * outctx,const sha256ctxx8 * inctx)37 void PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_clone_statex8(sha256ctxx8 *outctx, const sha256ctxx8 *inctx) {
38     memcpy(outctx, inctx, sizeof(sha256ctxx8));
39 }
40 
PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_init8x(sha256ctxx8 * ctx)41 void PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_init8x(sha256ctxx8 *ctx) {
42     ctx->s[0] = _mm256_set_epi32((int)0x6a09e667, (int)0x6a09e667, (int)0x6a09e667, (int)0x6a09e667, (int)0x6a09e667, (int)0x6a09e667, (int)0x6a09e667, (int)0x6a09e667);
43     ctx->s[1] = _mm256_set_epi32((int)0xbb67ae85, (int)0xbb67ae85, (int)0xbb67ae85, (int)0xbb67ae85, (int)0xbb67ae85, (int)0xbb67ae85, (int)0xbb67ae85, (int)0xbb67ae85);
44     ctx->s[2] = _mm256_set_epi32((int)0x3c6ef372, (int)0x3c6ef372, (int)0x3c6ef372, (int)0x3c6ef372, (int)0x3c6ef372, (int)0x3c6ef372, (int)0x3c6ef372, (int)0x3c6ef372);
45     ctx->s[3] = _mm256_set_epi32((int)0xa54ff53a, (int)0xa54ff53a, (int)0xa54ff53a, (int)0xa54ff53a, (int)0xa54ff53a, (int)0xa54ff53a, (int)0xa54ff53a, (int)0xa54ff53a);
46     ctx->s[4] = _mm256_set_epi32((int)0x510e527f, (int)0x510e527f, (int)0x510e527f, (int)0x510e527f, (int)0x510e527f, (int)0x510e527f, (int)0x510e527f, (int)0x510e527f);
47     ctx->s[5] = _mm256_set_epi32((int)0x9b05688c, (int)0x9b05688c, (int)0x9b05688c, (int)0x9b05688c, (int)0x9b05688c, (int)0x9b05688c, (int)0x9b05688c, (int)0x9b05688c);
48     ctx->s[6] = _mm256_set_epi32((int)0x1f83d9ab, (int)0x1f83d9ab, (int)0x1f83d9ab, (int)0x1f83d9ab, (int)0x1f83d9ab, (int)0x1f83d9ab, (int)0x1f83d9ab, (int)0x1f83d9ab);
49     ctx->s[7] = _mm256_set_epi32((int)0x5be0cd19, (int)0x5be0cd19, (int)0x5be0cd19, (int)0x5be0cd19, (int)0x5be0cd19, (int)0x5be0cd19, (int)0x5be0cd19, (int)0x5be0cd19);
50 
51     ctx->datalen = 0;
52     ctx->msglen = 0;
53 }
54 
PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_update8x(sha256ctxx8 * ctx,const unsigned char * d0,const unsigned char * d1,const unsigned char * d2,const unsigned char * d3,const unsigned char * d4,const unsigned char * d5,const unsigned char * d6,const unsigned char * d7,unsigned long long len)55 void PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_update8x(sha256ctxx8 *ctx,
56         const unsigned char *d0,
57         const unsigned char *d1,
58         const unsigned char *d2,
59         const unsigned char *d3,
60         const unsigned char *d4,
61         const unsigned char *d5,
62         const unsigned char *d6,
63         const unsigned char *d7,
64         unsigned long long len) {
65     size_t i = 0;
66     size_t bytes_to_copy;
67 
68     while (i < len) {
69         bytes_to_copy = (size_t)len - i;
70         if (bytes_to_copy > 64) {
71             bytes_to_copy = 64;
72         }
73         memcpy(&ctx->msgblocks[64 * 0], d0 + i, bytes_to_copy);
74         memcpy(&ctx->msgblocks[64 * 1], d1 + i, bytes_to_copy);
75         memcpy(&ctx->msgblocks[64 * 2], d2 + i, bytes_to_copy);
76         memcpy(&ctx->msgblocks[64 * 3], d3 + i, bytes_to_copy);
77         memcpy(&ctx->msgblocks[64 * 4], d4 + i, bytes_to_copy);
78         memcpy(&ctx->msgblocks[64 * 5], d5 + i, bytes_to_copy);
79         memcpy(&ctx->msgblocks[64 * 6], d6 + i, bytes_to_copy);
80         memcpy(&ctx->msgblocks[64 * 7], d7 + i, bytes_to_copy);
81         ctx->datalen += (unsigned int)bytes_to_copy;
82         i += bytes_to_copy;
83         if (ctx->datalen == 64) {
84             PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_transform8x(ctx, ctx->msgblocks);
85             ctx->msglen += 512;
86             ctx->datalen = 0;
87         }
88     }
89 }
90 
PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_final8x(sha256ctxx8 * ctx,unsigned char * out0,unsigned char * out1,unsigned char * out2,unsigned char * out3,unsigned char * out4,unsigned char * out5,unsigned char * out6,unsigned char * out7)91 void PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_final8x(sha256ctxx8 *ctx,
92         unsigned char *out0,
93         unsigned char *out1,
94         unsigned char *out2,
95         unsigned char *out3,
96         unsigned char *out4,
97         unsigned char *out5,
98         unsigned char *out6,
99         unsigned char *out7) {
100     unsigned int i, curlen;
101 
102     // Padding
103     if (ctx->datalen < 56) {
104         for (i = 0; i < 8; ++i) {
105             curlen = ctx->datalen;
106             ctx->msgblocks[64 * i + curlen++] = 0x80;
107             while (curlen < 64) {
108                 ctx->msgblocks[64 * i + curlen++] = 0x00;
109             }
110         }
111     } else {
112         for (i = 0; i < 8; ++i) {
113             curlen = ctx->datalen;
114             ctx->msgblocks[64 * i + curlen++] = 0x80;
115             while (curlen < 64) {
116                 ctx->msgblocks[64 * i + curlen++] = 0x00;
117             }
118         }
119         PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_transform8x(ctx, ctx->msgblocks);
120         memset(ctx->msgblocks, 0, 8 * 64);
121     }
122 
123     // Add length of the message to each block
124     ctx->msglen += ctx->datalen * 8;
125     for (i = 0; i < 8; i++) {
126         ctx->msgblocks[64 * i + 63] = (unsigned char)ctx->msglen;
127         ctx->msgblocks[64 * i + 62] = (unsigned char)(ctx->msglen >> 8);
128         ctx->msgblocks[64 * i + 61] = (unsigned char)(ctx->msglen >> 16);
129         ctx->msgblocks[64 * i + 60] = (unsigned char)(ctx->msglen >> 24);
130         ctx->msgblocks[64 * i + 59] = (unsigned char)(ctx->msglen >> 32);
131         ctx->msgblocks[64 * i + 58] = (unsigned char)(ctx->msglen >> 40);
132         ctx->msgblocks[64 * i + 57] = (unsigned char)(ctx->msglen >> 48);
133         ctx->msgblocks[64 * i + 56] = (unsigned char)(ctx->msglen >> 56);
134     }
135     PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_transform8x(ctx, ctx->msgblocks);
136 
137     // Compute final hash output
138     PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_transpose(ctx->s);
139 
140     // Store Hash value
141     STORE(out0, BYTESWAP(ctx->s[0]));
142     STORE(out1, BYTESWAP(ctx->s[1]));
143     STORE(out2, BYTESWAP(ctx->s[2]));
144     STORE(out3, BYTESWAP(ctx->s[3]));
145     STORE(out4, BYTESWAP(ctx->s[4]));
146     STORE(out5, BYTESWAP(ctx->s[5]));
147     STORE(out6, BYTESWAP(ctx->s[6]));
148     STORE(out7, BYTESWAP(ctx->s[7]));
149 }
150 
PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_transform8x(sha256ctxx8 * ctx,const unsigned char * data)151 void PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_sha256_transform8x(sha256ctxx8 *ctx, const unsigned char *data) {
152     u256 s[8], w[64], T0, T1;
153     int i;
154 
155     // Load words and transform data correctly
156     for (i = 0; i < 8; i++) {
157         w[i] = BYTESWAP(LOAD(data + 64 * i));
158         w[i + 8] = BYTESWAP(LOAD(data + 32 + 64 * i));
159     }
160 
161     PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_transpose(w);
162     PQCLEAN_SPHINCSSHA256192SROBUST_AVX2_transpose(w + 8);
163 
164     // Initial State
165     s[0] = ctx->s[0];
166     s[1] = ctx->s[1];
167     s[2] = ctx->s[2];
168     s[3] = ctx->s[3];
169     s[4] = ctx->s[4];
170     s[5] = ctx->s[5];
171     s[6] = ctx->s[6];
172     s[7] = ctx->s[7];
173 
174     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 0, w[0]);
175     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 1, w[1]);
176     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 2, w[2]);
177     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 3, w[3]);
178     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 4, w[4]);
179     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 5, w[5]);
180     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 6, w[6]);
181     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 7, w[7]);
182     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 8, w[8]);
183     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 9, w[9]);
184     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 10, w[10]);
185     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 11, w[11]);
186     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 12, w[12]);
187     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 13, w[13]);
188     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 14, w[14]);
189     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 15, w[15]);
190     w[16] = ADD4_32(WSIGMA1_AVX(w[14]), w[0], w[9], WSIGMA0_AVX(w[1]));
191     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 16, w[16]);
192     w[17] = ADD4_32(WSIGMA1_AVX(w[15]), w[1], w[10], WSIGMA0_AVX(w[2]));
193     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 17, w[17]);
194     w[18] = ADD4_32(WSIGMA1_AVX(w[16]), w[2], w[11], WSIGMA0_AVX(w[3]));
195     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 18, w[18]);
196     w[19] = ADD4_32(WSIGMA1_AVX(w[17]), w[3], w[12], WSIGMA0_AVX(w[4]));
197     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 19, w[19]);
198     w[20] = ADD4_32(WSIGMA1_AVX(w[18]), w[4], w[13], WSIGMA0_AVX(w[5]));
199     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 20, w[20]);
200     w[21] = ADD4_32(WSIGMA1_AVX(w[19]), w[5], w[14], WSIGMA0_AVX(w[6]));
201     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 21, w[21]);
202     w[22] = ADD4_32(WSIGMA1_AVX(w[20]), w[6], w[15], WSIGMA0_AVX(w[7]));
203     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 22, w[22]);
204     w[23] = ADD4_32(WSIGMA1_AVX(w[21]), w[7], w[16], WSIGMA0_AVX(w[8]));
205     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 23, w[23]);
206     w[24] = ADD4_32(WSIGMA1_AVX(w[22]), w[8], w[17], WSIGMA0_AVX(w[9]));
207     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 24, w[24]);
208     w[25] = ADD4_32(WSIGMA1_AVX(w[23]), w[9], w[18], WSIGMA0_AVX(w[10]));
209     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 25, w[25]);
210     w[26] = ADD4_32(WSIGMA1_AVX(w[24]), w[10], w[19], WSIGMA0_AVX(w[11]));
211     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 26, w[26]);
212     w[27] = ADD4_32(WSIGMA1_AVX(w[25]), w[11], w[20], WSIGMA0_AVX(w[12]));
213     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 27, w[27]);
214     w[28] = ADD4_32(WSIGMA1_AVX(w[26]), w[12], w[21], WSIGMA0_AVX(w[13]));
215     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 28, w[28]);
216     w[29] = ADD4_32(WSIGMA1_AVX(w[27]), w[13], w[22], WSIGMA0_AVX(w[14]));
217     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 29, w[29]);
218     w[30] = ADD4_32(WSIGMA1_AVX(w[28]), w[14], w[23], WSIGMA0_AVX(w[15]));
219     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 30, w[30]);
220     w[31] = ADD4_32(WSIGMA1_AVX(w[29]), w[15], w[24], WSIGMA0_AVX(w[16]));
221     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 31, w[31]);
222     w[32] = ADD4_32(WSIGMA1_AVX(w[30]), w[16], w[25], WSIGMA0_AVX(w[17]));
223     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 32, w[32]);
224     w[33] = ADD4_32(WSIGMA1_AVX(w[31]), w[17], w[26], WSIGMA0_AVX(w[18]));
225     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 33, w[33]);
226     w[34] = ADD4_32(WSIGMA1_AVX(w[32]), w[18], w[27], WSIGMA0_AVX(w[19]));
227     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 34, w[34]);
228     w[35] = ADD4_32(WSIGMA1_AVX(w[33]), w[19], w[28], WSIGMA0_AVX(w[20]));
229     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 35, w[35]);
230     w[36] = ADD4_32(WSIGMA1_AVX(w[34]), w[20], w[29], WSIGMA0_AVX(w[21]));
231     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 36, w[36]);
232     w[37] = ADD4_32(WSIGMA1_AVX(w[35]), w[21], w[30], WSIGMA0_AVX(w[22]));
233     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 37, w[37]);
234     w[38] = ADD4_32(WSIGMA1_AVX(w[36]), w[22], w[31], WSIGMA0_AVX(w[23]));
235     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 38, w[38]);
236     w[39] = ADD4_32(WSIGMA1_AVX(w[37]), w[23], w[32], WSIGMA0_AVX(w[24]));
237     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 39, w[39]);
238     w[40] = ADD4_32(WSIGMA1_AVX(w[38]), w[24], w[33], WSIGMA0_AVX(w[25]));
239     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 40, w[40]);
240     w[41] = ADD4_32(WSIGMA1_AVX(w[39]), w[25], w[34], WSIGMA0_AVX(w[26]));
241     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 41, w[41]);
242     w[42] = ADD4_32(WSIGMA1_AVX(w[40]), w[26], w[35], WSIGMA0_AVX(w[27]));
243     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 42, w[42]);
244     w[43] = ADD4_32(WSIGMA1_AVX(w[41]), w[27], w[36], WSIGMA0_AVX(w[28]));
245     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 43, w[43]);
246     w[44] = ADD4_32(WSIGMA1_AVX(w[42]), w[28], w[37], WSIGMA0_AVX(w[29]));
247     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 44, w[44]);
248     w[45] = ADD4_32(WSIGMA1_AVX(w[43]), w[29], w[38], WSIGMA0_AVX(w[30]));
249     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 45, w[45]);
250     w[46] = ADD4_32(WSIGMA1_AVX(w[44]), w[30], w[39], WSIGMA0_AVX(w[31]));
251     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 46, w[46]);
252     w[47] = ADD4_32(WSIGMA1_AVX(w[45]), w[31], w[40], WSIGMA0_AVX(w[32]));
253     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 47, w[47]);
254     w[48] = ADD4_32(WSIGMA1_AVX(w[46]), w[32], w[41], WSIGMA0_AVX(w[33]));
255     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 48, w[48]);
256     w[49] = ADD4_32(WSIGMA1_AVX(w[47]), w[33], w[42], WSIGMA0_AVX(w[34]));
257     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 49, w[49]);
258     w[50] = ADD4_32(WSIGMA1_AVX(w[48]), w[34], w[43], WSIGMA0_AVX(w[35]));
259     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 50, w[50]);
260     w[51] = ADD4_32(WSIGMA1_AVX(w[49]), w[35], w[44], WSIGMA0_AVX(w[36]));
261     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 51, w[51]);
262     w[52] = ADD4_32(WSIGMA1_AVX(w[50]), w[36], w[45], WSIGMA0_AVX(w[37]));
263     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 52, w[52]);
264     w[53] = ADD4_32(WSIGMA1_AVX(w[51]), w[37], w[46], WSIGMA0_AVX(w[38]));
265     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 53, w[53]);
266     w[54] = ADD4_32(WSIGMA1_AVX(w[52]), w[38], w[47], WSIGMA0_AVX(w[39]));
267     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 54, w[54]);
268     w[55] = ADD4_32(WSIGMA1_AVX(w[53]), w[39], w[48], WSIGMA0_AVX(w[40]));
269     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 55, w[55]);
270     w[56] = ADD4_32(WSIGMA1_AVX(w[54]), w[40], w[49], WSIGMA0_AVX(w[41]));
271     SHA256ROUND_AVX(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], 56, w[56]);
272     w[57] = ADD4_32(WSIGMA1_AVX(w[55]), w[41], w[50], WSIGMA0_AVX(w[42]));
273     SHA256ROUND_AVX(s[7], s[0], s[1], s[2], s[3], s[4], s[5], s[6], 57, w[57]);
274     w[58] = ADD4_32(WSIGMA1_AVX(w[56]), w[42], w[51], WSIGMA0_AVX(w[43]));
275     SHA256ROUND_AVX(s[6], s[7], s[0], s[1], s[2], s[3], s[4], s[5], 58, w[58]);
276     w[59] = ADD4_32(WSIGMA1_AVX(w[57]), w[43], w[52], WSIGMA0_AVX(w[44]));
277     SHA256ROUND_AVX(s[5], s[6], s[7], s[0], s[1], s[2], s[3], s[4], 59, w[59]);
278     w[60] = ADD4_32(WSIGMA1_AVX(w[58]), w[44], w[53], WSIGMA0_AVX(w[45]));
279     SHA256ROUND_AVX(s[4], s[5], s[6], s[7], s[0], s[1], s[2], s[3], 60, w[60]);
280     w[61] = ADD4_32(WSIGMA1_AVX(w[59]), w[45], w[54], WSIGMA0_AVX(w[46]));
281     SHA256ROUND_AVX(s[3], s[4], s[5], s[6], s[7], s[0], s[1], s[2], 61, w[61]);
282     w[62] = ADD4_32(WSIGMA1_AVX(w[60]), w[46], w[55], WSIGMA0_AVX(w[47]));
283     SHA256ROUND_AVX(s[2], s[3], s[4], s[5], s[6], s[7], s[0], s[1], 62, w[62]);
284     w[63] = ADD4_32(WSIGMA1_AVX(w[61]), w[47], w[56], WSIGMA0_AVX(w[48]));
285     SHA256ROUND_AVX(s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[0], 63, w[63]);
286 
287     // Feed Forward
288     ctx->s[0] = ADD32(s[0], ctx->s[0]);
289     ctx->s[1] = ADD32(s[1], ctx->s[1]);
290     ctx->s[2] = ADD32(s[2], ctx->s[2]);
291     ctx->s[3] = ADD32(s[3], ctx->s[3]);
292     ctx->s[4] = ADD32(s[4], ctx->s[4]);
293     ctx->s[5] = ADD32(s[5], ctx->s[5]);
294     ctx->s[6] = ADD32(s[6], ctx->s[6]);
295     ctx->s[7] = ADD32(s[7], ctx->s[7]);
296 }
297