1 /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
2 2021-04-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #if defined(_MSC_VER)
7 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
8 // #define USE_MY_MM
9 #endif
10 #endif
11 
12 #include "CpuArch.h"
13 
14 #ifdef MY_CPU_X86_OR_AMD64
15   #if defined(__clang__)
16     #if (__clang_major__ >= 8) // fix that check
17       #define USE_HW_SHA
18       #ifndef __SHA__
19         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
20         #if defined(_MSC_VER)
21           // SSSE3: for clang-cl:
22           #include <tmmintrin.h>
23           #define __SHA__
24         #endif
25       #endif
26       #pragma clang diagnostic ignored "-Wvector-conversion"
27     #endif
28   #elif defined(__GNUC__)
29     #if (__GNUC__ >= 8) // fix that check
30       #define USE_HW_SHA
31       #ifndef __SHA__
32         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
33         // #pragma GCC target("sha,ssse3")
34       #endif
35     #endif
36   #elif defined(__INTEL_COMPILER)
37     #if (__INTEL_COMPILER >= 1800) // fix that check
38       #define USE_HW_SHA
39     #endif
40   #elif defined(_MSC_VER)
41     #ifdef USE_MY_MM
42       #define USE_VER_MIN 1300
43     #else
44       #define USE_VER_MIN 1910
45     #endif
46     #if _MSC_VER >= USE_VER_MIN
47       #define USE_HW_SHA
48     #endif
49   #endif
50 // #endif // MY_CPU_X86_OR_AMD64
51 
52 #ifdef USE_HW_SHA
53 
54 // #pragma message("Sha1 HW")
55 // #include <wmmintrin.h>
56 
57 #if !defined(_MSC_VER) || (_MSC_VER >= 1900)
58 #include <immintrin.h>
59 #else
60 #include <emmintrin.h>
61 
62 #if defined(_MSC_VER) && (_MSC_VER >= 1600)
63 // #include <intrin.h>
64 #endif
65 
66 #ifdef USE_MY_MM
67 #include "My_mm.h"
68 #endif
69 
70 #endif
71 
72 /*
73 SHA1 uses:
74 SSE2:
75   _mm_loadu_si128
76   _mm_storeu_si128
77   _mm_set_epi32
78   _mm_add_epi32
79   _mm_shuffle_epi32 / pshufd
80   _mm_xor_si128
81   _mm_cvtsi128_si32
82   _mm_cvtsi32_si128
83 SSSE3:
84   _mm_shuffle_epi8 / pshufb
85 
86 SHA:
87   _mm_sha1*
88 */
89 
90 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
91 #define XOR_SI128(dest, src)      dest = _mm_xor_si128(dest, src);
92 #define SHUFFLE_EPI8(dest, mask)  dest = _mm_shuffle_epi8(dest, mask);
93 #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
94 
95 #define SHA1_RND4(abcd, e0, f)  abcd = _mm_sha1rnds4_epu32(abcd, e0, f);
96 #define SHA1_NEXTE(e, m)        e = _mm_sha1nexte_epu32(e, m);
97 
98 
99 
100 
101 
102 #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
103 #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
104 
105 
106 #define LOAD_SHUFFLE(m, k) \
107     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
108     SHUFFLE_EPI8(m, mask); \
109 
110 #define SM1(m0, m1, m2, m3) \
111     SHA1_MSG1(m0, m1); \
112 
113 #define SM2(m0, m1, m2, m3) \
114     XOR_SI128(m3, m1); \
115     SHA1_MSG2(m3, m2); \
116 
117 #define SM3(m0, m1, m2, m3) \
118     XOR_SI128(m3, m1); \
119     SM1(m0, m1, m2, m3) \
120     SHA1_MSG2(m3, m2); \
121 
122 #define NNN(m0, m1, m2, m3)
123 
124 
125 
126 
127 
128 
129 
130 
131 
132 
133 
134 
135 
136 
137 
138 
139 
140 #define R4(k, e0, e1, m0, m1, m2, m3, OP) \
141     e1 = abcd; \
142     SHA1_RND4(abcd, e0, (k) / 5); \
143     SHA1_NEXTE(e1, m1); \
144     OP(m0, m1, m2, m3); \
145 
146 #define R16(k, mx, OP0, OP1, OP2, OP3) \
147     R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \
148     R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \
149     R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \
150     R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \
151 
152 #define PREPARE_STATE \
153     SHUFFLE_EPI32 (abcd, 0x1B); \
154     SHUFFLE_EPI32 (e0,   0x1B); \
155 
156 
157 
158 
159 
160 void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
161 #ifdef ATTRIB_SHA
162 ATTRIB_SHA
163 #endif
Sha1_UpdateBlocks_HW(UInt32 state[5],const Byte * data,size_t numBlocks)164 void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
165 {
166   const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
167 
168   __m128i abcd, e0;
169 
170   if (numBlocks == 0)
171     return;
172 
173   abcd = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); // dbca
174   e0 = _mm_cvtsi32_si128((int)state[4]); // 000e
175 
176   PREPARE_STATE
177 
178   do
179   {
180     __m128i abcd_save, e2;
181     __m128i m0, m1, m2, m3;
182     __m128i e1;
183 
184 
185     abcd_save = abcd;
186     e2 = e0;
187 
188     LOAD_SHUFFLE (m0, 0)
189     LOAD_SHUFFLE (m1, 1)
190     LOAD_SHUFFLE (m2, 2)
191     LOAD_SHUFFLE (m3, 3)
192 
193     ADD_EPI32(e0, m0);
194 
195     R16 ( 0, m0, SM1, SM3, SM3, SM3 );
196     R16 ( 1, m0, SM3, SM3, SM3, SM3 );
197     R16 ( 2, m0, SM3, SM3, SM3, SM3 );
198     R16 ( 3, m0, SM3, SM3, SM3, SM3 );
199     R16 ( 4, e2, SM2, NNN, NNN, NNN );
200 
201     ADD_EPI32(abcd, abcd_save);
202 
203     data += 64;
204   }
205   while (--numBlocks);
206 
207   PREPARE_STATE
208 
209   _mm_storeu_si128((__m128i *) (void *) state, abcd);
210   *(state+4) = (UInt32)_mm_cvtsi128_si32(e0);
211 }
212 
213 #endif // USE_HW_SHA
214 
215 #elif defined(MY_CPU_ARM_OR_ARM64)
216 
217   #if defined(__clang__)
218     #if (__clang_major__ >= 8) // fix that check
219       #define USE_HW_SHA
220     #endif
221   #elif defined(__GNUC__)
222     #if (__GNUC__ >= 6) // fix that check
223       #define USE_HW_SHA
224     #endif
225   #elif defined(_MSC_VER)
226     #if _MSC_VER >= 1910
227       #define USE_HW_SHA
228     #endif
229   #endif
230 
231 #ifdef USE_HW_SHA
232 
233 // #pragma message("=== Sha1 HW === ")
234 
235 #if defined(__clang__) || defined(__GNUC__)
236   #ifdef MY_CPU_ARM64
237     #define ATTRIB_SHA __attribute__((__target__("+crypto")))
238   #else
239     #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
240   #endif
241 #else
242   // _MSC_VER
243   // for arm32
244   #define _ARM_USE_NEW_NEON_INTRINSICS
245 #endif
246 
247 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
248 #include <arm64_neon.h>
249 #else
250 #include <arm_neon.h>
251 #endif
252 
253 typedef uint32x4_t v128;
254 // typedef __n128 v128; // MSVC
255 
256 #ifdef MY_CPU_BE
257   #define MY_rev32_for_LE(x)
258 #else
259   #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
260 #endif
261 
262 #define LOAD_128(_p)      (*(const v128 *)(const void *)(_p))
263 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
264 
265 #define LOAD_SHUFFLE(m, k) \
266     m = LOAD_128((data + (k) * 16)); \
267     MY_rev32_for_LE(m); \
268 
269 #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
270 #define SU1(dest, src)        dest = vsha1su1q_u32(dest, src);
271 #define C(e)                  abcd = vsha1cq_u32(abcd, e, t);
272 #define P(e)                  abcd = vsha1pq_u32(abcd, e, t);
273 #define M(e)                  abcd = vsha1mq_u32(abcd, e, t);
274 #define H(e)                  e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
275 #define T(m, c)               t = vaddq_u32(m, c)
276 
277 void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
278 #ifdef ATTRIB_SHA
279 ATTRIB_SHA
280 #endif
Sha1_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)281 void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
282 {
283   v128 abcd;
284   v128 c0, c1, c2, c3;
285   uint32_t e0;
286 
287   if (numBlocks == 0)
288     return;
289 
290   c0 = vdupq_n_u32(0x5a827999);
291   c1 = vdupq_n_u32(0x6ed9eba1);
292   c2 = vdupq_n_u32(0x8f1bbcdc);
293   c3 = vdupq_n_u32(0xca62c1d6);
294 
295   abcd = LOAD_128(&state[0]);
296   e0 = state[4];
297 
298   do
299   {
300     v128 abcd_save;
301     v128 m0, m1, m2, m3;
302     v128 t;
303     uint32_t e0_save, e1;
304 
305     abcd_save = abcd;
306     e0_save = e0;
307 
308     LOAD_SHUFFLE (m0, 0)
309     LOAD_SHUFFLE (m1, 1)
310     LOAD_SHUFFLE (m2, 2)
311     LOAD_SHUFFLE (m3, 3)
312 
313     T(m0, c0);                                  H(e1); C(e0);
314     T(m1, c0);  SU0(m0, m1, m2);                H(e0); C(e1);
315     T(m2, c0);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); C(e0);
316     T(m3, c0);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); C(e1);
317     T(m0, c0);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); C(e0);
318     T(m1, c1);  SU0(m0, m1, m2);  SU1(m3, m2);  H(e0); P(e1);
319     T(m2, c1);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); P(e0);
320     T(m3, c1);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); P(e1);
321     T(m0, c1);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); P(e0);
322     T(m1, c1);  SU0(m0, m1, m2);  SU1(m3, m2);  H(e0); P(e1);
323     T(m2, c2);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); M(e0);
324     T(m3, c2);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); M(e1);
325     T(m0, c2);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); M(e0);
326     T(m1, c2);  SU0(m0, m1, m2);  SU1(m3, m2);  H(e0); M(e1);
327     T(m2, c2);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); M(e0);
328     T(m3, c3);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); P(e1);
329     T(m0, c3);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); P(e0);
330     T(m1, c3);                    SU1(m3, m2);  H(e0); P(e1);
331     T(m2, c3);                                  H(e1); P(e0);
332     T(m3, c3);                                  H(e0); P(e1);
333 
334     abcd = vaddq_u32(abcd, abcd_save);
335     e0 += e0_save;
336 
337     data += 64;
338   }
339   while (--numBlocks);
340 
341   STORE_128(&state[0], abcd);
342   state[4] = e0;
343 }
344 
345 #endif // USE_HW_SHA
346 
347 #endif // MY_CPU_ARM_OR_ARM64
348 
349 
350 #ifndef USE_HW_SHA
351 
352 // #error Stop_Compiling_UNSUPPORTED_SHA
353 // #include <stdlib.h>
354 
355 // #include "Sha1.h"
356 void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
357 
358 #pragma message("Sha1   HW-SW stub was used")
359 
360 void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
Sha1_UpdateBlocks_HW(UInt32 state[5],const Byte * data,size_t numBlocks)361 void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
362 {
363   Sha1_UpdateBlocks(state, data, numBlocks);
364   /*
365   UNUSED_VAR(state);
366   UNUSED_VAR(data);
367   UNUSED_VAR(numBlocks);
368   exit(1);
369   return;
370   */
371 }
372 
373 #endif
374