xref: /freebsd/sys/crypto/aesni/intel_sha1.c (revision 42249ef2)
1 /*******************************************************************************
2 * Copyright (c) 2013, Intel Corporation
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 *   notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 *   notice, this list of conditions and the following disclaimer in the
15 *   documentation and/or other materials provided with the
16 *   distribution.
17 *
18 * * Neither the name of the Intel Corporation nor the names of its
19 *   contributors may be used to endorse or promote products derived from
20 *   this software without specific prior written permission.
21 *
22 *
23 * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 ********************************************************************************
35 *
36 * Intel SHA Extensions optimized implementation of a SHA-1 update function
37 *
38 * The function takes a pointer to the current hash values, a pointer to the
39 * input data, and a number of 64 byte blocks to process.  Once all blocks have
40 * been processed, the digest pointer is  updated with the resulting hash value.
41 * The function only processes complete blocks, there is no functionality to
42 * store partial blocks.  All message padding and hash value initialization must
43 * be done outside the update function.
44 *
45 * The indented lines in the loop are instructions related to rounds processing.
46 * The non-indented lines are instructions related to the message schedule.
47 *
48 * Author: Sean Gulley <sean.m.gulley@intel.com>
49 * Date:   July 2013
50 *
51 ********************************************************************************
52 *
53 * Example complier command line:
54 * icc intel_sha_extensions_sha1_intrinsic.c
55 * gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
56 *
57 *******************************************************************************/
58 #include <sys/cdefs.h>
59 __FBSDID("$FreeBSD$");
60 
61 #include <sys/types.h>
62 #include <crypto/aesni/aesni_os.h>
63 #include <crypto/aesni/sha_sse.h>
64 
65 #include <immintrin.h>
66 
67 void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
68    __m128i abcd, e0, e1;
69    __m128i abcd_save, e_save;
70    __m128i msg0, msg1, msg2, msg3;
71    __m128i shuf_mask, e_mask;
72 
73 #if 0
74    e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
75 #else
76    (void)e_mask;
77    e0        = _mm_set_epi64x(0, 0);
78 #endif
79    shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
80 
81    // Load initial hash values
82    abcd      = _mm_loadu_si128((__m128i*) digest);
83    e0        = _mm_insert_epi32(e0, *(digest+4), 3);
84    abcd      = _mm_shuffle_epi32(abcd, 0x1B);
85 #if 0
86    e0        = _mm_and_si128(e0, e_mask);
87 #endif
88 
89    while (num_blks > 0) {
90       // Save hash values for addition after rounds
91       abcd_save = abcd;
92       e_save    = e0;
93 
94       // Rounds 0-3
95       msg0 = _mm_loadu_si128((const __m128i*) data);
96       msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
97          e0   = _mm_add_epi32(e0, msg0);
98          e1   = abcd;
99          abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
100 
101       // Rounds 4-7
102       msg1 = _mm_loadu_si128((const __m128i*) (data+16));
103       msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
104          e1   = _mm_sha1nexte_epu32(e1, msg1);
105          e0   = abcd;
106          abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
107       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
108 
109       // Rounds 8-11
110       msg2 = _mm_loadu_si128((const __m128i*) (data+32));
111       msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
112          e0   = _mm_sha1nexte_epu32(e0, msg2);
113          e1   = abcd;
114          abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
115       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
116       msg0 = _mm_xor_si128(msg0, msg2);
117 
118       // Rounds 12-15
119       msg3 = _mm_loadu_si128((const __m128i*) (data+48));
120       msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
121          e1   = _mm_sha1nexte_epu32(e1, msg3);
122          e0   = abcd;
123       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
124          abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
125       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
126       msg1 = _mm_xor_si128(msg1, msg3);
127 
128       // Rounds 16-19
129          e0   = _mm_sha1nexte_epu32(e0, msg0);
130          e1   = abcd;
131       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
132          abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
133       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
134       msg2 = _mm_xor_si128(msg2, msg0);
135 
136       // Rounds 20-23
137          e1   = _mm_sha1nexte_epu32(e1, msg1);
138          e0   = abcd;
139       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
140          abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
141       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
142       msg3 = _mm_xor_si128(msg3, msg1);
143 
144       // Rounds 24-27
145          e0   = _mm_sha1nexte_epu32(e0, msg2);
146          e1   = abcd;
147       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
148          abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
149       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
150       msg0 = _mm_xor_si128(msg0, msg2);
151 
152       // Rounds 28-31
153          e1   = _mm_sha1nexte_epu32(e1, msg3);
154          e0   = abcd;
155       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
156          abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
157       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
158       msg1 = _mm_xor_si128(msg1, msg3);
159 
160       // Rounds 32-35
161          e0   = _mm_sha1nexte_epu32(e0, msg0);
162          e1   = abcd;
163       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
164          abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
165       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
166       msg2 = _mm_xor_si128(msg2, msg0);
167 
168       // Rounds 36-39
169          e1   = _mm_sha1nexte_epu32(e1, msg1);
170          e0   = abcd;
171       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
172          abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
173       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
174       msg3 = _mm_xor_si128(msg3, msg1);
175 
176       // Rounds 40-43
177          e0   = _mm_sha1nexte_epu32(e0, msg2);
178          e1   = abcd;
179       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
180          abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
181       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
182       msg0 = _mm_xor_si128(msg0, msg2);
183 
184       // Rounds 44-47
185          e1   = _mm_sha1nexte_epu32(e1, msg3);
186          e0   = abcd;
187       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
188          abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
189       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
190       msg1 = _mm_xor_si128(msg1, msg3);
191 
192       // Rounds 48-51
193          e0   = _mm_sha1nexte_epu32(e0, msg0);
194          e1   = abcd;
195       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
196          abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
197       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
198       msg2 = _mm_xor_si128(msg2, msg0);
199 
200       // Rounds 52-55
201          e1   = _mm_sha1nexte_epu32(e1, msg1);
202          e0   = abcd;
203       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
204          abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
205       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
206       msg3 = _mm_xor_si128(msg3, msg1);
207 
208       // Rounds 56-59
209          e0   = _mm_sha1nexte_epu32(e0, msg2);
210          e1   = abcd;
211       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
212          abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
213       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
214       msg0 = _mm_xor_si128(msg0, msg2);
215 
216       // Rounds 60-63
217          e1   = _mm_sha1nexte_epu32(e1, msg3);
218          e0   = abcd;
219       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
220          abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
221       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
222       msg1 = _mm_xor_si128(msg1, msg3);
223 
224       // Rounds 64-67
225          e0   = _mm_sha1nexte_epu32(e0, msg0);
226          e1   = abcd;
227       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
228          abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
229       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
230       msg2 = _mm_xor_si128(msg2, msg0);
231 
232       // Rounds 68-71
233          e1   = _mm_sha1nexte_epu32(e1, msg1);
234          e0   = abcd;
235       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
236          abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
237       msg3 = _mm_xor_si128(msg3, msg1);
238 
239       // Rounds 72-75
240          e0   = _mm_sha1nexte_epu32(e0, msg2);
241          e1   = abcd;
242       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
243          abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
244 
245       // Rounds 76-79
246          e1   = _mm_sha1nexte_epu32(e1, msg3);
247          e0   = abcd;
248          abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
249 
250       // Add current hash values with previously saved
251       e0   = _mm_sha1nexte_epu32(e0, e_save);
252       abcd = _mm_add_epi32(abcd, abcd_save);
253 
254       data += 64;
255       num_blks--;
256    }
257 
258    abcd = _mm_shuffle_epi32(abcd, 0x1B);
259    _mm_store_si128((__m128i*) digest, abcd);
260    *(digest+4) = _mm_extract_epi32(e0, 3);
261 }
262 
263