xref: /freebsd/sys/crypto/aesni/intel_sha1.c (revision fdafd315)
1fe182ba1SConrad Meyer /*******************************************************************************
2fe182ba1SConrad Meyer * Copyright (c) 2013, Intel Corporation
3fe182ba1SConrad Meyer *
4fe182ba1SConrad Meyer * All rights reserved.
5fe182ba1SConrad Meyer *
6fe182ba1SConrad Meyer * Redistribution and use in source and binary forms, with or without
7fe182ba1SConrad Meyer * modification, are permitted provided that the following conditions are
8fe182ba1SConrad Meyer * met:
9fe182ba1SConrad Meyer *
10fe182ba1SConrad Meyer * * Redistributions of source code must retain the above copyright
11fe182ba1SConrad Meyer *   notice, this list of conditions and the following disclaimer.
12fe182ba1SConrad Meyer *
13fe182ba1SConrad Meyer * * Redistributions in binary form must reproduce the above copyright
14fe182ba1SConrad Meyer *   notice, this list of conditions and the following disclaimer in the
15fe182ba1SConrad Meyer *   documentation and/or other materials provided with the
16fe182ba1SConrad Meyer *   distribution.
17fe182ba1SConrad Meyer *
18fe182ba1SConrad Meyer * * Neither the name of the Intel Corporation nor the names of its
19fe182ba1SConrad Meyer *   contributors may be used to endorse or promote products derived from
20fe182ba1SConrad Meyer *   this software without specific prior written permission.
21fe182ba1SConrad Meyer *
22fe182ba1SConrad Meyer *
23fe182ba1SConrad Meyer * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
24fe182ba1SConrad Meyer * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25fe182ba1SConrad Meyer * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26fe182ba1SConrad Meyer * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27fe182ba1SConrad Meyer * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28fe182ba1SConrad Meyer * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29fe182ba1SConrad Meyer * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30fe182ba1SConrad Meyer * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31fe182ba1SConrad Meyer * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32fe182ba1SConrad Meyer * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33fe182ba1SConrad Meyer * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34fe182ba1SConrad Meyer ********************************************************************************
35fe182ba1SConrad Meyer *
36fe182ba1SConrad Meyer * Intel SHA Extensions optimized implementation of a SHA-1 update function
37fe182ba1SConrad Meyer *
38fe182ba1SConrad Meyer * The function takes a pointer to the current hash values, a pointer to the
39fe182ba1SConrad Meyer * input data, and a number of 64 byte blocks to process.  Once all blocks have
40fe182ba1SConrad Meyer * been processed, the digest pointer is  updated with the resulting hash value.
41fe182ba1SConrad Meyer * The function only processes complete blocks, there is no functionality to
42fe182ba1SConrad Meyer * store partial blocks.  All message padding and hash value initialization must
43fe182ba1SConrad Meyer * be done outside the update function.
44fe182ba1SConrad Meyer *
45fe182ba1SConrad Meyer * The indented lines in the loop are instructions related to rounds processing.
46fe182ba1SConrad Meyer * The non-indented lines are instructions related to the message schedule.
47fe182ba1SConrad Meyer *
48fe182ba1SConrad Meyer * Author: Sean Gulley <sean.m.gulley@intel.com>
49fe182ba1SConrad Meyer * Date:   July 2013
50fe182ba1SConrad Meyer *
51fe182ba1SConrad Meyer ********************************************************************************
52fe182ba1SConrad Meyer *
53fe182ba1SConrad Meyer * Example complier command line:
54fe182ba1SConrad Meyer * icc intel_sha_extensions_sha1_intrinsic.c
55fe182ba1SConrad Meyer * gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
56fe182ba1SConrad Meyer *
57fe182ba1SConrad Meyer *******************************************************************************/
58fdafd315SWarner Losh 
59fe182ba1SConrad Meyer #include <sys/types.h>
6050cf4f89SConrad Meyer #include <crypto/aesni/aesni_os.h>
61fe182ba1SConrad Meyer #include <crypto/aesni/sha_sse.h>
62fe182ba1SConrad Meyer 
6350cf4f89SConrad Meyer #include <immintrin.h>
6450cf4f89SConrad Meyer 
intel_sha1_step(uint32_t * digest,const char * data,uint32_t num_blks)65fe182ba1SConrad Meyer void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
66fe182ba1SConrad Meyer    __m128i abcd, e0, e1;
67fe182ba1SConrad Meyer    __m128i abcd_save, e_save;
68fe182ba1SConrad Meyer    __m128i msg0, msg1, msg2, msg3;
69fe182ba1SConrad Meyer    __m128i shuf_mask, e_mask;
70fe182ba1SConrad Meyer 
71fe182ba1SConrad Meyer #if 0
72fe182ba1SConrad Meyer    e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
73fe182ba1SConrad Meyer #else
74fe182ba1SConrad Meyer    (void)e_mask;
75fe182ba1SConrad Meyer    e0        = _mm_set_epi64x(0, 0);
76fe182ba1SConrad Meyer #endif
77fe182ba1SConrad Meyer    shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
78fe182ba1SConrad Meyer 
79fe182ba1SConrad Meyer    // Load initial hash values
80fe182ba1SConrad Meyer    abcd      = _mm_loadu_si128((__m128i*) digest);
81fe182ba1SConrad Meyer    e0        = _mm_insert_epi32(e0, *(digest+4), 3);
82fe182ba1SConrad Meyer    abcd      = _mm_shuffle_epi32(abcd, 0x1B);
83fe182ba1SConrad Meyer #if 0
84fe182ba1SConrad Meyer    e0        = _mm_and_si128(e0, e_mask);
85fe182ba1SConrad Meyer #endif
86fe182ba1SConrad Meyer 
87fe182ba1SConrad Meyer    while (num_blks > 0) {
88fe182ba1SConrad Meyer       // Save hash values for addition after rounds
89fe182ba1SConrad Meyer       abcd_save = abcd;
90fe182ba1SConrad Meyer       e_save    = e0;
91fe182ba1SConrad Meyer 
92fe182ba1SConrad Meyer       // Rounds 0-3
93fe182ba1SConrad Meyer       msg0 = _mm_loadu_si128((const __m128i*) data);
94fe182ba1SConrad Meyer       msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
95fe182ba1SConrad Meyer          e0   = _mm_add_epi32(e0, msg0);
96fe182ba1SConrad Meyer          e1   = abcd;
97fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
98fe182ba1SConrad Meyer 
99fe182ba1SConrad Meyer       // Rounds 4-7
100fe182ba1SConrad Meyer       msg1 = _mm_loadu_si128((const __m128i*) (data+16));
101fe182ba1SConrad Meyer       msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
102fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg1);
103fe182ba1SConrad Meyer          e0   = abcd;
104fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
105fe182ba1SConrad Meyer       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
106fe182ba1SConrad Meyer 
107fe182ba1SConrad Meyer       // Rounds 8-11
108fe182ba1SConrad Meyer       msg2 = _mm_loadu_si128((const __m128i*) (data+32));
109fe182ba1SConrad Meyer       msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
110fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg2);
111fe182ba1SConrad Meyer          e1   = abcd;
112fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
113fe182ba1SConrad Meyer       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
114fe182ba1SConrad Meyer       msg0 = _mm_xor_si128(msg0, msg2);
115fe182ba1SConrad Meyer 
116fe182ba1SConrad Meyer       // Rounds 12-15
117fe182ba1SConrad Meyer       msg3 = _mm_loadu_si128((const __m128i*) (data+48));
118fe182ba1SConrad Meyer       msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
119fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg3);
120fe182ba1SConrad Meyer          e0   = abcd;
121fe182ba1SConrad Meyer       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
122fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
123fe182ba1SConrad Meyer       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
124fe182ba1SConrad Meyer       msg1 = _mm_xor_si128(msg1, msg3);
125fe182ba1SConrad Meyer 
126fe182ba1SConrad Meyer       // Rounds 16-19
127fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg0);
128fe182ba1SConrad Meyer          e1   = abcd;
129fe182ba1SConrad Meyer       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
130fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
131fe182ba1SConrad Meyer       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
132fe182ba1SConrad Meyer       msg2 = _mm_xor_si128(msg2, msg0);
133fe182ba1SConrad Meyer 
134fe182ba1SConrad Meyer       // Rounds 20-23
135fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg1);
136fe182ba1SConrad Meyer          e0   = abcd;
137fe182ba1SConrad Meyer       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
138fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
139fe182ba1SConrad Meyer       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
140fe182ba1SConrad Meyer       msg3 = _mm_xor_si128(msg3, msg1);
141fe182ba1SConrad Meyer 
142fe182ba1SConrad Meyer       // Rounds 24-27
143fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg2);
144fe182ba1SConrad Meyer          e1   = abcd;
145fe182ba1SConrad Meyer       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
146fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
147fe182ba1SConrad Meyer       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
148fe182ba1SConrad Meyer       msg0 = _mm_xor_si128(msg0, msg2);
149fe182ba1SConrad Meyer 
150fe182ba1SConrad Meyer       // Rounds 28-31
151fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg3);
152fe182ba1SConrad Meyer          e0   = abcd;
153fe182ba1SConrad Meyer       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
154fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
155fe182ba1SConrad Meyer       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
156fe182ba1SConrad Meyer       msg1 = _mm_xor_si128(msg1, msg3);
157fe182ba1SConrad Meyer 
158fe182ba1SConrad Meyer       // Rounds 32-35
159fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg0);
160fe182ba1SConrad Meyer          e1   = abcd;
161fe182ba1SConrad Meyer       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
162fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
163fe182ba1SConrad Meyer       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
164fe182ba1SConrad Meyer       msg2 = _mm_xor_si128(msg2, msg0);
165fe182ba1SConrad Meyer 
166fe182ba1SConrad Meyer       // Rounds 36-39
167fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg1);
168fe182ba1SConrad Meyer          e0   = abcd;
169fe182ba1SConrad Meyer       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
170fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
171fe182ba1SConrad Meyer       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
172fe182ba1SConrad Meyer       msg3 = _mm_xor_si128(msg3, msg1);
173fe182ba1SConrad Meyer 
174fe182ba1SConrad Meyer       // Rounds 40-43
175fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg2);
176fe182ba1SConrad Meyer          e1   = abcd;
177fe182ba1SConrad Meyer       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
178fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
179fe182ba1SConrad Meyer       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
180fe182ba1SConrad Meyer       msg0 = _mm_xor_si128(msg0, msg2);
181fe182ba1SConrad Meyer 
182fe182ba1SConrad Meyer       // Rounds 44-47
183fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg3);
184fe182ba1SConrad Meyer          e0   = abcd;
185fe182ba1SConrad Meyer       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
186fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
187fe182ba1SConrad Meyer       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
188fe182ba1SConrad Meyer       msg1 = _mm_xor_si128(msg1, msg3);
189fe182ba1SConrad Meyer 
190fe182ba1SConrad Meyer       // Rounds 48-51
191fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg0);
192fe182ba1SConrad Meyer          e1   = abcd;
193fe182ba1SConrad Meyer       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
194fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
195fe182ba1SConrad Meyer       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
196fe182ba1SConrad Meyer       msg2 = _mm_xor_si128(msg2, msg0);
197fe182ba1SConrad Meyer 
198fe182ba1SConrad Meyer       // Rounds 52-55
199fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg1);
200fe182ba1SConrad Meyer          e0   = abcd;
201fe182ba1SConrad Meyer       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
202fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
203fe182ba1SConrad Meyer       msg0 = _mm_sha1msg1_epu32(msg0, msg1);
204fe182ba1SConrad Meyer       msg3 = _mm_xor_si128(msg3, msg1);
205fe182ba1SConrad Meyer 
206fe182ba1SConrad Meyer       // Rounds 56-59
207fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg2);
208fe182ba1SConrad Meyer          e1   = abcd;
209fe182ba1SConrad Meyer       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
210fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
211fe182ba1SConrad Meyer       msg1 = _mm_sha1msg1_epu32(msg1, msg2);
212fe182ba1SConrad Meyer       msg0 = _mm_xor_si128(msg0, msg2);
213fe182ba1SConrad Meyer 
214fe182ba1SConrad Meyer       // Rounds 60-63
215fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg3);
216fe182ba1SConrad Meyer          e0   = abcd;
217fe182ba1SConrad Meyer       msg0 = _mm_sha1msg2_epu32(msg0, msg3);
218fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
219fe182ba1SConrad Meyer       msg2 = _mm_sha1msg1_epu32(msg2, msg3);
220fe182ba1SConrad Meyer       msg1 = _mm_xor_si128(msg1, msg3);
221fe182ba1SConrad Meyer 
222fe182ba1SConrad Meyer       // Rounds 64-67
223fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg0);
224fe182ba1SConrad Meyer          e1   = abcd;
225fe182ba1SConrad Meyer       msg1 = _mm_sha1msg2_epu32(msg1, msg0);
226fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
227fe182ba1SConrad Meyer       msg3 = _mm_sha1msg1_epu32(msg3, msg0);
228fe182ba1SConrad Meyer       msg2 = _mm_xor_si128(msg2, msg0);
229fe182ba1SConrad Meyer 
230fe182ba1SConrad Meyer       // Rounds 68-71
231fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg1);
232fe182ba1SConrad Meyer          e0   = abcd;
233fe182ba1SConrad Meyer       msg2 = _mm_sha1msg2_epu32(msg2, msg1);
234fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
235fe182ba1SConrad Meyer       msg3 = _mm_xor_si128(msg3, msg1);
236fe182ba1SConrad Meyer 
237fe182ba1SConrad Meyer       // Rounds 72-75
238fe182ba1SConrad Meyer          e0   = _mm_sha1nexte_epu32(e0, msg2);
239fe182ba1SConrad Meyer          e1   = abcd;
240fe182ba1SConrad Meyer       msg3 = _mm_sha1msg2_epu32(msg3, msg2);
241fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
242fe182ba1SConrad Meyer 
243fe182ba1SConrad Meyer       // Rounds 76-79
244fe182ba1SConrad Meyer          e1   = _mm_sha1nexte_epu32(e1, msg3);
245fe182ba1SConrad Meyer          e0   = abcd;
246fe182ba1SConrad Meyer          abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
247fe182ba1SConrad Meyer 
248fe182ba1SConrad Meyer       // Add current hash values with previously saved
249fe182ba1SConrad Meyer       e0   = _mm_sha1nexte_epu32(e0, e_save);
250fe182ba1SConrad Meyer       abcd = _mm_add_epi32(abcd, abcd_save);
251fe182ba1SConrad Meyer 
252fe182ba1SConrad Meyer       data += 64;
253fe182ba1SConrad Meyer       num_blks--;
254fe182ba1SConrad Meyer    }
255fe182ba1SConrad Meyer 
256fe182ba1SConrad Meyer    abcd = _mm_shuffle_epi32(abcd, 0x1B);
257fe182ba1SConrad Meyer    _mm_store_si128((__m128i*) digest, abcd);
258fe182ba1SConrad Meyer    *(digest+4) = _mm_extract_epi32(e0, 3);
259fe182ba1SConrad Meyer }
260fe182ba1SConrad Meyer 
261