1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27 
28 #if BR_SSE2
29 
30 /*
31  * This file contains a ChaCha20 implementation that leverages SSE2
32  * opcodes for better performance.
33  */
34 
35 /* see bearssl_block.h */
36 br_chacha20_run
37 br_chacha20_sse2_get(void)
38 {
39 	/*
40 	 * If using 64-bit mode, then SSE2 opcodes should be automatically
41 	 * available, since they are part of the ABI.
42 	 *
43 	 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
44 	 */
45 
46 #if BR_amd64
47 	return &br_chacha20_sse2_run;
48 #else
49 
50 	/*
51 	 * SSE2 support is indicated by bit 26 in EDX.
52 	 */
53 	if (br_cpuid(0, 0, 0, 0x04000000)) {
54 		return &br_chacha20_sse2_run;
55 	} else {
56 		return 0;
57 	}
58 #endif
59 }
60 
61 BR_TARGETS_X86_UP
62 
63 /* see bearssl_block.h */
64 BR_TARGET("sse2")
65 uint32_t
66 br_chacha20_sse2_run(const void *key,
67 	const void *iv, uint32_t cc, void *data, size_t len)
68 {
69 	unsigned char *buf;
70 	uint32_t ivtmp[4];
71 	__m128i kw0, kw1;
72 	__m128i iw, cw;
73 	__m128i one;
74 
75 	static const uint32_t CW[] = {
76 		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77 	};
78 
79 	buf = data;
80 	kw0 = _mm_loadu_si128(key);
81 	kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
82 	ivtmp[0] = cc;
83 	memcpy(ivtmp + 1, iv, 12);
84 	iw = _mm_loadu_si128((const void *)ivtmp);
85 	cw = _mm_loadu_si128((const void *)CW);
86 	one = _mm_set_epi32(0, 0, 0, 1);
87 
88 	while (len > 0) {
89 		/*
90 		 * sj contains state words 4*j to 4*j+3.
91 		 */
92 		__m128i s0, s1, s2, s3;
93 		int i;
94 
95 		s0 = cw;
96 		s1 = kw0;
97 		s2 = kw1;
98 		s3 = iw;
99 		for (i = 0; i < 10; i ++) {
100 			/*
101 			 * Even round is straightforward application on
102 			 * the state words.
103 			 */
104 			s0 = _mm_add_epi32(s0, s1);
105 			s3 = _mm_xor_si128(s3, s0);
106 			s3 = _mm_or_si128(
107 				_mm_slli_epi32(s3, 16),
108 				_mm_srli_epi32(s3, 16));
109 
110 			s2 = _mm_add_epi32(s2, s3);
111 			s1 = _mm_xor_si128(s1, s2);
112 			s1 = _mm_or_si128(
113 				_mm_slli_epi32(s1, 12),
114 				_mm_srli_epi32(s1, 20));
115 
116 			s0 = _mm_add_epi32(s0, s1);
117 			s3 = _mm_xor_si128(s3, s0);
118 			s3 = _mm_or_si128(
119 				_mm_slli_epi32(s3, 8),
120 				_mm_srli_epi32(s3, 24));
121 
122 			s2 = _mm_add_epi32(s2, s3);
123 			s1 = _mm_xor_si128(s1, s2);
124 			s1 = _mm_or_si128(
125 				_mm_slli_epi32(s1, 7),
126 				_mm_srli_epi32(s1, 25));
127 
128 			/*
129 			 * For the odd round, we must rotate some state
130 			 * words so that the computations apply on the
131 			 * right combinations of words.
132 			 */
133 			s1 = _mm_shuffle_epi32(s1, 0x39);
134 			s2 = _mm_shuffle_epi32(s2, 0x4E);
135 			s3 = _mm_shuffle_epi32(s3, 0x93);
136 
137 			s0 = _mm_add_epi32(s0, s1);
138 			s3 = _mm_xor_si128(s3, s0);
139 			s3 = _mm_or_si128(
140 				_mm_slli_epi32(s3, 16),
141 				_mm_srli_epi32(s3, 16));
142 
143 			s2 = _mm_add_epi32(s2, s3);
144 			s1 = _mm_xor_si128(s1, s2);
145 			s1 = _mm_or_si128(
146 				_mm_slli_epi32(s1, 12),
147 				_mm_srli_epi32(s1, 20));
148 
149 			s0 = _mm_add_epi32(s0, s1);
150 			s3 = _mm_xor_si128(s3, s0);
151 			s3 = _mm_or_si128(
152 				_mm_slli_epi32(s3, 8),
153 				_mm_srli_epi32(s3, 24));
154 
155 			s2 = _mm_add_epi32(s2, s3);
156 			s1 = _mm_xor_si128(s1, s2);
157 			s1 = _mm_or_si128(
158 				_mm_slli_epi32(s1, 7),
159 				_mm_srli_epi32(s1, 25));
160 
161 			/*
162 			 * After the odd round, we rotate back the values
163 			 * to undo the rotate at the start of the odd round.
164 			 */
165 			s1 = _mm_shuffle_epi32(s1, 0x93);
166 			s2 = _mm_shuffle_epi32(s2, 0x4E);
167 			s3 = _mm_shuffle_epi32(s3, 0x39);
168 		}
169 
170 		/*
171 		 * Addition with the initial state.
172 		 */
173 		s0 = _mm_add_epi32(s0, cw);
174 		s1 = _mm_add_epi32(s1, kw0);
175 		s2 = _mm_add_epi32(s2, kw1);
176 		s3 = _mm_add_epi32(s3, iw);
177 
178 		/*
179 		 * Increment block counter.
180 		 */
181 		iw = _mm_add_epi32(iw, one);
182 
183 		/*
184 		 * XOR final state with the data.
185 		 */
186 		if (len < 64) {
187 			unsigned char tmp[64];
188 			size_t u;
189 
190 			_mm_storeu_si128((void *)(tmp +  0), s0);
191 			_mm_storeu_si128((void *)(tmp + 16), s1);
192 			_mm_storeu_si128((void *)(tmp + 32), s2);
193 			_mm_storeu_si128((void *)(tmp + 48), s3);
194 			for (u = 0; u < len; u ++) {
195 				buf[u] ^= tmp[u];
196 			}
197 			break;
198 		} else {
199 			__m128i b0, b1, b2, b3;
200 
201 			b0 = _mm_loadu_si128((const void *)(buf +  0));
202 			b1 = _mm_loadu_si128((const void *)(buf + 16));
203 			b2 = _mm_loadu_si128((const void *)(buf + 32));
204 			b3 = _mm_loadu_si128((const void *)(buf + 48));
205 			b0 = _mm_xor_si128(b0, s0);
206 			b1 = _mm_xor_si128(b1, s1);
207 			b2 = _mm_xor_si128(b2, s2);
208 			b3 = _mm_xor_si128(b3, s3);
209 			_mm_storeu_si128((void *)(buf +  0), b0);
210 			_mm_storeu_si128((void *)(buf + 16), b1);
211 			_mm_storeu_si128((void *)(buf + 32), b2);
212 			_mm_storeu_si128((void *)(buf + 48), b3);
213 			buf += 64;
214 			len -= 64;
215 		}
216 	}
217 
218 	/*
219 	 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220 	 * raw SSE2, thus we use _mm_extract_epi16().
221 	 */
222 	return (uint32_t)_mm_extract_epi16(iw, 0)
223 		| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224 }
225 
226 BR_TARGETS_X86_DOWN
227 
228 #else
229 
230 /* see bearssl_block.h */
231 br_chacha20_run
232 br_chacha20_sse2_get(void)
233 {
234 	return 0;
235 }
236 
237 #endif
238