1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Hash code using AES intrinsics.
6 
7 #include "runtime.h"
8 
9 uintptr aeshashbody(void*, uintptr, uintptr, Slice)
10 	__asm__(GOSYM_PREFIX "runtime.aeshashbody");
11 
12 uintptr aeshashbody(void*, uintptr, uintptr, Slice)
13 	__attribute__((no_split_stack));
14 
15 #if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_AS_X86_AES)
16 
17 #include <emmintrin.h>
18 #include <tmmintrin.h>
19 #include <wmmintrin.h>
20 
21 // Force appropriate CPU level.  We won't call here unless the CPU
22 // supports it.
23 
24 #pragma GCC target("ssse3", "aes")
25 
26 #ifdef __x86_64__
27 
28 // aeshashbody implements a hash function using AES instructions
29 // available in recent x86 processors. Note this is not encryption,
30 // just hashing.
31 //
32 // This is written to produce exactly the same results as the gc
33 // implementation, not because that matters, but just to ensure that
34 // this does something reasonable.
aeshashbody(void * p,uintptr seed,uintptr size,Slice aeskeysched)35 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
36 	__m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8;
37 	__m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8;
38 
39 	// Start with hash seed.
40 	mseed = _mm_cvtsi64_si128(seed);
41 	// Get 16 bits of length.
42 	mseed = _mm_insert_epi16(mseed, size, 4);
43 	// Repeat length 4 times total.
44 	mseed = _mm_shufflehi_epi16(mseed, 0);
45 	// Save unscrambled seed.
46 	mseed2 = mseed;
47 	// XOR in per-process seed.
48 	mseed ^= _mm_loadu_si128(aeskeysched.__values);
49 	// Scramble seed.
50 	mseed = _mm_aesenc_si128(mseed, mseed);
51 
52 	if (size <= 16) {
53 		if (size == 0) {
54 			// Return scrambled input seed.
55 			return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, mseed));
56 		} else if (size < 16) {
57 			if ((((uintptr)(p) + 16) & 0xff0) != 0) {
58 				static const uint64 masks[32]
59 				  __attribute__ ((aligned(16))) =
60 				  {
61 				    0x0000000000000000, 0x0000000000000000,
62 				    0x00000000000000ff, 0x0000000000000000,
63 				    0x000000000000ffff, 0x0000000000000000,
64 				    0x0000000000ffffff, 0x0000000000000000,
65 				    0x00000000ffffffff, 0x0000000000000000,
66 				    0x000000ffffffffff, 0x0000000000000000,
67 				    0x0000ffffffffffff, 0x0000000000000000,
68 				    0x00ffffffffffffff, 0x0000000000000000,
69 				    0xffffffffffffffff, 0x0000000000000000,
70 				    0xffffffffffffffff, 0x00000000000000ff,
71 				    0xffffffffffffffff, 0x000000000000ffff,
72 				    0xffffffffffffffff, 0x0000000000ffffff,
73 				    0xffffffffffffffff, 0x00000000ffffffff,
74 				    0xffffffffffffffff, 0x000000ffffffffff,
75 				    0xffffffffffffffff, 0x0000ffffffffffff,
76 				    0xffffffffffffffff, 0x00ffffffffffffff
77 				  };
78 
79 				// 16 bytes loaded at p won't cross a page
80 				// boundary, so we can load directly.
81 				mval = _mm_loadu_si128(p);
82 				mval &= *(const __m128i*)(&masks[size*2]);
83 			} else {
84 				static const uint64 shifts[32]
85 				  __attribute__ ((aligned(16))) =
86 				  {
87 				    0x0000000000000000, 0x0000000000000000,
88 				    0xffffffffffffff0f, 0xffffffffffffffff,
89 				    0xffffffffffff0f0e, 0xffffffffffffffff,
90 				    0xffffffffff0f0e0d, 0xffffffffffffffff,
91 				    0xffffffff0f0e0d0c, 0xffffffffffffffff,
92 				    0xffffff0f0e0d0c0b, 0xffffffffffffffff,
93 				    0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
94 				    0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
95 				    0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
96 				    0x0e0d0c0b0a090807, 0xffffffffffffff0f,
97 				    0x0d0c0b0a09080706, 0xffffffffffff0f0e,
98 				    0x0c0b0a0908070605, 0xffffffffff0f0e0d,
99 				    0x0b0a090807060504, 0xffffffff0f0e0d0c,
100 				    0x0a09080706050403, 0xffffff0f0e0d0c0b,
101 				    0x0908070605040302, 0xffff0f0e0d0c0b0a,
102 				    0x0807060504030201, 0xff0f0e0d0c0b0a09,
103 				  };
104 
105 				// address ends in 1111xxxx. Might be
106 				// up against a page boundary, so load
107 				// ending at last byte.  Then shift
108 				// bytes down using pshufb.
109 				mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
110 				mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
111 			}
112 		} else {
113 			mval = _mm_loadu_si128(p);
114 		}
115 
116 		// XOR data with seed.
117 		mval ^= mseed;
118 		// Scramble combo 3 times.
119 		mval = _mm_aesenc_si128(mval, mval);
120 		mval = _mm_aesenc_si128(mval, mval);
121 		mval = _mm_aesenc_si128(mval, mval);
122 		return _mm_cvtsi128_si64(mval);
123 	} else if (size <= 32) {
124 		// Make second starting seed.
125 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
126 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
127 		// Load data to be hashed.
128 		mval = _mm_loadu_si128(p);
129 		mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
130 		// XOR with seed.
131 		mval ^= mseed;
132 		mval2 ^= mseed2;
133 		// Scramble 3 times.
134 		mval = _mm_aesenc_si128(mval, mval);
135 		mval2 = _mm_aesenc_si128(mval2, mval2);
136 		mval = _mm_aesenc_si128(mval, mval);
137 		mval2 = _mm_aesenc_si128(mval2, mval2);
138 		mval = _mm_aesenc_si128(mval, mval);
139 		mval2 = _mm_aesenc_si128(mval2, mval2);
140 		// Combine results.
141 		mval ^= mval2;
142 		return _mm_cvtsi128_si64(mval);
143 	} else if (size <= 64) {
144 		// Make 3 more starting seeds.
145 		mseed3 = mseed2;
146 		mseed4 = mseed2;
147 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
148 		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
149 		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
150 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
151 		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
152 		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
153 
154 		mval = _mm_loadu_si128(p);
155 		mval2 = _mm_loadu_si128((void*)((char*)p + 16));
156 		mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
157 		mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
158 
159 		mval ^= mseed;
160 		mval2 ^= mseed2;
161 		mval3 ^= mseed3;
162 		mval4 ^= mseed4;
163 
164 		mval = _mm_aesenc_si128(mval, mval);
165 		mval2 = _mm_aesenc_si128(mval2, mval2);
166 		mval3 = _mm_aesenc_si128(mval3, mval3);
167 		mval4 = _mm_aesenc_si128(mval4, mval4);
168 
169 		mval = _mm_aesenc_si128(mval, mval);
170 		mval2 = _mm_aesenc_si128(mval2, mval2);
171 		mval3 = _mm_aesenc_si128(mval3, mval3);
172 		mval4 = _mm_aesenc_si128(mval4, mval4);
173 
174 		mval = _mm_aesenc_si128(mval, mval);
175 		mval2 = _mm_aesenc_si128(mval2, mval2);
176 		mval3 = _mm_aesenc_si128(mval3, mval3);
177 		mval4 = _mm_aesenc_si128(mval4, mval4);
178 
179 		mval ^= mval3;
180 		mval2 ^= mval4;
181 		mval ^= mval2;
182 		return _mm_cvtsi128_si64(mval);
183 	} else if (size <= 128) {
184 		// Make 7 more starting seeds.
185 		mseed3 = mseed2;
186 		mseed4 = mseed2;
187 		mseed5 = mseed2;
188 		mseed6 = mseed2;
189 		mseed7 = mseed2;
190 		mseed8 = mseed2;
191 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
192 		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
193 		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
194 		mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
195 		mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
196 		mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
197 		mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
198 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
199 		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
200 		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
201 		mseed5 = _mm_aesenc_si128(mseed5, mseed5);
202 		mseed6 = _mm_aesenc_si128(mseed6, mseed6);
203 		mseed7 = _mm_aesenc_si128(mseed7, mseed7);
204 		mseed8 = _mm_aesenc_si128(mseed8, mseed8);
205 
206 		// Load data.
207 		mval = _mm_loadu_si128(p);
208 		mval2 = _mm_loadu_si128((void*)((char*)p + 16));
209 		mval3 = _mm_loadu_si128((void*)((char*)p + 32));
210 		mval4 = _mm_loadu_si128((void*)((char*)p + 48));
211 		mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
212 		mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
213 		mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
214 		mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
215 
216 		// XOR with seed.
217 		mval ^= mseed;
218 		mval2 ^= mseed2;
219 		mval3 ^= mseed3;
220 		mval4 ^= mseed4;
221 		mval5 ^= mseed5;
222 		mval6 ^= mseed6;
223 		mval7 ^= mseed7;
224 		mval8 ^= mseed8;
225 
226 		// Scramble 3 times.
227 		mval = _mm_aesenc_si128(mval, mval);
228 		mval2 = _mm_aesenc_si128(mval2, mval2);
229 		mval3 = _mm_aesenc_si128(mval3, mval3);
230 		mval4 = _mm_aesenc_si128(mval4, mval4);
231 		mval5 = _mm_aesenc_si128(mval5, mval5);
232 		mval6 = _mm_aesenc_si128(mval6, mval6);
233 		mval7 = _mm_aesenc_si128(mval7, mval7);
234 		mval8 = _mm_aesenc_si128(mval8, mval8);
235 
236 		mval = _mm_aesenc_si128(mval, mval);
237 		mval2 = _mm_aesenc_si128(mval2, mval2);
238 		mval3 = _mm_aesenc_si128(mval3, mval3);
239 		mval4 = _mm_aesenc_si128(mval4, mval4);
240 		mval5 = _mm_aesenc_si128(mval5, mval5);
241 		mval6 = _mm_aesenc_si128(mval6, mval6);
242 		mval7 = _mm_aesenc_si128(mval7, mval7);
243 		mval8 = _mm_aesenc_si128(mval8, mval8);
244 
245 		mval = _mm_aesenc_si128(mval, mval);
246 		mval2 = _mm_aesenc_si128(mval2, mval2);
247 		mval3 = _mm_aesenc_si128(mval3, mval3);
248 		mval4 = _mm_aesenc_si128(mval4, mval4);
249 		mval5 = _mm_aesenc_si128(mval5, mval5);
250 		mval6 = _mm_aesenc_si128(mval6, mval6);
251 		mval7 = _mm_aesenc_si128(mval7, mval7);
252 		mval8 = _mm_aesenc_si128(mval8, mval8);
253 
254 		// Combine results.
255 		mval ^= mval5;
256 		mval2 ^= mval6;
257 		mval3 ^= mval7;
258 		mval4 ^= mval8;
259 		mval ^= mval3;
260 		mval2 ^= mval4;
261 		mval ^= mval2;
262 		return _mm_cvtsi128_si64(mval);
263 	} else {
264 		// Make 7 more starting seeds.
265 		mseed3 = mseed2;
266 		mseed4 = mseed2;
267 		mseed5 = mseed2;
268 		mseed6 = mseed2;
269 		mseed7 = mseed2;
270 		mseed8 = mseed2;
271 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
272 		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
273 		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
274 		mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
275 		mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
276 		mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
277 		mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
278 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
279 		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
280 		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
281 		mseed5 = _mm_aesenc_si128(mseed5, mseed5);
282 		mseed6 = _mm_aesenc_si128(mseed6, mseed6);
283 		mseed7 = _mm_aesenc_si128(mseed7, mseed7);
284 		mseed8 = _mm_aesenc_si128(mseed8, mseed8);
285 
286 		// Start with last (possibly overlapping) block.
287 		mval = _mm_loadu_si128((void*)((char*)p + size - 128));
288 		mval2 = _mm_loadu_si128((void*)((char*)p + size - 112));
289 		mval3 = _mm_loadu_si128((void*)((char*)p + size - 96));
290 		mval4 = _mm_loadu_si128((void*)((char*)p + size - 80));
291 		mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
292 		mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
293 		mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
294 		mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
295 
296 		// XOR in seed.
297 		mval ^= mseed;
298 		mval2 ^= mseed2;
299 		mval3 ^= mseed3;
300 		mval4 ^= mseed4;
301 		mval5 ^= mseed5;
302 		mval6 ^= mseed6;
303 		mval7 ^= mseed7;
304 		mval8 ^= mseed8;
305 
306 		// Compute number of remaining 128-byte blocks.
307 		size--;
308 		size >>= 7;
309 		do {
310 			// Scramble state.
311 			mval = _mm_aesenc_si128(mval, mval);
312 			mval2 = _mm_aesenc_si128(mval2, mval2);
313 			mval3 = _mm_aesenc_si128(mval3, mval3);
314 			mval4 = _mm_aesenc_si128(mval4, mval4);
315 			mval5 = _mm_aesenc_si128(mval5, mval5);
316 			mval6 = _mm_aesenc_si128(mval6, mval6);
317 			mval7 = _mm_aesenc_si128(mval7, mval7);
318 			mval8 = _mm_aesenc_si128(mval8, mval8);
319 
320 			// Scramble state, XOR in a block.
321 			mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
322 			mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
323 			mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
324 			mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
325 			mval5 = _mm_aesenc_si128(mval5, _mm_loadu_si128((void*)((char*)p + 64)));
326 			mval6 = _mm_aesenc_si128(mval6, _mm_loadu_si128((void*)((char*)p + 80)));
327 			mval7 = _mm_aesenc_si128(mval7, _mm_loadu_si128((void*)((char*)p + 96)));
328 			mval8 = _mm_aesenc_si128(mval8, _mm_loadu_si128((void*)((char*)p + 112)));
329 
330 			p = (void*)((char*)p + 128);
331 		} while (--size > 0);
332 
333 		// 3 more scrambles to finish.
334 		mval = _mm_aesenc_si128(mval, mval);
335 		mval2 = _mm_aesenc_si128(mval2, mval2);
336 		mval3 = _mm_aesenc_si128(mval3, mval3);
337 		mval4 = _mm_aesenc_si128(mval4, mval4);
338 		mval5 = _mm_aesenc_si128(mval5, mval5);
339 		mval6 = _mm_aesenc_si128(mval6, mval6);
340 		mval7 = _mm_aesenc_si128(mval7, mval7);
341 		mval8 = _mm_aesenc_si128(mval8, mval8);
342 		mval = _mm_aesenc_si128(mval, mval);
343 		mval2 = _mm_aesenc_si128(mval2, mval2);
344 		mval3 = _mm_aesenc_si128(mval3, mval3);
345 		mval4 = _mm_aesenc_si128(mval4, mval4);
346 		mval5 = _mm_aesenc_si128(mval5, mval5);
347 		mval6 = _mm_aesenc_si128(mval6, mval6);
348 		mval7 = _mm_aesenc_si128(mval7, mval7);
349 		mval8 = _mm_aesenc_si128(mval8, mval8);
350 		mval = _mm_aesenc_si128(mval, mval);
351 		mval2 = _mm_aesenc_si128(mval2, mval2);
352 		mval3 = _mm_aesenc_si128(mval3, mval3);
353 		mval4 = _mm_aesenc_si128(mval4, mval4);
354 		mval5 = _mm_aesenc_si128(mval5, mval5);
355 		mval6 = _mm_aesenc_si128(mval6, mval6);
356 		mval7 = _mm_aesenc_si128(mval7, mval7);
357 		mval8 = _mm_aesenc_si128(mval8, mval8);
358 
359 		mval ^= mval5;
360 		mval2 ^= mval6;
361 		mval3 ^= mval7;
362 		mval4 ^= mval8;
363 		mval ^= mval3;
364 		mval2 ^= mval4;
365 		mval ^= mval2;
366 		return _mm_cvtsi128_si64(mval);
367 	}
368 }
369 
370 #else // !defined(__x86_64__)
371 
372 // The 32-bit version of aeshashbody.
373 
aeshashbody(void * p,uintptr seed,uintptr size,Slice aeskeysched)374 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
375 	__m128i mseed, mseed2, mseed3, mseed4;
376 	__m128i mval, mval2, mval3, mval4;
377 
378 	// Start with hash seed.
379 	mseed = _mm_cvtsi32_si128(seed);
380 	// Get 16 bits of length.
381 	mseed = _mm_insert_epi16(mseed, size, 4);
382 	// Replace size with its low 2 bytes repeated 4 times.
383 	mseed = _mm_shufflehi_epi16(mseed, 0);
384 	// Save unscrambled seed.
385 	mseed2 = mseed;
386 	// XOR in per-process seed.
387 	mseed ^= _mm_loadu_si128(aeskeysched.__values);
388 	// Scramble seed.
389 	mseed = _mm_aesenc_si128(mseed, mseed);
390 
391 	if (size <= 16) {
392 		if (size == 0) {
393 			// Return scrambled input seed.
394 			return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, mseed));
395 		} else if (size < 16) {
396 			if ((((uintptr)(p) + 16) & 0xff0) != 0) {
397 				static const uint64 masks[32]
398 				  __attribute__ ((aligned(16))) =
399 				  {
400 				    0x0000000000000000, 0x0000000000000000,
401 				    0x00000000000000ff, 0x0000000000000000,
402 				    0x000000000000ffff, 0x0000000000000000,
403 				    0x0000000000ffffff, 0x0000000000000000,
404 				    0x00000000ffffffff, 0x0000000000000000,
405 				    0x000000ffffffffff, 0x0000000000000000,
406 				    0x0000ffffffffffff, 0x0000000000000000,
407 				    0x00ffffffffffffff, 0x0000000000000000,
408 				    0xffffffffffffffff, 0x0000000000000000,
409 				    0xffffffffffffffff, 0x00000000000000ff,
410 				    0xffffffffffffffff, 0x000000000000ffff,
411 				    0xffffffffffffffff, 0x0000000000ffffff,
412 				    0xffffffffffffffff, 0x00000000ffffffff,
413 				    0xffffffffffffffff, 0x000000ffffffffff,
414 				    0xffffffffffffffff, 0x0000ffffffffffff,
415 				    0xffffffffffffffff, 0x00ffffffffffffff
416 				  };
417 
418 				// 16 bytes loaded at p won't cross a page
419 				// boundary, so we can load it directly.
420 				mval = _mm_loadu_si128(p);
421 				mval &= *(const __m128i*)(&masks[size*2]);
422 			} else {
423 				static const uint64 shifts[32]
424 				  __attribute__ ((aligned(16))) =
425 				  {
426 				    0x0000000000000000, 0x0000000000000000,
427 				    0xffffffffffffff0f, 0xffffffffffffffff,
428 				    0xffffffffffff0f0e, 0xffffffffffffffff,
429 				    0xffffffffff0f0e0d, 0xffffffffffffffff,
430 				    0xffffffff0f0e0d0c, 0xffffffffffffffff,
431 				    0xffffff0f0e0d0c0b, 0xffffffffffffffff,
432 				    0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
433 				    0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
434 				    0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
435 				    0x0e0d0c0b0a090807, 0xffffffffffffff0f,
436 				    0x0d0c0b0a09080706, 0xffffffffffff0f0e,
437 				    0x0c0b0a0908070605, 0xffffffffff0f0e0d,
438 				    0x0b0a090807060504, 0xffffffff0f0e0d0c,
439 				    0x0a09080706050403, 0xffffff0f0e0d0c0b,
440 				    0x0908070605040302, 0xffff0f0e0d0c0b0a,
441 				    0x0807060504030201, 0xff0f0e0d0c0b0a09,
442 				  };
443 
444 				// address ends in 1111xxxx. Might be
445 				// up against a page boundary, so load
446 				// ending at last byte.  Then shift
447 				// bytes down using pshufb.
448 				mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
449 				mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
450 			}
451 		} else {
452 			mval = _mm_loadu_si128(p);
453 		}
454 
455 		// Scramble input, XOR in seed.
456 		mval = _mm_aesenc_si128(mval, mseed);
457 		mval = _mm_aesenc_si128(mval, mval);
458 		mval = _mm_aesenc_si128(mval, mval);
459 		return _mm_cvtsi128_si32(mval);
460 	} else if (size <= 32) {
461 		// Make second starting seed.
462 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
463 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
464 		// Load data to be hashed.
465 		mval = _mm_loadu_si128(p);
466 		mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
467 
468 		// Scramble 3 times.
469 		mval = _mm_aesenc_si128(mval, mseed);
470 		mval2 = _mm_aesenc_si128(mval2, mseed2);
471 		mval = _mm_aesenc_si128(mval, mval);
472 		mval2 = _mm_aesenc_si128(mval2, mval2);
473 		mval = _mm_aesenc_si128(mval, mval);
474 		mval2 = _mm_aesenc_si128(mval2, mval2);
475 
476 		// Combine results.
477 		mval ^= mval2;
478 		return _mm_cvtsi128_si32(mval);
479 	} else if (size <= 64) {
480 		// Make 3 more starting seeds.
481 		mseed3 = mseed2;
482 		mseed4 = mseed2;
483 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
484 		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
485 		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
486 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
487 		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
488 		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
489 
490 		mval = _mm_loadu_si128(p);
491 		mval2 = _mm_loadu_si128((void*)((char*)p + 16));
492 		mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
493 		mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
494 
495 		mval = _mm_aesenc_si128(mval, mseed);
496 		mval2 = _mm_aesenc_si128(mval2, mseed2);
497 		mval3 = _mm_aesenc_si128(mval3, mseed3);
498 		mval4 = _mm_aesenc_si128(mval4, mseed4);
499 
500 		mval = _mm_aesenc_si128(mval, mval);
501 		mval2 = _mm_aesenc_si128(mval2, mval2);
502 		mval3 = _mm_aesenc_si128(mval3, mval3);
503 		mval4 = _mm_aesenc_si128(mval4, mval4);
504 
505 		mval = _mm_aesenc_si128(mval, mval);
506 		mval2 = _mm_aesenc_si128(mval2, mval2);
507 		mval3 = _mm_aesenc_si128(mval3, mval3);
508 		mval4 = _mm_aesenc_si128(mval4, mval4);
509 
510 		mval ^= mval3;
511 		mval2 ^= mval4;
512 		mval ^= mval2;
513 		return _mm_cvtsi128_si32(mval);
514 	} else {
515 		// Make 3 more starting seeds.
516 		mseed3 = mseed2;
517 		mseed4 = mseed2;
518 		mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
519 		mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
520 		mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
521 		mseed2 = _mm_aesenc_si128(mseed2, mseed2);
522 		mseed3 = _mm_aesenc_si128(mseed3, mseed3);
523 		mseed4 = _mm_aesenc_si128(mseed4, mseed4);
524 
525 		// Start with last (possibly overlapping) block.
526 		mval = _mm_loadu_si128((void*)((char*)p + size - 64));
527 		mval2 = _mm_loadu_si128((void*)((char*)p + size - 48));
528 		mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
529 		mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
530 
531 		// Scramble state once.
532 		mval = _mm_aesenc_si128(mval, mseed);
533 		mval2 = _mm_aesenc_si128(mval2, mseed2);
534 		mval3 = _mm_aesenc_si128(mval3, mseed3);
535 		mval4 = _mm_aesenc_si128(mval4, mseed4);
536 
537 		// Compute number of remaining 64-byte blocks.
538 		size--;
539 		size >>= 6;
540 		do {
541 			// Scramble state, XOR in a block.
542 			mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
543 			mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
544 			mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
545 			mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
546 
547 			// Scramble state.
548 			mval = _mm_aesenc_si128(mval, mval);
549 			mval2 = _mm_aesenc_si128(mval2, mval2);
550 			mval3 = _mm_aesenc_si128(mval3, mval3);
551 			mval4 = _mm_aesenc_si128(mval4, mval4);
552 
553 			p = (void*)((char*)p + 64);
554 		} while (--size > 0);
555 
556 		// 2 more scrambles to finish.
557 		mval = _mm_aesenc_si128(mval, mval);
558 		mval2 = _mm_aesenc_si128(mval2, mval2);
559 		mval3 = _mm_aesenc_si128(mval3, mval3);
560 		mval4 = _mm_aesenc_si128(mval4, mval4);
561 
562 		mval = _mm_aesenc_si128(mval, mval);
563 		mval2 = _mm_aesenc_si128(mval2, mval2);
564 		mval3 = _mm_aesenc_si128(mval3, mval3);
565 		mval4 = _mm_aesenc_si128(mval4, mval4);
566 
567 		mval ^= mval3;
568 		mval2 ^= mval4;
569 		mval ^= mval2;
570 		return _mm_cvtsi128_si32(mval);
571 	}
572 }
573 
574 #endif // !defined(__x86_64__)
575 
576 #elif defined(__aarch64__)
577 
578 // Undefine some identifiers that we pick up from the Go runtime package that
579 // are used in arm_neon.h.
580 
581 #undef t1
582 #undef tx
583 #undef t2
584 #undef t3
585 #undef t4
586 #undef t5
587 
588 #include <arm_neon.h>
589 
590 // Force appropriate CPU level.  We won't call here unless the CPU
591 // supports it.
592 
593 #pragma GCC target("+crypto")
594 
595 // The arm64 version of aeshashbody.
596 
aeshashbody(void * p,uintptr seed,uintptr size,Slice aeskeysched)597 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
598 	uint8x16_t *pseed;
599 	uint64x2_t vinit64;
600 	uint8x16_t vinit;
601 	uint8x16_t vseed, vseed2, vseed3, vseed4;
602 	uint8x16_t vseed5, vseed6, vseed7, vseed8;
603 	uint8x16_t vval, vval2, vval3, vval4;
604 	uint8x16_t vval5, vval6, vval7, vval8;
605 	uint8x16_t vvalLoop, vvalLoop2, vvalLoop3, vvalLoop4;
606 	uint8x16_t vvalLoop5, vvalLoop6, vvalLoop7, vvalLoop8;
607 	uint8x16x2_t avval2;
608 	uint8x16x3_t avseed3;
609 
610 	pseed = (uint8x16_t*)(aeskeysched.__values);
611 
612 	// Combined hash seed and length.
613 	vinit64 = vdupq_n_u64(0);
614 	vinit64[0] = (uint64)seed;
615 	vinit64[1] = (uint64)size;
616 	vinit = vreinterpretq_u8_u64(vinit64);
617 
618 	// Mix in per-process seed.
619 	vseed = vaeseq_u8(*pseed, vinit);
620 	++pseed;
621 	// Scramble seed.
622 	vseed = vaesmcq_u8(vseed);
623 
624 	if (size <= 16) {
625 		if (size == 0) {
626 			// Return 64 bits of scrambled input seed.
627 			return vreinterpretq_u64_u8(vseed)[0];
628 		} else if (size < 16) {
629 			vval = vreinterpretq_u8_u64(vdupq_n_u64(0));
630 			if ((size & 8) != 0) {
631 				vval = vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p), vreinterpretq_u64_u8(vval), 0));
632 				p = (void*)((uint64_t*)(p) + 1);
633 			}
634 			if ((size & 4) != 0) {
635 				vval = vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p), vreinterpretq_u32_u8(vval), 2));
636 				p = (void*)((uint32_t*)(p) + 1);
637 			}
638 			if ((size & 2) != 0) {
639 				vval = vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p), vreinterpretq_u16_u8(vval), 6));
640 				p = (void*)((uint16_t*)(p) + 1);
641 			}
642 			if ((size & 1) != 0) {
643 				vval = vld1q_lane_u8((uint8*)(p), vval, 14);
644 			}
645 		} else {
646 			vval = *(uint8x16_t*)(p);
647 		}
648 		vval = vaeseq_u8(vval, vseed);
649 		vval = vaesmcq_u8(vval);
650 		vval = vaeseq_u8(vval, vseed);
651 		vval = vaesmcq_u8(vval);
652 		vval = vaeseq_u8(vval, vseed);
653 		return vreinterpretq_u64_u8(vval)[0];
654 	} else if (size <= 32) {
655 		// Make a second seed.
656 		vseed2 = vaeseq_u8(*pseed, vinit);
657 		vseed2 = vaesmcq_u8(vseed2);
658 		vval = *(uint8x16_t*)(p);
659 		vval2 = *(uint8x16_t*)((char*)(p) + (size - 16));
660 
661 		vval = vaeseq_u8(vval, vseed);
662 		vval = vaesmcq_u8(vval);
663 		vval2 = vaeseq_u8(vval2, vseed2);
664 		vval2 = vaesmcq_u8(vval2);
665 
666 		vval = vaeseq_u8(vval, vseed);
667 		vval = vaesmcq_u8(vval);
668 		vval2 = vaeseq_u8(vval2, vseed2);
669 		vval2 = vaesmcq_u8(vval2);
670 
671 		vval = vaeseq_u8(vval, vseed);
672 		vval2 = vaeseq_u8(vval2, vseed2);
673 
674 		vval ^= vval2;
675 
676 		return vreinterpretq_u64_u8(vval)[0];
677 	} else if (size <= 64) {
678 		avseed3 = vld1q_u8_x3((uint8*)(pseed));
679 		vseed2 = avseed3.val[0];
680 		vseed3 = avseed3.val[1];
681 		vseed4 = avseed3.val[2];
682 
683 		vseed2 = vaeseq_u8(vseed2, vinit);
684 		vseed2 = vaesmcq_u8(vseed2);
685 		vseed3 = vaeseq_u8(vseed3, vinit);
686 		vseed3 = vaesmcq_u8(vseed3);
687 		vseed4 = vaeseq_u8(vseed4, vinit);
688 		vseed4 = vaesmcq_u8(vseed4);
689 
690 		avval2 = vld1q_u8_x2((uint8*)(p));
691 		vval = avval2.val[0];
692 		vval2 = avval2.val[1];
693 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
694 		vval3 = avval2.val[0];
695 		vval4 = avval2.val[1];
696 
697 		vval = vaeseq_u8(vval, vseed);
698 		vval = vaesmcq_u8(vval);
699 		vval2 = vaeseq_u8(vval2, vseed2);
700 		vval2 = vaesmcq_u8(vval2);
701 		vval3 = vaeseq_u8(vval3, vseed3);
702 		vval3 = vaesmcq_u8(vval3);
703 		vval4 = vaeseq_u8(vval4, vseed4);
704 		vval4 = vaesmcq_u8(vval4);
705 
706 		vval = vaeseq_u8(vval, vseed);
707 		vval = vaesmcq_u8(vval);
708 		vval2 = vaeseq_u8(vval2, vseed2);
709 		vval2 = vaesmcq_u8(vval2);
710 		vval3 = vaeseq_u8(vval3, vseed3);
711 		vval3 = vaesmcq_u8(vval3);
712 		vval4 = vaeseq_u8(vval4, vseed4);
713 		vval4 = vaesmcq_u8(vval4);
714 
715 		vval = vaeseq_u8(vval, vseed);
716 		vval2 = vaeseq_u8(vval2, vseed2);
717 		vval3 = vaeseq_u8(vval3, vseed3);
718 		vval4 = vaeseq_u8(vval4, vseed4);
719 
720 		vval ^= vval3;
721 		vval2 ^= vval4;
722 		vval ^= vval2;
723 
724 		return vreinterpretq_u64_u8(vval)[0];
725 	} else if (size <= 128) {
726 		// For some reason vld1q_u8_x4 is missing.
727 		avseed3 = vld1q_u8_x3((uint8*)(pseed));
728 		vseed2 = avseed3.val[0];
729 		vseed3 = avseed3.val[1];
730 		vseed4 = avseed3.val[2];
731 		avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
732 		vseed5 = avseed3.val[0];
733 		vseed6 = avseed3.val[1];
734 		vseed7 = avseed3.val[2];
735 		vseed8 = *(pseed + 6);
736 
737 		vseed2 = vaeseq_u8(vseed2, vinit);
738 		vseed2 = vaesmcq_u8(vseed2);
739 		vseed3 = vaeseq_u8(vseed3, vinit);
740 		vseed3 = vaesmcq_u8(vseed3);
741 		vseed4 = vaeseq_u8(vseed4, vinit);
742 		vseed4 = vaesmcq_u8(vseed4);
743 		vseed5 = vaeseq_u8(vseed5, vinit);
744 		vseed5 = vaesmcq_u8(vseed5);
745 		vseed6 = vaeseq_u8(vseed6, vinit);
746 		vseed6 = vaesmcq_u8(vseed6);
747 		vseed7 = vaeseq_u8(vseed7, vinit);
748 		vseed7 = vaesmcq_u8(vseed7);
749 		vseed8 = vaeseq_u8(vseed8, vinit);
750 		vseed8 = vaesmcq_u8(vseed8);
751 
752 		avval2 = vld1q_u8_x2((uint8*)(p));
753 		vval = avval2.val[0];
754 		vval2 = avval2.val[1];
755 		avval2 = vld1q_u8_x2((uint8*)(p) + 32);
756 		vval3 = avval2.val[0];
757 		vval4 = avval2.val[1];
758 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
759 		vval5 = avval2.val[0];
760 		vval6 = avval2.val[1];
761 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
762 		vval7 = avval2.val[0];
763 		vval8 = avval2.val[1];
764 
765 		vval = vaeseq_u8(vval, vseed);
766 		vval = vaesmcq_u8(vval);
767 		vval2 = vaeseq_u8(vval2, vseed2);
768 		vval2 = vaesmcq_u8(vval2);
769 		vval3 = vaeseq_u8(vval3, vseed3);
770 		vval3 = vaesmcq_u8(vval3);
771 		vval4 = vaeseq_u8(vval4, vseed4);
772 		vval4 = vaesmcq_u8(vval4);
773 		vval5 = vaeseq_u8(vval5, vseed5);
774 		vval5 = vaesmcq_u8(vval5);
775 		vval6 = vaeseq_u8(vval6, vseed6);
776 		vval6 = vaesmcq_u8(vval6);
777 		vval7 = vaeseq_u8(vval7, vseed7);
778 		vval7 = vaesmcq_u8(vval7);
779 		vval8 = vaeseq_u8(vval8, vseed8);
780 		vval8 = vaesmcq_u8(vval8);
781 
782 		vval = vaeseq_u8(vval, vseed);
783 		vval = vaesmcq_u8(vval);
784 		vval2 = vaeseq_u8(vval2, vseed2);
785 		vval2 = vaesmcq_u8(vval2);
786 		vval3 = vaeseq_u8(vval3, vseed3);
787 		vval3 = vaesmcq_u8(vval3);
788 		vval4 = vaeseq_u8(vval4, vseed4);
789 		vval4 = vaesmcq_u8(vval4);
790 		vval5 = vaeseq_u8(vval5, vseed5);
791 		vval5 = vaesmcq_u8(vval5);
792 		vval6 = vaeseq_u8(vval6, vseed6);
793 		vval6 = vaesmcq_u8(vval6);
794 		vval7 = vaeseq_u8(vval7, vseed7);
795 		vval7 = vaesmcq_u8(vval7);
796 		vval8 = vaeseq_u8(vval8, vseed8);
797 		vval8 = vaesmcq_u8(vval8);
798 
799 		vval = vaeseq_u8(vval, vseed);
800 		vval2 = vaeseq_u8(vval2, vseed2);
801 		vval3 = vaeseq_u8(vval3, vseed3);
802 		vval4 = vaeseq_u8(vval4, vseed4);
803 		vval5 = vaeseq_u8(vval5, vseed5);
804 		vval6 = vaeseq_u8(vval6, vseed6);
805 		vval7 = vaeseq_u8(vval7, vseed7);
806 		vval8 = vaeseq_u8(vval8, vseed8);
807 
808 		vval ^= vval5;
809 		vval2 ^= vval6;
810 		vval3 ^= vval7;
811 		vval4 ^= vval8;
812 		vval ^= vval3;
813 		vval2 ^= vval4;
814 		vval ^= vval2;
815 
816 		return vreinterpretq_u64_u8(vval)[0];
817 	} else {
818 		// For some reason vld1q_u8_x4 is missing.
819 		avseed3 = vld1q_u8_x3((uint8*)(pseed));
820 		vseed2 = avseed3.val[0];
821 		vseed3 = avseed3.val[1];
822 		vseed4 = avseed3.val[2];
823 		avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
824 		vseed5 = avseed3.val[0];
825 		vseed6 = avseed3.val[1];
826 		vseed7 = avseed3.val[2];
827 		vseed8 = *(pseed + 6);
828 
829 		vseed2 = vaeseq_u8(vseed2, vinit);
830 		vseed2 = vaesmcq_u8(vseed2);
831 		vseed3 = vaeseq_u8(vseed3, vinit);
832 		vseed3 = vaesmcq_u8(vseed3);
833 		vseed4 = vaeseq_u8(vseed4, vinit);
834 		vseed4 = vaesmcq_u8(vseed4);
835 		vseed5 = vaeseq_u8(vseed5, vinit);
836 		vseed5 = vaesmcq_u8(vseed5);
837 		vseed6 = vaeseq_u8(vseed6, vinit);
838 		vseed6 = vaesmcq_u8(vseed6);
839 		vseed7 = vaeseq_u8(vseed7, vinit);
840 		vseed7 = vaesmcq_u8(vseed7);
841 		vseed8 = vaeseq_u8(vseed8, vinit);
842 		vseed8 = vaesmcq_u8(vseed8);
843 
844 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 128));
845 		vval = avval2.val[0];
846 		vval2 = avval2.val[1];
847 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 96));
848 		vval3 = avval2.val[0];
849 		vval4 = avval2.val[1];
850 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
851 		vval5 = avval2.val[0];
852 		vval6 = avval2.val[1];
853 		avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
854 		vval7 = avval2.val[0];
855 		vval8 = avval2.val[1];
856 
857 		vvalLoop = vseed;
858 		vvalLoop2 = vseed2;
859 		vvalLoop3 = vseed3;
860 		vvalLoop4 = vseed4;
861 		vvalLoop5 = vseed5;
862 		vvalLoop6 = vseed6;
863 		vvalLoop7 = vseed7;
864 		vvalLoop8 = vseed8;
865 
866 		size--;
867 		size >>= 7;
868 		do {
869 			vval = vaeseq_u8(vval, vvalLoop);
870 			vval = vaesmcq_u8(vval);
871 			vval2 = vaeseq_u8(vval2, vvalLoop2);
872 			vval2 = vaesmcq_u8(vval2);
873 			vval3 = vaeseq_u8(vval3, vvalLoop3);
874 			vval3 = vaesmcq_u8(vval3);
875 			vval4 = vaeseq_u8(vval4, vvalLoop4);
876 			vval4 = vaesmcq_u8(vval4);
877 			vval5 = vaeseq_u8(vval5, vvalLoop5);
878 			vval5 = vaesmcq_u8(vval5);
879 			vval6 = vaeseq_u8(vval6, vvalLoop6);
880 			vval6 = vaesmcq_u8(vval6);
881 			vval7 = vaeseq_u8(vval7, vvalLoop7);
882 			vval7 = vaesmcq_u8(vval7);
883 			vval8 = vaeseq_u8(vval8, vvalLoop8);
884 			vval8 = vaesmcq_u8(vval8);
885 
886 			avval2 = vld1q_u8_x2((uint8*)(p));
887 			vvalLoop = avval2.val[0];
888 			vvalLoop2 = avval2.val[1];
889 			avval2 = vld1q_u8_x2((uint8*)(p) + 32);
890 			vvalLoop3 = avval2.val[0];
891 			vvalLoop4 = avval2.val[1];
892 			avval2 = vld1q_u8_x2((uint8*)(p) + 64);
893 			vvalLoop5 = avval2.val[0];
894 			vvalLoop6 = avval2.val[1];
895 			avval2 = vld1q_u8_x2((uint8*)(p) + 96);
896 			vvalLoop7 = avval2.val[0];
897 			vvalLoop8 = avval2.val[1];
898 
899 			p = (void *)((uint8*)(p) + 128);
900 
901 			vval = vaeseq_u8(vval, vvalLoop);
902 			vval = vaesmcq_u8(vval);
903 			vval2 = vaeseq_u8(vval2, vvalLoop2);
904 			vval2 = vaesmcq_u8(vval2);
905 			vval3 = vaeseq_u8(vval3, vvalLoop3);
906 			vval3 = vaesmcq_u8(vval3);
907 			vval4 = vaeseq_u8(vval4, vvalLoop4);
908 			vval4 = vaesmcq_u8(vval4);
909 			vval5 = vaeseq_u8(vval5, vvalLoop5);
910 			vval5 = vaesmcq_u8(vval5);
911 			vval6 = vaeseq_u8(vval6, vvalLoop6);
912 			vval6 = vaesmcq_u8(vval6);
913 			vval7 = vaeseq_u8(vval7, vvalLoop7);
914 			vval7 = vaesmcq_u8(vval7);
915 			vval8 = vaeseq_u8(vval8, vvalLoop8);
916 			vval8 = vaesmcq_u8(vval8);
917 		} while (--size > 0);
918 
919 		vval = vaeseq_u8(vval, vvalLoop);
920 		vval = vaesmcq_u8(vval);
921 		vval2 = vaeseq_u8(vval2, vvalLoop2);
922 		vval2 = vaesmcq_u8(vval2);
923 		vval3 = vaeseq_u8(vval3, vvalLoop3);
924 		vval3 = vaesmcq_u8(vval3);
925 		vval4 = vaeseq_u8(vval4, vvalLoop4);
926 		vval4 = vaesmcq_u8(vval4);
927 		vval5 = vaeseq_u8(vval5, vvalLoop5);
928 		vval5 = vaesmcq_u8(vval5);
929 		vval6 = vaeseq_u8(vval6, vvalLoop6);
930 		vval6 = vaesmcq_u8(vval6);
931 		vval7 = vaeseq_u8(vval7, vvalLoop7);
932 		vval7 = vaesmcq_u8(vval7);
933 		vval8 = vaeseq_u8(vval8, vvalLoop8);
934 		vval8 = vaesmcq_u8(vval8);
935 
936 
937 		vval = vaeseq_u8(vval, vvalLoop);
938 		vval = vaesmcq_u8(vval);
939 		vval2 = vaeseq_u8(vval2, vvalLoop2);
940 		vval2 = vaesmcq_u8(vval2);
941 		vval3 = vaeseq_u8(vval3, vvalLoop3);
942 		vval3 = vaesmcq_u8(vval3);
943 		vval4 = vaeseq_u8(vval4, vvalLoop4);
944 		vval4 = vaesmcq_u8(vval4);
945 		vval5 = vaeseq_u8(vval5, vvalLoop5);
946 		vval5 = vaesmcq_u8(vval5);
947 		vval6 = vaeseq_u8(vval6, vvalLoop6);
948 		vval6 = vaesmcq_u8(vval6);
949 		vval7 = vaeseq_u8(vval7, vvalLoop7);
950 		vval7 = vaesmcq_u8(vval7);
951 		vval8 = vaeseq_u8(vval8, vvalLoop8);
952 		vval8 = vaesmcq_u8(vval8);
953 
954 		vval = vaeseq_u8(vval, vvalLoop);
955 		vval2 = vaeseq_u8(vval2, vvalLoop2);
956 		vval3 = vaeseq_u8(vval3, vvalLoop3);
957 		vval4 = vaeseq_u8(vval4, vvalLoop4);
958 		vval5 = vaeseq_u8(vval5, vvalLoop5);
959 		vval6 = vaeseq_u8(vval6, vvalLoop6);
960 		vval7 = vaeseq_u8(vval7, vvalLoop7);
961 		vval8 = vaeseq_u8(vval8, vvalLoop8);
962 
963 		vval ^= vval5;
964 		vval2 ^= vval6;
965 		vval3 ^= vval7;
966 		vval4 ^= vval8;
967 		vval ^= vval3;
968 		vval2 ^= vval4;
969 		vval ^= vval2;
970 
971 		return vreinterpretq_u64_u8(vval)[0];
972 	}
973 }
974 
975 #else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__)
976 
aeshashbody(void * p,uintptr seed,uintptr size,Slice aeskeysched)977 uintptr aeshashbody(void* p __attribute__((unused)),
978 		    uintptr seed __attribute__((unused)),
979 		    uintptr size __attribute__((unused)),
980 		    Slice aeskeysched __attribute__((unused))) {
981 	// We should never get here on a non-x86, non-arm64 system.
982 	runtime_throw("impossible call to aeshashbody");
983 }
984 
985 #endif // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)
986