1 /*
2  Copyright (c) 2011 Apple Inc.
3  https://bulletphysics.org
4 
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose,
8  including commercial applications, and to alter it and redistribute it freely,
9  subject to the following restrictions:
10 
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14 
15  This source version has been altered.
16  */
17 
18 #if defined(_WIN32) || defined(__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21 
22 #include "btVector3.h"
23 
24 #if defined BT_USE_SIMD_VECTOR3
25 
26 #if DEBUG
27 #include <string.h>  //for memset
28 #endif
29 
30 #ifdef __APPLE__
31 #include <stdint.h>
32 typedef float float4 __attribute__((vector_size(16)));
33 #else
34 #define float4 __m128
35 #endif
36 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
37 
38 #if defined BT_USE_SSE || defined _WIN32
39 
40 #define LOG2_ARRAY_SIZE 6
41 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
42 
43 #include <emmintrin.h>
44 
45 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
_maxdot_large(const float * vv,const float * vec,unsigned long count,float * dotResult)46 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
47 {
48 	const float4 *vertices = (const float4 *)vv;
49 	static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
50 	float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY);
51 	float4 vvec = _mm_loadu_ps(vec);
52 	float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));  /// zzzz
53 	float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
54 
55 	long maxIndex = -1L;
56 
57 	size_t segment = 0;
58 	float4 stack_array[STACK_ARRAY_COUNT];
59 
60 #if DEBUG
61 	//memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
62 #endif
63 
64 	size_t index;
65 	float4 max;
66 	// Faster loop without cleanup code for full tiles
67 	for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
68 	{
69 		max = dotMax;
70 
71 		for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
72 		{  // do four dot products at a time. Carefully avoid touching the w element.
73 			float4 v0 = vertices[0];
74 			float4 v1 = vertices[1];
75 			float4 v2 = vertices[2];
76 			float4 v3 = vertices[3];
77 			vertices += 4;
78 
79 			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
80 			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
81 			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
82 			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
83 
84 			lo0 = lo0 * vLo;
85 			lo1 = lo1 * vLo;
86 			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87 			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88 			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
89 			z = z * vHi;
90 			x = x + y;
91 			x = x + z;
92 			stack_array[index] = x;
93 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
94 
95 			v0 = vertices[0];
96 			v1 = vertices[1];
97 			v2 = vertices[2];
98 			v3 = vertices[3];
99 			vertices += 4;
100 
101 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
102 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
103 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
104 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
105 
106 			lo0 = lo0 * vLo;
107 			lo1 = lo1 * vLo;
108 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
109 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
110 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
111 			z = z * vHi;
112 			x = x + y;
113 			x = x + z;
114 			stack_array[index + 1] = x;
115 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
116 
117 			v0 = vertices[0];
118 			v1 = vertices[1];
119 			v2 = vertices[2];
120 			v3 = vertices[3];
121 			vertices += 4;
122 
123 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
124 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
125 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
126 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
127 
128 			lo0 = lo0 * vLo;
129 			lo1 = lo1 * vLo;
130 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
131 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
132 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
133 			z = z * vHi;
134 			x = x + y;
135 			x = x + z;
136 			stack_array[index + 2] = x;
137 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
138 
139 			v0 = vertices[0];
140 			v1 = vertices[1];
141 			v2 = vertices[2];
142 			v3 = vertices[3];
143 			vertices += 4;
144 
145 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
146 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
147 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
148 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
149 
150 			lo0 = lo0 * vLo;
151 			lo1 = lo1 * vLo;
152 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
153 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
154 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
155 			z = z * vHi;
156 			x = x + y;
157 			x = x + z;
158 			stack_array[index + 3] = x;
159 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
160 
161 			// It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
162 		}
163 
164 		// If we found a new max
165 		if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
166 		{
167 			// copy the new max across all lanes of our max accumulator
168 			max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169 			max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
170 
171 			dotMax = max;
172 
173 			// find first occurrence of that max
174 			size_t test;
175 			for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
176 			{
177 			}
178 			// record where it is.
179 			maxIndex = 4 * index + segment + indexTable[test];
180 		}
181 	}
182 
183 	// account for work we've already done
184 	count -= segment;
185 
186 	// Deal with the last < STACK_ARRAY_COUNT vectors
187 	max = dotMax;
188 	index = 0;
189 
190 	if (btUnlikely(count > 16))
191 	{
192 		for (; index + 4 <= count / 4; index += 4)
193 		{  // do four dot products at a time. Carefully avoid touching the w element.
194 			float4 v0 = vertices[0];
195 			float4 v1 = vertices[1];
196 			float4 v2 = vertices[2];
197 			float4 v3 = vertices[3];
198 			vertices += 4;
199 
200 			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
201 			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
202 			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
203 			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
204 
205 			lo0 = lo0 * vLo;
206 			lo1 = lo1 * vLo;
207 			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210 			z = z * vHi;
211 			x = x + y;
212 			x = x + z;
213 			stack_array[index] = x;
214 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
215 
216 			v0 = vertices[0];
217 			v1 = vertices[1];
218 			v2 = vertices[2];
219 			v3 = vertices[3];
220 			vertices += 4;
221 
222 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
223 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
224 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
225 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
226 
227 			lo0 = lo0 * vLo;
228 			lo1 = lo1 * vLo;
229 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
230 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
231 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
232 			z = z * vHi;
233 			x = x + y;
234 			x = x + z;
235 			stack_array[index + 1] = x;
236 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
237 
238 			v0 = vertices[0];
239 			v1 = vertices[1];
240 			v2 = vertices[2];
241 			v3 = vertices[3];
242 			vertices += 4;
243 
244 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
245 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
246 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
247 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
248 
249 			lo0 = lo0 * vLo;
250 			lo1 = lo1 * vLo;
251 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
252 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
253 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
254 			z = z * vHi;
255 			x = x + y;
256 			x = x + z;
257 			stack_array[index + 2] = x;
258 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
259 
260 			v0 = vertices[0];
261 			v1 = vertices[1];
262 			v2 = vertices[2];
263 			v3 = vertices[3];
264 			vertices += 4;
265 
266 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
267 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
268 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
269 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
270 
271 			lo0 = lo0 * vLo;
272 			lo1 = lo1 * vLo;
273 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
274 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
275 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276 			z = z * vHi;
277 			x = x + y;
278 			x = x + z;
279 			stack_array[index + 3] = x;
280 			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
281 
282 			// It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
283 		}
284 	}
285 
286 	size_t localCount = (count & -4L) - 4 * index;
287 	if (localCount)
288 	{
289 #ifdef __APPLE__
290 		float4 t0, t1, t2, t3, t4;
291 		float4 *sap = &stack_array[index + localCount / 4];
292 		vertices += localCount;  // counter the offset
293 		size_t byteIndex = -(localCount) * sizeof(float);
294 		//AT&T Code style assembly
295 		asm volatile(
296 			".align 4                                                                   \n\
297              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
298           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
299           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
300           movaps  %[t0], %[max]                               // vertices[0]      \n\
301           movlhps %[t1], %[max]                               // x0y0x1y1         \n\
302          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
303          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
304           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
305          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
306          movaps  %[t3], %[t0]                                // vertices[2]      \n\
307          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
308          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
309           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
310           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
311           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
312          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
313          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
314          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
315          addps   %[t3], %[max]                               // x + y            \n\
316          addps   %[t1], %[max]                               // x + y + z        \n\
317          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
318          maxps   %[t2], %[max]                               // record max, restore max   \n\
319          add     $16, %[byteIndex]                           // advance loop counter\n\
320          jnz     0b                                          \n\
321      "
322 			: [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
323 			: [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
324 			: "memory", "cc");
325 		index += localCount / 4;
326 #else
327 		{
328 			for (unsigned int i = 0; i < localCount / 4; i++, index++)
329 			{  // do four dot products at a time. Carefully avoid touching the w element.
330 				float4 v0 = vertices[0];
331 				float4 v1 = vertices[1];
332 				float4 v2 = vertices[2];
333 				float4 v3 = vertices[3];
334 				vertices += 4;
335 
336 				float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
337 				float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
338 				float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
339 				float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
340 
341 				lo0 = lo0 * vLo;
342 				lo1 = lo1 * vLo;
343 				float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344 				float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345 				float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
346 				z = z * vHi;
347 				x = x + y;
348 				x = x + z;
349 				stack_array[index] = x;
350 				max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
351 			}
352 		}
353 #endif  //__APPLE__
354 	}
355 
356 	// process the last few points
357 	if (count & 3)
358 	{
359 		float4 v0, v1, v2, x, y, z;
360 		switch (count & 3)
361 		{
362 			case 3:
363 			{
364 				v0 = vertices[0];
365 				v1 = vertices[1];
366 				v2 = vertices[2];
367 
368 				// Calculate 3 dot products, transpose, duplicate v2
369 				float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
370 				float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
371 				lo0 = lo0 * vLo;
372 				z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
373 				z = z * vHi;
374 				float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
375 				lo1 = lo1 * vLo;
376 				x = _mm_shuffle_ps(lo0, lo1, 0x88);
377 				y = _mm_shuffle_ps(lo0, lo1, 0xdd);
378 			}
379 			break;
380 			case 2:
381 			{
382 				v0 = vertices[0];
383 				v1 = vertices[1];
384 				float4 xy = _mm_movelh_ps(v0, v1);
385 				z = _mm_movehl_ps(v1, v0);
386 				xy = xy * vLo;
387 				z = _mm_shuffle_ps(z, z, 0xa8);
388 				x = _mm_shuffle_ps(xy, xy, 0xa8);
389 				y = _mm_shuffle_ps(xy, xy, 0xfd);
390 				z = z * vHi;
391 			}
392 			break;
393 			case 1:
394 			{
395 				float4 xy = vertices[0];
396 				z = _mm_shuffle_ps(xy, xy, 0xaa);
397 				xy = xy * vLo;
398 				z = z * vHi;
399 				x = _mm_shuffle_ps(xy, xy, 0);
400 				y = _mm_shuffle_ps(xy, xy, 0x55);
401 			}
402 			break;
403 		}
404 		x = x + y;
405 		x = x + z;
406 		stack_array[index] = x;
407 		max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
408 		index++;
409 	}
410 
411 	// if we found a new max.
412 	if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
413 	{  // we found a new max. Search for it
414 		// find max across the max vector, place in all elements of max -- big latency hit here
415 		max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416 		max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
417 
418 		// It is slightly faster to do this part in scalar code when count < 8. However, the common case for
419 		// this where it actually makes a difference is handled in the early out at the top of the function,
420 		// so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
421 		// complexity, and removed it.
422 
423 		dotMax = max;
424 
425 		// scan for the first occurence of max in the array
426 		size_t test;
427 		for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
428 		{
429 		}
430 		maxIndex = 4 * index + segment + indexTable[test];
431 	}
432 
433 	_mm_store_ss(dotResult, dotMax);
434 	return maxIndex;
435 }
436 
437 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
438 
_mindot_large(const float * vv,const float * vec,unsigned long count,float * dotResult)439 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
440 {
441 	const float4 *vertices = (const float4 *)vv;
442 	static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
443 	float4 dotmin = btAssign128(BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY);
444 	float4 vvec = _mm_loadu_ps(vec);
445 	float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));  /// zzzz
446 	float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
447 
448 	long minIndex = -1L;
449 
450 	size_t segment = 0;
451 	float4 stack_array[STACK_ARRAY_COUNT];
452 
453 #if DEBUG
454 	//memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
455 #endif
456 
457 	size_t index;
458 	float4 min;
459 	// Faster loop without cleanup code for full tiles
460 	for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
461 	{
462 		min = dotmin;
463 
464 		for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
465 		{  // do four dot products at a time. Carefully avoid touching the w element.
466 			float4 v0 = vertices[0];
467 			float4 v1 = vertices[1];
468 			float4 v2 = vertices[2];
469 			float4 v3 = vertices[3];
470 			vertices += 4;
471 
472 			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
473 			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
474 			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
475 			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
476 
477 			lo0 = lo0 * vLo;
478 			lo1 = lo1 * vLo;
479 			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480 			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481 			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
482 			z = z * vHi;
483 			x = x + y;
484 			x = x + z;
485 			stack_array[index] = x;
486 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
487 
488 			v0 = vertices[0];
489 			v1 = vertices[1];
490 			v2 = vertices[2];
491 			v3 = vertices[3];
492 			vertices += 4;
493 
494 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
495 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
496 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
497 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
498 
499 			lo0 = lo0 * vLo;
500 			lo1 = lo1 * vLo;
501 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
502 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
503 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
504 			z = z * vHi;
505 			x = x + y;
506 			x = x + z;
507 			stack_array[index + 1] = x;
508 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
509 
510 			v0 = vertices[0];
511 			v1 = vertices[1];
512 			v2 = vertices[2];
513 			v3 = vertices[3];
514 			vertices += 4;
515 
516 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
517 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
518 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
519 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
520 
521 			lo0 = lo0 * vLo;
522 			lo1 = lo1 * vLo;
523 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
524 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
525 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
526 			z = z * vHi;
527 			x = x + y;
528 			x = x + z;
529 			stack_array[index + 2] = x;
530 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
531 
532 			v0 = vertices[0];
533 			v1 = vertices[1];
534 			v2 = vertices[2];
535 			v3 = vertices[3];
536 			vertices += 4;
537 
538 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
539 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
540 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
541 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
542 
543 			lo0 = lo0 * vLo;
544 			lo1 = lo1 * vLo;
545 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
546 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
547 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
548 			z = z * vHi;
549 			x = x + y;
550 			x = x + z;
551 			stack_array[index + 3] = x;
552 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
553 
554 			// It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
555 		}
556 
557 		// If we found a new min
558 		if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
559 		{
560 			// copy the new min across all lanes of our min accumulator
561 			min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562 			min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
563 
564 			dotmin = min;
565 
566 			// find first occurrence of that min
567 			size_t test;
568 			for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
569 			{
570 			}
571 			// record where it is.
572 			minIndex = 4 * index + segment + indexTable[test];
573 		}
574 	}
575 
576 	// account for work we've already done
577 	count -= segment;
578 
579 	// Deal with the last < STACK_ARRAY_COUNT vectors
580 	min = dotmin;
581 	index = 0;
582 
583 	if (btUnlikely(count > 16))
584 	{
585 		for (; index + 4 <= count / 4; index += 4)
586 		{  // do four dot products at a time. Carefully avoid touching the w element.
587 			float4 v0 = vertices[0];
588 			float4 v1 = vertices[1];
589 			float4 v2 = vertices[2];
590 			float4 v3 = vertices[3];
591 			vertices += 4;
592 
593 			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
594 			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
595 			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
596 			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
597 
598 			lo0 = lo0 * vLo;
599 			lo1 = lo1 * vLo;
600 			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601 			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602 			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
603 			z = z * vHi;
604 			x = x + y;
605 			x = x + z;
606 			stack_array[index] = x;
607 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
608 
609 			v0 = vertices[0];
610 			v1 = vertices[1];
611 			v2 = vertices[2];
612 			v3 = vertices[3];
613 			vertices += 4;
614 
615 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
616 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
617 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
618 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
619 
620 			lo0 = lo0 * vLo;
621 			lo1 = lo1 * vLo;
622 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
623 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
624 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
625 			z = z * vHi;
626 			x = x + y;
627 			x = x + z;
628 			stack_array[index + 1] = x;
629 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
630 
631 			v0 = vertices[0];
632 			v1 = vertices[1];
633 			v2 = vertices[2];
634 			v3 = vertices[3];
635 			vertices += 4;
636 
637 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
638 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
639 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
640 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
641 
642 			lo0 = lo0 * vLo;
643 			lo1 = lo1 * vLo;
644 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
645 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
646 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
647 			z = z * vHi;
648 			x = x + y;
649 			x = x + z;
650 			stack_array[index + 2] = x;
651 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
652 
653 			v0 = vertices[0];
654 			v1 = vertices[1];
655 			v2 = vertices[2];
656 			v3 = vertices[3];
657 			vertices += 4;
658 
659 			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
660 			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
661 			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
662 			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
663 
664 			lo0 = lo0 * vLo;
665 			lo1 = lo1 * vLo;
666 			z = _mm_shuffle_ps(hi0, hi1, 0x88);
667 			x = _mm_shuffle_ps(lo0, lo1, 0x88);
668 			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
669 			z = z * vHi;
670 			x = x + y;
671 			x = x + z;
672 			stack_array[index + 3] = x;
673 			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
674 
675 			// It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
676 		}
677 	}
678 
679 	size_t localCount = (count & -4L) - 4 * index;
680 	if (localCount)
681 	{
682 #ifdef __APPLE__
683 		vertices += localCount;  // counter the offset
684 		float4 t0, t1, t2, t3, t4;
685 		size_t byteIndex = -(localCount) * sizeof(float);
686 		float4 *sap = &stack_array[index + localCount / 4];
687 
688 		asm volatile(
689 			".align 4                                                                   \n\
690              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
691              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
692              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
693              movaps  %[t0], %[min]                               // vertices[0]      \n\
694              movlhps %[t1], %[min]                               // x0y0x1y1         \n\
695              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
696              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
697              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
698              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
699              movaps  %[t3], %[t0]                                // vertices[2]      \n\
700              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
701              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
702              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
703              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
704              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
705              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
706              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
707              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
708              addps   %[t3], %[min]                               // x + y            \n\
709              addps   %[t1], %[min]                               // x + y + z        \n\
710              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
711              minps   %[t2], %[min]                               // record min, restore min   \n\
712              add     $16, %[byteIndex]                           // advance loop counter\n\
713              jnz     0b                                          \n\
714              "
715 			: [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
716 			: [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
717 			: "memory", "cc");
718 		index += localCount / 4;
719 #else
720 		{
721 			for (unsigned int i = 0; i < localCount / 4; i++, index++)
722 			{  // do four dot products at a time. Carefully avoid touching the w element.
723 				float4 v0 = vertices[0];
724 				float4 v1 = vertices[1];
725 				float4 v2 = vertices[2];
726 				float4 v3 = vertices[3];
727 				vertices += 4;
728 
729 				float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
730 				float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
731 				float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
732 				float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
733 
734 				lo0 = lo0 * vLo;
735 				lo1 = lo1 * vLo;
736 				float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737 				float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738 				float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
739 				z = z * vHi;
740 				x = x + y;
741 				x = x + z;
742 				stack_array[index] = x;
743 				min = _mm_min_ps(x, min);  // control the order here so that max is never NaN even if x is nan
744 			}
745 		}
746 
747 #endif
748 	}
749 
750 	// process the last few points
751 	if (count & 3)
752 	{
753 		float4 v0, v1, v2, x, y, z;
754 		switch (count & 3)
755 		{
756 			case 3:
757 			{
758 				v0 = vertices[0];
759 				v1 = vertices[1];
760 				v2 = vertices[2];
761 
762 				// Calculate 3 dot products, transpose, duplicate v2
763 				float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
764 				float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
765 				lo0 = lo0 * vLo;
766 				z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
767 				z = z * vHi;
768 				float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
769 				lo1 = lo1 * vLo;
770 				x = _mm_shuffle_ps(lo0, lo1, 0x88);
771 				y = _mm_shuffle_ps(lo0, lo1, 0xdd);
772 			}
773 			break;
774 			case 2:
775 			{
776 				v0 = vertices[0];
777 				v1 = vertices[1];
778 				float4 xy = _mm_movelh_ps(v0, v1);
779 				z = _mm_movehl_ps(v1, v0);
780 				xy = xy * vLo;
781 				z = _mm_shuffle_ps(z, z, 0xa8);
782 				x = _mm_shuffle_ps(xy, xy, 0xa8);
783 				y = _mm_shuffle_ps(xy, xy, 0xfd);
784 				z = z * vHi;
785 			}
786 			break;
787 			case 1:
788 			{
789 				float4 xy = vertices[0];
790 				z = _mm_shuffle_ps(xy, xy, 0xaa);
791 				xy = xy * vLo;
792 				z = z * vHi;
793 				x = _mm_shuffle_ps(xy, xy, 0);
794 				y = _mm_shuffle_ps(xy, xy, 0x55);
795 			}
796 			break;
797 		}
798 		x = x + y;
799 		x = x + z;
800 		stack_array[index] = x;
801 		min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
802 		index++;
803 	}
804 
805 	// if we found a new min.
806 	if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
807 	{  // we found a new min. Search for it
808 		// find min across the min vector, place in all elements of min -- big latency hit here
809 		min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810 		min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
811 
812 		// It is slightly faster to do this part in scalar code when count < 8. However, the common case for
813 		// this where it actually makes a difference is handled in the early out at the top of the function,
814 		// so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
815 		// complexity, and removed it.
816 
817 		dotmin = min;
818 
819 		// scan for the first occurence of min in the array
820 		size_t test;
821 		for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
822 		{
823 		}
824 		minIndex = 4 * index + segment + indexTable[test];
825 	}
826 
827 	_mm_store_ss(dotResult, dotmin);
828 	return minIndex;
829 }
830 
831 #elif defined BT_USE_NEON
832 
833 #define ARM_NEON_GCC_COMPATIBILITY 1
834 #include <arm_neon.h>
835 #include <sys/types.h>
836 #include <sys/sysctl.h>  //for sysctlbyname
837 
838 static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
839 static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
840 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
841 static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
842 static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
843 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
844 
845 long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
846 long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
847 
btGetCpuCapabilities(void)848 static inline uint32_t btGetCpuCapabilities(void)
849 {
850 	static uint32_t capabilities = 0;
851 	static bool testedCapabilities = false;
852 
853 	if (0 == testedCapabilities)
854 	{
855 		uint32_t hasFeature = 0;
856 		size_t featureSize = sizeof(hasFeature);
857 		int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
858 
859 		if (0 == err && hasFeature)
860 			capabilities |= 0x2000;
861 
862 		testedCapabilities = true;
863 	}
864 
865 	return capabilities;
866 }
867 
_maxdot_large_sel(const float * vv,const float * vec,unsigned long count,float * dotResult)868 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
869 {
870 	if (btGetCpuCapabilities() & 0x2000)
871 		_maxdot_large = _maxdot_large_v1;
872 	else
873 		_maxdot_large = _maxdot_large_v0;
874 
875 	return _maxdot_large(vv, vec, count, dotResult);
876 }
877 
_mindot_large_sel(const float * vv,const float * vec,unsigned long count,float * dotResult)878 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
879 {
880 	if (btGetCpuCapabilities() & 0x2000)
881 		_mindot_large = _mindot_large_v1;
882 	else
883 		_mindot_large = _mindot_large_v0;
884 
885 	return _mindot_large(vv, vec, count, dotResult);
886 }
887 
888 #if defined __arm__
889 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890 #else
891 //support 64bit arm
892 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893 #endif
894 
_maxdot_large_v0(const float * vv,const float * vec,unsigned long count,float * dotResult)895 long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
896 {
897 	unsigned long i = 0;
898 	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899 	float32x2_t vLo = vget_low_f32(vvec);
900 	float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
901 	float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
902 	float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
903 	uint32x2_t indexLo = (uint32x2_t){0, 1};
904 	uint32x2_t indexHi = (uint32x2_t){2, 3};
905 	uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
906 	uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907 	const uint32x2_t four = (uint32x2_t){4, 4};
908 
909 	for (; i + 8 <= count; i += 8)
910 	{
911 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
913 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
914 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
915 
916 		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917 		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
918 		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
919 		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
920 
921 		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
922 		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
923 		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924 		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
925 
926 		float32x2_t rLo = vpadd_f32(xy0, xy1);
927 		float32x2_t rHi = vpadd_f32(xy2, xy3);
928 		rLo = vadd_f32(rLo, zLo);
929 		rHi = vadd_f32(rHi, zHi);
930 
931 		uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932 		uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933 		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934 		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935 		iLo = vbsl_u32(maskLo, indexLo, iLo);
936 		iHi = vbsl_u32(maskHi, indexHi, iHi);
937 		indexLo = vadd_u32(indexLo, four);
938 		indexHi = vadd_u32(indexHi, four);
939 
940 		v0 = vld1q_f32_aligned_postincrement(vv);
941 		v1 = vld1q_f32_aligned_postincrement(vv);
942 		v2 = vld1q_f32_aligned_postincrement(vv);
943 		v3 = vld1q_f32_aligned_postincrement(vv);
944 
945 		xy0 = vmul_f32(vget_low_f32(v0), vLo);
946 		xy1 = vmul_f32(vget_low_f32(v1), vLo);
947 		xy2 = vmul_f32(vget_low_f32(v2), vLo);
948 		xy3 = vmul_f32(vget_low_f32(v3), vLo);
949 
950 		z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
951 		z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
952 		zLo = vmul_f32(z0.val[0], vHi);
953 		zHi = vmul_f32(z1.val[0], vHi);
954 
955 		rLo = vpadd_f32(xy0, xy1);
956 		rHi = vpadd_f32(xy2, xy3);
957 		rLo = vadd_f32(rLo, zLo);
958 		rHi = vadd_f32(rHi, zHi);
959 
960 		maskLo = vcgt_f32(rLo, dotMaxLo);
961 		maskHi = vcgt_f32(rHi, dotMaxHi);
962 		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963 		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964 		iLo = vbsl_u32(maskLo, indexLo, iLo);
965 		iHi = vbsl_u32(maskHi, indexHi, iHi);
966 		indexLo = vadd_u32(indexLo, four);
967 		indexHi = vadd_u32(indexHi, four);
968 	}
969 
970 	for (; i + 4 <= count; i += 4)
971 	{
972 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
974 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
975 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
976 
977 		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978 		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
979 		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
980 		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
981 
982 		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
983 		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
984 		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985 		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
986 
987 		float32x2_t rLo = vpadd_f32(xy0, xy1);
988 		float32x2_t rHi = vpadd_f32(xy2, xy3);
989 		rLo = vadd_f32(rLo, zLo);
990 		rHi = vadd_f32(rHi, zHi);
991 
992 		uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993 		uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994 		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995 		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996 		iLo = vbsl_u32(maskLo, indexLo, iLo);
997 		iHi = vbsl_u32(maskHi, indexHi, iHi);
998 		indexLo = vadd_u32(indexLo, four);
999 		indexHi = vadd_u32(indexHi, four);
1000 	}
1001 
1002 	switch (count & 3)
1003 	{
1004 		case 3:
1005 		{
1006 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1008 			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1009 
1010 			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011 			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012 			float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1013 
1014 			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1015 			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016 			float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1017 
1018 			float32x2_t rLo = vpadd_f32(xy0, xy1);
1019 			float32x2_t rHi = vpadd_f32(xy2, xy2);
1020 			rLo = vadd_f32(rLo, zLo);
1021 			rHi = vadd_f32(rHi, zHi);
1022 
1023 			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024 			uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025 			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026 			dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027 			iLo = vbsl_u32(maskLo, indexLo, iLo);
1028 			iHi = vbsl_u32(maskHi, indexHi, iHi);
1029 		}
1030 		break;
1031 		case 2:
1032 		{
1033 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1035 
1036 			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037 			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1038 
1039 			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1040 			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1041 
1042 			float32x2_t rLo = vpadd_f32(xy0, xy1);
1043 			rLo = vadd_f32(rLo, zLo);
1044 
1045 			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046 			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047 			iLo = vbsl_u32(maskLo, indexLo, iLo);
1048 		}
1049 		break;
1050 		case 1:
1051 		{
1052 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053 			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054 			float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055 			float32x2_t zLo = vmul_f32(z0, vHi);
1056 			float32x2_t rLo = vpadd_f32(xy0, xy0);
1057 			rLo = vadd_f32(rLo, zLo);
1058 			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059 			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060 			iLo = vbsl_u32(maskLo, indexLo, iLo);
1061 		}
1062 		break;
1063 
1064 		default:
1065 			break;
1066 	}
1067 
1068 	// select best answer between hi and lo results
1069 	uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070 	dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1071 	iLo = vbsl_u32(mask, iHi, iLo);
1072 
1073 	// select best answer between even and odd results
1074 	dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075 	iHi = vdup_lane_u32(iLo, 1);
1076 	mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077 	dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1078 	iLo = vbsl_u32(mask, iHi, iLo);
1079 
1080 	*dotResult = vget_lane_f32(dotMaxLo, 0);
1081 	return vget_lane_u32(iLo, 0);
1082 }
1083 
_maxdot_large_v1(const float * vv,const float * vec,unsigned long count,float * dotResult)1084 long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1085 {
1086 	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087 	float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088 	float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089 	const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090 	uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1091 	uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1092 	float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY};
1093 
1094 	unsigned long i = 0;
1095 	for (; i + 8 <= count; i += 8)
1096 	{
1097 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1099 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1100 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1101 
1102 		// the next two lines should resolve to a single vswp d, d
1103 		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1104 		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1105 		// the next two lines should resolve to a single vswp d, d
1106 		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1107 		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1108 
1109 		xy0 = vmulq_f32(xy0, vLo);
1110 		xy1 = vmulq_f32(xy1, vLo);
1111 
1112 		float32x4x2_t zb = vuzpq_f32(z0, z1);
1113 		float32x4_t z = vmulq_f32(zb.val[0], vHi);
1114 		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1115 		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116 		x = vaddq_f32(x, z);
1117 
1118 		uint32x4_t mask = vcgtq_f32(x, maxDot);
1119 		maxDot = vbslq_f32(mask, x, maxDot);
1120 		index = vbslq_u32(mask, local_index, index);
1121 		local_index = vaddq_u32(local_index, four);
1122 
1123 		v0 = vld1q_f32_aligned_postincrement(vv);
1124 		v1 = vld1q_f32_aligned_postincrement(vv);
1125 		v2 = vld1q_f32_aligned_postincrement(vv);
1126 		v3 = vld1q_f32_aligned_postincrement(vv);
1127 
1128 		// the next two lines should resolve to a single vswp d, d
1129 		xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1130 		xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1131 		// the next two lines should resolve to a single vswp d, d
1132 		z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1133 		z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1134 
1135 		xy0 = vmulq_f32(xy0, vLo);
1136 		xy1 = vmulq_f32(xy1, vLo);
1137 
1138 		zb = vuzpq_f32(z0, z1);
1139 		z = vmulq_f32(zb.val[0], vHi);
1140 		xy = vuzpq_f32(xy0, xy1);
1141 		x = vaddq_f32(xy.val[0], xy.val[1]);
1142 		x = vaddq_f32(x, z);
1143 
1144 		mask = vcgtq_f32(x, maxDot);
1145 		maxDot = vbslq_f32(mask, x, maxDot);
1146 		index = vbslq_u32(mask, local_index, index);
1147 		local_index = vaddq_u32(local_index, four);
1148 	}
1149 
1150 	for (; i + 4 <= count; i += 4)
1151 	{
1152 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1154 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1155 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1156 
1157 		// the next two lines should resolve to a single vswp d, d
1158 		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1159 		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1160 		// the next two lines should resolve to a single vswp d, d
1161 		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1162 		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1163 
1164 		xy0 = vmulq_f32(xy0, vLo);
1165 		xy1 = vmulq_f32(xy1, vLo);
1166 
1167 		float32x4x2_t zb = vuzpq_f32(z0, z1);
1168 		float32x4_t z = vmulq_f32(zb.val[0], vHi);
1169 		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1170 		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171 		x = vaddq_f32(x, z);
1172 
1173 		uint32x4_t mask = vcgtq_f32(x, maxDot);
1174 		maxDot = vbslq_f32(mask, x, maxDot);
1175 		index = vbslq_u32(mask, local_index, index);
1176 		local_index = vaddq_u32(local_index, four);
1177 	}
1178 
1179 	switch (count & 3)
1180 	{
1181 		case 3:
1182 		{
1183 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1185 			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1186 
1187 			// the next two lines should resolve to a single vswp d, d
1188 			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1189 			float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1190 			// the next two lines should resolve to a single vswp d, d
1191 			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1192 			float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1193 
1194 			xy0 = vmulq_f32(xy0, vLo);
1195 			xy1 = vmulq_f32(xy1, vLo);
1196 
1197 			float32x4x2_t zb = vuzpq_f32(z0, z1);
1198 			float32x4_t z = vmulq_f32(zb.val[0], vHi);
1199 			float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1200 			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201 			x = vaddq_f32(x, z);
1202 
1203 			uint32x4_t mask = vcgtq_f32(x, maxDot);
1204 			maxDot = vbslq_f32(mask, x, maxDot);
1205 			index = vbslq_u32(mask, local_index, index);
1206 			local_index = vaddq_u32(local_index, four);
1207 		}
1208 		break;
1209 
1210 		case 2:
1211 		{
1212 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1214 
1215 			// the next two lines should resolve to a single vswp d, d
1216 			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1217 			// the next two lines should resolve to a single vswp d, d
1218 			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1219 
1220 			xy0 = vmulq_f32(xy0, vLo);
1221 
1222 			float32x4x2_t zb = vuzpq_f32(z0, z0);
1223 			float32x4_t z = vmulq_f32(zb.val[0], vHi);
1224 			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1225 			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226 			x = vaddq_f32(x, z);
1227 
1228 			uint32x4_t mask = vcgtq_f32(x, maxDot);
1229 			maxDot = vbslq_f32(mask, x, maxDot);
1230 			index = vbslq_u32(mask, local_index, index);
1231 			local_index = vaddq_u32(local_index, four);
1232 		}
1233 		break;
1234 
1235 		case 1:
1236 		{
1237 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1238 
1239 			// the next two lines should resolve to a single vswp d, d
1240 			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1241 			// the next two lines should resolve to a single vswp d, d
1242 			float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1243 
1244 			xy0 = vmulq_f32(xy0, vLo);
1245 
1246 			z = vmulq_f32(z, vHi);
1247 			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1248 			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249 			x = vaddq_f32(x, z);
1250 
1251 			uint32x4_t mask = vcgtq_f32(x, maxDot);
1252 			maxDot = vbslq_f32(mask, x, maxDot);
1253 			index = vbslq_u32(mask, local_index, index);
1254 			local_index = vaddq_u32(local_index, four);
1255 		}
1256 		break;
1257 
1258 		default:
1259 			break;
1260 	}
1261 
1262 	// select best answer between hi and lo results
1263 	uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264 	float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265 	uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1266 
1267 	// select best answer between even and odd results
1268 	float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269 	uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270 	mask = vcgt_f32(maxDotO, maxDot2);
1271 	maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272 	index2 = vbsl_u32(mask, indexHi, index2);
1273 
1274 	*dotResult = vget_lane_f32(maxDot2, 0);
1275 	return vget_lane_u32(index2, 0);
1276 }
1277 
_mindot_large_v0(const float * vv,const float * vec,unsigned long count,float * dotResult)1278 long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1279 {
1280 	unsigned long i = 0;
1281 	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282 	float32x2_t vLo = vget_low_f32(vvec);
1283 	float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1284 	float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY};
1285 	float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY};
1286 	uint32x2_t indexLo = (uint32x2_t){0, 1};
1287 	uint32x2_t indexHi = (uint32x2_t){2, 3};
1288 	uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1289 	uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1290 	const uint32x2_t four = (uint32x2_t){4, 4};
1291 
1292 	for (; i + 8 <= count; i += 8)
1293 	{
1294 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1296 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1297 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1298 
1299 		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300 		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1301 		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1302 		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1303 
1304 		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1305 		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1306 		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307 		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1308 
1309 		float32x2_t rLo = vpadd_f32(xy0, xy1);
1310 		float32x2_t rHi = vpadd_f32(xy2, xy3);
1311 		rLo = vadd_f32(rLo, zLo);
1312 		rHi = vadd_f32(rHi, zHi);
1313 
1314 		uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315 		uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316 		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317 		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318 		iLo = vbsl_u32(maskLo, indexLo, iLo);
1319 		iHi = vbsl_u32(maskHi, indexHi, iHi);
1320 		indexLo = vadd_u32(indexLo, four);
1321 		indexHi = vadd_u32(indexHi, four);
1322 
1323 		v0 = vld1q_f32_aligned_postincrement(vv);
1324 		v1 = vld1q_f32_aligned_postincrement(vv);
1325 		v2 = vld1q_f32_aligned_postincrement(vv);
1326 		v3 = vld1q_f32_aligned_postincrement(vv);
1327 
1328 		xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329 		xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330 		xy2 = vmul_f32(vget_low_f32(v2), vLo);
1331 		xy3 = vmul_f32(vget_low_f32(v3), vLo);
1332 
1333 		z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1334 		z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1335 		zLo = vmul_f32(z0.val[0], vHi);
1336 		zHi = vmul_f32(z1.val[0], vHi);
1337 
1338 		rLo = vpadd_f32(xy0, xy1);
1339 		rHi = vpadd_f32(xy2, xy3);
1340 		rLo = vadd_f32(rLo, zLo);
1341 		rHi = vadd_f32(rHi, zHi);
1342 
1343 		maskLo = vclt_f32(rLo, dotMinLo);
1344 		maskHi = vclt_f32(rHi, dotMinHi);
1345 		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346 		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347 		iLo = vbsl_u32(maskLo, indexLo, iLo);
1348 		iHi = vbsl_u32(maskHi, indexHi, iHi);
1349 		indexLo = vadd_u32(indexLo, four);
1350 		indexHi = vadd_u32(indexHi, four);
1351 	}
1352 
1353 	for (; i + 4 <= count; i += 4)
1354 	{
1355 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1357 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1358 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1359 
1360 		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361 		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1362 		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1363 		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1364 
1365 		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1366 		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1367 		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368 		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1369 
1370 		float32x2_t rLo = vpadd_f32(xy0, xy1);
1371 		float32x2_t rHi = vpadd_f32(xy2, xy3);
1372 		rLo = vadd_f32(rLo, zLo);
1373 		rHi = vadd_f32(rHi, zHi);
1374 
1375 		uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376 		uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377 		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378 		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379 		iLo = vbsl_u32(maskLo, indexLo, iLo);
1380 		iHi = vbsl_u32(maskHi, indexHi, iHi);
1381 		indexLo = vadd_u32(indexLo, four);
1382 		indexHi = vadd_u32(indexHi, four);
1383 	}
1384 	switch (count & 3)
1385 	{
1386 		case 3:
1387 		{
1388 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1390 			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1391 
1392 			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393 			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394 			float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1395 
1396 			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1397 			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398 			float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1399 
1400 			float32x2_t rLo = vpadd_f32(xy0, xy1);
1401 			float32x2_t rHi = vpadd_f32(xy2, xy2);
1402 			rLo = vadd_f32(rLo, zLo);
1403 			rHi = vadd_f32(rHi, zHi);
1404 
1405 			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406 			uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407 			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408 			dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409 			iLo = vbsl_u32(maskLo, indexLo, iLo);
1410 			iHi = vbsl_u32(maskHi, indexHi, iHi);
1411 		}
1412 		break;
1413 		case 2:
1414 		{
1415 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1417 
1418 			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419 			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1420 
1421 			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1422 			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1423 
1424 			float32x2_t rLo = vpadd_f32(xy0, xy1);
1425 			rLo = vadd_f32(rLo, zLo);
1426 
1427 			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428 			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429 			iLo = vbsl_u32(maskLo, indexLo, iLo);
1430 		}
1431 		break;
1432 		case 1:
1433 		{
1434 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435 			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436 			float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437 			float32x2_t zLo = vmul_f32(z0, vHi);
1438 			float32x2_t rLo = vpadd_f32(xy0, xy0);
1439 			rLo = vadd_f32(rLo, zLo);
1440 			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441 			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442 			iLo = vbsl_u32(maskLo, indexLo, iLo);
1443 		}
1444 		break;
1445 
1446 		default:
1447 			break;
1448 	}
1449 
1450 	// select best answer between hi and lo results
1451 	uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1452 	dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1453 	iLo = vbsl_u32(mask, iHi, iLo);
1454 
1455 	// select best answer between even and odd results
1456 	dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457 	iHi = vdup_lane_u32(iLo, 1);
1458 	mask = vclt_f32(dotMinHi, dotMinLo);
1459 	dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1460 	iLo = vbsl_u32(mask, iHi, iLo);
1461 
1462 	*dotResult = vget_lane_f32(dotMinLo, 0);
1463 	return vget_lane_u32(iLo, 0);
1464 }
1465 
_mindot_large_v1(const float * vv,const float * vec,unsigned long count,float * dotResult)1466 long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1467 {
1468 	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469 	float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470 	float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471 	const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472 	uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1473 	uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1474 	float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY};
1475 
1476 	unsigned long i = 0;
1477 	for (; i + 8 <= count; i += 8)
1478 	{
1479 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1481 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1482 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1483 
1484 		// the next two lines should resolve to a single vswp d, d
1485 		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1486 		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1487 		// the next two lines should resolve to a single vswp d, d
1488 		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1489 		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1490 
1491 		xy0 = vmulq_f32(xy0, vLo);
1492 		xy1 = vmulq_f32(xy1, vLo);
1493 
1494 		float32x4x2_t zb = vuzpq_f32(z0, z1);
1495 		float32x4_t z = vmulq_f32(zb.val[0], vHi);
1496 		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1497 		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498 		x = vaddq_f32(x, z);
1499 
1500 		uint32x4_t mask = vcltq_f32(x, minDot);
1501 		minDot = vbslq_f32(mask, x, minDot);
1502 		index = vbslq_u32(mask, local_index, index);
1503 		local_index = vaddq_u32(local_index, four);
1504 
1505 		v0 = vld1q_f32_aligned_postincrement(vv);
1506 		v1 = vld1q_f32_aligned_postincrement(vv);
1507 		v2 = vld1q_f32_aligned_postincrement(vv);
1508 		v3 = vld1q_f32_aligned_postincrement(vv);
1509 
1510 		// the next two lines should resolve to a single vswp d, d
1511 		xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1512 		xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1513 		// the next two lines should resolve to a single vswp d, d
1514 		z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1515 		z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1516 
1517 		xy0 = vmulq_f32(xy0, vLo);
1518 		xy1 = vmulq_f32(xy1, vLo);
1519 
1520 		zb = vuzpq_f32(z0, z1);
1521 		z = vmulq_f32(zb.val[0], vHi);
1522 		xy = vuzpq_f32(xy0, xy1);
1523 		x = vaddq_f32(xy.val[0], xy.val[1]);
1524 		x = vaddq_f32(x, z);
1525 
1526 		mask = vcltq_f32(x, minDot);
1527 		minDot = vbslq_f32(mask, x, minDot);
1528 		index = vbslq_u32(mask, local_index, index);
1529 		local_index = vaddq_u32(local_index, four);
1530 	}
1531 
1532 	for (; i + 4 <= count; i += 4)
1533 	{
1534 		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535 		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1536 		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1537 		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1538 
1539 		// the next two lines should resolve to a single vswp d, d
1540 		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1541 		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1542 		// the next two lines should resolve to a single vswp d, d
1543 		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1544 		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1545 
1546 		xy0 = vmulq_f32(xy0, vLo);
1547 		xy1 = vmulq_f32(xy1, vLo);
1548 
1549 		float32x4x2_t zb = vuzpq_f32(z0, z1);
1550 		float32x4_t z = vmulq_f32(zb.val[0], vHi);
1551 		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1552 		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553 		x = vaddq_f32(x, z);
1554 
1555 		uint32x4_t mask = vcltq_f32(x, minDot);
1556 		minDot = vbslq_f32(mask, x, minDot);
1557 		index = vbslq_u32(mask, local_index, index);
1558 		local_index = vaddq_u32(local_index, four);
1559 	}
1560 
1561 	switch (count & 3)
1562 	{
1563 		case 3:
1564 		{
1565 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1567 			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1568 
1569 			// the next two lines should resolve to a single vswp d, d
1570 			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1571 			float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1572 			// the next two lines should resolve to a single vswp d, d
1573 			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1574 			float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1575 
1576 			xy0 = vmulq_f32(xy0, vLo);
1577 			xy1 = vmulq_f32(xy1, vLo);
1578 
1579 			float32x4x2_t zb = vuzpq_f32(z0, z1);
1580 			float32x4_t z = vmulq_f32(zb.val[0], vHi);
1581 			float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1582 			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583 			x = vaddq_f32(x, z);
1584 
1585 			uint32x4_t mask = vcltq_f32(x, minDot);
1586 			minDot = vbslq_f32(mask, x, minDot);
1587 			index = vbslq_u32(mask, local_index, index);
1588 			local_index = vaddq_u32(local_index, four);
1589 		}
1590 		break;
1591 
1592 		case 2:
1593 		{
1594 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595 			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1596 
1597 			// the next two lines should resolve to a single vswp d, d
1598 			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1599 			// the next two lines should resolve to a single vswp d, d
1600 			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1601 
1602 			xy0 = vmulq_f32(xy0, vLo);
1603 
1604 			float32x4x2_t zb = vuzpq_f32(z0, z0);
1605 			float32x4_t z = vmulq_f32(zb.val[0], vHi);
1606 			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1607 			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608 			x = vaddq_f32(x, z);
1609 
1610 			uint32x4_t mask = vcltq_f32(x, minDot);
1611 			minDot = vbslq_f32(mask, x, minDot);
1612 			index = vbslq_u32(mask, local_index, index);
1613 			local_index = vaddq_u32(local_index, four);
1614 		}
1615 		break;
1616 
1617 		case 1:
1618 		{
1619 			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1620 
1621 			// the next two lines should resolve to a single vswp d, d
1622 			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1623 			// the next two lines should resolve to a single vswp d, d
1624 			float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1625 
1626 			xy0 = vmulq_f32(xy0, vLo);
1627 
1628 			z = vmulq_f32(z, vHi);
1629 			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1630 			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631 			x = vaddq_f32(x, z);
1632 
1633 			uint32x4_t mask = vcltq_f32(x, minDot);
1634 			minDot = vbslq_f32(mask, x, minDot);
1635 			index = vbslq_u32(mask, local_index, index);
1636 			local_index = vaddq_u32(local_index, four);
1637 		}
1638 		break;
1639 
1640 		default:
1641 			break;
1642 	}
1643 
1644 	// select best answer between hi and lo results
1645 	uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646 	float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647 	uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1648 
1649 	// select best answer between even and odd results
1650 	float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651 	uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652 	mask = vclt_f32(minDotO, minDot2);
1653 	minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654 	index2 = vbsl_u32(mask, indexHi, index2);
1655 
1656 	*dotResult = vget_lane_f32(minDot2, 0);
1657 	return vget_lane_u32(index2, 0);
1658 }
1659 
1660 #else
1661 #error Unhandled __APPLE__ arch
1662 #endif
1663 
1664 #endif /* __APPLE__ */
1665