Bullet Collision Detection & Physics Library
btVector3.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2011 Apple Inc.
3  http://continuousphysics.com/Bullet/
4 
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose,
8  including commercial applications, and to alter it and redistribute it freely,
9  subject to the following restrictions:
10 
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14 
15  This source version has been altered.
16  */
17 
18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21 
22 
23 #include "btVector3.h"
24 
25 
26 
27 #if defined BT_USE_SIMD_VECTOR3
28 
29 #if DEBUG
30 #include <string.h>//for memset
31 #endif
32 
33 
34 #ifdef __APPLE__
35 #include <stdint.h>
36 typedef float float4 __attribute__ ((vector_size(16)));
37 #else
38 #define float4 __m128
39 #endif
40 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
41 
42 
43 #if defined BT_USE_SSE || defined _WIN32
44 
45 #define LOG2_ARRAY_SIZE 6
46 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
47 
48 #include <emmintrin.h>
49 
50 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
51 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
52 {
53  const float4 *vertices = (const float4*) vv;
54  static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
55  float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
56  float4 vvec = _mm_loadu_ps( vec );
57  float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
58  float4 vLo = _mm_movelh_ps( vvec, vvec );
59 
60  long maxIndex = -1L;
61 
62  size_t segment = 0;
63  float4 stack_array[ STACK_ARRAY_COUNT ];
64 
65 #if DEBUG
66  memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
67 #endif
68 
69  size_t index;
70  float4 max;
71  // Faster loop without cleanup code for full tiles
72  for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
73  {
74  max = dotMax;
75 
76  for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
77  { // do four dot products at a time. Carefully avoid touching the w element.
78  float4 v0 = vertices[0];
79  float4 v1 = vertices[1];
80  float4 v2 = vertices[2];
81  float4 v3 = vertices[3]; vertices += 4;
82 
83  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
84  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
85  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
86  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
87 
88  lo0 = lo0*vLo;
89  lo1 = lo1*vLo;
90  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
91  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
92  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
93  z = z*vHi;
94  x = x+y;
95  x = x+z;
96  stack_array[index] = x;
97  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
98 
99  v0 = vertices[0];
100  v1 = vertices[1];
101  v2 = vertices[2];
102  v3 = vertices[3]; vertices += 4;
103 
104  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
105  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
106  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
107  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
108 
109  lo0 = lo0*vLo;
110  lo1 = lo1*vLo;
111  z = _mm_shuffle_ps(hi0, hi1, 0x88);
112  x = _mm_shuffle_ps(lo0, lo1, 0x88);
113  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114  z = z*vHi;
115  x = x+y;
116  x = x+z;
117  stack_array[index+1] = x;
118  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
119 
120  v0 = vertices[0];
121  v1 = vertices[1];
122  v2 = vertices[2];
123  v3 = vertices[3]; vertices += 4;
124 
125  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
126  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
127  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
128  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
129 
130  lo0 = lo0*vLo;
131  lo1 = lo1*vLo;
132  z = _mm_shuffle_ps(hi0, hi1, 0x88);
133  x = _mm_shuffle_ps(lo0, lo1, 0x88);
134  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
135  z = z*vHi;
136  x = x+y;
137  x = x+z;
138  stack_array[index+2] = x;
139  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
140 
141  v0 = vertices[0];
142  v1 = vertices[1];
143  v2 = vertices[2];
144  v3 = vertices[3]; vertices += 4;
145 
146  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
147  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
148  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
149  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
150 
151  lo0 = lo0*vLo;
152  lo1 = lo1*vLo;
153  z = _mm_shuffle_ps(hi0, hi1, 0x88);
154  x = _mm_shuffle_ps(lo0, lo1, 0x88);
155  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
156  z = z*vHi;
157  x = x+y;
158  x = x+z;
159  stack_array[index+3] = x;
160  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
161 
162  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
163  }
164 
165  // If we found a new max
166  if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
167  {
168  // copy the new max across all lanes of our max accumulator
169  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
170  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
171 
172  dotMax = max;
173 
174  // find first occurrence of that max
175  size_t test;
176  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
177  {}
178  // record where it is.
179  maxIndex = 4*index + segment + indexTable[test];
180  }
181  }
182 
183  // account for work we've already done
184  count -= segment;
185 
186  // Deal with the last < STACK_ARRAY_COUNT vectors
187  max = dotMax;
188  index = 0;
189 
190 
191  if( btUnlikely( count > 16) )
192  {
193  for( ; index + 4 <= count / 4; index+=4 )
194  { // do four dot products at a time. Carefully avoid touching the w element.
195  float4 v0 = vertices[0];
196  float4 v1 = vertices[1];
197  float4 v2 = vertices[2];
198  float4 v3 = vertices[3]; vertices += 4;
199 
200  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
201  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
202  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
203  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
204 
205  lo0 = lo0*vLo;
206  lo1 = lo1*vLo;
207  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210  z = z*vHi;
211  x = x+y;
212  x = x+z;
213  stack_array[index] = x;
214  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
215 
216  v0 = vertices[0];
217  v1 = vertices[1];
218  v2 = vertices[2];
219  v3 = vertices[3]; vertices += 4;
220 
221  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
222  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
223  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
224  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
225 
226  lo0 = lo0*vLo;
227  lo1 = lo1*vLo;
228  z = _mm_shuffle_ps(hi0, hi1, 0x88);
229  x = _mm_shuffle_ps(lo0, lo1, 0x88);
230  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
231  z = z*vHi;
232  x = x+y;
233  x = x+z;
234  stack_array[index+1] = x;
235  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
236 
237  v0 = vertices[0];
238  v1 = vertices[1];
239  v2 = vertices[2];
240  v3 = vertices[3]; vertices += 4;
241 
242  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
243  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
244  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
245  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
246 
247  lo0 = lo0*vLo;
248  lo1 = lo1*vLo;
249  z = _mm_shuffle_ps(hi0, hi1, 0x88);
250  x = _mm_shuffle_ps(lo0, lo1, 0x88);
251  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
252  z = z*vHi;
253  x = x+y;
254  x = x+z;
255  stack_array[index+2] = x;
256  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
257 
258  v0 = vertices[0];
259  v1 = vertices[1];
260  v2 = vertices[2];
261  v3 = vertices[3]; vertices += 4;
262 
263  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
264  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
265  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
266  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
267 
268  lo0 = lo0*vLo;
269  lo1 = lo1*vLo;
270  z = _mm_shuffle_ps(hi0, hi1, 0x88);
271  x = _mm_shuffle_ps(lo0, lo1, 0x88);
272  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
273  z = z*vHi;
274  x = x+y;
275  x = x+z;
276  stack_array[index+3] = x;
277  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
278 
279  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
280  }
281  }
282 
283  size_t localCount = (count & -4L) - 4*index;
284  if( localCount )
285  {
286 #ifdef __APPLE__
287  float4 t0, t1, t2, t3, t4;
288  float4 * sap = &stack_array[index + localCount / 4];
289  vertices += localCount; // counter the offset
290  size_t byteIndex = -(localCount) * sizeof(float);
291  //AT&T Code style assembly
292  asm volatile
293  ( ".align 4 \n\
294  0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
295  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
296  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
297  movaps %[t0], %[max] // vertices[0] \n\
298  movlhps %[t1], %[max] // x0y0x1y1 \n\
299  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
300  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
301  mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
302  movhlps %[t0], %[t1] // z0w0z1w1 \n\
303  movaps %[t3], %[t0] // vertices[2] \n\
304  movlhps %[t4], %[t0] // x2y2x3y3 \n\
305  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
306  movhlps %[t3], %[t4] // z2w2z3w3 \n\
307  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
308  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
309  movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
310  shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
311  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
312  addps %[t3], %[max] // x + y \n\
313  addps %[t1], %[max] // x + y + z \n\
314  movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
315  maxps %[t2], %[max] // record max, restore max \n\
316  add $16, %[byteIndex] // advance loop counter\n\
317  jnz 0b \n\
318  "
319  : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
320  : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
321  : "memory", "cc"
322  );
323  index += localCount/4;
324 #else
325  {
326  for( unsigned int i=0; i<localCount/4; i++,index++)
327  { // do four dot products at a time. Carefully avoid touching the w element.
328  float4 v0 = vertices[0];
329  float4 v1 = vertices[1];
330  float4 v2 = vertices[2];
331  float4 v3 = vertices[3];
332  vertices += 4;
333 
334  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
335  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
336  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
337  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
338 
339  lo0 = lo0*vLo;
340  lo1 = lo1*vLo;
341  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
342  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
343  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
344  z = z*vHi;
345  x = x+y;
346  x = x+z;
347  stack_array[index] = x;
348  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
349  }
350  }
351 #endif //__APPLE__
352  }
353 
354  // process the last few points
355  if( count & 3 )
356  {
357  float4 v0, v1, v2, x, y, z;
358  switch( count & 3 )
359  {
360  case 3:
361  {
362  v0 = vertices[0];
363  v1 = vertices[1];
364  v2 = vertices[2];
365 
366  // Calculate 3 dot products, transpose, duplicate v2
367  float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
368  float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
369  lo0 = lo0*vLo;
370  z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
371  z = z*vHi;
372  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
373  lo1 = lo1*vLo;
374  x = _mm_shuffle_ps(lo0, lo1, 0x88);
375  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
376  }
377  break;
378  case 2:
379  {
380  v0 = vertices[0];
381  v1 = vertices[1];
382  float4 xy = _mm_movelh_ps(v0, v1);
383  z = _mm_movehl_ps(v1, v0);
384  xy = xy*vLo;
385  z = _mm_shuffle_ps( z, z, 0xa8);
386  x = _mm_shuffle_ps( xy, xy, 0xa8);
387  y = _mm_shuffle_ps( xy, xy, 0xfd);
388  z = z*vHi;
389  }
390  break;
391  case 1:
392  {
393  float4 xy = vertices[0];
394  z = _mm_shuffle_ps( xy, xy, 0xaa);
395  xy = xy*vLo;
396  z = z*vHi;
397  x = _mm_shuffle_ps(xy, xy, 0);
398  y = _mm_shuffle_ps(xy, xy, 0x55);
399  }
400  break;
401  }
402  x = x+y;
403  x = x+z;
404  stack_array[index] = x;
405  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
406  index++;
407  }
408 
409  // if we found a new max.
410  if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
411  { // we found a new max. Search for it
412  // find max across the max vector, place in all elements of max -- big latency hit here
413  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
414  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
415 
416  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
417  // this where it actually makes a difference is handled in the early out at the top of the function,
418  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
419  // complexity, and removed it.
420 
421  dotMax = max;
422 
423  // scan for the first occurence of max in the array
424  size_t test;
425  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
426  {}
427  maxIndex = 4*index + segment + indexTable[test];
428  }
429 
430  _mm_store_ss( dotResult, dotMax);
431  return maxIndex;
432 }
433 
434 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
435 
436 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
437 {
438  const float4 *vertices = (const float4*) vv;
439  static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
440  float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
441  float4 vvec = _mm_loadu_ps( vec );
442  float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
443  float4 vLo = _mm_movelh_ps( vvec, vvec );
444 
445  long minIndex = -1L;
446 
447  size_t segment = 0;
448  float4 stack_array[ STACK_ARRAY_COUNT ];
449 
450 #if DEBUG
451  memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
452 #endif
453 
454  size_t index;
455  float4 min;
456  // Faster loop without cleanup code for full tiles
457  for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
458  {
459  min = dotmin;
460 
461  for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
462  { // do four dot products at a time. Carefully avoid touching the w element.
463  float4 v0 = vertices[0];
464  float4 v1 = vertices[1];
465  float4 v2 = vertices[2];
466  float4 v3 = vertices[3]; vertices += 4;
467 
468  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
469  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
470  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
471  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
472 
473  lo0 = lo0*vLo;
474  lo1 = lo1*vLo;
475  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
478  z = z*vHi;
479  x = x+y;
480  x = x+z;
481  stack_array[index] = x;
482  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
483 
484  v0 = vertices[0];
485  v1 = vertices[1];
486  v2 = vertices[2];
487  v3 = vertices[3]; vertices += 4;
488 
489  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
490  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
491  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
492  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
493 
494  lo0 = lo0*vLo;
495  lo1 = lo1*vLo;
496  z = _mm_shuffle_ps(hi0, hi1, 0x88);
497  x = _mm_shuffle_ps(lo0, lo1, 0x88);
498  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
499  z = z*vHi;
500  x = x+y;
501  x = x+z;
502  stack_array[index+1] = x;
503  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
504 
505  v0 = vertices[0];
506  v1 = vertices[1];
507  v2 = vertices[2];
508  v3 = vertices[3]; vertices += 4;
509 
510  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
511  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
512  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
513  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
514 
515  lo0 = lo0*vLo;
516  lo1 = lo1*vLo;
517  z = _mm_shuffle_ps(hi0, hi1, 0x88);
518  x = _mm_shuffle_ps(lo0, lo1, 0x88);
519  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
520  z = z*vHi;
521  x = x+y;
522  x = x+z;
523  stack_array[index+2] = x;
524  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
525 
526  v0 = vertices[0];
527  v1 = vertices[1];
528  v2 = vertices[2];
529  v3 = vertices[3]; vertices += 4;
530 
531  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
532  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
533  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
534  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
535 
536  lo0 = lo0*vLo;
537  lo1 = lo1*vLo;
538  z = _mm_shuffle_ps(hi0, hi1, 0x88);
539  x = _mm_shuffle_ps(lo0, lo1, 0x88);
540  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
541  z = z*vHi;
542  x = x+y;
543  x = x+z;
544  stack_array[index+3] = x;
545  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
546 
547  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
548  }
549 
550  // If we found a new min
551  if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
552  {
553  // copy the new min across all lanes of our min accumulator
554  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
555  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
556 
557  dotmin = min;
558 
559  // find first occurrence of that min
560  size_t test;
561  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
562  {}
563  // record where it is.
564  minIndex = 4*index + segment + indexTable[test];
565  }
566  }
567 
568  // account for work we've already done
569  count -= segment;
570 
571  // Deal with the last < STACK_ARRAY_COUNT vectors
572  min = dotmin;
573  index = 0;
574 
575 
576  if(btUnlikely( count > 16) )
577  {
578  for( ; index + 4 <= count / 4; index+=4 )
579  { // do four dot products at a time. Carefully avoid touching the w element.
580  float4 v0 = vertices[0];
581  float4 v1 = vertices[1];
582  float4 v2 = vertices[2];
583  float4 v3 = vertices[3]; vertices += 4;
584 
585  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
586  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
587  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
588  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
589 
590  lo0 = lo0*vLo;
591  lo1 = lo1*vLo;
592  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
593  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
594  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
595  z = z*vHi;
596  x = x+y;
597  x = x+z;
598  stack_array[index] = x;
599  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
600 
601  v0 = vertices[0];
602  v1 = vertices[1];
603  v2 = vertices[2];
604  v3 = vertices[3]; vertices += 4;
605 
606  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
607  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
608  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
609  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
610 
611  lo0 = lo0*vLo;
612  lo1 = lo1*vLo;
613  z = _mm_shuffle_ps(hi0, hi1, 0x88);
614  x = _mm_shuffle_ps(lo0, lo1, 0x88);
615  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
616  z = z*vHi;
617  x = x+y;
618  x = x+z;
619  stack_array[index+1] = x;
620  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
621 
622  v0 = vertices[0];
623  v1 = vertices[1];
624  v2 = vertices[2];
625  v3 = vertices[3]; vertices += 4;
626 
627  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
628  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
629  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
630  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
631 
632  lo0 = lo0*vLo;
633  lo1 = lo1*vLo;
634  z = _mm_shuffle_ps(hi0, hi1, 0x88);
635  x = _mm_shuffle_ps(lo0, lo1, 0x88);
636  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
637  z = z*vHi;
638  x = x+y;
639  x = x+z;
640  stack_array[index+2] = x;
641  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
642 
643  v0 = vertices[0];
644  v1 = vertices[1];
645  v2 = vertices[2];
646  v3 = vertices[3]; vertices += 4;
647 
648  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
649  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
650  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
651  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
652 
653  lo0 = lo0*vLo;
654  lo1 = lo1*vLo;
655  z = _mm_shuffle_ps(hi0, hi1, 0x88);
656  x = _mm_shuffle_ps(lo0, lo1, 0x88);
657  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
658  z = z*vHi;
659  x = x+y;
660  x = x+z;
661  stack_array[index+3] = x;
662  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
663 
664  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
665  }
666  }
667 
668  size_t localCount = (count & -4L) - 4*index;
669  if( localCount )
670  {
671 
672 
673 #ifdef __APPLE__
674  vertices += localCount; // counter the offset
675  float4 t0, t1, t2, t3, t4;
676  size_t byteIndex = -(localCount) * sizeof(float);
677  float4 * sap = &stack_array[index + localCount / 4];
678 
679  asm volatile
680  ( ".align 4 \n\
681  0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
682  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
683  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
684  movaps %[t0], %[min] // vertices[0] \n\
685  movlhps %[t1], %[min] // x0y0x1y1 \n\
686  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
687  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
688  mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
689  movhlps %[t0], %[t1] // z0w0z1w1 \n\
690  movaps %[t3], %[t0] // vertices[2] \n\
691  movlhps %[t4], %[t0] // x2y2x3y3 \n\
692  movhlps %[t3], %[t4] // z2w2z3w3 \n\
693  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
694  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
695  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
696  movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
697  shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
698  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
699  addps %[t3], %[min] // x + y \n\
700  addps %[t1], %[min] // x + y + z \n\
701  movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
702  minps %[t2], %[min] // record min, restore min \n\
703  add $16, %[byteIndex] // advance loop counter\n\
704  jnz 0b \n\
705  "
706  : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
707  : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
708  : "memory", "cc"
709  );
710  index += localCount/4;
711 #else
712  {
713  for( unsigned int i=0; i<localCount/4; i++,index++)
714  { // do four dot products at a time. Carefully avoid touching the w element.
715  float4 v0 = vertices[0];
716  float4 v1 = vertices[1];
717  float4 v2 = vertices[2];
718  float4 v3 = vertices[3];
719  vertices += 4;
720 
721  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
722  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
723  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
724  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
725 
726  lo0 = lo0*vLo;
727  lo1 = lo1*vLo;
728  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
729  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
730  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
731  z = z*vHi;
732  x = x+y;
733  x = x+z;
734  stack_array[index] = x;
735  min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
736  }
737  }
738 
739 #endif
740  }
741 
742  // process the last few points
743  if( count & 3 )
744  {
745  float4 v0, v1, v2, x, y, z;
746  switch( count & 3 )
747  {
748  case 3:
749  {
750  v0 = vertices[0];
751  v1 = vertices[1];
752  v2 = vertices[2];
753 
754  // Calculate 3 dot products, transpose, duplicate v2
755  float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
756  float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
757  lo0 = lo0*vLo;
758  z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
759  z = z*vHi;
760  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
761  lo1 = lo1*vLo;
762  x = _mm_shuffle_ps(lo0, lo1, 0x88);
763  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
764  }
765  break;
766  case 2:
767  {
768  v0 = vertices[0];
769  v1 = vertices[1];
770  float4 xy = _mm_movelh_ps(v0, v1);
771  z = _mm_movehl_ps(v1, v0);
772  xy = xy*vLo;
773  z = _mm_shuffle_ps( z, z, 0xa8);
774  x = _mm_shuffle_ps( xy, xy, 0xa8);
775  y = _mm_shuffle_ps( xy, xy, 0xfd);
776  z = z*vHi;
777  }
778  break;
779  case 1:
780  {
781  float4 xy = vertices[0];
782  z = _mm_shuffle_ps( xy, xy, 0xaa);
783  xy = xy*vLo;
784  z = z*vHi;
785  x = _mm_shuffle_ps(xy, xy, 0);
786  y = _mm_shuffle_ps(xy, xy, 0x55);
787  }
788  break;
789  }
790  x = x+y;
791  x = x+z;
792  stack_array[index] = x;
793  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
794  index++;
795  }
796 
797  // if we found a new min.
798  if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
799  { // we found a new min. Search for it
800  // find min across the min vector, place in all elements of min -- big latency hit here
801  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
802  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
803 
804  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
805  // this where it actually makes a difference is handled in the early out at the top of the function,
806  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
807  // complexity, and removed it.
808 
809  dotmin = min;
810 
811  // scan for the first occurence of min in the array
812  size_t test;
813  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
814  {}
815  minIndex = 4*index + segment + indexTable[test];
816  }
817 
818  _mm_store_ss( dotResult, dotmin);
819  return minIndex;
820 }
821 
822 
823 #elif defined BT_USE_NEON
824 #define ARM_NEON_GCC_COMPATIBILITY 1
825 #include <arm_neon.h>
826 #include <sys/types.h>
827 #include <sys/sysctl.h> //for sysctlbyname
828 
829 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
830 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
831 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
832 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
833 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
834 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
835 
836 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
837 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
838 
839 
840 static inline uint32_t btGetCpuCapabilities( void )
841 {
842  static uint32_t capabilities = 0;
843  static bool testedCapabilities = false;
844 
845  if( 0 == testedCapabilities)
846  {
847  uint32_t hasFeature = 0;
848  size_t featureSize = sizeof( hasFeature );
849  int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
850 
851  if( 0 == err && hasFeature)
852  capabilities |= 0x2000;
853 
854  testedCapabilities = true;
855  }
856 
857  return capabilities;
858 }
859 
860 
861 
862 
863 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
864 {
865 
866  if( btGetCpuCapabilities() & 0x2000 )
867  _maxdot_large = _maxdot_large_v1;
868  else
869  _maxdot_large = _maxdot_large_v0;
870 
871  return _maxdot_large(vv, vec, count, dotResult);
872 }
873 
874 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
875 {
876 
877  if( btGetCpuCapabilities() & 0x2000 )
878  _mindot_large = _mindot_large_v1;
879  else
880  _mindot_large = _mindot_large_v0;
881 
882  return _mindot_large(vv, vec, count, dotResult);
883 }
884 
885 
886 
887 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
888 
889 
890 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
891 {
892  unsigned long i = 0;
893  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
894  float32x2_t vLo = vget_low_f32(vvec);
895  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
896  float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
897  float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
898  uint32x2_t indexLo = (uint32x2_t) {0, 1};
899  uint32x2_t indexHi = (uint32x2_t) {2, 3};
900  uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
901  uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
902  const uint32x2_t four = (uint32x2_t) {4,4};
903 
904  for( ; i+8 <= count; i+= 8 )
905  {
906  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
907  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
908  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
909  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
910 
911  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
912  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
913  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
914  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
915 
916  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
917  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
918  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
919  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
920 
921  float32x2_t rLo = vpadd_f32( xy0, xy1);
922  float32x2_t rHi = vpadd_f32( xy2, xy3);
923  rLo = vadd_f32(rLo, zLo);
924  rHi = vadd_f32(rHi, zHi);
925 
926  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
927  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
928  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
929  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
930  iLo = vbsl_u32(maskLo, indexLo, iLo);
931  iHi = vbsl_u32(maskHi, indexHi, iHi);
932  indexLo = vadd_u32(indexLo, four);
933  indexHi = vadd_u32(indexHi, four);
934 
935  v0 = vld1q_f32_aligned_postincrement( vv );
936  v1 = vld1q_f32_aligned_postincrement( vv );
937  v2 = vld1q_f32_aligned_postincrement( vv );
938  v3 = vld1q_f32_aligned_postincrement( vv );
939 
940  xy0 = vmul_f32( vget_low_f32(v0), vLo);
941  xy1 = vmul_f32( vget_low_f32(v1), vLo);
942  xy2 = vmul_f32( vget_low_f32(v2), vLo);
943  xy3 = vmul_f32( vget_low_f32(v3), vLo);
944 
945  z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
946  z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
947  zLo = vmul_f32( z0.val[0], vHi);
948  zHi = vmul_f32( z1.val[0], vHi);
949 
950  rLo = vpadd_f32( xy0, xy1);
951  rHi = vpadd_f32( xy2, xy3);
952  rLo = vadd_f32(rLo, zLo);
953  rHi = vadd_f32(rHi, zHi);
954 
955  maskLo = vcgt_f32( rLo, dotMaxLo );
956  maskHi = vcgt_f32( rHi, dotMaxHi );
957  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
958  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
959  iLo = vbsl_u32(maskLo, indexLo, iLo);
960  iHi = vbsl_u32(maskHi, indexHi, iHi);
961  indexLo = vadd_u32(indexLo, four);
962  indexHi = vadd_u32(indexHi, four);
963  }
964 
965  for( ; i+4 <= count; i+= 4 )
966  {
967  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
968  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
969  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
970  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
971 
972  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
973  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
974  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
975  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
976 
977  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
978  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
979  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
980  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
981 
982  float32x2_t rLo = vpadd_f32( xy0, xy1);
983  float32x2_t rHi = vpadd_f32( xy2, xy3);
984  rLo = vadd_f32(rLo, zLo);
985  rHi = vadd_f32(rHi, zHi);
986 
987  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
988  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
989  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
990  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
991  iLo = vbsl_u32(maskLo, indexLo, iLo);
992  iHi = vbsl_u32(maskHi, indexHi, iHi);
993  indexLo = vadd_u32(indexLo, four);
994  indexHi = vadd_u32(indexHi, four);
995  }
996 
997  switch( count & 3 )
998  {
999  case 3:
1000  {
1001  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1002  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1003  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1004 
1005  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1006  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1007  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1008 
1009  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1010  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1011  float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1012 
1013  float32x2_t rLo = vpadd_f32( xy0, xy1);
1014  float32x2_t rHi = vpadd_f32( xy2, xy2);
1015  rLo = vadd_f32(rLo, zLo);
1016  rHi = vadd_f32(rHi, zHi);
1017 
1018  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1019  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
1020  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1021  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
1022  iLo = vbsl_u32(maskLo, indexLo, iLo);
1023  iHi = vbsl_u32(maskHi, indexHi, iHi);
1024  }
1025  break;
1026  case 2:
1027  {
1028  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1029  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1030 
1031  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1032  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1033 
1034  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1035  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1036 
1037  float32x2_t rLo = vpadd_f32( xy0, xy1);
1038  rLo = vadd_f32(rLo, zLo);
1039 
1040  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1041  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1042  iLo = vbsl_u32(maskLo, indexLo, iLo);
1043  }
1044  break;
1045  case 1:
1046  {
1047  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1048  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1049  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1050  float32x2_t zLo = vmul_f32( z0, vHi);
1051  float32x2_t rLo = vpadd_f32( xy0, xy0);
1052  rLo = vadd_f32(rLo, zLo);
1053  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1054  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1055  iLo = vbsl_u32(maskLo, indexLo, iLo);
1056  }
1057  break;
1058 
1059  default:
1060  break;
1061  }
1062 
1063  // select best answer between hi and lo results
1064  uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1065  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1066  iLo = vbsl_u32(mask, iHi, iLo);
1067 
1068  // select best answer between even and odd results
1069  dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1070  iHi = vdup_lane_u32(iLo, 1);
1071  mask = vcgt_f32( dotMaxHi, dotMaxLo );
1072  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1073  iLo = vbsl_u32(mask, iHi, iLo);
1074 
1075  *dotResult = vget_lane_f32( dotMaxLo, 0);
1076  return vget_lane_u32(iLo, 0);
1077 }
1078 
1079 
1080 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1081 {
1082  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1083  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1084  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1085  const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1086  uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1087  uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1088  float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1089 
1090  unsigned long i = 0;
1091  for( ; i + 8 <= count; i += 8 )
1092  {
1093  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1094  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1095  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1096  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1097 
1098  // the next two lines should resolve to a single vswp d, d
1099  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1100  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1101  // the next two lines should resolve to a single vswp d, d
1102  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1103  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1104 
1105  xy0 = vmulq_f32(xy0, vLo);
1106  xy1 = vmulq_f32(xy1, vLo);
1107 
1108  float32x4x2_t zb = vuzpq_f32( z0, z1);
1109  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1110  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1111  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1112  x = vaddq_f32(x, z);
1113 
1114  uint32x4_t mask = vcgtq_f32(x, maxDot);
1115  maxDot = vbslq_f32( mask, x, maxDot);
1116  index = vbslq_u32(mask, local_index, index);
1117  local_index = vaddq_u32(local_index, four);
1118 
1119  v0 = vld1q_f32_aligned_postincrement( vv );
1120  v1 = vld1q_f32_aligned_postincrement( vv );
1121  v2 = vld1q_f32_aligned_postincrement( vv );
1122  v3 = vld1q_f32_aligned_postincrement( vv );
1123 
1124  // the next two lines should resolve to a single vswp d, d
1125  xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1126  xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1127  // the next two lines should resolve to a single vswp d, d
1128  z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1129  z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1130 
1131  xy0 = vmulq_f32(xy0, vLo);
1132  xy1 = vmulq_f32(xy1, vLo);
1133 
1134  zb = vuzpq_f32( z0, z1);
1135  z = vmulq_f32( zb.val[0], vHi);
1136  xy = vuzpq_f32( xy0, xy1);
1137  x = vaddq_f32(xy.val[0], xy.val[1]);
1138  x = vaddq_f32(x, z);
1139 
1140  mask = vcgtq_f32(x, maxDot);
1141  maxDot = vbslq_f32( mask, x, maxDot);
1142  index = vbslq_u32(mask, local_index, index);
1143  local_index = vaddq_u32(local_index, four);
1144  }
1145 
1146  for( ; i + 4 <= count; i += 4 )
1147  {
1148  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1149  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1150  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1151  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1152 
1153  // the next two lines should resolve to a single vswp d, d
1154  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1155  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1156  // the next two lines should resolve to a single vswp d, d
1157  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1158  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1159 
1160  xy0 = vmulq_f32(xy0, vLo);
1161  xy1 = vmulq_f32(xy1, vLo);
1162 
1163  float32x4x2_t zb = vuzpq_f32( z0, z1);
1164  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1165  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1166  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1167  x = vaddq_f32(x, z);
1168 
1169  uint32x4_t mask = vcgtq_f32(x, maxDot);
1170  maxDot = vbslq_f32( mask, x, maxDot);
1171  index = vbslq_u32(mask, local_index, index);
1172  local_index = vaddq_u32(local_index, four);
1173  }
1174 
1175  switch (count & 3) {
1176  case 3:
1177  {
1178  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1179  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1180  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1181 
1182  // the next two lines should resolve to a single vswp d, d
1183  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1184  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1185  // the next two lines should resolve to a single vswp d, d
1186  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1187  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1188 
1189  xy0 = vmulq_f32(xy0, vLo);
1190  xy1 = vmulq_f32(xy1, vLo);
1191 
1192  float32x4x2_t zb = vuzpq_f32( z0, z1);
1193  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1194  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1195  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1196  x = vaddq_f32(x, z);
1197 
1198  uint32x4_t mask = vcgtq_f32(x, maxDot);
1199  maxDot = vbslq_f32( mask, x, maxDot);
1200  index = vbslq_u32(mask, local_index, index);
1201  local_index = vaddq_u32(local_index, four);
1202  }
1203  break;
1204 
1205  case 2:
1206  {
1207  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1208  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1209 
1210  // the next two lines should resolve to a single vswp d, d
1211  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1212  // the next two lines should resolve to a single vswp d, d
1213  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1214 
1215  xy0 = vmulq_f32(xy0, vLo);
1216 
1217  float32x4x2_t zb = vuzpq_f32( z0, z0);
1218  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1219  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1220  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1221  x = vaddq_f32(x, z);
1222 
1223  uint32x4_t mask = vcgtq_f32(x, maxDot);
1224  maxDot = vbslq_f32( mask, x, maxDot);
1225  index = vbslq_u32(mask, local_index, index);
1226  local_index = vaddq_u32(local_index, four);
1227  }
1228  break;
1229 
1230  case 1:
1231  {
1232  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1233 
1234  // the next two lines should resolve to a single vswp d, d
1235  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1236  // the next two lines should resolve to a single vswp d, d
1237  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1238 
1239  xy0 = vmulq_f32(xy0, vLo);
1240 
1241  z = vmulq_f32( z, vHi);
1242  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1243  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1244  x = vaddq_f32(x, z);
1245 
1246  uint32x4_t mask = vcgtq_f32(x, maxDot);
1247  maxDot = vbslq_f32( mask, x, maxDot);
1248  index = vbslq_u32(mask, local_index, index);
1249  local_index = vaddq_u32(local_index, four);
1250  }
1251  break;
1252 
1253  default:
1254  break;
1255  }
1256 
1257 
1258  // select best answer between hi and lo results
1259  uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1260  float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1261  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1262 
1263  // select best answer between even and odd results
1264  float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1265  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1266  mask = vcgt_f32( maxDotO, maxDot2 );
1267  maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1268  index2 = vbsl_u32(mask, indexHi, index2);
1269 
1270  *dotResult = vget_lane_f32( maxDot2, 0);
1271  return vget_lane_u32(index2, 0);
1272 
1273 }
1274 
1275 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1276 {
1277  unsigned long i = 0;
1278  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1279  float32x2_t vLo = vget_low_f32(vvec);
1280  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1281  float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1282  float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1283  uint32x2_t indexLo = (uint32x2_t) {0, 1};
1284  uint32x2_t indexHi = (uint32x2_t) {2, 3};
1285  uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1286  uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1287  const uint32x2_t four = (uint32x2_t) {4,4};
1288 
1289  for( ; i+8 <= count; i+= 8 )
1290  {
1291  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1292  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1293  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1294  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1295 
1296  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1297  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1298  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1299  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1300 
1301  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1302  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1303  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1304  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1305 
1306  float32x2_t rLo = vpadd_f32( xy0, xy1);
1307  float32x2_t rHi = vpadd_f32( xy2, xy3);
1308  rLo = vadd_f32(rLo, zLo);
1309  rHi = vadd_f32(rHi, zHi);
1310 
1311  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1312  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1313  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1314  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1315  iLo = vbsl_u32(maskLo, indexLo, iLo);
1316  iHi = vbsl_u32(maskHi, indexHi, iHi);
1317  indexLo = vadd_u32(indexLo, four);
1318  indexHi = vadd_u32(indexHi, four);
1319 
1320  v0 = vld1q_f32_aligned_postincrement( vv );
1321  v1 = vld1q_f32_aligned_postincrement( vv );
1322  v2 = vld1q_f32_aligned_postincrement( vv );
1323  v3 = vld1q_f32_aligned_postincrement( vv );
1324 
1325  xy0 = vmul_f32( vget_low_f32(v0), vLo);
1326  xy1 = vmul_f32( vget_low_f32(v1), vLo);
1327  xy2 = vmul_f32( vget_low_f32(v2), vLo);
1328  xy3 = vmul_f32( vget_low_f32(v3), vLo);
1329 
1330  z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1331  z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1332  zLo = vmul_f32( z0.val[0], vHi);
1333  zHi = vmul_f32( z1.val[0], vHi);
1334 
1335  rLo = vpadd_f32( xy0, xy1);
1336  rHi = vpadd_f32( xy2, xy3);
1337  rLo = vadd_f32(rLo, zLo);
1338  rHi = vadd_f32(rHi, zHi);
1339 
1340  maskLo = vclt_f32( rLo, dotMinLo );
1341  maskHi = vclt_f32( rHi, dotMinHi );
1342  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1343  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1344  iLo = vbsl_u32(maskLo, indexLo, iLo);
1345  iHi = vbsl_u32(maskHi, indexHi, iHi);
1346  indexLo = vadd_u32(indexLo, four);
1347  indexHi = vadd_u32(indexHi, four);
1348  }
1349 
1350  for( ; i+4 <= count; i+= 4 )
1351  {
1352  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1353  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1354  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1355  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1356 
1357  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1358  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1359  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1360  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1361 
1362  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1363  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1364  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1365  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1366 
1367  float32x2_t rLo = vpadd_f32( xy0, xy1);
1368  float32x2_t rHi = vpadd_f32( xy2, xy3);
1369  rLo = vadd_f32(rLo, zLo);
1370  rHi = vadd_f32(rHi, zHi);
1371 
1372  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1373  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1374  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1375  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1376  iLo = vbsl_u32(maskLo, indexLo, iLo);
1377  iHi = vbsl_u32(maskHi, indexHi, iHi);
1378  indexLo = vadd_u32(indexLo, four);
1379  indexHi = vadd_u32(indexHi, four);
1380  }
1381  switch( count & 3 )
1382  {
1383  case 3:
1384  {
1385  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1386  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1387  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1388 
1389  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1390  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1391  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1392 
1393  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1394  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1395  float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1396 
1397  float32x2_t rLo = vpadd_f32( xy0, xy1);
1398  float32x2_t rHi = vpadd_f32( xy2, xy2);
1399  rLo = vadd_f32(rLo, zLo);
1400  rHi = vadd_f32(rHi, zHi);
1401 
1402  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1403  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1404  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1405  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1406  iLo = vbsl_u32(maskLo, indexLo, iLo);
1407  iHi = vbsl_u32(maskHi, indexHi, iHi);
1408  }
1409  break;
1410  case 2:
1411  {
1412  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1413  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1414 
1415  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1416  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1417 
1418  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1419  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1420 
1421  float32x2_t rLo = vpadd_f32( xy0, xy1);
1422  rLo = vadd_f32(rLo, zLo);
1423 
1424  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1425  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1426  iLo = vbsl_u32(maskLo, indexLo, iLo);
1427  }
1428  break;
1429  case 1:
1430  {
1431  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1432  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1433  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1434  float32x2_t zLo = vmul_f32( z0, vHi);
1435  float32x2_t rLo = vpadd_f32( xy0, xy0);
1436  rLo = vadd_f32(rLo, zLo);
1437  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1438  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1439  iLo = vbsl_u32(maskLo, indexLo, iLo);
1440  }
1441  break;
1442 
1443  default:
1444  break;
1445  }
1446 
1447  // select best answer between hi and lo results
1448  uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1449  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1450  iLo = vbsl_u32(mask, iHi, iLo);
1451 
1452  // select best answer between even and odd results
1453  dotMinHi = vdup_lane_f32(dotMinLo, 1);
1454  iHi = vdup_lane_u32(iLo, 1);
1455  mask = vclt_f32( dotMinHi, dotMinLo );
1456  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1457  iLo = vbsl_u32(mask, iHi, iLo);
1458 
1459  *dotResult = vget_lane_f32( dotMinLo, 0);
1460  return vget_lane_u32(iLo, 0);
1461 }
1462 
1463 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1464 {
1465  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1466  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1467  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1468  const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1469  uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1470  uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1471  float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1472 
1473  unsigned long i = 0;
1474  for( ; i + 8 <= count; i += 8 )
1475  {
1476  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1477  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1478  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1479  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1480 
1481  // the next two lines should resolve to a single vswp d, d
1482  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1483  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1484  // the next two lines should resolve to a single vswp d, d
1485  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1486  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1487 
1488  xy0 = vmulq_f32(xy0, vLo);
1489  xy1 = vmulq_f32(xy1, vLo);
1490 
1491  float32x4x2_t zb = vuzpq_f32( z0, z1);
1492  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1493  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1494  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1495  x = vaddq_f32(x, z);
1496 
1497  uint32x4_t mask = vcltq_f32(x, minDot);
1498  minDot = vbslq_f32( mask, x, minDot);
1499  index = vbslq_u32(mask, local_index, index);
1500  local_index = vaddq_u32(local_index, four);
1501 
1502  v0 = vld1q_f32_aligned_postincrement( vv );
1503  v1 = vld1q_f32_aligned_postincrement( vv );
1504  v2 = vld1q_f32_aligned_postincrement( vv );
1505  v3 = vld1q_f32_aligned_postincrement( vv );
1506 
1507  // the next two lines should resolve to a single vswp d, d
1508  xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1509  xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1510  // the next two lines should resolve to a single vswp d, d
1511  z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1512  z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1513 
1514  xy0 = vmulq_f32(xy0, vLo);
1515  xy1 = vmulq_f32(xy1, vLo);
1516 
1517  zb = vuzpq_f32( z0, z1);
1518  z = vmulq_f32( zb.val[0], vHi);
1519  xy = vuzpq_f32( xy0, xy1);
1520  x = vaddq_f32(xy.val[0], xy.val[1]);
1521  x = vaddq_f32(x, z);
1522 
1523  mask = vcltq_f32(x, minDot);
1524  minDot = vbslq_f32( mask, x, minDot);
1525  index = vbslq_u32(mask, local_index, index);
1526  local_index = vaddq_u32(local_index, four);
1527  }
1528 
1529  for( ; i + 4 <= count; i += 4 )
1530  {
1531  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1532  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1533  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1534  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1535 
1536  // the next two lines should resolve to a single vswp d, d
1537  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1538  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1539  // the next two lines should resolve to a single vswp d, d
1540  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1541  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1542 
1543  xy0 = vmulq_f32(xy0, vLo);
1544  xy1 = vmulq_f32(xy1, vLo);
1545 
1546  float32x4x2_t zb = vuzpq_f32( z0, z1);
1547  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1548  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1549  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1550  x = vaddq_f32(x, z);
1551 
1552  uint32x4_t mask = vcltq_f32(x, minDot);
1553  minDot = vbslq_f32( mask, x, minDot);
1554  index = vbslq_u32(mask, local_index, index);
1555  local_index = vaddq_u32(local_index, four);
1556  }
1557 
1558  switch (count & 3) {
1559  case 3:
1560  {
1561  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1562  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1563  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1564 
1565  // the next two lines should resolve to a single vswp d, d
1566  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1567  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1568  // the next two lines should resolve to a single vswp d, d
1569  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1570  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1571 
1572  xy0 = vmulq_f32(xy0, vLo);
1573  xy1 = vmulq_f32(xy1, vLo);
1574 
1575  float32x4x2_t zb = vuzpq_f32( z0, z1);
1576  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1577  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1578  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1579  x = vaddq_f32(x, z);
1580 
1581  uint32x4_t mask = vcltq_f32(x, minDot);
1582  minDot = vbslq_f32( mask, x, minDot);
1583  index = vbslq_u32(mask, local_index, index);
1584  local_index = vaddq_u32(local_index, four);
1585  }
1586  break;
1587 
1588  case 2:
1589  {
1590  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1591  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1592 
1593  // the next two lines should resolve to a single vswp d, d
1594  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1595  // the next two lines should resolve to a single vswp d, d
1596  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1597 
1598  xy0 = vmulq_f32(xy0, vLo);
1599 
1600  float32x4x2_t zb = vuzpq_f32( z0, z0);
1601  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1602  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1603  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1604  x = vaddq_f32(x, z);
1605 
1606  uint32x4_t mask = vcltq_f32(x, minDot);
1607  minDot = vbslq_f32( mask, x, minDot);
1608  index = vbslq_u32(mask, local_index, index);
1609  local_index = vaddq_u32(local_index, four);
1610  }
1611  break;
1612 
1613  case 1:
1614  {
1615  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1616 
1617  // the next two lines should resolve to a single vswp d, d
1618  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1619  // the next two lines should resolve to a single vswp d, d
1620  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1621 
1622  xy0 = vmulq_f32(xy0, vLo);
1623 
1624  z = vmulq_f32( z, vHi);
1625  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1626  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1627  x = vaddq_f32(x, z);
1628 
1629  uint32x4_t mask = vcltq_f32(x, minDot);
1630  minDot = vbslq_f32( mask, x, minDot);
1631  index = vbslq_u32(mask, local_index, index);
1632  local_index = vaddq_u32(local_index, four);
1633  }
1634  break;
1635 
1636  default:
1637  break;
1638  }
1639 
1640 
1641  // select best answer between hi and lo results
1642  uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1643  float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1644  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1645 
1646  // select best answer between even and odd results
1647  float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1648  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1649  mask = vclt_f32( minDotO, minDot2 );
1650  minDot2 = vbsl_f32(mask, minDotO, minDot2);
1651  index2 = vbsl_u32(mask, indexHi, index2);
1652 
1653  *dotResult = vget_lane_f32( minDot2, 0);
1654  return vget_lane_u32(index2, 0);
1655 
1656 }
1657 
1658 #else
1659  #error Unhandled __APPLE__ arch
1660 #endif
1661 
1662 #endif /* __APPLE__ */
1663 
1664 
enum @25 __attribute__
#define btUnlikely(_c)
Definition: btScalar.h:107
static float max(float a, float b)
unsigned int uint32_t
static float min(float a, float b)
#define BT_INFINITY
Definition: btScalar.h:338