18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
27 #if defined BT_USE_SIMD_VECTOR3
43 #if defined BT_USE_SSE || defined _WIN32
45 #define LOG2_ARRAY_SIZE 6
46 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
48 #include <emmintrin.h>
50 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
51 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
54 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
56 float4 vvec = _mm_loadu_ps( vec );
57 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
58 float4 vLo = _mm_movelh_ps( vvec, vvec );
63 float4 stack_array[ STACK_ARRAY_COUNT ];
66 memset( stack_array, -1, STACK_ARRAY_COUNT *
sizeof(stack_array[0]) );
72 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
76 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
81 float4 v3 = vertices[3]; vertices += 4;
83 float4 lo0 = _mm_movelh_ps( v0, v1);
84 float4 hi0 = _mm_movehl_ps( v1, v0);
85 float4 lo1 = _mm_movelh_ps( v2, v3);
86 float4 hi1 = _mm_movehl_ps( v3, v2);
90 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
91 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
92 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
96 stack_array[index] = x;
97 max = _mm_max_ps( x, max );
102 v3 = vertices[3]; vertices += 4;
104 lo0 = _mm_movelh_ps( v0, v1);
105 hi0 = _mm_movehl_ps( v1, v0);
106 lo1 = _mm_movelh_ps( v2, v3);
107 hi1 = _mm_movehl_ps( v3, v2);
111 z = _mm_shuffle_ps(hi0, hi1, 0x88);
112 x = _mm_shuffle_ps(lo0, lo1, 0x88);
113 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
117 stack_array[index+1] = x;
118 max = _mm_max_ps( x, max );
123 v3 = vertices[3]; vertices += 4;
125 lo0 = _mm_movelh_ps( v0, v1);
126 hi0 = _mm_movehl_ps( v1, v0);
127 lo1 = _mm_movelh_ps( v2, v3);
128 hi1 = _mm_movehl_ps( v3, v2);
132 z = _mm_shuffle_ps(hi0, hi1, 0x88);
133 x = _mm_shuffle_ps(lo0, lo1, 0x88);
134 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
138 stack_array[index+2] = x;
139 max = _mm_max_ps( x, max );
144 v3 = vertices[3]; vertices += 4;
146 lo0 = _mm_movelh_ps( v0, v1);
147 hi0 = _mm_movehl_ps( v1, v0);
148 lo1 = _mm_movelh_ps( v2, v3);
149 hi1 = _mm_movehl_ps( v3, v2);
153 z = _mm_shuffle_ps(hi0, hi1, 0x88);
154 x = _mm_shuffle_ps(lo0, lo1, 0x88);
155 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
159 stack_array[index+3] = x;
160 max = _mm_max_ps( x, max );
166 if( 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(max, dotMax)))
169 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0x4e));
170 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0xb1));
176 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
179 maxIndex = 4*index + segment + indexTable[test];
193 for( ; index + 4 <= count / 4; index+=4 )
198 float4 v3 = vertices[3]; vertices += 4;
200 float4 lo0 = _mm_movelh_ps( v0, v1);
201 float4 hi0 = _mm_movehl_ps( v1, v0);
202 float4 lo1 = _mm_movelh_ps( v2, v3);
203 float4 hi1 = _mm_movehl_ps( v3, v2);
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
213 stack_array[index] = x;
214 max = _mm_max_ps( x, max );
219 v3 = vertices[3]; vertices += 4;
221 lo0 = _mm_movelh_ps( v0, v1);
222 hi0 = _mm_movehl_ps( v1, v0);
223 lo1 = _mm_movelh_ps( v2, v3);
224 hi1 = _mm_movehl_ps( v3, v2);
228 z = _mm_shuffle_ps(hi0, hi1, 0x88);
229 x = _mm_shuffle_ps(lo0, lo1, 0x88);
230 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
234 stack_array[index+1] = x;
235 max = _mm_max_ps( x, max );
240 v3 = vertices[3]; vertices += 4;
242 lo0 = _mm_movelh_ps( v0, v1);
243 hi0 = _mm_movehl_ps( v1, v0);
244 lo1 = _mm_movelh_ps( v2, v3);
245 hi1 = _mm_movehl_ps( v3, v2);
249 z = _mm_shuffle_ps(hi0, hi1, 0x88);
250 x = _mm_shuffle_ps(lo0, lo1, 0x88);
251 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
255 stack_array[index+2] = x;
256 max = _mm_max_ps( x, max );
261 v3 = vertices[3]; vertices += 4;
263 lo0 = _mm_movelh_ps( v0, v1);
264 hi0 = _mm_movehl_ps( v1, v0);
265 lo1 = _mm_movelh_ps( v2, v3);
266 hi1 = _mm_movehl_ps( v3, v2);
270 z = _mm_shuffle_ps(hi0, hi1, 0x88);
271 x = _mm_shuffle_ps(lo0, lo1, 0x88);
272 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276 stack_array[index+3] = x;
277 max = _mm_max_ps( x, max );
283 size_t localCount = (count & -4L) - 4*index;
287 float4 t0, t1, t2, t3, t4;
288 float4 * sap = &stack_array[index + localCount / 4];
289 vertices += localCount;
290 size_t byteIndex = -(localCount) *
sizeof(
float);
294 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
295 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
296 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
297 movaps %[t0], %[max] // vertices[0] \n\
298 movlhps %[t1], %[max] // x0y0x1y1 \n\
299 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
300 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
301 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
302 movhlps %[t0], %[t1] // z0w0z1w1 \n\
303 movaps %[t3], %[t0] // vertices[2] \n\
304 movlhps %[t4], %[t0] // x2y2x3y3 \n\
305 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
306 movhlps %[t3], %[t4] // z2w2z3w3 \n\
307 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
308 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
309 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
310 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
311 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
312 addps %[t3], %[max] // x + y \n\
313 addps %[t1], %[max] // x + y + z \n\
314 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
315 maxps %[t2], %[max] // record max, restore max \n\
316 add $16, %[byteIndex] // advance loop counter\n\
319 : [
max]
"+x" (
max), [t0]
"=&x" (t0), [t1]
"=&x" (t1), [t2]
"=&x" (t2), [t3]
"=&x" (t3), [t4]
"=&x" (t4), [byteIndex]
"+r" (byteIndex)
320 : [vLo]
"x" (vLo), [vHi]
"x" (vHi), [vertices]
"r" (vertices), [sap]
"r" (sap)
323 index += localCount/4;
326 for(
unsigned int i=0; i<localCount/4; i++,index++)
334 float4 lo0 = _mm_movelh_ps( v0, v1);
335 float4 hi0 = _mm_movehl_ps( v1, v0);
336 float4 lo1 = _mm_movelh_ps( v2, v3);
337 float4 hi1 = _mm_movehl_ps( v3, v2);
341 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
342 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
343 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
347 stack_array[index] = x;
348 max = _mm_max_ps( x, max );
357 float4 v0, v1, v2, x, y, z;
367 float4 lo0 = _mm_movelh_ps( v0, v1);
368 float4 hi0 = _mm_movehl_ps( v1, v0);
370 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
372 float4 lo1 = _mm_movelh_ps(v2, v2);
374 x = _mm_shuffle_ps(lo0, lo1, 0x88);
375 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
382 float4 xy = _mm_movelh_ps(v0, v1);
383 z = _mm_movehl_ps(v1, v0);
385 z = _mm_shuffle_ps( z, z, 0xa8);
386 x = _mm_shuffle_ps( xy, xy, 0xa8);
387 y = _mm_shuffle_ps( xy, xy, 0xfd);
394 z = _mm_shuffle_ps( xy, xy, 0xaa);
397 x = _mm_shuffle_ps(xy, xy, 0);
398 y = _mm_shuffle_ps(xy, xy, 0x55);
404 stack_array[index] = x;
405 max = _mm_max_ps( x, max );
410 if( 0 == segment || 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(max, dotMax)))
413 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0x4e));
414 max = _mm_max_ps(max, (
float4) _mm_shuffle_ps( max, max, 0xb1));
425 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
427 maxIndex = 4*index + segment + indexTable[test];
430 _mm_store_ss( dotResult, dotMax);
434 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
436 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
439 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
441 float4 vvec = _mm_loadu_ps( vec );
442 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
443 float4 vLo = _mm_movelh_ps( vvec, vvec );
448 float4 stack_array[ STACK_ARRAY_COUNT ];
451 memset( stack_array, -1, STACK_ARRAY_COUNT *
sizeof(stack_array[0]) );
457 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
461 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
466 float4 v3 = vertices[3]; vertices += 4;
468 float4 lo0 = _mm_movelh_ps( v0, v1);
469 float4 hi0 = _mm_movehl_ps( v1, v0);
470 float4 lo1 = _mm_movelh_ps( v2, v3);
471 float4 hi1 = _mm_movehl_ps( v3, v2);
475 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
481 stack_array[index] = x;
482 min = _mm_min_ps( x, min );
487 v3 = vertices[3]; vertices += 4;
489 lo0 = _mm_movelh_ps( v0, v1);
490 hi0 = _mm_movehl_ps( v1, v0);
491 lo1 = _mm_movelh_ps( v2, v3);
492 hi1 = _mm_movehl_ps( v3, v2);
496 z = _mm_shuffle_ps(hi0, hi1, 0x88);
497 x = _mm_shuffle_ps(lo0, lo1, 0x88);
498 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
502 stack_array[index+1] = x;
503 min = _mm_min_ps( x, min );
508 v3 = vertices[3]; vertices += 4;
510 lo0 = _mm_movelh_ps( v0, v1);
511 hi0 = _mm_movehl_ps( v1, v0);
512 lo1 = _mm_movelh_ps( v2, v3);
513 hi1 = _mm_movehl_ps( v3, v2);
517 z = _mm_shuffle_ps(hi0, hi1, 0x88);
518 x = _mm_shuffle_ps(lo0, lo1, 0x88);
519 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
523 stack_array[index+2] = x;
524 min = _mm_min_ps( x, min );
529 v3 = vertices[3]; vertices += 4;
531 lo0 = _mm_movelh_ps( v0, v1);
532 hi0 = _mm_movehl_ps( v1, v0);
533 lo1 = _mm_movelh_ps( v2, v3);
534 hi1 = _mm_movehl_ps( v3, v2);
538 z = _mm_shuffle_ps(hi0, hi1, 0x88);
539 x = _mm_shuffle_ps(lo0, lo1, 0x88);
540 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
544 stack_array[index+3] = x;
545 min = _mm_min_ps( x, min );
551 if( 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(min, dotmin)))
554 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0x4e));
555 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0xb1));
561 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
564 minIndex = 4*index + segment + indexTable[test];
578 for( ; index + 4 <= count / 4; index+=4 )
583 float4 v3 = vertices[3]; vertices += 4;
585 float4 lo0 = _mm_movelh_ps( v0, v1);
586 float4 hi0 = _mm_movehl_ps( v1, v0);
587 float4 lo1 = _mm_movelh_ps( v2, v3);
588 float4 hi1 = _mm_movehl_ps( v3, v2);
592 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
593 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
594 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
598 stack_array[index] = x;
599 min = _mm_min_ps( x, min );
604 v3 = vertices[3]; vertices += 4;
606 lo0 = _mm_movelh_ps( v0, v1);
607 hi0 = _mm_movehl_ps( v1, v0);
608 lo1 = _mm_movelh_ps( v2, v3);
609 hi1 = _mm_movehl_ps( v3, v2);
613 z = _mm_shuffle_ps(hi0, hi1, 0x88);
614 x = _mm_shuffle_ps(lo0, lo1, 0x88);
615 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
619 stack_array[index+1] = x;
620 min = _mm_min_ps( x, min );
625 v3 = vertices[3]; vertices += 4;
627 lo0 = _mm_movelh_ps( v0, v1);
628 hi0 = _mm_movehl_ps( v1, v0);
629 lo1 = _mm_movelh_ps( v2, v3);
630 hi1 = _mm_movehl_ps( v3, v2);
634 z = _mm_shuffle_ps(hi0, hi1, 0x88);
635 x = _mm_shuffle_ps(lo0, lo1, 0x88);
636 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
640 stack_array[index+2] = x;
641 min = _mm_min_ps( x, min );
646 v3 = vertices[3]; vertices += 4;
648 lo0 = _mm_movelh_ps( v0, v1);
649 hi0 = _mm_movehl_ps( v1, v0);
650 lo1 = _mm_movelh_ps( v2, v3);
651 hi1 = _mm_movehl_ps( v3, v2);
655 z = _mm_shuffle_ps(hi0, hi1, 0x88);
656 x = _mm_shuffle_ps(lo0, lo1, 0x88);
657 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
661 stack_array[index+3] = x;
662 min = _mm_min_ps( x, min );
668 size_t localCount = (count & -4L) - 4*index;
674 vertices += localCount;
675 float4 t0, t1, t2, t3, t4;
676 size_t byteIndex = -(localCount) *
sizeof(
float);
677 float4 * sap = &stack_array[index + localCount / 4];
681 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
682 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
683 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
684 movaps %[t0], %[min] // vertices[0] \n\
685 movlhps %[t1], %[min] // x0y0x1y1 \n\
686 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
687 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
688 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
689 movhlps %[t0], %[t1] // z0w0z1w1 \n\
690 movaps %[t3], %[t0] // vertices[2] \n\
691 movlhps %[t4], %[t0] // x2y2x3y3 \n\
692 movhlps %[t3], %[t4] // z2w2z3w3 \n\
693 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
694 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
695 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
696 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
697 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
698 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
699 addps %[t3], %[min] // x + y \n\
700 addps %[t1], %[min] // x + y + z \n\
701 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
702 minps %[t2], %[min] // record min, restore min \n\
703 add $16, %[byteIndex] // advance loop counter\n\
706 : [
min]
"+x" (
min), [t0]
"=&x" (t0), [t1]
"=&x" (t1), [t2]
"=&x" (t2), [t3]
"=&x" (t3), [t4]
"=&x" (t4), [byteIndex]
"+r" (byteIndex)
707 : [vLo]
"x" (vLo), [vHi]
"x" (vHi), [vertices]
"r" (vertices), [sap]
"r" (sap)
710 index += localCount/4;
713 for(
unsigned int i=0; i<localCount/4; i++,index++)
721 float4 lo0 = _mm_movelh_ps( v0, v1);
722 float4 hi0 = _mm_movehl_ps( v1, v0);
723 float4 lo1 = _mm_movelh_ps( v2, v3);
724 float4 hi1 = _mm_movehl_ps( v3, v2);
728 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
729 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
730 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
734 stack_array[index] = x;
735 min = _mm_min_ps( x, min );
745 float4 v0, v1, v2, x, y, z;
755 float4 lo0 = _mm_movelh_ps( v0, v1);
756 float4 hi0 = _mm_movehl_ps( v1, v0);
758 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
760 float4 lo1 = _mm_movelh_ps(v2, v2);
762 x = _mm_shuffle_ps(lo0, lo1, 0x88);
763 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
770 float4 xy = _mm_movelh_ps(v0, v1);
771 z = _mm_movehl_ps(v1, v0);
773 z = _mm_shuffle_ps( z, z, 0xa8);
774 x = _mm_shuffle_ps( xy, xy, 0xa8);
775 y = _mm_shuffle_ps( xy, xy, 0xfd);
782 z = _mm_shuffle_ps( xy, xy, 0xaa);
785 x = _mm_shuffle_ps(xy, xy, 0);
786 y = _mm_shuffle_ps(xy, xy, 0x55);
792 stack_array[index] = x;
793 min = _mm_min_ps( x, min );
798 if( 0 == segment || 0xf != _mm_movemask_ps( (
float4) _mm_cmpeq_ps(min, dotmin)))
801 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0x4e));
802 min = _mm_min_ps(min, (
float4) _mm_shuffle_ps( min, min, 0xb1));
813 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
815 minIndex = 4*index + segment + indexTable[test];
818 _mm_store_ss( dotResult, dotmin);
823 #elif defined BT_USE_NEON
824 #define ARM_NEON_GCC_COMPATIBILITY 1
825 #include <arm_neon.h>
826 #include <sys/types.h>
827 #include <sys/sysctl.h>
829 static long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
830 static long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
831 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
832 static long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
833 static long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
834 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
836 long (*_maxdot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult ) = _maxdot_large_sel;
837 long (*_mindot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult ) = _mindot_large_sel;
840 static inline uint32_t btGetCpuCapabilities(
void )
843 static bool testedCapabilities =
false;
845 if( 0 == testedCapabilities)
848 size_t featureSize =
sizeof( hasFeature );
849 int err = sysctlbyname(
"hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
851 if( 0 == err && hasFeature)
852 capabilities |= 0x2000;
854 testedCapabilities =
true;
863 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
866 if( btGetCpuCapabilities() & 0x2000 )
867 _maxdot_large = _maxdot_large_v1;
869 _maxdot_large = _maxdot_large_v0;
871 return _maxdot_large(vv, vec, count, dotResult);
874 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
877 if( btGetCpuCapabilities() & 0x2000 )
878 _mindot_large = _mindot_large_v1;
880 _mindot_large = _mindot_large_v0;
882 return _mindot_large(vv, vec, count, dotResult);
887 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); _r; })
890 long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
893 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
894 float32x2_t vLo = vget_low_f32(vvec);
895 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
898 uint32x2_t indexLo = (uint32x2_t) {0, 1};
899 uint32x2_t indexHi = (uint32x2_t) {2, 3};
900 uint32x2_t iLo = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
901 uint32x2_t iHi = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
902 const uint32x2_t four = (uint32x2_t) {4,4};
904 for( ; i+8 <= count; i+= 8 )
906 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
907 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
908 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
909 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
911 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
912 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
913 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
914 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
916 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
917 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
918 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
919 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
921 float32x2_t rLo = vpadd_f32( xy0, xy1);
922 float32x2_t rHi = vpadd_f32( xy2, xy3);
923 rLo = vadd_f32(rLo, zLo);
924 rHi = vadd_f32(rHi, zHi);
926 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
927 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
928 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
929 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
930 iLo = vbsl_u32(maskLo, indexLo, iLo);
931 iHi = vbsl_u32(maskHi, indexHi, iHi);
932 indexLo = vadd_u32(indexLo, four);
933 indexHi = vadd_u32(indexHi, four);
935 v0 = vld1q_f32_aligned_postincrement( vv );
936 v1 = vld1q_f32_aligned_postincrement( vv );
937 v2 = vld1q_f32_aligned_postincrement( vv );
938 v3 = vld1q_f32_aligned_postincrement( vv );
940 xy0 = vmul_f32( vget_low_f32(v0), vLo);
941 xy1 = vmul_f32( vget_low_f32(v1), vLo);
942 xy2 = vmul_f32( vget_low_f32(v2), vLo);
943 xy3 = vmul_f32( vget_low_f32(v3), vLo);
945 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
946 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
947 zLo = vmul_f32( z0.val[0], vHi);
948 zHi = vmul_f32( z1.val[0], vHi);
950 rLo = vpadd_f32( xy0, xy1);
951 rHi = vpadd_f32( xy2, xy3);
952 rLo = vadd_f32(rLo, zLo);
953 rHi = vadd_f32(rHi, zHi);
955 maskLo = vcgt_f32( rLo, dotMaxLo );
956 maskHi = vcgt_f32( rHi, dotMaxHi );
957 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
958 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
959 iLo = vbsl_u32(maskLo, indexLo, iLo);
960 iHi = vbsl_u32(maskHi, indexHi, iHi);
961 indexLo = vadd_u32(indexLo, four);
962 indexHi = vadd_u32(indexHi, four);
965 for( ; i+4 <= count; i+= 4 )
967 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
968 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
969 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
970 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
972 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
973 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
974 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
975 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
977 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
978 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
979 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
980 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
982 float32x2_t rLo = vpadd_f32( xy0, xy1);
983 float32x2_t rHi = vpadd_f32( xy2, xy3);
984 rLo = vadd_f32(rLo, zLo);
985 rHi = vadd_f32(rHi, zHi);
987 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
988 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
989 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
990 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
991 iLo = vbsl_u32(maskLo, indexLo, iLo);
992 iHi = vbsl_u32(maskHi, indexHi, iHi);
993 indexLo = vadd_u32(indexLo, four);
994 indexHi = vadd_u32(indexHi, four);
1001 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1002 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1003 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1005 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1006 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1007 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1009 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1010 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1011 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1013 float32x2_t rLo = vpadd_f32( xy0, xy1);
1014 float32x2_t rHi = vpadd_f32( xy2, xy2);
1015 rLo = vadd_f32(rLo, zLo);
1016 rHi = vadd_f32(rHi, zHi);
1018 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1019 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
1020 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1021 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
1022 iLo = vbsl_u32(maskLo, indexLo, iLo);
1023 iHi = vbsl_u32(maskHi, indexHi, iHi);
1028 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1029 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1031 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1032 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1034 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1035 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1037 float32x2_t rLo = vpadd_f32( xy0, xy1);
1038 rLo = vadd_f32(rLo, zLo);
1040 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1041 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1042 iLo = vbsl_u32(maskLo, indexLo, iLo);
1047 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1048 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1049 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1050 float32x2_t zLo = vmul_f32( z0, vHi);
1051 float32x2_t rLo = vpadd_f32( xy0, xy0);
1052 rLo = vadd_f32(rLo, zLo);
1053 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1054 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1055 iLo = vbsl_u32(maskLo, indexLo, iLo);
1064 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1065 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1066 iLo = vbsl_u32(mask, iHi, iLo);
1069 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1070 iHi = vdup_lane_u32(iLo, 1);
1071 mask = vcgt_f32( dotMaxHi, dotMaxLo );
1072 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1073 iLo = vbsl_u32(mask, iHi, iLo);
1075 *dotResult = vget_lane_f32( dotMaxLo, 0);
1076 return vget_lane_u32(iLo, 0);
1080 long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1082 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1083 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1084 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1085 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1086 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1087 uint32x4_t index = (uint32x4_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1090 unsigned long i = 0;
1091 for( ; i + 8 <= count; i += 8 )
1093 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1094 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1095 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1096 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1099 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1100 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1102 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1103 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1105 xy0 = vmulq_f32(xy0, vLo);
1106 xy1 = vmulq_f32(xy1, vLo);
1108 float32x4x2_t zb = vuzpq_f32( z0, z1);
1109 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1110 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1111 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1112 x = vaddq_f32(x, z);
1114 uint32x4_t mask = vcgtq_f32(x, maxDot);
1115 maxDot = vbslq_f32( mask, x, maxDot);
1116 index = vbslq_u32(mask, local_index, index);
1117 local_index = vaddq_u32(local_index, four);
1119 v0 = vld1q_f32_aligned_postincrement( vv );
1120 v1 = vld1q_f32_aligned_postincrement( vv );
1121 v2 = vld1q_f32_aligned_postincrement( vv );
1122 v3 = vld1q_f32_aligned_postincrement( vv );
1125 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1126 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1128 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1129 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1131 xy0 = vmulq_f32(xy0, vLo);
1132 xy1 = vmulq_f32(xy1, vLo);
1134 zb = vuzpq_f32( z0, z1);
1135 z = vmulq_f32( zb.val[0], vHi);
1136 xy = vuzpq_f32( xy0, xy1);
1137 x = vaddq_f32(xy.val[0], xy.val[1]);
1138 x = vaddq_f32(x, z);
1140 mask = vcgtq_f32(x, maxDot);
1141 maxDot = vbslq_f32( mask, x, maxDot);
1142 index = vbslq_u32(mask, local_index, index);
1143 local_index = vaddq_u32(local_index, four);
1146 for( ; i + 4 <= count; i += 4 )
1148 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1149 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1150 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1151 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1154 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1155 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1157 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1158 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1160 xy0 = vmulq_f32(xy0, vLo);
1161 xy1 = vmulq_f32(xy1, vLo);
1163 float32x4x2_t zb = vuzpq_f32( z0, z1);
1164 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1165 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1166 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1167 x = vaddq_f32(x, z);
1169 uint32x4_t mask = vcgtq_f32(x, maxDot);
1170 maxDot = vbslq_f32( mask, x, maxDot);
1171 index = vbslq_u32(mask, local_index, index);
1172 local_index = vaddq_u32(local_index, four);
1175 switch (count & 3) {
1178 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1179 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1180 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1183 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1184 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1186 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1187 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1189 xy0 = vmulq_f32(xy0, vLo);
1190 xy1 = vmulq_f32(xy1, vLo);
1192 float32x4x2_t zb = vuzpq_f32( z0, z1);
1193 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1194 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1195 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1196 x = vaddq_f32(x, z);
1198 uint32x4_t mask = vcgtq_f32(x, maxDot);
1199 maxDot = vbslq_f32( mask, x, maxDot);
1200 index = vbslq_u32(mask, local_index, index);
1201 local_index = vaddq_u32(local_index, four);
1207 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1208 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1211 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1213 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1215 xy0 = vmulq_f32(xy0, vLo);
1217 float32x4x2_t zb = vuzpq_f32( z0, z0);
1218 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1219 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1220 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1221 x = vaddq_f32(x, z);
1223 uint32x4_t mask = vcgtq_f32(x, maxDot);
1224 maxDot = vbslq_f32( mask, x, maxDot);
1225 index = vbslq_u32(mask, local_index, index);
1226 local_index = vaddq_u32(local_index, four);
1232 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1235 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1237 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1239 xy0 = vmulq_f32(xy0, vLo);
1241 z = vmulq_f32( z, vHi);
1242 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1243 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1244 x = vaddq_f32(x, z);
1246 uint32x4_t mask = vcgtq_f32(x, maxDot);
1247 maxDot = vbslq_f32( mask, x, maxDot);
1248 index = vbslq_u32(mask, local_index, index);
1249 local_index = vaddq_u32(local_index, four);
1259 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1260 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1261 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1264 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1265 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1266 mask = vcgt_f32( maxDotO, maxDot2 );
1267 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1268 index2 = vbsl_u32(mask, indexHi, index2);
1270 *dotResult = vget_lane_f32( maxDot2, 0);
1271 return vget_lane_u32(index2, 0);
1275 long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1277 unsigned long i = 0;
1278 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1279 float32x2_t vLo = vget_low_f32(vvec);
1280 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1283 uint32x2_t indexLo = (uint32x2_t) {0, 1};
1284 uint32x2_t indexHi = (uint32x2_t) {2, 3};
1285 uint32x2_t iLo = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1286 uint32x2_t iHi = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1287 const uint32x2_t four = (uint32x2_t) {4,4};
1289 for( ; i+8 <= count; i+= 8 )
1291 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1292 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1293 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1294 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1296 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1297 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1298 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1299 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1301 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1302 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1303 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1304 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1306 float32x2_t rLo = vpadd_f32( xy0, xy1);
1307 float32x2_t rHi = vpadd_f32( xy2, xy3);
1308 rLo = vadd_f32(rLo, zLo);
1309 rHi = vadd_f32(rHi, zHi);
1311 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1312 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1313 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1314 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1315 iLo = vbsl_u32(maskLo, indexLo, iLo);
1316 iHi = vbsl_u32(maskHi, indexHi, iHi);
1317 indexLo = vadd_u32(indexLo, four);
1318 indexHi = vadd_u32(indexHi, four);
1320 v0 = vld1q_f32_aligned_postincrement( vv );
1321 v1 = vld1q_f32_aligned_postincrement( vv );
1322 v2 = vld1q_f32_aligned_postincrement( vv );
1323 v3 = vld1q_f32_aligned_postincrement( vv );
1325 xy0 = vmul_f32( vget_low_f32(v0), vLo);
1326 xy1 = vmul_f32( vget_low_f32(v1), vLo);
1327 xy2 = vmul_f32( vget_low_f32(v2), vLo);
1328 xy3 = vmul_f32( vget_low_f32(v3), vLo);
1330 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1331 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1332 zLo = vmul_f32( z0.val[0], vHi);
1333 zHi = vmul_f32( z1.val[0], vHi);
1335 rLo = vpadd_f32( xy0, xy1);
1336 rHi = vpadd_f32( xy2, xy3);
1337 rLo = vadd_f32(rLo, zLo);
1338 rHi = vadd_f32(rHi, zHi);
1340 maskLo = vclt_f32( rLo, dotMinLo );
1341 maskHi = vclt_f32( rHi, dotMinHi );
1342 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1343 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1344 iLo = vbsl_u32(maskLo, indexLo, iLo);
1345 iHi = vbsl_u32(maskHi, indexHi, iHi);
1346 indexLo = vadd_u32(indexLo, four);
1347 indexHi = vadd_u32(indexHi, four);
1350 for( ; i+4 <= count; i+= 4 )
1352 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1353 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1354 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1355 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1357 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1358 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1359 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1360 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1362 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1363 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1364 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1365 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1367 float32x2_t rLo = vpadd_f32( xy0, xy1);
1368 float32x2_t rHi = vpadd_f32( xy2, xy3);
1369 rLo = vadd_f32(rLo, zLo);
1370 rHi = vadd_f32(rHi, zHi);
1372 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1373 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1374 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1375 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1376 iLo = vbsl_u32(maskLo, indexLo, iLo);
1377 iHi = vbsl_u32(maskHi, indexHi, iHi);
1378 indexLo = vadd_u32(indexLo, four);
1379 indexHi = vadd_u32(indexHi, four);
1385 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1386 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1387 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1389 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1390 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1391 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1393 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1394 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1395 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1397 float32x2_t rLo = vpadd_f32( xy0, xy1);
1398 float32x2_t rHi = vpadd_f32( xy2, xy2);
1399 rLo = vadd_f32(rLo, zLo);
1400 rHi = vadd_f32(rHi, zHi);
1402 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1403 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1404 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1405 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1406 iLo = vbsl_u32(maskLo, indexLo, iLo);
1407 iHi = vbsl_u32(maskHi, indexHi, iHi);
1412 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1413 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1415 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1416 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1418 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1419 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1421 float32x2_t rLo = vpadd_f32( xy0, xy1);
1422 rLo = vadd_f32(rLo, zLo);
1424 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1425 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1426 iLo = vbsl_u32(maskLo, indexLo, iLo);
1431 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1432 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1433 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1434 float32x2_t zLo = vmul_f32( z0, vHi);
1435 float32x2_t rLo = vpadd_f32( xy0, xy0);
1436 rLo = vadd_f32(rLo, zLo);
1437 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1438 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1439 iLo = vbsl_u32(maskLo, indexLo, iLo);
1448 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1449 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1450 iLo = vbsl_u32(mask, iHi, iLo);
1453 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1454 iHi = vdup_lane_u32(iLo, 1);
1455 mask = vclt_f32( dotMinHi, dotMinLo );
1456 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1457 iLo = vbsl_u32(mask, iHi, iLo);
1459 *dotResult = vget_lane_f32( dotMinLo, 0);
1460 return vget_lane_u32(iLo, 0);
1463 long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1465 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1466 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1467 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1468 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1469 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1470 uint32x4_t index = (uint32x4_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1473 unsigned long i = 0;
1474 for( ; i + 8 <= count; i += 8 )
1476 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1477 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1478 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1479 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1482 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1483 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1485 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1486 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1488 xy0 = vmulq_f32(xy0, vLo);
1489 xy1 = vmulq_f32(xy1, vLo);
1491 float32x4x2_t zb = vuzpq_f32( z0, z1);
1492 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1493 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1494 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1495 x = vaddq_f32(x, z);
1497 uint32x4_t mask = vcltq_f32(x, minDot);
1498 minDot = vbslq_f32( mask, x, minDot);
1499 index = vbslq_u32(mask, local_index, index);
1500 local_index = vaddq_u32(local_index, four);
1502 v0 = vld1q_f32_aligned_postincrement( vv );
1503 v1 = vld1q_f32_aligned_postincrement( vv );
1504 v2 = vld1q_f32_aligned_postincrement( vv );
1505 v3 = vld1q_f32_aligned_postincrement( vv );
1508 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1509 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1511 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1512 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1514 xy0 = vmulq_f32(xy0, vLo);
1515 xy1 = vmulq_f32(xy1, vLo);
1517 zb = vuzpq_f32( z0, z1);
1518 z = vmulq_f32( zb.val[0], vHi);
1519 xy = vuzpq_f32( xy0, xy1);
1520 x = vaddq_f32(xy.val[0], xy.val[1]);
1521 x = vaddq_f32(x, z);
1523 mask = vcltq_f32(x, minDot);
1524 minDot = vbslq_f32( mask, x, minDot);
1525 index = vbslq_u32(mask, local_index, index);
1526 local_index = vaddq_u32(local_index, four);
1529 for( ; i + 4 <= count; i += 4 )
1531 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1532 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1533 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1534 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1537 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1538 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1540 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1541 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1543 xy0 = vmulq_f32(xy0, vLo);
1544 xy1 = vmulq_f32(xy1, vLo);
1546 float32x4x2_t zb = vuzpq_f32( z0, z1);
1547 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1548 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1549 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1550 x = vaddq_f32(x, z);
1552 uint32x4_t mask = vcltq_f32(x, minDot);
1553 minDot = vbslq_f32( mask, x, minDot);
1554 index = vbslq_u32(mask, local_index, index);
1555 local_index = vaddq_u32(local_index, four);
1558 switch (count & 3) {
1561 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1562 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1563 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1566 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1567 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1569 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1570 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1572 xy0 = vmulq_f32(xy0, vLo);
1573 xy1 = vmulq_f32(xy1, vLo);
1575 float32x4x2_t zb = vuzpq_f32( z0, z1);
1576 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1577 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1578 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1579 x = vaddq_f32(x, z);
1581 uint32x4_t mask = vcltq_f32(x, minDot);
1582 minDot = vbslq_f32( mask, x, minDot);
1583 index = vbslq_u32(mask, local_index, index);
1584 local_index = vaddq_u32(local_index, four);
1590 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1591 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1594 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1596 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1598 xy0 = vmulq_f32(xy0, vLo);
1600 float32x4x2_t zb = vuzpq_f32( z0, z0);
1601 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1602 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1603 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1604 x = vaddq_f32(x, z);
1606 uint32x4_t mask = vcltq_f32(x, minDot);
1607 minDot = vbslq_f32( mask, x, minDot);
1608 index = vbslq_u32(mask, local_index, index);
1609 local_index = vaddq_u32(local_index, four);
1615 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1618 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1620 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1622 xy0 = vmulq_f32(xy0, vLo);
1624 z = vmulq_f32( z, vHi);
1625 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1626 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1627 x = vaddq_f32(x, z);
1629 uint32x4_t mask = vcltq_f32(x, minDot);
1630 minDot = vbslq_f32( mask, x, minDot);
1631 index = vbslq_u32(mask, local_index, index);
1632 local_index = vaddq_u32(local_index, four);
1642 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1643 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1644 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1647 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1648 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1649 mask = vclt_f32( minDotO, minDot2 );
1650 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1651 index2 = vbsl_u32(mask, indexHi, index2);
1653 *dotResult = vget_lane_f32( minDot2, 0);
1654 return vget_lane_u32(index2, 0);
1659 #error Unhandled __APPLE__ arch
static float max(float a, float b)
static float min(float a, float b)