30 #ifndef _VECTORMATH_VEC_AOS_CPP_H
31 #define _VECTORMATH_VEC_AOS_CPP_H
37 #define _VECTORMATH_PERM_X 0x00010203
38 #define _VECTORMATH_PERM_Y 0x04050607
39 #define _VECTORMATH_PERM_Z 0x08090a0b
40 #define _VECTORMATH_PERM_W 0x0c0d0e0f
41 #define _VECTORMATH_PERM_A 0x10111213
42 #define _VECTORMATH_PERM_B 0x14151617
43 #define _VECTORMATH_PERM_C 0x18191a1b
44 #define _VECTORMATH_PERM_D 0x1c1d1e1f
45 #define _VECTORMATH_PERM_XYZA (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A }
46 #define _VECTORMATH_PERM_ZXYW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_W }
47 #define _VECTORMATH_PERM_YZXW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W }
48 #define _VECTORMATH_PERM_YZAB (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B }
49 #define _VECTORMATH_PERM_ZABC (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B, _VECTORMATH_PERM_C }
50 #define _VECTORMATH_PERM_XYAW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_W }
51 #define _VECTORMATH_PERM_XAZW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W }
52 #define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 }
53 #define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 }
54 #define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 }
55 #define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff }
56 #define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f }
57 #define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f }
58 #define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
59 #define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
60 #define _VECTORMATH_SLERP_TOL 0.999f
66 #ifndef _VECTORMATH_INTERNAL_FUNCTIONS
67 #define _VECTORMATH_INTERNAL_FUNCTIONS
69 #define _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
72 __m128 result = _mm_mul_ps( vec0, vec1);
78 __m128 result = _mm_mul_ps(vec0, vec1);
79 return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
80 _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)),
81 _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
86 __m128 tmp0, tmp1, tmp2, tmp3, result;
87 tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
88 tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) );
89 tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) );
90 tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) );
149 d.
f[slot] = s.
f[slot];
153 #define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
157 return _mm_set1_ps(scalar);
162 namespace Vectormath {
166 #ifdef _VECTORMATH_NO_SCALAR_CAST
169 return floatInVec(ref, i);
177 return ((
float *)&ref)[i];
194 return *
this = floatInVec(scalar.ref, scalar.i);
199 return *
this *= floatInVec(scalar);
204 return *
this = floatInVec(ref, i) * scalar;
209 return *
this /= floatInVec(scalar);
212 inline floatInVec VecIdx::operator /=(
const floatInVec &scalar )
214 return *
this = floatInVec(ref, i) / scalar;
219 return *
this += floatInVec(scalar);
224 return *
this = floatInVec(ref, i) + scalar;
229 return *
this -= floatInVec(scalar);
234 return *
this = floatInVec(ref, i) - scalar;
250 mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
255 __m128 xz = _mm_unpacklo_ps( _x.
get128(), _z.
get128() );
296 return lerp( floatInVec(t), vec0, vec1 );
301 return ( vec0 + ( ( vec1 - vec0 ) * t ) );
311 __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
314 angle =
acosf4( cosAngle );
316 oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
317 angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt );
318 angles = _mm_unpacklo_ps( angles, oneMinusT );
319 angles = _mm_mul_ps( angles, angle );
320 sines =
sinf4( angles );
321 scales = _mm_div_ps( sines,
vec_splat( sines, 0 ) );
334 #ifdef USE_SSE3_LDDQU
335 vec =
Point3(
SSEFloat(_mm_lddqu_si128((
const __m128i*)((
float*)(fptr)))).m128 );
343 #endif //USE_SSE3_LDDQU
351 #ifdef USE_SSE3_LDDQU
352 vec = Vector3(
SSEFloat(_mm_lddqu_si128((
const __m128i*)((
float*)(fptr)))).m128 );
359 vec = Vector3( fl.
m128);
360 #endif //USE_SSE3_LDDQU
366 __m128 dstVec = *quad;
374 fptr[0] = vec.getX();
375 fptr[1] = vec.getY();
376 fptr[2] = vec.getZ();
381 fptr[0] = vec.getX();
382 fptr[1] = vec.getY();
383 fptr[2] = vec.getZ();
389 const float *quads = (
float *)threeQuads;
390 vec0 =
Vector3( _mm_load_ps(quads) );
391 vec1 =
Vector3( _mm_loadu_ps(quads + 3) );
392 vec2 =
Vector3( _mm_loadu_ps(quads + 6) );
393 vec3 =
Vector3( _mm_loadu_ps(quads + 9) );
398 __m128 xxxx = _mm_shuffle_ps( vec1.
get128(), vec1.
get128(), _MM_SHUFFLE(0, 0, 0, 0) );
399 __m128 zzzz = _mm_shuffle_ps( vec2.
get128(), vec2.
get128(), _MM_SHUFFLE(2, 2, 2, 2) );
403 threeQuads[1] = _mm_shuffle_ps( vec1.
get128(), vec2.
get128(), _MM_SHUFFLE(1, 0, 2, 1) );
404 threeQuads[2] =
vec_sel( _mm_shuffle_ps( vec3.
get128(), vec3.
get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
502 return floatInVec(
mVec128, idx );
517 return Point3( _mm_add_ps(
mVec128, pnt.get128() ) );
522 return *
this * floatInVec(scalar);
544 *
this = *
this * scalar;
550 *
this = *
this * scalar;
566 *
this = *
this / scalar;
572 *
this = *
this / scalar;
580 VM_ATTRIBUTE_ALIGN16 static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
587 return floatInVec(scalar) * vec;
597 return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
602 return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
607 return Vector3( _mm_rcp_ps( vec.get128() ) );
612 return Vector3(
fabsf4( vec.get128() ) );
617 __m128 vmask =
toM128(0x7fffffff);
618 return Vector3( _mm_or_ps(
619 _mm_and_ps ( vmask, vec0.get128() ),
620 _mm_andnot_ps( vmask, vec1.get128() ) ) );
625 return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
630 return floatInVec( _mm_max_ps( _mm_max_ps(
vec_splat( vec.get128(), 0 ),
vec_splat( vec.get128(), 1 ) ),
vec_splat( vec.get128(), 2 ) ) );
635 return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
640 return floatInVec( _mm_min_ps( _mm_min_ps(
vec_splat( vec.get128(), 0 ),
vec_splat( vec.get128(), 1 ) ),
vec_splat( vec.get128(), 2 ) ) );
645 return floatInVec( _mm_add_ps( _mm_add_ps(
vec_splat( vec.get128(), 0 ),
vec_splat( vec.get128(), 1 ) ),
vec_splat( vec.get128(), 2 ) ) );
650 return floatInVec(
_vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
655 return floatInVec(
_vmathVfDot3( vec.get128(), vec.get128() ), 0 );
660 return floatInVec( _mm_sqrt_ps(
_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
676 return Vector3(
_vmathVfCross( vec0.get128(), vec1.get128() ) );
681 return select( vec0, vec1, boolInVec(select1) );
690 #ifdef _VECTORMATH_DEBUG
694 union { __m128 v;
float s[4]; } tmp;
695 tmp.v = vec.get128();
696 printf(
"( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
701 union { __m128 v;
float s[4]; } tmp;
702 tmp.v = vec.get128();
703 printf(
"%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
710 mVec128 = _mm_setr_ps(_x, _y, _z, _w);
751 mVec128 = floatInVec(scalar).get128();
786 return lerp( floatInVec(t), vec0, vec1 );
791 return ( vec0 + ( ( vec1 - vec0 ) * t ) );
801 __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
804 angle =
acosf4( cosAngle );
806 oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
807 angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt );
808 angles = _mm_unpacklo_ps( angles, oneMinusT );
809 angles = _mm_mul_ps( angles, angle );
810 sines =
sinf4( angles );
811 scales = _mm_div_ps( sines,
vec_splat( sines, 0 ) );
938 return floatInVec(
mVec128, idx );
953 return *
this * floatInVec(scalar);
975 *
this = *
this * scalar;
981 *
this = *
this * scalar;
997 *
this = *
this / scalar;
1003 *
this = *
this / scalar;
1019 return vec * scalar;
1024 return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
1029 return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
1034 return Vector4( _mm_rcp_ps( vec.get128() ) );
1039 return Vector4(
fabsf4( vec.get128() ) );
1044 __m128 vmask =
toM128(0x7fffffff);
1045 return Vector4( _mm_or_ps(
1046 _mm_and_ps ( vmask, vec0.get128() ),
1047 _mm_andnot_ps( vmask, vec1.get128() ) ) );
1052 return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
1057 return floatInVec( _mm_max_ps(
1064 return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
1069 return floatInVec( _mm_min_ps(
1076 return floatInVec( _mm_add_ps(
1083 return floatInVec(
_vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
1088 return floatInVec(
_vmathVfDot4( vec.get128(), vec.get128() ), 0 );
1093 return floatInVec( _mm_sqrt_ps(
_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
1108 return select( vec0, vec1, boolInVec(select1) );
1112 #ifdef _VECTORMATH_DEBUG
1116 union { __m128 v;
float s[4]; } tmp;
1117 tmp.v = vec.get128();
1118 printf(
"( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
1123 union { __m128 v;
float s[4]; } tmp;
1124 tmp.v = vec.get128();
1125 printf(
"%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
1132 mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
1167 return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
1177 __m128 dstVec = *quad;
1185 const float *quads = (
float *)threeQuads;
1186 pnt0 =
Point3( _mm_load_ps(quads) );
1187 pnt1 =
Point3( _mm_loadu_ps(quads + 3) );
1188 pnt2 =
Point3( _mm_loadu_ps(quads + 6) );
1189 pnt3 =
Point3( _mm_loadu_ps(quads + 9) );
1194 __m128 xxxx = _mm_shuffle_ps( pnt1.
get128(), pnt1.
get128(), _MM_SHUFFLE(0, 0, 0, 0) );
1195 __m128 zzzz = _mm_shuffle_ps( pnt2.
get128(), pnt2.
get128(), _MM_SHUFFLE(2, 2, 2, 2) );
1199 threeQuads[1] = _mm_shuffle_ps( pnt1.
get128(), pnt2.
get128(), _MM_SHUFFLE(1, 0, 2, 1) );
1200 threeQuads[2] =
vec_sel( _mm_shuffle_ps( pnt3.
get128(), pnt3.
get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
1294 return VecIdx(
mVec128, idx );
1299 return floatInVec(
mVec128, idx );
1304 return Vector3( _mm_sub_ps(
mVec128, pnt.mVec128 ) );
1319 *
this = *
this + vec;
1325 *
this = *
this - vec;
1331 return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
1336 return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
1341 return Point3( _mm_rcp_ps( pnt.get128() ) );
1346 return Point3(
fabsf4( pnt.get128() ) );
1351 __m128 vmask =
toM128(0x7fffffff);
1352 return Point3( _mm_or_ps(
1353 _mm_and_ps ( vmask, pnt0.get128() ),
1354 _mm_andnot_ps( vmask, pnt1.get128() ) ) );
1359 return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
1364 return floatInVec( _mm_max_ps( _mm_max_ps(
vec_splat( pnt.get128(), 0 ),
vec_splat( pnt.get128(), 1 ) ),
vec_splat( pnt.get128(), 2 ) ) );
1369 return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
1374 return floatInVec( _mm_min_ps( _mm_min_ps(
vec_splat( pnt.get128(), 0 ),
vec_splat( pnt.get128(), 1 ) ),
vec_splat( pnt.get128(), 2 ) ) );
1379 return floatInVec( _mm_add_ps( _mm_add_ps(
vec_splat( pnt.get128(), 0 ),
vec_splat( pnt.get128(), 1 ) ),
vec_splat( pnt.get128(), 2 ) ) );
1384 return scale( pnt, floatInVec( scaleVal ) );
1394 return mulPerElem( pnt, Point3( scaleVec ) );
1399 return floatInVec(
_vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
1409 return length( Vector3( pnt ) );
1419 return length( ( pnt1 - pnt0 ) );
1424 return select( pnt0, pnt1, boolInVec(select1) );
1434 #ifdef _VECTORMATH_DEBUG
1438 union { __m128 v;
float s[4]; } tmp;
1439 tmp.v = pnt.get128();
1440 printf(
"( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
1445 union { __m128 v;
float s[4]; } tmp;
1446 tmp.v = pnt.get128();
1447 printf(
"%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
const Vector4 operator*(float scalar) const
const Quat normalize(const Quat &quat)
#define _VECTORMATH_UNIT_1000
float & operator[](int idx)
const Vector3 getXYZ() const
const Vector3 recipPerElem(const Vector3 &vec)
void loadXYZArray(Vector3 &vec0, Vector3 &vec1, Vector3 &vec2, Vector3 &vec3, const __m128 *threeQuads)
static const Vector4 yAxis()
float minElem(const Vector3 &vec)
#define _VECTORMATH_UNIT_0100
Vector4 & setXYZ(const Vector3 &vec)
Vector4 & operator+=(const Vector4 &vec)
#define _VECTORMATH_UNIT_0010
static const Vector3 xAxis()
#define _VECTORMATH_SLERP_TOL
static const Vector3 zAxis()
float dist(const Point3 &pnt0, const Point3 &pnt1)
const Vector3 minPerElem(const Vector3 &vec0, const Vector3 &vec1)
Point3 & operator=(const Point3 &pnt)
Point3 & setElem(int idx, float value)
static const Vector4 xAxis()
#define vec_madd(a, b, c)
static __m128 sinf4(vec_float4 x)
Vector3 & operator=(const Vector3 &vec)
Point3 & operator-=(const Vector3 &vec)
float distFromOrigin(const Point3 &pnt)
const Vector4 operator-() const
Vector3 & operator/=(float scalar)
#define VECTORMATH_FORCE_INLINE
static __m128 acosf4(__m128 x)
static __m128 _vmathVfCross(__m128 vec0, __m128 vec1)
static const Vector3 yAxis()
float3 & operator-=(float3 &a, const float3 &b)
Vector3 & operator-=(const Vector3 &vec)
static const Vector4 wAxis()
const Vector3 maxPerElem(const Vector3 &vec0, const Vector3 &vec1)
#define _vmathVfSetElement(vec, scalar, slot)
float lengthSqr(const Vector3 &vec)
float distSqrFromOrigin(const Point3 &pnt)
const Point3 operator+(const Vector3 &vec) const
float & operator[](int idx)
const Point3 scale(const Point3 &pnt, float scaleVal)
const Vector3 operator-(const Point3 &pnt) const
float projection(const Point3 &pnt, const Vector3 &unitVec)
const Vector3 copySignPerElem(const Vector3 &vec0, const Vector3 &vec1)
float4 & operator+=(float4 &a, const float4 &b)
const Vector3 operator-() const
const Vector3 operator+(const Vector3 &vec) const
static __m128 toM128(unsigned int x)
#define _VECTORMATH_UNIT_0001
static __m128 _vmathVfDot4(__m128 vec0, __m128 vec1)
Vector3 & setElem(int idx, float value)
#define VM_ATTRIBUTE_ALIGN16
void storeXYZArray(const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 *threeQuads)
#define vec_nmsub(a, b, c)
const Vector3 operator/(float scalar) const
const Vector3 normalizeApprox(const Vector3 &vec)
void set128(vec_float4 vec)
float4 & operator*=(float4 &a, float fact)
Point3 & operator+=(const Vector3 &vec)
const Vector4 operator+(const Vector4 &vec) const
float distSqr(const Point3 &pnt0, const Point3 &pnt1)
static __m128 newtonrapson_rsqrt4(const __m128 v)
float getElem(int idx) const
static __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
const Quat lerp(float t, const Quat &quat0, const Quat &quat1)
const Matrix3 select(const Matrix3 &mat0, const Matrix3 &mat1, bool select1)
const Vector4 operator/(float scalar) const
Vector3 & operator*=(float scalar)
float getElem(int idx) const
Vector3 & operator+=(const Vector3 &vec)
float getElem(int idx) const
float maxElem(const Vector3 &vec)
void storeXYZ(const Vector3 &vec, float *fptr)
float & operator[](int idx)
static __m128 _vmathVfSplatScalar(float scalar)
float sum(const Vector3 &vec)
static const Vector4 zAxis()
float dot(const Quat &quat0, const Quat &quat1)
static __m128 fabsf4(__m128 x)
const Vector3 cross(const Vector3 &vec0, const Vector3 &vec1)
const Matrix3 mulPerElem(const Matrix3 &mat0, const Matrix3 &mat1)
Vector4 & operator/=(float scalar)
const Vector3 operator*(float scalar) const
float length(const Quat &quat)
Vector4 & operator*=(float scalar)
const Vector3 divPerElem(const Vector3 &vec0, const Vector3 &vec1)
static __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
static __m128 _vmathVfDot3(__m128 vec0, __m128 vec1)
const Matrix3 absPerElem(const Matrix3 &mat)
Vector4 & setElem(int idx, float value)
const Matrix3 operator*(float scalar, const Matrix3 &mat)
Vector4 & operator=(const Vector4 &vec)
Vector4 & operator-=(const Vector4 &vec)
void loadXYZ(Vector3 &vec, const float *fptr)
const Quat slerp(float t, const Quat &unitQuat0, const Quat &unitQuat1)