UnrealMathNeon_8h_source.html

// Copyright Epic Games, Inc. All Rights Reserved.


#pragma once


#include "HAL/Platform.h"


// HEADER_UNIT_SKIP - Not included directly


PRAGMA_DISABLE_SHADOW_VARIABLE_WARNINGS


#if PLATFORM_ENABLE_VECTORINTRINSICS_NEON


struct VectorRegisterConstInit {};


// Include the intrinsic functions header

#if (PLATFORM_WINDOWS && PLATFORM_64BITS && !PLATFORM_COMPILER_CLANG)

#include <arm64_neon.h>

#else

#include <arm_neon.h>

#endif


#include "Math/Float16.h"


/*=============================================================================

 *  Helpers:

 *============================================================================*/


#if PLATFORM_COMPILER_MSVC


// MSVC NEON headers typedef float32x4_t and int32x4_t both to __n128

// This wrapper type allows VectorRegister4Float and VectorRegister4Int to be

// discriminated for template specialization (e.g. FConstantHandler)

//

// This comes at the cost of having to define constructors for some

// anonymous unions, because VectorRegister4Float/VectorRegister4Int are no

// longer trivially constructible. The optimizer should eliminate the

// redundant zero initialization in these cases for non-MSVC (e.g. V()

// is called now where it wasn't before)

template<typename T, typename BASE_TYPE>

struct alignas(alignof(T)) VectorRegisterWrapper

{

    FORCEINLINE VectorRegisterWrapper() = default;

    FORCEINLINE constexpr VectorRegisterWrapper(T vec) : m_vec(vec) {}


    FORCEINLINE operator T&() { return m_vec; }

    FORCEINLINE operator const T&() const { return m_vec; }


    FORCEINLINE BASE_TYPE operator[](int Index) const;


    T m_vec;

};


template<>

FORCEINLINE float VectorRegisterWrapper<float32x4_t, float>::operator[](int Index) const

{

    return m_vec.n128_f32[Index];

}


template<>

FORCEINLINE double VectorRegisterWrapper<float64x2_t, double>::operator[](int Index) const

{

    return m_vec.n128_f64[Index];

}


template<>

FORCEINLINE int VectorRegisterWrapper<int32x4_t, int>::operator[](int Index) const

{

    return m_vec.n128_i32[Index];

}


template<>

FORCEINLINE int64 VectorRegisterWrapper<int64x2_t, int64>::operator[](int Index) const

{

    return m_vec.n128_i64[Index];

}


typedef VectorRegisterWrapper<float32x4_t, float> VectorRegister4Float;

typedef VectorRegisterWrapper<float64x2_t, double> VectorRegister2Double;

typedef VectorRegisterWrapper<int32x4_t, int> VectorRegister4Int;

typedef VectorRegisterWrapper<int64x2_t, int64> VectorRegister2Int64;


typedef float32x4x4_t VectorRegister4x4Float;


FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)

{

    int32x4_t Out = {};

    Out.n128_i32[0] = X;

    Out.n128_i32[1] = Y;

    Out.n128_i32[2] = Z;

    Out.n128_i32[3] = W;

    return Out;

}


FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)

{

    float32x4_t Out = {};

    Out.n128_f32[0] = X;

    Out.n128_f32[1] = Y;

    Out.n128_f32[2] = Z;

    Out.n128_f32[3] = W;

    return Out;

}


FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)

{

    float64x2_t Out = {};

    Out.n128_f64[0] = X;

    Out.n128_f64[1] = Y;

    return Out;

}


#else


typedef float32x4_t GCC_ALIGN(16) VectorRegister4Float;

typedef float64x2_t GCC_ALIGN(16) VectorRegister2Double;

typedef int32x4_t  GCC_ALIGN(16) VectorRegister4Int;

typedef int64x2_t GCC_ALIGN(16) VectorRegister2Int64;

typedef float32x4x4_t GCC_ALIGN(16) VectorRegister4x4Float;


FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)

{

    return VectorRegister4Int { X, Y, Z, W };

}


FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)

{

    return VectorRegister4Float { X, Y, Z, W };

}


FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)

{

    return VectorRegister2Double { X, Y };

}


#endif


#define DECLARE_VECTOR_REGISTER(X, Y, Z, W) MakeVectorRegister( X, Y, Z, W )


struct alignas(16) VectorRegister4Double

{

    struct

    {

        VectorRegister2Double XY;

        VectorRegister2Double ZW;

    };


    FORCEINLINE VectorRegister4Double() = default;

    FORCEINLINE VectorRegister4Double(VectorRegister2Double xy, VectorRegister2Double zw) : XY(xy), ZW(zw) {}

    FORCEINLINE constexpr VectorRegister4Double(VectorRegister2Double xy, VectorRegister2Double zw, VectorRegisterConstInit) : XY(xy), ZW(zw) {}


    FORCEINLINE VectorRegister4Double(VectorRegister4Float From)

    {

        XY = vcvt_f64_f32(*(float32x2_t*)&From);

        ZW = vcvt_high_f64_f32(From);

    }


    VectorRegister4Double(VectorRegister2Double From) = delete;


    FORCEINLINE VectorRegister4Double& operator=(VectorRegister4Float From)

    {

        *this = VectorRegister4Double(From);

        return *this;

    }

};


typedef VectorRegister4Double VectorRegister;

#define VectorZeroVectorRegister() VectorZeroDouble()

#define VectorOneVectorRegister() VectorOneDouble()


// Forward declarations

VectorRegister4Float VectorLoadAligned(const float* Ptr);

VectorRegister4Double VectorLoadAligned(const double* Ptr);

void VectorStoreAligned(VectorRegister4Float Vec, float* Ptr);

void VectorStoreAligned(VectorRegister4Double Vec, double* Dst);


// Helper for conveniently aligning a float array for extraction from VectorRegister4Float

struct alignas(alignof(VectorRegister4Float)) AlignedFloat4

{

    float V[4];


    FORCEINLINE AlignedFloat4(VectorRegister4Float Vec)

    {

        VectorStoreAligned(Vec, V);

    }


    FORCEINLINE float operator[](int32 Index) const { return V[Index]; }

    FORCEINLINE float& operator[](int32 Index) { return V[Index]; }


    FORCEINLINE VectorRegister4Float ToVectorRegister() const

    {

        return VectorLoadAligned(V);

    }

};


// Helper for conveniently aligning a double array for extraction from VectorRegister4Double

struct alignas(alignof(VectorRegister4Double)) AlignedDouble4

{

    double V[4];


    FORCEINLINE AlignedDouble4(VectorRegister4Double Vec)

    {

        VectorStoreAligned(Vec, V);

    }


    FORCEINLINE double operator[](int32 Index) const { return V[Index]; }

    FORCEINLINE double& operator[](int32 Index) { return V[Index]; }


    FORCEINLINE VectorRegister4Double ToVectorRegister() const

    {

        return VectorLoadAligned(V);

    }

};


typedef AlignedDouble4 AlignedRegister4;


// Aliases

typedef VectorRegister4Int VectorRegister4i;

typedef VectorRegister4Float VectorRegister4f;

typedef VectorRegister4Double VectorRegister4d;

typedef VectorRegister2Double VectorRegister2d;


// Backwards compatibility

typedef VectorRegister4Double VectorRegister4;

typedef VectorRegister4 VectorRegister;

typedef VectorRegister4Int VectorRegisterInt;


FORCEINLINE VectorRegister4Float MakeVectorRegister(uint32 X, uint32 Y, uint32 Z, uint32 W)

{

    union U {

        VectorRegister4Float V; uint32 F[4];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.F[0] = X;

    Tmp.F[1] = Y;

    Tmp.F[2] = Z;

    Tmp.F[3] = W;

    return Tmp.V;

}


FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(uint32 X, uint32 Y, uint32 Z, uint32 W)

{

    return MakeVectorRegister(X, Y, Z, W);

}


// Nicer alias

FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatMask(uint32 X, uint32 Y, uint32 Z, uint32 W)

{

    return MakeVectorRegisterFloat(X, Y, Z, W);

}


FORCEINLINE VectorRegister4Float MakeVectorRegister(float X, float Y, float Z, float W)

{

    union U {

        VectorRegister4Float V; float F[4];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.F[0] = X;

    Tmp.F[1] = Y;

    Tmp.F[2] = Z;

    Tmp.F[3] = W;

    return Tmp.V;

}


FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(float X, float Y, float Z, float W)

{

    return MakeVectorRegister(X, Y, Z, W);

}


FORCEINLINE VectorRegister4Double MakeVectorRegister(double X, double Y, double Z, double W)

{

    union U

    {

        VectorRegister4Double V; double D[4];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.D[0] = X;

    Tmp.D[1] = Y;

    Tmp.D[2] = Z;

    Tmp.D[3] = W;

    return Tmp.V;

}


FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(double X, double Y, double Z, double W)

{

    return MakeVectorRegister(X, Y, Z, W);

}


FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(VectorRegister2Double XY, VectorRegister2Double ZW)

{

    return VectorRegister4Double(XY, ZW);

}


FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)

{

    union U

    {

        VectorRegister4Double V; uint64_t D[4];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.D[0] = X;

    Tmp.D[1] = Y;

    Tmp.D[2] = Z;

    Tmp.D[3] = W;

    return Tmp.V;

}


// Nicer alias

FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)

{

    return MakeVectorRegisterDouble(X, Y, Z, W);

}


FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)

{

    union U

    {

        VectorRegister2Double V; double D[2];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.D[0] = X;

    Tmp.D[1] = Y;

    return Tmp.V;

}


FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(uint64 X, uint64 Y)

{

    union U

    {

        VectorRegister2Double V; uint64_t D[2];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.D[0] = X;

    Tmp.D[1] = Y;

    return Tmp.V;

}


FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)

{

    union U {

        VectorRegister4Int V; int32 I[4];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.I[0] = X;

    Tmp.I[1] = Y;

    Tmp.I[2] = Z;

    Tmp.I[3] = W;

    return Tmp.V;

}


FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)

{

    union U

    {

        VectorRegister4Int V; int64 I[2];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.I[0] = X;

    Tmp.I[1] = Y;

    return Tmp.V;

}


// Make double register from float register

FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(VectorRegister4Float From)

{

    return VectorRegister4Double(From);

}


// Lossy conversion: double->float vector

FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatFromDouble(VectorRegister4Double Vec)

{

    return vcombine_f32(vcvt_f32_f64(Vec.XY), vcvt_f32_f64(Vec.ZW));

}


/*

#define VectorPermute(Vec1, Vec2, Mask) my_perm(Vec1, Vec2, Mask)


/ ** Reads NumBytesMinusOne+1 bytes from the address pointed to by Ptr, always reading the aligned 16 bytes containing the start of Ptr, but only reading the next 16 bytes if the data straddles the boundary * /

FORCEINLINE VectorRegister4Float VectorLoadNPlusOneUnalignedBytes(const void* Ptr, int NumBytesMinusOne)

{

    return VectorPermute( my_ld (0, (float*)Ptr), my_ld(NumBytesMinusOne, (float*)Ptr), my_lvsl(0, (float*)Ptr) );

}

*/


/*=============================================================================

 *  Constants:

 *============================================================================*/


#include "Math/UnrealMathVectorConstants.h.inl"


/*=============================================================================

 *  Intrinsics:

 *============================================================================*/


FORCEINLINE VectorRegister4Float VectorZeroFloat()

{

    return vdupq_n_f32( 0.0f );

}


FORCEINLINE VectorRegister4Double VectorZeroDouble()

{

    VectorRegister2Double Zero = vdupq_n_f64(0.0);

    return VectorRegister4Double(Zero, Zero);

}


FORCEINLINE VectorRegister4Float VectorOneFloat()

{

    return vdupq_n_f32( 1.0f );

}


FORCEINLINE VectorRegister4Double VectorOneDouble()

{

    VectorRegister4Double Result;

    Result.XY = vdupq_n_f64(1.0f);

    Result.ZW = Result.XY;

    return Result;

}


FORCEINLINE VectorRegister4Float VectorLoad(const float* Ptr)

{

    return vld1q_f32( (float32_t*)Ptr );

}


FORCEINLINE VectorRegister4Double VectorLoad(const double* Ptr)

{

    float64x2x2_t Vec = vld1q_f64_x2(Ptr);

    VectorRegister4Double Result = *(VectorRegister4Double*)&Vec;

    return Result;

}


FORCEINLINE VectorRegister4x4Float VectorLoad16(const float* Ptr)

{

    return vld1q_f32_x4(Ptr);

}


FORCEINLINE VectorRegister4Float VectorLoadFloat2(const float* Ptr)

{

    return MakeVectorRegister(Ptr[0], Ptr[1], Ptr[0], Ptr[1]);

}


FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double* Ptr)

{

    VectorRegister4Double Result;

    Result.XY = vld1q_f64(Ptr);

    Result.ZW = vcombine_f64(vld1_f64(&Ptr[2]), vdup_n_f64(0.0));

    return Result;

}


FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double* Ptr)

{

    VectorRegister4Double Result;

    Result.XY = vld1q_f64(Ptr);

    Result.ZW = vcombine_f64(vld1_f64(&Ptr[2]), vdup_n_f64(1.0));

    return Result;

}


template <int ElementIndex>

FORCEINLINE VectorRegister4Float VectorSetComponentImpl(VectorRegister4Float Vec, float Scalar)

{

    return vsetq_lane_f32(Scalar, Vec, ElementIndex);

}


template <int ElementIndex>

FORCEINLINE VectorRegister2Double VectorSetComponentImpl(VectorRegister2Double Vec, double Scalar)

{

    return vsetq_lane_f64(Scalar, Vec, ElementIndex);

}


template <int ElementIndex>

FORCEINLINE VectorRegister4Double VectorSetComponentImpl(VectorRegister4Double Vec, double Scalar)

{

    VectorRegister4Double Result;

    if constexpr (ElementIndex > 1)

    {

        Result.XY = Vec.XY;

        Result.ZW = VectorSetComponentImpl<ElementIndex - 2>(Vec.ZW, Scalar);

    }

    else

    {

        Result.XY = VectorSetComponentImpl<ElementIndex>(Vec.XY, Scalar);

        Result.ZW = Vec.ZW;

    }

    return Result;

}


#define VectorSetComponent( Vec, ElementIndex, Scalar ) VectorSetComponentImpl<ElementIndex>(Vec, Scalar)


FORCEINLINE VectorRegister4Float VectorLoadAligned(const float* Ptr)

{

    return VectorLoad(Ptr);

}


FORCEINLINE VectorRegister4Double VectorLoadAligned(const double* Ptr)

{

    return VectorLoad(Ptr);

}


FORCEINLINE VectorRegister4Float VectorLoadFloat1(const float *Ptr)

{

    return vdupq_n_f32(Ptr[0]);

}


FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double* Ptr)

{

    VectorRegister4Double Result;

    Result.XY = vdupq_n_f64(Ptr[0]);

    Result.ZW = Result.XY;

    return Result;

}


FORCEINLINE VectorRegister4i VectorLoad64Bits(const void *Ptr)

{

    return vcombine_s64(vld1_s64((const int64_t *)Ptr), vdup_n_s64(0));

}


FORCEINLINE VectorRegister4Float VectorLoadTwoPairsFloat(const float* Ptr1, const float* Ptr2)

{

    float32x2_t Lo = vld1_f32(Ptr1);

    float32x2_t Hi = vld1_f32(Ptr2);

    return vcombine_f32(Lo, Hi);

}


FORCEINLINE VectorRegister4Double VectorLoadTwoPairsFloat(const double* Ptr1, const double* Ptr2)

{

    VectorRegister4Double Res;

    Res.XY = vld1q_f64(Ptr1);

    Res.ZW = vld1q_f64(Ptr2);

    return Res;

}


FORCEINLINE VectorRegister4Float VectorSetFloat1(float X)

{

    return vdupq_n_f32(X);

}


FORCEINLINE VectorRegister4Double VectorSetFloat1(double X)

{

    VectorRegister4Double Result;

    Result.XY = vdupq_n_f64(X);

    Result.ZW = Result.XY;

    return Result;

}


FORCEINLINE void VectorStoreAligned(VectorRegister4Float Vec, float* Ptr)

{

    vst1q_f32(Ptr, Vec);

}


FORCEINLINE void VectorStoreAligned(VectorRegister4Double Vec, double* Ptr)

{

    vst1q_f64_x2(Ptr, *(float64x2x2_t*)&Vec);

}


//TODO: LWC VectorVM.cpp calls it on a line 3294, case EVectorVMOp::outputdata_half: Context.WriteExecFunction(CopyConstantToOutput<float, FFloat16, 2>); break;

FORCEINLINE void VectorStoreAligned(VectorRegister4Float Vec, FFloat16* Ptr)

{

    AlignedFloat4 Floats(Vec);

    for (int i = 0; i < 4; ++i)

    {

        Ptr[i] = Floats[i];

    }

}


#define VectorStoreAlignedStreamed(Vec, Ptr)    VectorStoreAligned(Vec, Ptr)


FORCEINLINE void VectorStore(VectorRegister4Float Vec, float* Ptr)

{

    vst1q_f32(Ptr, Vec);

}


FORCEINLINE void VectorStore(VectorRegister4Double Vec, double* Ptr)

{

    vst1q_f64_x2(Ptr, *(float64x2x2_t*)&Vec);

}


FORCEINLINE void VectorStore16(VectorRegister4x4Float Vec, float* Ptr)

{

    vst1q_f32_x4(Ptr, Vec);

}


FORCEINLINE void VectorStoreFloat3(VectorRegister4Float Vec, float* Ptr)

{

    vst1_f32(Ptr, *(float32x2_t*)&Vec);

    vst1q_lane_f32(((float32_t*)Ptr) + 2, Vec, 2);

}


FORCEINLINE void VectorStoreFloat3(VectorRegister4Double Vec, double* Ptr)

{

    vst1q_f64(Ptr, Vec.XY);

    vst1q_lane_f64(((float64_t*)Ptr) + 2, Vec.ZW, 0);

}


FORCEINLINE void VectorStoreFloat1(VectorRegister4Float Vec, float* Ptr)

{

    vst1q_lane_f32( Ptr, Vec, 0 );

}


FORCEINLINE void VectorStoreFloat1(VectorRegister4Double Vec, double* Ptr)

{

    vst1q_lane_f64(Ptr, Vec.XY, 0);

}


template <int ElementIndex>

FORCEINLINE VectorRegister4Float VectorReplicateImpl(VectorRegister4Float Vec)

{

    return vdupq_n_f32(vgetq_lane_f32(Vec, ElementIndex));

}


template <int ElementIndex>

FORCEINLINE VectorRegister2Double VectorReplicateImpl(VectorRegister2Double Vec)

{

    return vdupq_n_f64(vgetq_lane_f64(Vec, ElementIndex));

}


template <int ElementIndex>

FORCEINLINE VectorRegister4Double VectorReplicateImpl(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    if constexpr (ElementIndex <= 1)

    {

        Result.XY = VectorReplicateImpl<ElementIndex>(Vec.XY);

        Result.ZW = Result.XY;

    }

    else

    {

        Result.ZW = VectorReplicateImpl<ElementIndex - 2>(Vec.ZW);

        Result.XY = Result.ZW;

    }

    return Result;

}


#define VectorReplicate( Vec, ElementIndex ) VectorReplicateImpl<ElementIndex>(Vec)


FORCEINLINE VectorRegister4Float VectorAbs(VectorRegister4Float Vec)

{

    return vabsq_f32( Vec );

}


FORCEINLINE VectorRegister4Double VectorAbs(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    Result.XY = vabsq_f64(Vec.XY);

    Result.ZW = vabsq_f64(Vec.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorNegate(VectorRegister4Float Vec)

{

    return vnegq_f32( Vec );

}


FORCEINLINE VectorRegister4Double VectorNegate(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    Result.XY = vnegq_f64(Vec.XY);

    Result.ZW = vnegq_f64(Vec.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vaddq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = vaddq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = vaddq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorSubtract(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vsubq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorSubtract(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Res;

    Res.XY = vsubq_f64(Vec1.XY, Vec2.XY);

    Res.ZW = vsubq_f64(Vec1.ZW, Vec2.ZW);

    return Res;

}


FORCEINLINE VectorRegister4Float VectorMultiply(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vmulq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister2Double VectorMultiply(VectorRegister2Double Vec1, VectorRegister2Double Vec2)

{

    return vmulq_f64(Vec1, Vec2);

}


FORCEINLINE VectorRegister4Double VectorMultiply(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = vmulq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = vmulq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorDivide(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vdivq_f32(Vec1, Vec2);

}


FORCEINLINE VectorRegister4Double VectorDivide(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Res;

    Res.XY = vdivq_f64(Vec1.XY, Vec2.XY);

    Res.ZW = vdivq_f64(Vec1.ZW, Vec2.ZW);

    return Res;

}


FORCEINLINE VectorRegister4Float VectorMultiplyAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2, VectorRegister4Float Acc)

{

    return vfmaq_f32(Acc, Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Acc)

{

    VectorRegister4Double Result;

    Result.XY = vfmaq_f64(Acc.XY, Vec1.XY, Vec2.XY);

    Result.ZW = vfmaq_f64(Acc.ZW, Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorNegateMultiplyAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2, VectorRegister4Float Sub)

{

    return vfmsq_f32(Sub, Vec1, Vec2);

}


FORCEINLINE VectorRegister4Double VectorNegateMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Sub)

{

    VectorRegister4Double Result;

    Result.XY = vfmsq_f64(Sub.XY, Vec1.XY, Vec2.XY);

    Result.ZW = vfmsq_f64(Sub.ZW, Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorDot3(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    VectorRegister4Float Temp = VectorMultiply( Vec1, Vec2 );

    Temp = vsetq_lane_f32( 0.0f, Temp, 3 );

    float32x2_t sum = vpadd_f32( vget_low_f32( Temp ), vget_high_f32( Temp ) );

    sum = vpadd_f32( sum, sum );

    return vdupq_lane_f32( sum, 0 );

}


FORCEINLINE VectorRegister4Double VectorDot3(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister2Double A, B;

    A = vmulq_f64(Vec1.XY, Vec2.XY);

    B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);

    float64x1_t Sum = vadd_f64(vget_low_f64(B), vget_high_f64(A));

    VectorRegister4Double Temp;

    Temp.XY = vdupq_lane_f64(Sum, 0);

    Temp.ZW = Temp.XY;

    return Temp;

}


FORCEINLINE float VectorDot3Scalar(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vgetq_lane_f32(VectorDot3(Vec1, Vec2), 0);

}


FORCEINLINE double VectorDot3Scalar(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister2Double A, B;

    A = vmulq_f64(Vec1.XY, Vec2.XY);

    B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);

    float64x1_t Sum = vadd_f64(vget_low_f64(B), vget_high_f64(A));

    return *(double*)&Sum;

}


FORCEINLINE VectorRegister4Float VectorDot4(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    VectorRegister4Float Temp = VectorMultiply(Vec1, Vec2);

    float32x2_t sum = vpadd_f32(vget_low_f32(Temp), vget_high_f32(Temp));

    sum = vpadd_f32(sum, sum);

    return vdupq_lane_f32(sum, 0);

}


FORCEINLINE VectorRegister4Double VectorDot4(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister2Double A, B;

    A = vmulq_f64(Vec1.XY, Vec2.XY);

    B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);

    A = vextq_f64(B, B, 1);

    VectorRegister4Double Temp;

    Temp.XY = vaddq_f64(A, B);

    Temp.ZW = Temp.XY;

    return Temp;

}


FORCEINLINE VectorRegister4Float VectorCompareEQ(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vceqq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorCompareEQ(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)vceqq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = (VectorRegister2Double)vceqq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorCompareNE(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vmvnq_u32( vceqq_f32( Vec1, Vec2 ) );

}


FORCEINLINE VectorRegister4Double VectorCompareNE(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)vmvnq_u32(vceqq_f64(Vec1.XY, Vec2.XY));

    Result.ZW = (VectorRegister2Double)vmvnq_u32(vceqq_f64(Vec1.ZW, Vec2.ZW));

    return Result;

}


FORCEINLINE VectorRegister4Float VectorCompareGT(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vcgtq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorCompareGT(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)vcgtq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = (VectorRegister2Double)vcgtq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorCompareGE(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vcgeq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorCompareGE(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)vcgeq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = (VectorRegister2Double)vcgeq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorCompareLT(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vcltq_f32(Vec1, Vec2);

}


FORCEINLINE VectorRegister4Double VectorCompareLT(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Res;

    Res.XY = (VectorRegister2Double)vcltq_f64(Vec1.XY, Vec2.XY);

    Res.ZW = (VectorRegister2Double)vcltq_f64(Vec1.ZW, Vec2.ZW);

    return Res;

}


FORCEINLINE VectorRegister4Float VectorCompareLE(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vcleq_f32(Vec1, Vec2);

}


FORCEINLINE VectorRegister4Double VectorCompareLE(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Res;

    Res.XY = (VectorRegister2Double)vcleq_f64(Vec1.XY, Vec2.XY);

    Res.ZW = (VectorRegister2Double)vcleq_f64(Vec1.ZW, Vec2.ZW);

    return Res;

}


FORCEINLINE VectorRegister4Float VectorSelect(VectorRegister4Float Mask, VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vbslq_f32((VectorRegister4Int)Mask, Vec1, Vec2);

}


FORCEINLINE VectorRegister4Double VectorSelect(VectorRegister4Double Mask, VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = vbslq_f64((VectorRegister2Int64)Mask.XY, Vec1.XY, Vec2.XY);

    Result.ZW = vbslq_f64((VectorRegister2Int64)Mask.ZW, Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorBitwiseOr(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vorrq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );

}


FORCEINLINE VectorRegister4Double VectorBitwiseOr(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)vorrq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);

    Result.ZW = (VectorRegister2Double)vorrq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorBitwiseAnd(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)vandq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );

}


FORCEINLINE VectorRegister4Double VectorBitwiseAnd(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)vandq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);

    Result.ZW = (VectorRegister2Double)vandq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorBitwiseXor(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return (VectorRegister4Float)veorq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );

}


FORCEINLINE VectorRegister4Double VectorBitwiseXor(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = (VectorRegister2Double)veorq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);

    Result.ZW = (VectorRegister2Double)veorq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);

    return Result;

}


#ifndef __clang__

FORCEINLINE VectorRegister4Float VectorSwizzle

(

    VectorRegister4Float V,

    uint32 E0,

    uint32 E1,

    uint32 E2,

    uint32 E3

)

{

    check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));

    static constexpr uint32_t ControlElement[4] =

    {

        0x03020100, // XM_SWIZZLE_X

        0x07060504, // XM_SWIZZLE_Y

        0x0B0A0908, // XM_SWIZZLE_Z

        0x0F0E0D0C, // XM_SWIZZLE_W

    };


    uint8x8x2_t tbl;

    tbl.val[0] = vget_low_f32(V);

    tbl.val[1] = vget_high_f32(V);


    uint32x2_t idx = vcreate_u32(static_cast<uint64>(ControlElement[E0]) | (static_cast<uint64>(ControlElement[E1]) << 32));

    const uint8x8_t rL = vtbl2_u8(tbl, idx);


    idx = vcreate_u32(static_cast<uint64>(ControlElement[E2]) | (static_cast<uint64>(ControlElement[E3]) << 32));

    const uint8x8_t rH = vtbl2_u8(tbl, idx);


    return vcombine_f32(rL, rH);

}


FORCEINLINE VectorRegister4Double VectorSwizzle

(

    VectorRegister4Double V,

    uint32 E0,

    uint32 E1,

    uint32 E2,

    uint32 E3

)

{

    check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));

    static constexpr uint64_t ControlElement[4] =

    {

        0x0706050403020100ULL, // XM_SWIZZLE_X

        0x0F0E0D0C0B0A0908ULL, // XM_SWIZZLE_Y

        0x1716151413121110ULL, // XM_SWIZZLE_Z

        0x1F1E1D1C1B1A1918ULL, // XM_SWIZZLE_W

    };


    uint8x16x2_t tbl;

    tbl.val[0] = V.XY;

    tbl.val[1] = V.ZW;


    VectorRegister4Double Result;

    uint32x4_t idx = vcombine_u64(vcreate_u64(ControlElement[E0]), vcreate_u64(ControlElement[E1]));

    Result.XY = vqtbl2q_u8(tbl, idx);


    idx = vcombine_u64(vcreate_u64(ControlElement[E2]), vcreate_u64(ControlElement[E3]));

    Result.ZW = vqtbl2q_u8(tbl, idx);


    return Result;

}

#else

template <int X, int Y, int Z, int W>

FORCEINLINE VectorRegister4Float VectorSwizzleImpl(VectorRegister4Float Vec)

{

    return __builtin_shufflevector(Vec, Vec, X, Y, Z, W);

}


template <int X, int Y>

FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)

{

    if constexpr (X <= 1)

    {

        if constexpr (Y <= 1)

        {

            return __builtin_shufflevector(Vec.XY, Vec.XY, X, Y);

        }

        else

        {

            return __builtin_shufflevector(Vec.XY, Vec.ZW, X, Y);

        }

    }

    else

    {

        if constexpr (Y <= 1)

        {

            return __builtin_shufflevector(Vec.ZW, Vec.XY, X - 2, Y + 2);

        }

        else

        {

            return __builtin_shufflevector(Vec.ZW, Vec.ZW, X - 2, Y);

        }

    }

}


template <int X, int Y, int Z, int W>

FORCEINLINE VectorRegister4Double VectorSwizzleImpl(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    Result.XY = VectorSwizzleImpl2<X, Y>(Vec);

    Result.ZW = VectorSwizzleImpl2<Z, W>(Vec);

    return Result;

}


#define VectorSwizzle( Vec, X, Y, Z, W ) VectorSwizzleImpl<X, Y, Z, W>(Vec)

#endif // __clang__


#ifndef __clang__

FORCEINLINE VectorRegister4Float VectorShuffle

(

    VectorRegister4Float V1,

    VectorRegister4Float V2,

    uint32 PermuteX,

    uint32 PermuteY,

    uint32 PermuteZ,

    uint32 PermuteW

)

{

    check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);


    static constexpr uint32 ControlElement[8] =

    {

        0x03020100, // XM_PERMUTE_0X

        0x07060504, // XM_PERMUTE_0Y

        0x0B0A0908, // XM_PERMUTE_0Z

        0x0F0E0D0C, // XM_PERMUTE_0W

        0x13121110, // XM_PERMUTE_1X

        0x17161514, // XM_PERMUTE_1Y

        0x1B1A1918, // XM_PERMUTE_1Z

        0x1F1E1D1C, // XM_PERMUTE_1W

    };


    uint8x8x4_t tbl;

    tbl.val[0] = vget_low_f32(V1);

    tbl.val[1] = vget_high_f32(V1);

    tbl.val[2] = vget_low_f32(V2);

    tbl.val[3] = vget_high_f32(V2);


    uint32x2_t idx = vcreate_u32(static_cast<uint64>(ControlElement[PermuteX]) | (static_cast<uint64>(ControlElement[PermuteY]) << 32));

    const uint8x8_t rL = vtbl4_u8(tbl, idx);


    idx = vcreate_u32(static_cast<uint64>(ControlElement[PermuteZ + 4]) | (static_cast<uint64>(ControlElement[PermuteW + 4]) << 32));

    const uint8x8_t rH = vtbl4_u8(tbl, idx);


    return vcombine_f32(rL, rH);

}


FORCEINLINE VectorRegister4Double VectorShuffle

(

    VectorRegister4Double V1,

    VectorRegister4Double V2,

    uint32 PermuteX,

    uint32 PermuteY,

    uint32 PermuteZ,

    uint32 PermuteW

)

{

    check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);


    static constexpr uint64 ControlElement[8] =

    {

        0x0706050403020100ULL, // XM_PERMUTE_0X

        0x0F0E0D0C0B0A0908ULL, // XM_PERMUTE_0Y

        0x1716151413121110ULL, // XM_PERMUTE_0Z

        0x1F1E1D1C1B1A1918ULL, // XM_PERMUTE_0W


        0x2726252423222120ULL, // XM_PERMUTE_1X

        0x2F2E2D2C2B2A2928ULL, // XM_PERMUTE_1Y

        0x3736353433323130ULL, // XM_PERMUTE_1Z

        0x3F3E3D3C3B3A3938ULL, // XM_PERMUTE_1W

    };


    uint8x16x4_t tbl;

    tbl.val[0] = V1.XY;

    tbl.val[1] = V1.ZW;

    tbl.val[2] = V2.XY;

    tbl.val[3] = V2.ZW;


    VectorRegister4Double Result;

    uint32x4_t idx = vcombine_u64(vcreate_u64(ControlElement[PermuteX]), vcreate_u64(ControlElement[PermuteY]));

    Result.XY = vqtbl4q_u8(tbl, idx);


    idx = vcombine_u64(vcreate_u64(ControlElement[PermuteZ + 4]), vcreate_u64(ControlElement[PermuteW + 4]));

    Result.ZW = vqtbl4q_u8(tbl, idx);


    return Result;

}

#else


template <int X, int Y, int Z, int W>

FORCEINLINE VectorRegister4Float VectorShuffleImpl(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return __builtin_shufflevector(Vec1, Vec2, X, Y, Z + 4, W + 4);

}


template <int X, int Y, int Z, int W>

FORCEINLINE VectorRegister4Double VectorShuffleImpl(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = VectorSwizzleImpl2<X, Y>(Vec1);

    Result.ZW = VectorSwizzleImpl2<Z, W>(Vec2);

    return Result;

}


#define VectorShuffle( Vec1, Vec2, X, Y, Z, W ) VectorShuffleImpl<X, Y, Z, W>(Vec1, Vec2)

#endif // __clang__


FORCEINLINE uint32 VectorMaskBits(VectorRegister4Float VecMask)

{

    int32x4_t Signs = vshrq_n_s32(vreinterpretq_s32_f32(VecMask), 31); // sign bit of each lane replicated 32x

    int32x4_t Masked = vandq_s32(Signs, MakeVectorRegisterInt(0x1, 0x2, 0x4, 0x8)); // pick bit for lane position

    return uint32(vaddvq_s32(Masked)); // reduce via add

}


FORCEINLINE uint32 VectorMaskBits(VectorRegister4Double VecMask)

{

    int64x2_t Signs0 = vshrq_n_s64(vreinterpretq_s64_f32(VecMask.XY), 63); // sign bit of each lane replicated 64x

    int64x2_t Signs1 = vshrq_n_s64(vreinterpretq_s64_f32(VecMask.ZW), 63); // sign bit of each lane replicated 64x

    int32x4_t Signs = vuzp1q_s32(Signs0, Signs1); // 32-bit masks

    int32x4_t Masked = vandq_s32(Signs, MakeVectorRegisterInt(0x1, 0x2, 0x4, 0x8)); // pick bit for lane position

    return uint32(vaddvq_s32(Masked)); // reduce via add

}


FORCEINLINE VectorRegister4Float VectorCombineHigh(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vcombine_f32(vget_high_f32(Vec1), vget_high_f32(Vec2));

}


FORCEINLINE VectorRegister4Double VectorCombineHigh(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = Vec1.ZW;

    Result.ZW = Vec2.ZW;

    return Result;

}


FORCEINLINE VectorRegister4Float VectorCombineLow(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vcombine_f32(vget_low_f32(Vec1), vget_low_f32(Vec2));

}


FORCEINLINE VectorRegister4Double VectorCombineLow(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = Vec1.XY;

    Result.ZW = Vec2.XY;

    return Result;

}


FORCEINLINE void VectorDeinterleave(VectorRegister4Float& RESTRICT OutEvens, VectorRegister4Float& RESTRICT OutOdds, VectorRegister4Float Lo, VectorRegister4Float Hi)

{

    float32x4x2_t deinterleaved = vuzpq_f32(Lo, Hi);

    OutEvens = deinterleaved.val[0];

    OutOdds = deinterleaved.val[1];

}


FORCEINLINE void VectorDeinterleave(VectorRegister4Double& RESTRICT OutEvens, VectorRegister4Double& RESTRICT OutOdds, VectorRegister4Double Lo, VectorRegister4Double Hi)

{

    OutEvens = VectorShuffle(Lo, Hi, 0, 2, 0, 2);

    OutOdds = VectorShuffle(Lo, Hi, 1, 3, 1, 3);

}


FORCEINLINE VectorRegister4Float VectorCross(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    VectorRegister4Float C = VectorMultiply(Vec1, VectorSwizzle(Vec2, 1, 2, 0, 3));

    C = VectorNegateMultiplyAdd(VectorSwizzle(Vec1, 1, 2, 0, 3), Vec2, C);

    C = VectorSwizzle(C, 1, 2, 0, 3);

    return C;

}


FORCEINLINE VectorRegister4Double VectorCross(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double C = VectorMultiply(Vec1, VectorSwizzle(Vec2, 1, 2, 0, 3));

    C = VectorNegateMultiplyAdd(VectorSwizzle(Vec1, 1, 2, 0, 3), Vec2, C);

    C = VectorSwizzle(C, 1, 2, 0, 3);

    return C;

}


FORCEINLINE VectorRegister4Float VectorPow(VectorRegister4Float Base, VectorRegister4Float Exponent)

{

    //@TODO: Optimize this

    union U {

        VectorRegister4Float V; float F[4];

        FORCEINLINE U() : V() {}

    } B, E;

    B.V = Base;

    E.V = Exponent;

    return MakeVectorRegister( powf(B.F[0], E.F[0]), powf(B.F[1], E.F[1]), powf(B.F[2], E.F[2]), powf(B.F[3], E.F[3]) );

}


FORCEINLINE VectorRegister4Double VectorPow(VectorRegister4Double Base, VectorRegister4Double Exponent)

{

    //@TODO: Optimize this

    AlignedDouble4 Values(Base);

    AlignedDouble4 Exponents(Exponent);


    Values[0] = FMath::Pow(Values[0], Exponents[0]);

    Values[1] = FMath::Pow(Values[1], Exponents[1]);

    Values[2] = FMath::Pow(Values[2], Exponents[2]);

    Values[3] = FMath::Pow(Values[3], Exponents[3]);

    return Values.ToVectorRegister();

}


FORCEINLINE VectorRegister4Float VectorReciprocalEstimate(VectorRegister4Float Vec)

{

    return vrecpeq_f32(Vec);

}


FORCEINLINE VectorRegister4Double VectorReciprocalEstimate(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    Result.XY = vrecpeq_f64(Vec.XY);

    Result.ZW = vrecpeq_f64(Vec.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorReciprocal(VectorRegister4Float Vec)

{

    // Perform two passes of Newton-Raphson iteration on the hardware estimate

    // The built-in instruction (VRECPS) is not as accurate


    // Initial estimate

    VectorRegister4Float Reciprocal = VectorReciprocalEstimate(Vec);


    // First iteration

    VectorRegister4Float Squared = VectorMultiply(Reciprocal, Reciprocal);

    VectorRegister4Float Double = VectorAdd(Reciprocal, Reciprocal);

    Reciprocal = VectorNegateMultiplyAdd(Vec, Squared, Double);


    // Second iteration

    Squared = VectorMultiply(Reciprocal, Reciprocal);

    Double = VectorAdd(Reciprocal, Reciprocal);

    return VectorNegateMultiplyAdd(Vec, Squared, Double);

}


FORCEINLINE VectorRegister4Double VectorReciprocal(VectorRegister4Double Vec)

{

    return VectorDivide(GlobalVectorConstants::DoubleOne, Vec);

}


FORCEINLINE VectorRegister4Float VectorSqrt(VectorRegister4Float Vec)

{

    return vsqrtq_f32(Vec);

}


FORCEINLINE VectorRegister4Double VectorSqrt(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    Result.XY = vsqrtq_f64(Vec.XY);

    Result.ZW = vsqrtq_f64(Vec.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorReciprocalSqrtEstimate(VectorRegister4Float Vec)

{

    return vrsqrteq_f32(Vec);

}


FORCEINLINE VectorRegister4Double VectorReciprocalSqrtEstimate(VectorRegister4Double Vec)

{

    VectorRegister4Double Result;

    Result.XY = vrsqrteq_f64(Vec.XY);

    Result.ZW = vrsqrteq_f64(Vec.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorReciprocalSqrt(VectorRegister4Float Vec)

{

    // Initial estimate

    VectorRegister4Float RecipSqrt = VectorReciprocalSqrtEstimate(Vec);


    // Two refinement

    RecipSqrt = VectorMultiply(vrsqrtsq_f32(Vec, VectorMultiply(RecipSqrt, RecipSqrt)), RecipSqrt);

    return VectorMultiply(vrsqrtsq_f32(Vec, VectorMultiply(RecipSqrt, RecipSqrt)), RecipSqrt);

}


FORCEINLINE VectorRegister4Double VectorReciprocalSqrt(VectorRegister4Double Vec)

{

    // Initial estimate

    VectorRegister4Double RecipSqrt = VectorReciprocalSqrtEstimate(Vec);


    // Two refinement

    VectorRegister4Double Tmp;

    Tmp.XY = vrsqrtsq_f64(Vec.XY, VectorMultiply(RecipSqrt.XY, RecipSqrt.XY));

    Tmp.ZW = vrsqrtsq_f64(Vec.ZW, VectorMultiply(RecipSqrt.ZW, RecipSqrt.ZW));

    RecipSqrt = VectorMultiply(Tmp, RecipSqrt);


    Tmp.XY = vrsqrtsq_f64(Vec.XY, VectorMultiply(RecipSqrt.XY, RecipSqrt.XY));

    Tmp.ZW = vrsqrtsq_f64(Vec.ZW, VectorMultiply(RecipSqrt.ZW, RecipSqrt.ZW));

    return VectorMultiply(Tmp, RecipSqrt);

}


FORCEINLINE VectorRegister4Float VectorReciprocalLen(VectorRegister4Float Vector)

{

    return VectorReciprocalSqrt(VectorDot4(Vector, Vector));

}


FORCEINLINE VectorRegister4Double VectorReciprocalLen(VectorRegister4Double Vector)

{

    return VectorReciprocalSqrt(VectorDot4(Vector, Vector));

}


FORCEINLINE VectorRegister4Float VectorReciprocalLenEstimate(VectorRegister4Float Vector)

{

    return VectorReciprocalSqrtEstimate(VectorDot4(Vector, Vector));

}


FORCEINLINE VectorRegister4Double VectorReciprocalLenEstimate(VectorRegister4Double Vector)

{

    return VectorReciprocalSqrtEstimate(VectorDot4(Vector, Vector));

}


FORCEINLINE VectorRegister4Float VectorSet_W0(VectorRegister4Float Vec)

{

    return VectorSetComponent(Vec, 3, 0.0f);

}


FORCEINLINE VectorRegister4Double VectorSet_W0(VectorRegister4Double Vec)

{

    return VectorSetComponent(Vec, 3, 0.0);

}


FORCEINLINE VectorRegister4Float VectorSet_W1(VectorRegister4Float Vec)

{

    return VectorSetComponent(Vec, 3, 1.0f);

}


FORCEINLINE VectorRegister4Double VectorSet_W1(VectorRegister4Double Vec)

{

    return VectorSetComponent(Vec, 3, 1.0);

}


template <uint32 ElementIndex>

FORCEINLINE float VectorGetComponentImpl(VectorRegister4Float Vec)

{

    return vgetq_lane_f32(Vec, ElementIndex);

}


template <int ElementIndex>

FORCEINLINE double VectorGetComponentImpl(VectorRegister2Double Vec)

{

    return vgetq_lane_f64(Vec, ElementIndex);

}


template <int ElementIndex>

FORCEINLINE double VectorGetComponentImpl(VectorRegister4Double Vec)

{

    if constexpr (ElementIndex > 1)

    {

        return VectorGetComponentImpl<ElementIndex - 2>(Vec.ZW);

    }

    else

    {

        return VectorGetComponentImpl<ElementIndex>(Vec.XY);

    }

}


#define VectorGetComponent(Vec, ElementIndex) VectorGetComponentImpl<ElementIndex>(Vec)


FORCEINLINE float VectorGetComponentDynamic(VectorRegister4Float Vec, uint32 ElementIndex)

{

    AlignedFloat4 Floats(Vec);

    return Floats[ElementIndex];

}


FORCEINLINE double VectorGetComponentDynamic(VectorRegister4Double Vec, uint32 ElementIndex)

{

    AlignedDouble4 Doubles(Vec);

    return Doubles[ElementIndex];

}


FORCEINLINE void VectorMatrixMultiply(FMatrix44f* Result, const FMatrix44f* Matrix1, const FMatrix44f* Matrix2)

{

    float32x4x4_t A = vld1q_f32_x4((const float*)Matrix1);

    float32x4x4_t B = vld1q_f32_x4((const float*)Matrix2);

    float32x4x4_t R;


    // First row of result (Matrix1[0] * Matrix2).

    R.val[0] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[0]), 0);

    R.val[0] = vfmaq_lane_f32(R.val[0], B.val[1], vget_low_f32(A.val[0]), 1);

    R.val[0] = vfmaq_lane_f32(R.val[0], B.val[2], vget_high_f32(A.val[0]), 0);

    R.val[0] = vfmaq_lane_f32(R.val[0], B.val[3], vget_high_f32(A.val[0]), 1);


    // Second row of result (Matrix1[1] * Matrix2).

    R.val[1] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[1]), 0);

    R.val[1] = vfmaq_lane_f32(R.val[1], B.val[1], vget_low_f32(A.val[1]), 1);

    R.val[1] = vfmaq_lane_f32(R.val[1], B.val[2], vget_high_f32(A.val[1]), 0);

    R.val[1] = vfmaq_lane_f32(R.val[1], B.val[3], vget_high_f32(A.val[1]), 1);


    // Third row of result (Matrix1[2] * Matrix2).

    R.val[2] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[2]), 0);

    R.val[2] = vfmaq_lane_f32(R.val[2], B.val[1], vget_low_f32(A.val[2]), 1);

    R.val[2] = vfmaq_lane_f32(R.val[2], B.val[2], vget_high_f32(A.val[2]), 0);

    R.val[2] = vfmaq_lane_f32(R.val[2], B.val[3], vget_high_f32(A.val[2]), 1);


    // Fourth row of result (Matrix1[3] * Matrix2).

    R.val[3] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[3]), 0);

    R.val[3] = vfmaq_lane_f32(R.val[3], B.val[1], vget_low_f32(A.val[3]), 1);

    R.val[3] = vfmaq_lane_f32(R.val[3], B.val[2], vget_high_f32(A.val[3]), 0);

    R.val[3] = vfmaq_lane_f32(R.val[3], B.val[3], vget_high_f32(A.val[3]), 1);


    vst1q_f32_x4((float*)Result, R);

}


FORCEINLINE void VectorMatrixMultiply(FMatrix44d* Result, const FMatrix44d* Matrix1, const FMatrix44d* Matrix2)

{

    float64x2x4_t A = vld1q_f64_x4((const double*)Matrix1);

    float64x2x4_t B1 = vld1q_f64_x4((const double*)Matrix2);

    float64x2x4_t B2 = vld1q_f64_x4((const double*)Matrix2 + 8);

    float64_t* V = (float64_t*)&A;

    float64x2x4_t R;


    // First row of result (Matrix1[0] * Matrix2).

    R.val[0] = vmulq_n_f64(B1.val[0], V[0]);

    R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);

    R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);

    R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);


    R.val[1] = vmulq_n_f64(B1.val[1], V[0]);

    R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);

    R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);

    R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);


    // Second row of result (Matrix1[1] * Matrix2).

    R.val[2] = vmulq_n_f64(B1.val[0], V[4]);

    R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);

    R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);

    R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);


    R.val[3] = vmulq_n_f64(B1.val[1], V[4]);

    R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);

    R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);

    R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);


    vst1q_f64_x4((double*)Result, R);

    A = vld1q_f64_x4((const double*)Matrix1 + 8);

    V = (float64_t*)&A;


    // Third row of result (Matrix1[2] * Matrix2).

    R.val[0] = vmulq_n_f64(B1.val[0], V[0]);

    R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);

    R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);

    R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);


    R.val[1] = vmulq_n_f64(B1.val[1], V[0]);

    R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);

    R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);

    R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);


    // Fourth row of result (Matrix1[3] * Matrix2).

    R.val[2] = vmulq_n_f64(B1.val[0], V[4]);

    R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);

    R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);

    R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);


    R.val[3] = vmulq_n_f64(B1.val[1], V[4]);

    R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);

    R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);

    R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);


    vst1q_f64_x4((double*)Result + 8, R);

}


FORCEINLINE bool VectorMatrixInverse(FMatrix44d* DstMatrix, const FMatrix44d* SrcMatrix)

{

    return FMath::MatrixInverse(DstMatrix,SrcMatrix);

}

FORCEINLINE bool VectorMatrixInverse(FMatrix44f* DstMatrix, const FMatrix44f* SrcMatrix)

{

    return FMath::MatrixInverse(DstMatrix,SrcMatrix);

}


FORCEINLINE VectorRegister4Float VectorTransformVector(VectorRegister4Float VecP, const FMatrix44f* MatrixM )

{

    float32x4x4_t M = vld1q_f32_x4((const float*)MatrixM);

    VectorRegister4Float Result;


    Result = vmulq_n_f32(M.val[0], VecP[0]);

    Result = vfmaq_n_f32(Result, M.val[1], VecP[1]);

    Result = vfmaq_n_f32(Result, M.val[2], VecP[2]);

    Result = vfmaq_n_f32(Result, M.val[3], VecP[3]);


    return Result;

}


FORCEINLINE VectorRegister4Float VectorTransformVector(VectorRegister4Float VecP, const FMatrix44d* MatrixM)

{

    float64x2x4_t M1 = vld1q_f64_x4((const double*)MatrixM);

    float64x2x4_t M2 = vld1q_f64_x4(((const double*)MatrixM) + 8);

    VectorRegister4Double Result;

    VectorRegister4Double Vec(VecP);


    Result.XY = vmulq_n_f64(M1.val[0], Vec.XY[0]);

    Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], Vec.XY[1]);

    Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], Vec.ZW[0]);

    Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], Vec.ZW[1]);


    Result.ZW = vmulq_n_f64(M1.val[1], Vec.XY[0]);

    Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], Vec.XY[1]);

    Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], Vec.ZW[0]);

    Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], Vec.ZW[1]);


    return MakeVectorRegisterFloatFromDouble(Result);

}


FORCEINLINE VectorRegister4Double VectorTransformVector(VectorRegister4Double VecP, const FMatrix44d* MatrixM)

{

    float64x2x4_t M1 = vld1q_f64_x4((const double*)MatrixM);

    float64x2x4_t M2 = vld1q_f64_x4(((const double*)MatrixM) + 8);

    VectorRegister4Double Result;


    //TODO: this can be rewritten to avoid using M2 var, saves some registers

    Result.XY = vmulq_n_f64(M1.val[0], VecP.XY[0]);

    Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], VecP.XY[1]);

    Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], VecP.ZW[0]);

    Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], VecP.ZW[1]);


    Result.ZW = vmulq_n_f64(M1.val[1], VecP.XY[0]);

    Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], VecP.XY[1]);

    Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], VecP.ZW[0]);

    Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], VecP.ZW[1]);


    return Result;

}


FORCEINLINE VectorRegister4Float VectorMin(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vminq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorMin(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = vminq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = vminq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorMax(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    return vmaxq_f32( Vec1, Vec2 );

}


FORCEINLINE VectorRegister4Double VectorMax(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    VectorRegister4Double Result;

    Result.XY = vmaxq_f64(Vec1.XY, Vec2.XY);

    Result.ZW = vmaxq_f64(Vec1.ZW, Vec2.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorMergeVecXYZ_VecW(VectorRegister4Float VecXYZ, VectorRegister4Float VecW)

{

    return vsetq_lane_f32(vgetq_lane_f32(VecW, 3), VecXYZ, 3);

}


FORCEINLINE VectorRegister4Double VectorMergeVecXYZ_VecW(VectorRegister4Double VecXYZ, VectorRegister4Double VecW)

{

    VectorRegister4Double Res;

    Res.XY = VecXYZ.XY;

    Res.ZW = vsetq_lane_f64(vgetq_lane_f64(VecW.ZW, 1), VecXYZ.ZW, 1);

    return Res;

}


FORCEINLINE VectorRegister4Float VectorLoadByte4(const void* Ptr)

{

    uint8x8_t AsUInt8 = vreinterpret_u8_u32(vld1_dup_u32((const uint32*)Ptr));

    uint16x8_t AsUInt16 = vmovl_u8(AsUInt8);

    uint32x4_t AsUInt32 = vmovl_u16(vget_low_u16(AsUInt16));

    return vcvtq_f32_u32(AsUInt32);

}


FORCEINLINE VectorRegister4Float VectorLoadSignedByte4(const void* Ptr)

{

    int8x8_t AsInt8 = vreinterpret_s8_u32(vld1_dup_u32((const uint32*)Ptr));

    int16x8_t AsInt16 = vmovl_s8(AsInt8);

    int32x4_t AsInt32 = vmovl_s16(vget_low_u16(AsInt16));

    return vcvtq_f32_s32(AsInt32);

}


FORCEINLINE VectorRegister4Float VectorLoadByte4Reverse(const uint8* Ptr)

{

    uint8x8_t AsUInt8 = vrev32_u8(vreinterpret_u8_u32(vld1_dup_u32((const uint32*)Ptr)));

    uint16x8_t AsUInt16 = vmovl_u8(AsUInt8);

    uint32x4_t AsUInt32 = vmovl_u16(vget_low_u16(AsUInt16));

    return vcvtq_f32_u32(AsUInt32);

}


FORCEINLINE void VectorStoreByte4(VectorRegister4Float Vec, void* Ptr)

{

    uint32x4_t AsUInt32 = vcvtq_u32_f32(Vec); // Saturates (clamps) to [0,2^32 - 1]

    uint16x4_t AsUInt16 = vqmovn_u32(AsUInt32); // Saturates further to [0,2^16 - 1]

    uint8x8_t AsUInt8 = vqmovn_u16(vcombine_u16(AsUInt16, vdup_n_u16(0))); // Saturates to [0,255]

    vst1_lane_u32((uint32_t*)Ptr, AsUInt8, 0);

}


FORCEINLINE void VectorStoreSignedByte4(VectorRegister4Float Vec, void* Ptr)

{

    int32x4_t AsInt32 = vcvtq_s32_f32(Vec); // Saturates (clamps) to [-2^31,2^31 - 1]

    int16x4_t AsInt16 = vqmovn_s32(AsInt32); // Saturates further to [-32768,32767]

    int8x8_t AsInt8 = vqmovn_s16(vcombine_s16(AsInt16, vdup_n_s16(0))); // Saturates to [-128,127]

    vst1_lane_u32((uint32_t*)Ptr, AsInt8, 0);

}


template <bool bAligned>

FORCEINLINE void VectorStoreHalf4(VectorRegister4Float Vec, void* Ptr)

{

    float16x4_t f16x4 = vcvt_f16_f32(Vec);

    vst1_u8((uint8_t*)Ptr, f16x4);

}


FORCEINLINE VectorRegister4Float VectorLoadURGB10A2N(void* Ptr)

{

    alignas(16) float V[4];

    const uint32 E = *(uint32*)Ptr;

    V[0] = float((E >> 00) & 0x3FF);

    V[1] = float((E >> 10) & 0x3FF);

    V[2] = float((E >> 20) & 0x3FF);

    V[3] = float((E >> 30) & 0x3);


    VectorRegister4Float Div = MakeVectorRegister(1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f);

    return VectorMultiply(MakeVectorRegister(V[0], V[1], V[2], V[3]), Div);

}


FORCEINLINE void VectorStoreURGB10A2N(VectorRegister4Float Vec, void* Ptr)

{

    union U {

        VectorRegister4Float V; float F[4];

        FORCEINLINE U() : V() {}

    } Tmp;

    Tmp.V = VectorMax(Vec, VectorZeroFloat());

    Tmp.V = VectorMin(Tmp.V, VectorOneFloat());

    Tmp.V = VectorMultiply(Tmp.V, MakeVectorRegister(1023.0f, 1023.0f, 1023.0f, 3.0f));


    uint32* Out = (uint32*)Ptr;

    *Out = (uint32(Tmp.F[0]) & 0x3FF) << 00 |

        (uint32(Tmp.F[1]) & 0x3FF) << 10 |

        (uint32(Tmp.F[2]) & 0x3FF) << 20 |

        (uint32(Tmp.F[3]) & 0x003) << 30;

}


FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Float Vec1, VectorRegister4Float Vec2)

{

    uint32x4_t Mask = (uint32x4_t)VectorCompareGT(Vec1, Vec2);

    return vmaxvq_u32(Mask);

}


FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Double Vec1, VectorRegister4Double Vec2)

{

    uint32x4_t MaskXY = (uint32x4_t)vcgtq_f64(Vec1.XY, Vec2.XY);

    uint32x4_t MaskZW = (uint32x4_t)vcgtq_f64(Vec1.ZW, Vec2.ZW);

    return vmaxvq_u32(vorrq_u32(MaskXY, MaskZW));

}


#define VectorResetFloatRegisters()


#if PLATFORM_WINDOWS_ARM64EC

    #pragma warning(push)

    #pragma warning(disable:5076) // warning C5076: read from FPCR

    #pragma warning(disable:5077) // warning C5076: write to FPCR

#endif


FORCEINLINE uint32_t VectorGetControlRegister()

{

#if PLATFORM_WINDOWS && !PLATFORM_COMPILER_CLANG

    return (uint32_t)_ReadStatusReg(ARM64_FPCR);

#else

    uint64_t Value;

    // The system register read/write instructions use 64-bit registers,

    __asm__ volatile("mrs %0, fpcr" : "=r"(Value));

    return (uint32_t)Value;

#endif

}


FORCEINLINE void VectorSetControlRegister(uint32_t ControlStatus)

{

#if PLATFORM_WINDOWS && !PLATFORM_COMPILER_CLANG

    _WriteStatusReg(ARM64_FPCR, ControlStatus);

#else

    uint64_t State64 = ControlStatus; // instruction needs a 64b reg, but all control bits fit in the lower 32b

    __asm__ volatile("msr fpcr, %0" : : "r"(State64));

#endif

}


#if PLATFORM_WINDOWS_ARM64EC

    #pragma warning(pop)

#endif


#define VECTOR_ROUND_TOWARD_ZERO        (3 << 22)


#define VECTOR_DENORMALS_FLUSH_TO_ZERO  (1 << 24)


FORCEINLINE VectorRegister4Float VectorQuaternionMultiply2(VectorRegister4Float Quat1, VectorRegister4Float Quat2)

{

    VectorRegister4Float Result = VectorMultiply(VectorReplicate(Quat1, 3), Quat2);

    Result = VectorMultiplyAdd( VectorMultiply(VectorReplicate(Quat1, 0), VectorSwizzle(Quat2, 3,2,1,0)), GlobalVectorConstants::QMULTI_SIGN_MASK0, Result);

    Result = VectorMultiplyAdd( VectorMultiply(VectorReplicate(Quat1, 1), VectorSwizzle(Quat2, 2,3,0,1)), GlobalVectorConstants::QMULTI_SIGN_MASK1, Result);

    Result = VectorMultiplyAdd( VectorMultiply(VectorReplicate(Quat1, 2), VectorSwizzle(Quat2, 1,0,3,2)), GlobalVectorConstants::QMULTI_SIGN_MASK2, Result);


    return Result;

}


FORCEINLINE VectorRegister4Double VectorQuaternionMultiply2(VectorRegister4Double Quat1, VectorRegister4Double Quat2)

{

    VectorRegister4Double Result = VectorMultiply(VectorReplicate(Quat1, 3), Quat2);

    Result = VectorMultiplyAdd(VectorMultiply(VectorReplicate(Quat1, 0), VectorSwizzle(Quat2, 3, 2, 1, 0)), GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK0, Result);

    Result = VectorMultiplyAdd(VectorMultiply(VectorReplicate(Quat1, 1), VectorSwizzle(Quat2, 2, 3, 0, 1)), GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK1, Result);

    Result = VectorMultiplyAdd(VectorMultiply(VectorReplicate(Quat1, 2), VectorSwizzle(Quat2, 1, 0, 3, 2)), GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK2, Result);


    return Result;

}


FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float* RESTRICT Result, const VectorRegister4Float* RESTRICT Quat1, const VectorRegister4Float* RESTRICT Quat2)

{

    *Result = VectorQuaternionMultiply2(*Quat1, *Quat2);

}


FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Double* RESTRICT Result, const VectorRegister4Double* RESTRICT Quat1, const VectorRegister4Double* RESTRICT Quat2)

{

    *Result = VectorQuaternionMultiply2(*Quat1, *Quat2);

}


FORCEINLINE void VectorSinCos(VectorRegister4Float* RESTRICT VSinAngles, VectorRegister4Float* RESTRICT VCosAngles, const VectorRegister4Float* RESTRICT VAngles)

{

    // Map to [-pi, pi]

    // X = A - 2pi * round(A/2pi)

    // Note the round(), not truncate(). In this case round() can round halfway cases using round-to-nearest-even OR round-to-nearest.


    // Quotient = round(A/2pi)

    VectorRegister4Float Quotient = VectorMultiply(*VAngles, GlobalVectorConstants::OneOverTwoPi);

    Quotient = vrndnq_f32(Quotient); // round to nearest even is the default rounding mode but that's fine here.


    // X = A - 2pi * Quotient

    VectorRegister4Float X = VectorNegateMultiplyAdd(GlobalVectorConstants::TwoPi, Quotient, *VAngles);


    // Map in [-pi/2,pi/2]

    VectorRegister4Float sign = VectorBitwiseAnd(X, GlobalVectorConstants::SignBit());

    VectorRegister4Float c = VectorBitwiseOr(GlobalVectorConstants::Pi, sign);  // pi when x >= 0, -pi when x < 0

    VectorRegister4Float absx = VectorAbs(X);

    VectorRegister4Float rflx = VectorSubtract(c, X);

    VectorRegister4Float comp = VectorCompareGT(absx, GlobalVectorConstants::PiByTwo);

    X = VectorSelect(comp, rflx, X);

    sign = VectorSelect(comp, GlobalVectorConstants::FloatMinusOne, GlobalVectorConstants::FloatOne);


    const VectorRegister4Float XSquared = VectorMultiply(X, X);


    // 11-degree minimax approximation

    //*ScalarSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;

    const VectorRegister4Float SinCoeff0 = MakeVectorRegister(1.0f, -0.16666667f, 0.0083333310f, -0.00019840874f);

    const VectorRegister4Float SinCoeff1 = MakeVectorRegister(2.7525562e-06f, -2.3889859e-08f, /*unused*/ 0.f, /*unused*/ 0.f);


    VectorRegister4Float S;

    S = VectorReplicate(SinCoeff1, 1);

    S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff1, 0));

    S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 3));

    S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 2));

    S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 1));

    S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 0));

    *VSinAngles = VectorMultiply(S, X);


    // 10-degree minimax approximation

    //*ScalarCos = sign * (((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f);

    const VectorRegister4Float CosCoeff0 = MakeVectorRegister(1.0f, -0.5f, 0.041666638f, -0.0013888378f);

    const VectorRegister4Float CosCoeff1 = MakeVectorRegister(2.4760495e-05f, -2.6051615e-07f, /*unused*/ 0.f, /*unused*/ 0.f);


    VectorRegister4Float C;

    C = VectorReplicate(CosCoeff1, 1);

    C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff1, 0));

    C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 3));

    C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 2));

    C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 1));

    C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 0));

    *VCosAngles = VectorMultiply(C, sign);

}


// Returns true if the vector contains a component that is either NAN or +/-infinite.

inline bool VectorContainsNaNOrInfinite(VectorRegister4Float Vec)

{

    // https://en.wikipedia.org/wiki/IEEE_754-1985

    // Infinity is represented with all exponent bits set, with the correct sign bit.

    // NaN is represented with all exponent bits set, plus at least one fraction/significant bit set.

    // This means finite values will not have all exponent bits set, so check against those bits.


    union { float F; uint32 U; } InfUnion;

    InfUnion.U = 0x7F800000;

    const float Inf = InfUnion.F;

    const VectorRegister4Float FloatInfinity = MakeVectorRegister(Inf, Inf, Inf, Inf);


    // Mask off Exponent

    VectorRegister4Float ExpTest = VectorBitwiseAnd(Vec, FloatInfinity);


    // Compare to full exponent & combine resulting flags into lane 0

    const int32x4_t Table = MakeVectorRegisterIntConstant(0x0C080400, 0, 0, 0);


    uint8x16_t res = (uint8x16_t)VectorCompareEQ(ExpTest, FloatInfinity);

    // If we have all zeros, all elements are finite

    return vgetq_lane_u32((uint32x4_t)vqtbx1q_u8(res, res, Table), 0) != 0;

}


inline bool VectorContainsNaNOrInfinite(VectorRegister4Double Vec)

{

    // https://en.wikipedia.org/wiki/IEEE_754-1985

    // Infinity is represented with all exponent bits set, with the correct sign bit.

    // NaN is represented with all exponent bits set, plus at least one fraction/significant bit set.

    // This means finite values will not have all exponent bits set, so check against those bits.


    union { double F; uint64 U; } InfUnion;

    InfUnion.U = 0x7FF0000000000000ULL;

    const double Inf = InfUnion.F;

    const VectorRegister4Double DoubleInfinity = MakeVectorRegister(Inf, Inf, Inf, Inf);


    // Mask off Exponent

    VectorRegister4Double ExpTest = VectorBitwiseAnd(Vec, DoubleInfinity);


    // Compare to full exponent & combine resulting flags into lane 0

    const int32x4_t Table = MakeVectorRegisterIntConstant(0x18100800, 0, 0, 0);


    VectorRegister4Double InfTestRes = VectorCompareEQ(ExpTest, DoubleInfinity);


    // If we have all zeros, all elements are finite

    uint8x16_t ZeroVec = vdupq_n_u8(0);

    //TODO: there must be a better instruction to just get the top bits or smth

    return vgetq_lane_u32((uint32x4_t)vqtbx2q_u8(ZeroVec, *(uint8x16x2_t*)&InfTestRes, Table), 0) != 0;

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorExp(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorExp(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorExp2(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorExp2(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorLog(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorLog(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorLog2(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Log2(Val[0]), FMath::Log2(Val[1]), FMath::Log2(Val[2]), FMath::Log2(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorLog2(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Log2(Val[0]), FMath::Log2(Val[1]), FMath::Log2(Val[2]), FMath::Log2(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorTan(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorTan(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorASin(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorASin(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorACos(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorACos(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorATan(VectorRegister4Float X)

{

    AlignedFloat4 Val(X);

    return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));

}


FORCEINLINE VectorRegister4Double VectorATan(VectorRegister4Double X)

{

    AlignedDouble4 Val(X);

    return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));

}


//TODO: Vectorize

FORCEINLINE VectorRegister4Float VectorATan2(VectorRegister4Float X, VectorRegister4Float Y)

{

    AlignedFloat4 ValX(X);

    AlignedFloat4 ValY(Y);


    return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),

                              FMath::Atan2(ValX[1], ValY[1]),

                              FMath::Atan2(ValX[2], ValY[2]),

                              FMath::Atan2(ValX[3], ValY[3]));

}


FORCEINLINE VectorRegister4Double VectorATan2(VectorRegister4Double X, VectorRegister4Double Y)

{

    AlignedDouble4 ValX(X);

    AlignedDouble4 ValY(Y);


    return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),

                              FMath::Atan2(ValX[1], ValY[1]),

                              FMath::Atan2(ValX[2], ValY[2]),

                              FMath::Atan2(ValX[3], ValY[3]));

}


FORCEINLINE VectorRegister4Float VectorCeil(VectorRegister4Float X)

{

    return vrndpq_f32(X);

}


FORCEINLINE VectorRegister4Double VectorCeil(VectorRegister4Double X)

{

    VectorRegister4Double Result;

    Result.XY = vrndpq_f64(X.XY);

    Result.ZW = vrndpq_f64(X.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorFloor(VectorRegister4Float X)

{

    return vrndmq_f32(X);

}


FORCEINLINE VectorRegister4Double VectorFloor(VectorRegister4Double X)

{

    VectorRegister4Double Result;

    Result.XY = vrndmq_f64(X.XY);

    Result.ZW = vrndmq_f64(X.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorTruncate(VectorRegister4Float X)

{

    return vrndq_f32(X);

}


FORCEINLINE VectorRegister4Double VectorTruncate(VectorRegister4Double X)

{

    VectorRegister4Double Result;

    Result.XY = vrndq_f64(X.XY);

    Result.ZW = vrndq_f64(X.ZW);

    return Result;

}


FORCEINLINE VectorRegister4Float VectorMod(VectorRegister4Float X, VectorRegister4Float Y)

{

    // Check against invalid divisor

    VectorRegister4Float InvalidDivisorMask = VectorCompareLE(VectorAbs(Y), GlobalVectorConstants::SmallNumber);


    AlignedFloat4 XFloats(X), YFloats(Y);

    XFloats[0] = fmodf(XFloats[0], YFloats[0]);

    XFloats[1] = fmodf(XFloats[1], YFloats[1]);

    XFloats[2] = fmodf(XFloats[2], YFloats[2]);

    XFloats[3] = fmodf(XFloats[3], YFloats[3]);

    VectorRegister4Float Result = XFloats.ToVectorRegister();


    // Return 0 where divisor Y was too small

    Result = VectorSelect(InvalidDivisorMask, GlobalVectorConstants::FloatZero, Result);

    return Result;

}


FORCEINLINE VectorRegister4Double VectorMod(VectorRegister4Double X, VectorRegister4Double Y)

{

    // Check against invalid divisor

    VectorRegister4Double InvalidDivisorMask = VectorCompareLE(VectorAbs(Y), GlobalVectorConstants::DoubleSmallNumber);


    AlignedDouble4 XDoubles(X), YDoubles(Y);

    XDoubles[0] = fmod(XDoubles[0], YDoubles[0]);

    XDoubles[1] = fmod(XDoubles[1], YDoubles[1]);

    XDoubles[2] = fmod(XDoubles[2], YDoubles[2]);

    XDoubles[3] = fmod(XDoubles[3], YDoubles[3]);

    VectorRegister4Double DoubleResult = XDoubles.ToVectorRegister();


    // Return 0 where divisor Y was too small

    DoubleResult = VectorSelect(InvalidDivisorMask, GlobalVectorConstants::DoubleZero, DoubleResult);

    return DoubleResult;

}


FORCEINLINE VectorRegister4Float VectorSign(VectorRegister4Float X)

{

    VectorRegister4Float Mask = VectorCompareGE(X, GlobalVectorConstants::FloatZero);

    return VectorSelect(Mask, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatMinusOne);

}


FORCEINLINE VectorRegister4Double VectorSign(VectorRegister4Double X)

{

    VectorRegister4Double Mask = VectorCompareGE(X, GlobalVectorConstants::DoubleZero);

    return VectorSelect(Mask, GlobalVectorConstants::DoubleOne, GlobalVectorConstants::DoubleMinusOne);

}


FORCEINLINE VectorRegister4Float VectorStep(VectorRegister4Float X)

{

    VectorRegister4Float Mask = VectorCompareGE(X, GlobalVectorConstants::FloatZero);

    return VectorSelect(Mask, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero);

}


FORCEINLINE VectorRegister4Double VectorStep(VectorRegister4Double X)

{

    VectorRegister4Double Mask = VectorCompareGE(X, GlobalVectorConstants::DoubleZero);

    return VectorSelect(Mask, GlobalVectorConstants::DoubleOne, GlobalVectorConstants::DoubleZero);

}


namespace VectorSinConstantsNEON

{

    static const float p = 0.225f;

    static const float a = 7.58946609f; // 16 * sqrtf(p)

    static const float b = 1.63384342f; // (1 - p) / sqrtf(p)

    static const VectorRegister4Float A = MakeVectorRegisterConstant(a, a, a, a);

    static const VectorRegister4Float B = MakeVectorRegisterConstant(b, b, b, b);

}


FORCEINLINE VectorRegister4Float VectorSin(VectorRegister4Float X)

{

    //Sine approximation using a squared parabola restrained to f(0) = 0, f(PI) = 0, f(PI/2) = 1.

    //based on a good discussion here http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648

    //After approx 2.5 million tests comparing to sin():

    //Average error of 0.000128

    //Max error of 0.001091

    //

    // Error clarification - the *relative* error rises above 1.2% near

    // 0 and PI (as the result nears 0). This is enough to introduce

    // harmonic distortion when used as an oscillator - VectorSinCos

    // doesn't cost that much more and is significantly more accurate.

    // (though don't use either for an oscillator if you care about perf)


    VectorRegister4Float Y = VectorMultiply(X, GlobalVectorConstants::OneOverTwoPi);

    Y = VectorSubtract(Y, VectorFloor(VectorAdd(Y, GlobalVectorConstants::FloatOneHalf)));

    Y = VectorMultiply(VectorSinConstantsNEON::A, VectorMultiply(Y, VectorSubtract(GlobalVectorConstants::FloatOneHalf, VectorAbs(Y))));

    return VectorMultiply(Y, VectorAdd(VectorSinConstantsNEON::B, VectorAbs(Y)));

}


FORCEINLINE VectorRegister4Double VectorSin(VectorRegister4Double X)

{

    AlignedDouble4 Doubles(X);

    Doubles[0] = FMath::Sin(Doubles[0]);

    Doubles[1] = FMath::Sin(Doubles[1]);

    Doubles[2] = FMath::Sin(Doubles[2]);

    Doubles[3] = FMath::Sin(Doubles[3]);

    return Doubles.ToVectorRegister();

}


FORCEINLINE VectorRegister4Float VectorCos(VectorRegister4Float X)

{

    return VectorSin(VectorAdd(X, GlobalVectorConstants::PiByTwo));

}


FORCEINLINE VectorRegister4Double VectorCos(VectorRegister4Double X)

{

    AlignedDouble4 Doubles(X);

    Doubles[0] = FMath::Cos(Doubles[0]);

    Doubles[1] = FMath::Cos(Doubles[1]);

    Doubles[2] = FMath::Cos(Doubles[2]);

    Doubles[3] = FMath::Cos(Doubles[3]);

    return Doubles.ToVectorRegister();

}


FORCEINLINE void VectorSinCos(VectorRegister4Double* RESTRICT VSinAngles, VectorRegister4Double* RESTRICT VCosAngles, const VectorRegister4Double* RESTRICT VAngles)

{

    *VSinAngles = VectorSin(*VAngles);

    *VCosAngles = VectorCos(*VAngles);

}


FORCEINLINE VectorRegister4Float VectorLoadURGBA16N(const uint16* E)

{

    uint16x4_t UInt16s = vld1_u16(E);

    uint32x4_t UInt32s = vmovl_u16(UInt16s);

    return vcvtq_f32_u32(UInt32s);

}


FORCEINLINE VectorRegister4Float VectorLoadSRGBA16N(const void* Ptr)

{

    int16x4_t Int16s = vld1_s16((const int16 *)Ptr);

    int32x4_t Int32s = vmovl_s16(Int16s);

    return vcvtq_f32_s32(Int32s);

}


FORCEINLINE void VectorStoreURGBA16N(VectorRegister4Float Vec, uint16* Out)

{

    VectorRegister4Float Tmp;

    Tmp = VectorMax(Vec, VectorZeroFloat());

    Tmp = VectorMin(Tmp, VectorOneFloat());

    Tmp = VectorMultiply(Tmp, vdupq_n_f32(65535.0f));


    uint32x4_t TmpUInt = vcvtnq_u32_f32(Tmp);

    vst1_u16(Out, vmovn_u32(TmpUInt));

}


//Integer ops


//Bitwise

#define VectorIntAnd(A, B)      vandq_s32(A, B)

#define VectorIntOr(A, B)       vorrq_s32(A, B)

#define VectorIntXor(A, B)      veorq_s32(A, B)

#define VectorIntAndNot(A, B)   vbicq_s32(B, A)

#define VectorIntNot(A) vmvnq_s32(A)


//Comparison

#define VectorIntCompareEQ(A, B)    vceqq_s32(A,B)

#define VectorIntCompareNEQ(A, B)   VectorIntNot(VectorIntCompareEQ(A,B))

#define VectorIntCompareGT(A, B)    vcgtq_s32(A,B)

#define VectorIntCompareLT(A, B)    vcltq_s32(A,B)

#define VectorIntCompareGE(A, B)    vcgeq_s32(A,B)

#define VectorIntCompareLE(A, B)    vcleq_s32(A,B)


FORCEINLINE VectorRegister4Int VectorIntSelect(VectorRegister4Int Mask, VectorRegister4Int Vec1, VectorRegister4Int Vec2)

{

    return vbslq_s32(Mask, Vec1, Vec2);

}


//Arithmetic

#define VectorIntAdd(A, B)  vaddq_s32(A, B)

#define VectorIntSubtract(A, B) vsubq_s32(A, B)

#define VectorIntMultiply(A, B) vmulq_s32(A, B)

#define VectorIntNegate(A) vnegq_s32(A)

#define VectorIntMin(A, B) vminq_s32(A,B)

#define VectorIntMax(A, B) vmaxq_s32(A,B)

#define VectorIntClamp(A, B, C) VectorIntMin(VectorIntMax(A, B), C)

#define VectorIntAbs(A) vabsq_s32(A)


#define VectorIntSign(A) VectorIntSelect( VectorIntCompareGE(A, GlobalVectorConstants::IntZero), GlobalVectorConstants::IntOne, GlobalVectorConstants::IntMinusOne )


#define VectorIntToFloat(A) vcvtq_f32_s32(A)


FORCEINLINE VectorRegister4Int VectorFloatToInt(VectorRegister4Float A)

{

    return vcvtq_s32_f32(A);

}


FORCEINLINE VectorRegister4Int VectorFloatToInt(VectorRegister4Double A)

{

    return VectorFloatToInt(MakeVectorRegisterFloatFromDouble(A));

}


FORCEINLINE VectorRegister4Int VectorDoubleToInt(VectorRegister4Double Vec)

{

    VectorRegister2Int64 A = vcvtq_s64_f64(Vec.XY);

    VectorRegister2Int64 B = vcvtq_s64_f64(Vec.ZW);


    return vcombine_s32(vqmovn_s64(A), vqmovn_s64(B));

}


FORCEINLINE VectorRegister4Int VectorShuffleByte4(VectorRegister4Int Vec, VectorRegister4Int Mask)

{

    return vqtbl1q_u8(Vec, Mask);

}


//Loads and stores


#define VectorIntStore( Vec, Ptr )          vst1q_s32( (int32*)(Ptr), Vec )

#define VectorIntStore_16( Vec, Ptr )       vst1q_s16( (int16*)(Ptr), Vec )


#define VectorIntLoad( Ptr )                vld1q_s32( (int32*)((void*)(Ptr)) )

#define VectorIntLoad_16( Ptr )             vld1q_s16( (int16*)((void*)(Ptr)) )


#define VectorIntStoreAligned( Vec, Ptr )           vst1q_s32( (int32*)(Ptr), Vec )


#define VectorIntLoadAligned( Ptr )             vld1q_s32( (int32*)((void*)(Ptr)) )


#define VectorIntLoad1(Ptr)                         vld1q_dup_s32((int32*)(Ptr))

#define VectorIntLoad1_16(Ptr)                      vld1q_dup_s16((int16*)(Ptr))


#define VectorIntSet1(F)                            (VectorRegister4Int)vdupq_n_s32(F)

#define VectorSetZero()                             vdupq_n_s32(0)

#define VectorSet1(F)                               (VectorRegister4Float)vdupq_n_f32(F)

#define VectorCastIntToFloat(Vec)                   ((VectorRegister4f)vreinterpretq_f32_s32(Vec))

#define VectorCastFloatToInt(Vec)                   ((VectorRegister4i)vreinterpretq_s32_f32(Vec))

#define VectorCastDoubleToInt(Vec)                  ((VectorRegister4i)vreinterpretq_s64_f64(Vec))

#define VectorCastIntToDouble(Vec)                  ((VectorRegister2Double)vreinterpretq_f64_s64(Vec))

#define VectorShiftLeftImm(Vec, ImmAmt)             vshlq_n_s32(Vec, ImmAmt)

#define VectorShiftRightImmArithmetic(Vec, ImmAmt)  vshrq_n_s32(Vec, ImmAmt)

#define VectorShiftRightImmLogical(Vec, ImmAmt)     vshrq_n_u32(Vec, ImmAmt)

#define VectorRound(Vec)                            vrndnq_f32(Vec)


FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(VectorRegister4Float Vec)

{

    return vcvtnq_s32_f32(Vec);

}


FORCEINLINE VectorRegister4i VectorIntExpandLow16To32(VectorRegister4i V)

{

    return vmovl_u16(vget_low_u16(V));

}


// To be continued...


#endif // #if PLATFORM_ENABLE_VECTORINTRINSICS_NEON


PRAGMA_ENABLE_SHADOW_VARIABLE_WARNINGS


#if UE_ENABLE_INCLUDE_ORDER_DEPRECATED_IN_5_4

#include <type_traits>

#endif


EARObjectClassification::Table
@ Table

FORCEINLINE
#define FORCEINLINE
Definition AndroidPlatform.h:140

GCC_ALIGN
#define GCC_ALIGN(n)
Definition AndroidPlatform.h:163

ESplineBoneAxis::Z
@ Z

ESplineBoneAxis::Y
@ Y

check
#define check(expr)
Definition AssertionMacros.h:314

EMusicalNoteName::C
@ C

EMusicalNoteName::E
@ E

EMusicalNoteName::A
@ A

EMusicalNoteName::F
@ F

EMusicalNoteName::B
@ B

EMusicalNoteName::D
@ D

EChaosPerfUnits::S
@ S

Platform.h

int16
FPlatformTypes::int16 int16
A 16-bit signed integer.
Definition Platform.h:1123

int64
FPlatformTypes::int64 int64
A 64-bit signed integer.
Definition Platform.h:1127

int32
FPlatformTypes::int32 int32
A 32-bit signed integer.
Definition Platform.h:1125

RESTRICT
#define RESTRICT
Definition Platform.h:706

uint64
FPlatformTypes::uint64 uint64
A 64-bit unsigned integer.
Definition Platform.h:1117

StaticCastSharedRef
UE_FORCEINLINE_HINT TSharedRef< CastToType, Mode > StaticCastSharedRef(TSharedRef< CastFromType, Mode > const &InSharedRef)
Definition SharedPointer.h:127

ED3D12Access::Mask
@ Mask

EDatasmithCompositeCompMode::Sub
@ Sub

Float16.h

X
#define X(Name, Desc)
Definition FormatStringSan.h:47

EBlockCanary::Zero
@ Zero

EMaterialExpressionConvertType::Scalar
@ Scalar

EMaterialExpressionOperatorKind::Reciprocal
@ Reciprocal

PRAGMA_ENABLE_SHADOW_VARIABLE_WARNINGS
#define PRAGMA_ENABLE_SHADOW_VARIABLE_WARNINGS
Definition MSVCPlatformCompilerPreSetup.h:65

PRAGMA_DISABLE_SHADOW_VARIABLE_WARNINGS
#define PRAGMA_DISABLE_SHADOW_VARIABLE_WARNINGS
Definition MSVCPlatformCompilerPreSetup.h:55

EPixelFormatChannelFlags::R
@ R

EColorPickerChannels::Value
@ Value

float
USkinnedMeshComponent float
Definition SkinnedMeshComponent.h:60

ETextHistoryType::Base
@ Base

MakeVectorRegisterInt
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
Definition UnrealMathFPU.h:282

VectorTan
FORCEINLINE VectorRegister4Float VectorTan(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2039

VectorSubtract
FORCEINLINE VectorRegister4Float VectorSubtract(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:731

VectorDeinterleave
FORCEINLINE void VectorDeinterleave(VectorRegister4Float &RESTRICT OutEvens, VectorRegister4Float &RESTRICT OutOdds, const VectorRegister4Float &RESTRICT Lo, const VectorRegister4Float &RESTRICT Hi)
Definition UnrealMathFPU.h:1777

VectorLoadFloat3
FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double *Ptr)
Definition UnrealMathFPU.h:427

VectorATan2
FORCEINLINE VectorRegister4Float VectorATan2(const VectorRegister4Float &Y, const VectorRegister4Float &X)
Definition UnrealMathFPU.h:2083

VectorAnyGreaterThan
FORCEINLINE uint32 VectorAnyGreaterThan(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1917

VectorSqrt
FORCEINLINE VectorRegister4Float VectorSqrt(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1263

VectorReciprocalSqrt
FORCEINLINE VectorRegister4Float VectorReciprocalSqrt(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1279

VectorMatrixInverse
FORCEINLINE bool VectorMatrixInverse(FMatrix44d *DstMatrix, const FMatrix44d *SrcMatrix)
Definition UnrealMathFPU.h:1603

VectorLoadSRGBA16N
FORCEINLINE VectorRegister4Float VectorLoadSRGBA16N(void *Ptr)
Definition UnrealMathFPU.h:2268

VectorDot3
FORCEINLINE VectorRegister4Float VectorDot3(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:880

VectorMin
FORCEINLINE VectorRegister4Float VectorMin(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1686

VectorGetComponentImpl
FORCEINLINE float VectorGetComponentImpl(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:364

VectorLoad16
FORCEINLINE VectorRegister4x4Float VectorLoad16(const float *Ptr)
Definition UnrealMathFPU.h:410

VectorDot4
FORCEINLINE VectorRegister4Float VectorDot4(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:901

MakeVectorRegister
FORCEINLINE VectorRegister4Float MakeVectorRegister(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:195

VectorSinCos
FORCEINLINE void VectorSinCos(VectorRegister4Float *RESTRICT VSinAngles, VectorRegister4Float *RESTRICT VCosAngles, const VectorRegister4Float *RESTRICT VAngles)
Definition UnrealMathFPU.h:2109

VectorLoadURGB10A2N
FORCEINLINE VectorRegister4Float VectorLoadURGB10A2N(void *Ptr)
Definition UnrealMathFPU.h:1875

VectorStoreSignedByte4
FORCEINLINE void VectorStoreSignedByte4(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1858

VectorSet_W1
FORCEINLINE VectorRegister4Float VectorSet_W1(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1407

VectorSetFloat1
FORCEINLINE VectorRegister4Float VectorSetFloat1(float F)
Definition UnrealMathFPU.h:518

VectorLog2
FORCEINLINE VectorRegister4Float VectorLog2(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2004

VectorQuaternionMultiply
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float *RESTRICT Result, const VectorRegister4Float *RESTRICT Quat1, const VectorRegister4Float *RESTRICT Quat2)
Definition UnrealMathFPU.h:1431

VectorLoadURGBA16N
FORCEINLINE VectorRegister4Float VectorLoadURGBA16N(void *Ptr)
Definition UnrealMathFPU.h:2248

VectorShuffle
#define VectorShuffle(Vec1, Vec2, X, Y, Z, W)
Definition UnrealMathFPU.h:652

VectorRegister4d
VectorRegister4Double VectorRegister4d
Definition UnrealMathFPU.h:90

VectorTruncate
FORCEINLINE VectorRegister4Float VectorTruncate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2153

VectorZeroDouble
FORCEINLINE VectorRegister4Double VectorZeroDouble(void)
Definition UnrealMathFPU.h:336

VectorDivide
FORCEINLINE VectorRegister4Float VectorDivide(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:834

VectorMultiply
FORCEINLINE VectorRegister4Float VectorMultiply(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:758

AlignedRegister4
AlignedDouble4 AlignedRegister4
Definition UnrealMathFPU.h:150

VectorMax
FORCEINLINE VectorRegister4Float VectorMax(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1713

VectorBitwiseAnd
FORCEINLINE VectorRegister4Float VectorBitwiseAnd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1165

VectorLoadFloat1
FORCEINLINE VectorRegister4Float VectorLoadFloat1(const float *Ptr)
Definition UnrealMathFPU.h:468

VectorReciprocalLen
FORCEINLINE VectorRegister4Float VectorReciprocalLen(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1343

MakeVectorRegisterFloatConstant
FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
Definition UnrealMathFPU.h:297

VectorCos
FORCEINLINE VectorRegister4Float VectorCos(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2027

VectorLoadFloat2
FORCEINLINE VectorRegister4Float VectorLoadFloat2(const float *Ptr)
Definition UnrealMathFPU.h:485

VectorIntExpandLow16To32
#define VectorIntExpandLow16To32(V0)
Definition UnrealMathFPU.h:2661

VectorIntSelect
FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int &Mask, const VectorRegister4Int &Vec1, const VectorRegister4Int &Vec2)
Definition UnrealMathFPU.h:2411

VectorStoreByte4
FORCEINLINE void VectorStoreByte4(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1842

VectorCombineLow
FORCEINLINE VectorRegister4Float VectorCombineLow(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1757

VectorRegister4
VectorRegister4Double VectorRegister4
Definition UnrealMathFPU.h:94

VectorStore16
FORCEINLINE void VectorStore16(const VectorRegister4x4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:582

VectorReciprocalSqrtEstimate
FORCEINLINE VectorRegister4Float VectorReciprocalSqrtEstimate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1295

VectorLoadDouble1
FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double *Ptr)
Definition UnrealMathFPU.h:473

VectorMatrixMultiply
FORCEINLINE void VectorMatrixMultiply(FMatrix44d *Result, const FMatrix44d *Matrix1, const FMatrix44d *Matrix2)
Definition UnrealMathFPU.h:1538

VectorLoadAligned
VectorRegister4Float VectorLoadAligned(const float *Ptr)
Definition UnrealMathFPU.h:451

VectorMultiplyAdd
FORCEINLINE VectorRegister4Float VectorMultiplyAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2, const VectorRegister4Float &Vec3)
Definition UnrealMathFPU.h:786

VectorRegister4i
VectorRegister4Int VectorRegister4i
Definition UnrealMathFPU.h:88

VectorSelect
FORCEINLINE VectorRegister4Float VectorSelect(const VectorRegister4Float &Mask, const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1105

VectorExp
FORCEINLINE VectorRegister4Float VectorExp(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1971

VectorCompareGT
FORCEINLINE VectorRegister4Float VectorCompareGT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:974

VectorRoundToIntHalfToEven
FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float &A)
Definition UnrealMathFPU.h:2175

MakeVectorRegisterDoubleMask
FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)
Definition UnrealMathFPU.h:206

VectorExp2
FORCEINLINE VectorRegister4Float VectorExp2(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1982

VectorASin
FORCEINLINE VectorRegister4Float VectorASin(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2050

VectorLoadTwoPairsFloat
FORCEINLINE VectorRegister4Float VectorLoadTwoPairsFloat(const float *Ptr1, const float *Ptr2)
Definition UnrealMathFPU.h:503

VectorOneDouble
FORCEINLINE VectorRegister4Double VectorOneDouble(void)
Definition UnrealMathFPU.h:351

VectorReciprocalLenEstimate
FORCEINLINE VectorRegister4Float VectorReciprocalLenEstimate(const VectorRegister4Float &Vector)
Definition UnrealMathFPU.h:1375

VectorStore
FORCEINLINE void VectorStore(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:566

VectorSetControlRegister
#define VectorSetControlRegister(ControlStatus)
Definition UnrealMathFPU.h:1947

VectorTransformVector
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44f *MatrixM)
Definition UnrealMathFPU.h:1619

VectorCompareGE
FORCEINLINE VectorRegister4Float VectorCompareGE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1000

VectorMod
FORCEINLINE VectorRegister4Float VectorMod(const VectorRegister4Float &X, const VectorRegister4Float &Y)
Definition UnrealMathFPU.h:2185

MakeVectorRegisterInt64
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)
Definition UnrealMathFPU.h:307

VectorCombineHigh
FORCEINLINE VectorRegister4Float VectorCombineHigh(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1740

VectorCompareLT
FORCEINLINE VectorRegister4Float VectorCompareLT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1025

MakeVectorRegisterDouble
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)
Definition UnrealMathFPU.h:185

VectorRegister
VectorRegister4 VectorRegister
Definition UnrealMathFPU.h:95

VectorGetComponentDynamic
FORCEINLINE float VectorGetComponentDynamic(const VectorRegister4Float &Vec, uint32 ComponentIndex)
Definition UnrealMathFPU.h:369

VectorLog
FORCEINLINE VectorRegister4Float VectorLog(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1993

VectorSet_W0
FORCEINLINE VectorRegister4Float VectorSet_W0(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1391

VectorLoadSignedByte4
#define VectorLoadSignedByte4(Ptr)
Definition UnrealMathFPU.h:1823

MakeVectorRegisterIntConstant
FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
Definition UnrealMathFPU.h:292

VectorMaskBits
FORCEINLINE int32 VectorMaskBits(const VectorRegister4Float &Vec1)
Definition UnrealMathFPU.h:1075

VectorNegate
FORCEINLINE VectorRegister4Float VectorNegate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:687

VectorNegateMultiplyAdd
FORCEINLINE VectorRegister4Float VectorNegateMultiplyAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2, const VectorRegister4Float &Vec3)
Definition UnrealMathFPU.h:815

VectorReciprocal
FORCEINLINE VectorRegister4Float VectorReciprocal(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1311

VectorSin
FORCEINLINE VectorRegister4Float VectorSin(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2015

MakeVectorRegister2DoubleConstant
FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)
Definition UnrealMathFPU.h:302

VectorStoreURGBA16N
FORCEINLINE void VectorStoreURGBA16N(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:2288

VectorShuffleByte4
FORCEINLINE VectorRegister4Int VectorShuffleByte4(const VectorRegister4Int &Vec, const VectorRegister4Int &Mask)
Definition UnrealMathFPU.h:2515

VectorAbs
FORCEINLINE VectorRegister4Float VectorAbs(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:661

VectorACos
FORCEINLINE VectorRegister4Float VectorACos(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2061

VectorAdd
FORCEINLINE VectorRegister4Float VectorAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:704

VectorDoubleToInt
FORCEINLINE VectorRegister4Int VectorDoubleToInt(const VectorRegister4Double &A)
Definition UnrealMathFPU.h:2510

VectorFloor
FORCEINLINE VectorRegister4Float VectorFloor(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2142

VectorDot3Scalar
FORCEINLINE float VectorDot3Scalar(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:861

VectorRegisterInt
VectorRegister4Int VectorRegisterInt
Definition UnrealMathFPU.h:96

VectorStoreAligned
void VectorStoreAligned(const VectorRegister4Float &Vec, float *Ptr)
Definition UnrealMathFPU.h:534

MakeVectorRegisterFloatMask
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatMask(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:201

VectorBitwiseXor
FORCEINLINE VectorRegister4Float VectorBitwiseXor(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1190

VectorCeil
FORCEINLINE VectorRegister4Float VectorCeil(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2131

VectorLoadFloat3_W1
FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double *Ptr)
Definition UnrealMathFPU.h:439

VectorSwizzle
#define VectorSwizzle(Vec, X, Y, Z, W)
Definition UnrealMathFPU.h:639

VectorQuaternionMultiply2
FORCEINLINE VectorRegister4Float VectorQuaternionMultiply2(const VectorRegister4Float &Quat1, const VectorRegister4Float &Quat2)
Definition UnrealMathFPU.h:1517

VectorOneFloat
FORCEINLINE VectorRegister4Float VectorOneFloat(void)
Definition UnrealMathFPU.h:346

VectorLoadByte4
#define VectorLoadByte4(Ptr)
Definition UnrealMathFPU.h:1814

VectorZeroFloat
FORCEINLINE VectorRegister4Float VectorZeroFloat(void)
Definition UnrealMathFPU.h:331

VectorGetControlRegister
#define VectorGetControlRegister()
Definition UnrealMathFPU.h:1940

VectorATan
FORCEINLINE VectorRegister4Float VectorATan(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2072

VectorLoad
FORCEINLINE VectorRegister4Float VectorLoad(const float *Ptr)
Definition UnrealMathFPU.h:394

VectorCross
FORCEINLINE VectorRegister4Float VectorCross(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1216

VectorBitwiseOr
FORCEINLINE VectorRegister4Float VectorBitwiseOr(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1140

VectorFloatToInt
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Float &A)
Definition UnrealMathFPU.h:2491

VectorContainsNaNOrInfinite
bool VectorContainsNaNOrInfinite(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1960

VectorStep
FORCEINLINE VectorRegister4Float VectorStep(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2223

VectorRegister4f
VectorRegister4Float VectorRegister4f
Definition UnrealMathFPU.h:89

VectorPow
FORCEINLINE VectorRegister4Float VectorPow(const VectorRegister4Float &Base, const VectorRegister4Float &Exponent)
Definition UnrealMathFPU.h:1243

VectorStoreFloat3
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:594

VectorReciprocalEstimate
FORCEINLINE VectorRegister4Float VectorReciprocalEstimate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1327

VectorCompareLE
FORCEINLINE VectorRegister4Float VectorCompareLE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1050

VectorCompareEQ
FORCEINLINE VectorRegister4Float VectorCompareEQ(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:923

VectorStoreFloat1
FORCEINLINE void VectorStoreFloat1(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:610

MakeVectorRegisterFloat
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:175

MakeVectorRegisterFloatFromDouble
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatFromDouble(const VectorRegister4Double &Vec4d)
Definition UnrealMathFPU.h:262

MakeVectorRegister2Double
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)
Definition UnrealMathFPU.h:158

VectorReplicate
#define VectorReplicate(Vec, ElementIndex)
Definition UnrealMathFPU.h:627

VectorRegister2d
VectorRegister2Double VectorRegister2d
Definition UnrealMathFPU.h:91

VectorSign
FORCEINLINE VectorRegister4Float VectorSign(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2204

VectorLoadByte4Reverse
#define VectorLoadByte4Reverse(Ptr)
Definition UnrealMathFPU.h:1833

VectorCompareNE
FORCEINLINE VectorRegister4Float VectorCompareNE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:948

VectorMergeVecXYZ_VecW
FORCEINLINE VectorRegister4Float VectorMergeVecXYZ_VecW(const VectorRegister4Float &VecXYZ, const VectorRegister4Float &VecW)
Definition UnrealMathFPU.h:1797

VectorStoreURGB10A2N
FORCEINLINE void VectorStoreURGB10A2N(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1895

UnrealMathVectorConstants.h.inl

MakeVectorRegisterConstant
FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterConstant(float X, float Y, float Z, float W)
Definition UnrealMathVectorConstants.h.inl:28

Val
float Val(const FString &Value)
Definition UnrealMath.cpp:3163

EVariantTypes::Vector
@ Vector

EVariantTypes::Double
@ Double

EWidgetBlendMode::Masked
@ Masked

uint8
uint8_t uint8
Definition binka_ue_file_header.h:8

uint16
uint16_t uint16
Definition binka_ue_file_header.h:7

uint32
uint32_t uint32
Definition binka_ue_file_header.h:6

FFloat16
Definition Float16.h:34

GlobalVectorConstants::FloatInfinity
VectorRegister4Float FloatInfinity()
Definition UnrealMathVectorConstants.h.inl:118

GlobalVectorConstants::FloatZero
constexpr VectorRegister4Float FloatZero
Definition UnrealMathVectorConstants.h.inl:41

GlobalVectorConstants::FloatOne
constexpr VectorRegister4Float FloatOne
Definition UnrealMathVectorConstants.h.inl:40

GlobalVectorConstants::FloatMinusOne
constexpr VectorRegister4Float FloatMinusOne
Definition UnrealMathVectorConstants.h.inl:42

GlobalVectorConstants::OneOverTwoPi
constexpr VectorRegister4Float OneOverTwoPi
Definition UnrealMathVectorConstants.h.inl:126

GlobalVectorConstants::SmallNumber
constexpr VectorRegister4Float SmallNumber
Definition UnrealMathVectorConstants.h.inl:53

GlobalVectorConstants::DoubleMinusOne
constexpr VectorRegister4Double DoubleMinusOne
Definition UnrealMathVectorConstants.h.inl:60

GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK2
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK2
Definition UnrealMathVectorConstants.h.inl:91

GlobalVectorConstants::QMULTI_SIGN_MASK0
constexpr VectorRegister4Float QMULTI_SIGN_MASK0
Definition UnrealMathVectorConstants.h.inl:86

GlobalVectorConstants::TwoPi
constexpr VectorRegister4Float TwoPi
Definition UnrealMathVectorConstants.h.inl:122

GlobalVectorConstants::QMULTI_SIGN_MASK1
constexpr VectorRegister4Float QMULTI_SIGN_MASK1
Definition UnrealMathVectorConstants.h.inl:87

GlobalVectorConstants::DoubleSmallNumber
constexpr VectorRegister4Double DoubleSmallNumber
Definition UnrealMathVectorConstants.h.inl:71

GlobalVectorConstants::PiByTwo
constexpr VectorRegister4Float PiByTwo
Definition UnrealMathVectorConstants.h.inl:123

GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK0
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK0
Definition UnrealMathVectorConstants.h.inl:89

GlobalVectorConstants::Pi
constexpr VectorRegister4Float Pi
Definition UnrealMathVectorConstants.h.inl:121

GlobalVectorConstants::QMULTI_SIGN_MASK2
constexpr VectorRegister4Float QMULTI_SIGN_MASK2
Definition UnrealMathVectorConstants.h.inl:88

GlobalVectorConstants::FloatOneHalf
constexpr VectorRegister4Float FloatOneHalf
Definition UnrealMathVectorConstants.h.inl:50

GlobalVectorConstants::DoubleInfinity
VectorRegister4Double DoubleInfinity()
Definition UnrealMathVectorConstants.h.inl:119

GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK1
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK1
Definition UnrealMathVectorConstants.h.inl:90

GlobalVectorConstants::SignBit
VectorRegister4Float SignBit()
Definition UnrealMathVectorConstants.h.inl:105

GlobalVectorConstants::DoubleOne
constexpr VectorRegister4Double DoubleOne
Definition UnrealMathVectorConstants.h.inl:58

GlobalVectorConstants::DoubleZero
constexpr VectorRegister4Double DoubleZero
Definition UnrealMathVectorConstants.h.inl:59

UE::NNE::ModelData::V2
@ V2
Definition NNEModelData.cpp:18

UE::NNE::ModelData::V1
@ V1
Definition NNEModelData.cpp:17

UE::Shader::Div
FValue Div(const FValue &Lhs, const FValue &Rhs)
Definition ShaderValue.cpp:1519

UE::String::Private::Result
UE_STRING_CLASS Result(Forward< LhsType >(Lhs), RhsLen)
Definition String.cpp.inl:732

Index
U16 Index
Definition radfft.cpp:71

AlignedDouble4
Definition UnrealMathFPU.h:133

AlignedDouble4::operator[]
FORCEINLINE double operator[](int32 Index) const
Definition UnrealMathFPU.h:141

AlignedDouble4::V
double V[4]
Definition UnrealMathFPU.h:134

AlignedDouble4::ToVectorRegister
FORCEINLINE VectorRegister4Double ToVectorRegister() const
Definition UnrealMathFPU.h:144

AlignedFloat4
Definition UnrealMathFPU.h:113

AlignedFloat4::V
float V[4]
Definition UnrealMathFPU.h:114

AlignedFloat4::operator[]
FORCEINLINE float operator[](int32 Index) const
Definition UnrealMathFPU.h:121

AlignedFloat4::ToVectorRegister
FORCEINLINE VectorRegister4Float ToVectorRegister() const
Definition UnrealMathFPU.h:124

FMath::MatrixInverse
static CORE_API bool MatrixInverse(FMatrix44f *DstMatrix, const FMatrix44f *SrcMatrix)
Definition UnrealMath.cpp:928

FMath::Log2
static float Log2(float Value)
Definition UnrealMathUtility.h:722

UE::Math::TMatrix< float >

VectorRegister2Double
Definition UnrealMathFPU.h:34

VectorRegister2Double::V
double V[2]
Definition UnrealMathFPU.h:35

VectorRegister4Double
Definition UnrealMathFPU.h:42

VectorRegister4Double::VectorRegister4Double
VectorRegister4Double()=default

VectorRegister4Double::XY
VectorRegister2Double XY
Definition UnrealMathFPU.h:47

VectorRegister4Double::operator=
FORCEINLINE VectorRegister4Double & operator=(const VectorRegister4Float &From)
Definition UnrealMathFPU.h:77

VectorRegister4Double::ZW
VectorRegister2Double ZW
Definition UnrealMathFPU.h:48

VectorRegister4Double::V
double V[4]
Definition UnrealMathFPU.h:50

VectorRegister4Float
Definition UnrealMathFPU.h:20

VectorRegister4Float::V
float V[4]
Definition UnrealMathFPU.h:21

VectorRegister4Int
Definition UnrealMathFPU.h:28

VectorRegister4Int::V
int32 V[4]
Definition UnrealMathFPU.h:29

VectorRegister4x4Float
Definition UnrealMathFPU.h:99

VectorRegisterConstInit
Definition UnrealMathFPU.h:14