1#if GLM_ARCH & GLM_ARCH_SSE2_BIT 2 3#include "type_mat4x4.hpp" 4#include "../geometric.hpp" 5#include "../simd/matrix.h" 6#include <cstring> 7 8namespace glm{ 9namespace detail 10{ 11# if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE 12 template<qualifier Q> 13 struct compute_matrixCompMult<4, 4, float, Q, true> 14 { 15 GLM_STATIC_ASSERT(detail::is_aligned<Q>::value, "Specialization requires aligned"); 16 17 GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& x, mat<4, 4, float, Q> const& y) 18 { 19 mat<4, 4, float, Q> Result; 20 glm_mat4_matrixCompMult( 21 *static_cast<glm_vec4 const (*)[4]>(&x[0].data), 22 *static_cast<glm_vec4 const (*)[4]>(&y[0].data), 23 *static_cast<glm_vec4(*)[4]>(&Result[0].data)); 24 return Result; 25 } 26 }; 27# endif 28 29 template<qualifier Q> 30 struct compute_transpose<4, 4, float, Q, true> 31 { 32 GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m) 33 { 34 mat<4, 4, float, Q> Result; 35 glm_mat4_transpose(&m[0].data, &Result[0].data); 36 return Result; 37 } 38 }; 39 40 template<qualifier Q> 41 struct compute_determinant<4, 4, float, Q, true> 42 { 43 GLM_FUNC_QUALIFIER static float call(mat<4, 4, float, Q> const& m) 44 { 45 return _mm_cvtss_f32(glm_mat4_determinant(&m[0].data)); 46 } 47 }; 48 49 template<qualifier Q> 50 struct compute_inverse<4, 4, float, Q, true> 51 { 52 GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m) 53 { 54 mat<4, 4, float, Q> Result; 55 glm_mat4_inverse(&m[0].data, &Result[0].data); 56 return Result; 57 } 58 }; 59}//namespace detail 60 61# if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE 62 template<> 63 GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_lowp> outerProduct<4, 4, float, aligned_lowp>(vec<4, float, aligned_lowp> const& c, vec<4, float, aligned_lowp> const& r) 64 { 65 __m128 NativeResult[4]; 66 glm_mat4_outerProduct(c.data, r.data, NativeResult); 67 mat<4, 4, float, aligned_lowp> Result; 68 std::memcpy(&Result[0], &NativeResult[0], sizeof(Result)); 69 return Result; 70 } 71 72 template<> 73 GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_mediump> outerProduct<4, 4, float, aligned_mediump>(vec<4, float, aligned_mediump> const& c, vec<4, float, aligned_mediump> const& r) 74 { 75 __m128 NativeResult[4]; 76 glm_mat4_outerProduct(c.data, r.data, NativeResult); 77 mat<4, 4, float, aligned_mediump> Result; 78 std::memcpy(&Result[0], &NativeResult[0], sizeof(Result)); 79 return Result; 80 } 81 82 template<> 83 GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_highp> outerProduct<4, 4, float, aligned_highp>(vec<4, float, aligned_highp> const& c, vec<4, float, aligned_highp> const& r) 84 { 85 __m128 NativeResult[4]; 86 glm_mat4_outerProduct(c.data, r.data, NativeResult); 87 mat<4, 4, float, aligned_highp> Result; 88 std::memcpy(&Result[0], &NativeResult[0], sizeof(Result)); 89 return Result; 90 } 91# endif 92}//namespace glm 93 94#elif GLM_ARCH & GLM_ARCH_NEON_BIT 95 96namespace glm { 97#if GLM_LANG & GLM_LANG_CXX11_FLAG 98 template <qualifier Q> 99 GLM_FUNC_QUALIFIER 100 typename std::enable_if<detail::is_aligned<Q>::value, mat<4, 4, float, Q>>::type 101 operator*(mat<4, 4, float, Q> const & m1, mat<4, 4, float, Q> const & m2) 102 { 103 auto MulRow = [&](int l) { 104 float32x4_t const SrcA = m2[l].data; 105 106 float32x4_t r = neon::mul_lane(m1[0].data, SrcA, 0); 107 r = neon::madd_lane(r, m1[1].data, SrcA, 1); 108 r = neon::madd_lane(r, m1[2].data, SrcA, 2); 109 r = neon::madd_lane(r, m1[3].data, SrcA, 3); 110 111 return r; 112 }; 113 114 mat<4, 4, float, aligned_highp> Result; 115 Result[0].data = MulRow(0); 116 Result[1].data = MulRow(1); 117 Result[2].data = MulRow(2); 118 Result[3].data = MulRow(3); 119 120 return Result; 121 } 122#endif // CXX11 123 124 template<qualifier Q> 125 struct detail::compute_inverse<4, 4, float, Q, true> 126 { 127 GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m) 128 { 129 float32x4_t const& m0 = m[0].data; 130 float32x4_t const& m1 = m[1].data; 131 float32x4_t const& m2 = m[2].data; 132 float32x4_t const& m3 = m[3].data; 133 134 // m[2][2] * m[3][3] - m[3][2] * m[2][3]; 135 // m[2][2] * m[3][3] - m[3][2] * m[2][3]; 136 // m[1][2] * m[3][3] - m[3][2] * m[1][3]; 137 // m[1][2] * m[2][3] - m[2][2] * m[1][3]; 138 139 float32x4_t Fac0; 140 { 141 float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2)); 142 float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3); 143 float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2); 144 float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3)); 145 Fac0 = w0 * w1 - w2 * w3; 146 } 147 148 // m[2][1] * m[3][3] - m[3][1] * m[2][3]; 149 // m[2][1] * m[3][3] - m[3][1] * m[2][3]; 150 // m[1][1] * m[3][3] - m[3][1] * m[1][3]; 151 // m[1][1] * m[2][3] - m[2][1] * m[1][3]; 152 153 float32x4_t Fac1; 154 { 155 float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1)); 156 float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3); 157 float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1); 158 float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3)); 159 Fac1 = w0 * w1 - w2 * w3; 160 } 161 162 // m[2][1] * m[3][2] - m[3][1] * m[2][2]; 163 // m[2][1] * m[3][2] - m[3][1] * m[2][2]; 164 // m[1][1] * m[3][2] - m[3][1] * m[1][2]; 165 // m[1][1] * m[2][2] - m[2][1] * m[1][2]; 166 167 float32x4_t Fac2; 168 { 169 float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1)); 170 float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2); 171 float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1); 172 float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2)); 173 Fac2 = w0 * w1 - w2 * w3; 174 } 175 176 // m[2][0] * m[3][3] - m[3][0] * m[2][3]; 177 // m[2][0] * m[3][3] - m[3][0] * m[2][3]; 178 // m[1][0] * m[3][3] - m[3][0] * m[1][3]; 179 // m[1][0] * m[2][3] - m[2][0] * m[1][3]; 180 181 float32x4_t Fac3; 182 { 183 float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0)); 184 float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3); 185 float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0); 186 float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3)); 187 Fac3 = w0 * w1 - w2 * w3; 188 } 189 190 // m[2][0] * m[3][2] - m[3][0] * m[2][2]; 191 // m[2][0] * m[3][2] - m[3][0] * m[2][2]; 192 // m[1][0] * m[3][2] - m[3][0] * m[1][2]; 193 // m[1][0] * m[2][2] - m[2][0] * m[1][2]; 194 195 float32x4_t Fac4; 196 { 197 float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0)); 198 float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2); 199 float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0); 200 float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2)); 201 Fac4 = w0 * w1 - w2 * w3; 202 } 203 204 // m[2][0] * m[3][1] - m[3][0] * m[2][1]; 205 // m[2][0] * m[3][1] - m[3][0] * m[2][1]; 206 // m[1][0] * m[3][1] - m[3][0] * m[1][1]; 207 // m[1][0] * m[2][1] - m[2][0] * m[1][1]; 208 209 float32x4_t Fac5; 210 { 211 float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0)); 212 float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1); 213 float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0); 214 float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1)); 215 Fac5 = w0 * w1 - w2 * w3; 216 } 217 218 float32x4_t Vec0 = neon::copy_lane(neon::dupq_lane(m0, 0), 0, m1, 0); // (m[1][0], m[0][0], m[0][0], m[0][0]); 219 float32x4_t Vec1 = neon::copy_lane(neon::dupq_lane(m0, 1), 0, m1, 1); // (m[1][1], m[0][1], m[0][1], m[0][1]); 220 float32x4_t Vec2 = neon::copy_lane(neon::dupq_lane(m0, 2), 0, m1, 2); // (m[1][2], m[0][2], m[0][2], m[0][2]); 221 float32x4_t Vec3 = neon::copy_lane(neon::dupq_lane(m0, 3), 0, m1, 3); // (m[1][3], m[0][3], m[0][3], m[0][3]); 222 223 float32x4_t Inv0 = Vec1 * Fac0 - Vec2 * Fac1 + Vec3 * Fac2; 224 float32x4_t Inv1 = Vec0 * Fac0 - Vec2 * Fac3 + Vec3 * Fac4; 225 float32x4_t Inv2 = Vec0 * Fac1 - Vec1 * Fac3 + Vec3 * Fac5; 226 float32x4_t Inv3 = Vec0 * Fac2 - Vec1 * Fac4 + Vec2 * Fac5; 227 228 float32x4_t r0 = float32x4_t{-1, +1, -1, +1} * Inv0; 229 float32x4_t r1 = float32x4_t{+1, -1, +1, -1} * Inv1; 230 float32x4_t r2 = float32x4_t{-1, +1, -1, +1} * Inv2; 231 float32x4_t r3 = float32x4_t{+1, -1, +1, -1} * Inv3; 232 233 float32x4_t det = neon::mul_lane(r0, m0, 0); 234 det = neon::madd_lane(det, r1, m0, 1); 235 det = neon::madd_lane(det, r2, m0, 2); 236 det = neon::madd_lane(det, r3, m0, 3); 237 238 float32x4_t rdet = vdupq_n_f32(1 / vgetq_lane_f32(det, 0)); 239 240 mat<4, 4, float, Q> r; 241 r[0].data = vmulq_f32(r0, rdet); 242 r[1].data = vmulq_f32(r1, rdet); 243 r[2].data = vmulq_f32(r2, rdet); 244 r[3].data = vmulq_f32(r3, rdet); 245 return r; 246 } 247 }; 248}//namespace glm 249#endif 250