1 // 2 // Vec8.hpp 3 // MNN 4 // 5 // Created by MNN on b'2021/05/16'. 6 // Copyright © 2018, Alibaba Group Holding Limited 7 // 8 9 #ifndef Vec8_hpp 10 #define Vec8_hpp 11 #include "FunctionSummary.hpp" 12 struct Vec8 { 13 using VecType = Vec8; 14 __m256 value; operator +Vec815 VecType operator+(const VecType& lr) { 16 VecType dst = { _mm256_add_ps(value, lr.value) }; 17 return dst; 18 } operator -Vec819 VecType operator-(const VecType& lr) { 20 VecType dst = { _mm256_sub_ps(value, lr.value) }; 21 return dst; 22 } operator *Vec823 VecType operator*(const VecType& lr) { 24 VecType dst = { _mm256_mul_ps(value, lr.value) }; 25 return dst; 26 } operator *Vec827 VecType operator*(float lr) { 28 VecType dst = { _mm256_mul_ps(value, _mm256_set1_ps(lr)) }; 29 return dst; 30 } 31 operator =Vec832 VecType& operator=(const VecType& lr) { 33 value = lr.value; 34 return *this; 35 } operator -Vec836 VecType operator-() { 37 VecType dst; 38 #if defined(_MSC_VER) 39 dst.value = _mm256_xor_ps(value, _mm256_set1_ps(-0.f)); // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC. 40 #else 41 dst.value = -value; 42 #endif 43 return dst; 44 } Vec8Vec845 Vec8() { 46 } Vec8Vec847 Vec8(const float v) { 48 value = _mm256_set1_ps(v); 49 } Vec8Vec850 Vec8(__m256&& v) { 51 value = v; 52 } Vec8Vec853 Vec8(const VecType& lr) { 54 value = lr.value; 55 } operator []Vec856 float operator[](size_t i) { 57 #if defined(_MSC_VER) // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension. 58 float temp[8]; 59 _mm256_storeu_ps(temp, value); 60 return temp[i]; 61 #else 62 return value[i]; 63 #endif 64 } loadVec865 static VecType load(const float* addr) { 66 VecType v = { _mm256_loadu_ps(addr) }; 67 return v; 68 } saveVec869 static void save(float* addr, const VecType& v) { 70 _mm256_storeu_ps(addr, v.value); 71 } maxVec872 static VecType max(const VecType& v1, const VecType& v2) { 73 VecType dst = { _mm256_max_ps(v1.value, v2.value) }; 74 return dst; 75 } minVec876 static VecType min(const VecType& v1, const VecType& v2) { 77 VecType dst = { _mm256_min_ps(v1.value, v2.value) }; 78 return dst; 79 } 80 }; 81 82 #define TRANSPOSE_8x8 \ 83 t0 = _mm256_unpacklo_ps(r0, r1);\ 84 t1 = _mm256_unpackhi_ps(r0, r1);\ 85 t2 = _mm256_unpacklo_ps(r2, r3);\ 86 t3 = _mm256_unpackhi_ps(r2, r3);\ 87 t4 = _mm256_unpacklo_ps(r4, r5);\ 88 t5 = _mm256_unpackhi_ps(r4, r5);\ 89 t6 = _mm256_unpacklo_ps(r6, r7);\ 90 t7 = _mm256_unpackhi_ps(r6, r7);\ 91 \ 92 r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));\ 93 r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));\ 94 r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));\ 95 r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));\ 96 r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));\ 97 r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));\ 98 r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));\ 99 r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));\ 100 \ 101 t0 = _mm256_permute2f128_ps(r0, r4, 0x20);\ 102 t1 = _mm256_permute2f128_ps(r1, r5, 0x20);\ 103 t2 = _mm256_permute2f128_ps(r2, r6, 0x20);\ 104 t3 = _mm256_permute2f128_ps(r3, r7, 0x20);\ 105 t4 = _mm256_permute2f128_ps(r0, r4, 0x31);\ 106 t5 = _mm256_permute2f128_ps(r1, r5, 0x31);\ 107 t6 = _mm256_permute2f128_ps(r2, r6, 0x31);\ 108 t7 = _mm256_permute2f128_ps(r3, r7, 0x31);\ 109 110 #define TRANSPOSE_8x8_REPLACE(r0, r1, r2, r3, r4, r5, r6, r7) \ 111 {\ 112 auto t0 = _mm256_unpacklo_ps(r0, r1);\ 113 auto t1 = _mm256_unpackhi_ps(r0, r1);\ 114 auto t2 = _mm256_unpacklo_ps(r2, r3);\ 115 auto t3 = _mm256_unpackhi_ps(r2, r3);\ 116 auto t4 = _mm256_unpacklo_ps(r4, r5);\ 117 auto t5 = _mm256_unpackhi_ps(r4, r5);\ 118 auto t6 = _mm256_unpacklo_ps(r6, r7);\ 119 auto t7 = _mm256_unpackhi_ps(r6, r7);\ 120 \ 121 r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));\ 122 r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));\ 123 r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));\ 124 r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));\ 125 r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));\ 126 r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));\ 127 r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));\ 128 r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));\ 129 \ 130 t0 = _mm256_permute2f128_ps(r0, r4, 0x20);\ 131 t1 = _mm256_permute2f128_ps(r1, r5, 0x20);\ 132 t2 = _mm256_permute2f128_ps(r2, r6, 0x20);\ 133 t3 = _mm256_permute2f128_ps(r3, r7, 0x20);\ 134 t4 = _mm256_permute2f128_ps(r0, r4, 0x31);\ 135 t5 = _mm256_permute2f128_ps(r1, r5, 0x31);\ 136 t6 = _mm256_permute2f128_ps(r2, r6, 0x31);\ 137 t7 = _mm256_permute2f128_ps(r3, r7, 0x31);\ 138 r0 = t0, r1 = t1, r2 = t2, r3 = t3;\ 139 r4 = t4, r5 = t5, r6 = t6, r7 = t7;\ 140 }\ 141 142 143 #endif 144 145