1 //
2 //  Vec8.hpp
3 //  MNN
4 //
5 //  Created by MNN on b'2021/05/16'.
6 //  Copyright © 2018, Alibaba Group Holding Limited
7 //
8 
9 #ifndef Vec8_hpp
10 #define Vec8_hpp
11 #include "FunctionSummary.hpp"
12 struct Vec8 {
13     using VecType = Vec8;
14     __m256 value;
operator +Vec815     VecType operator+(const VecType& lr) {
16         VecType dst = { _mm256_add_ps(value, lr.value) };
17         return dst;
18     }
operator -Vec819     VecType operator-(const VecType& lr) {
20         VecType dst = { _mm256_sub_ps(value, lr.value) };
21         return dst;
22     }
operator *Vec823     VecType operator*(const VecType& lr) {
24         VecType dst = { _mm256_mul_ps(value, lr.value) };
25         return dst;
26     }
operator *Vec827     VecType operator*(float lr) {
28         VecType dst = { _mm256_mul_ps(value, _mm256_set1_ps(lr)) };
29         return dst;
30     }
31 
operator =Vec832     VecType& operator=(const VecType& lr) {
33         value = lr.value;
34         return *this;
35     }
operator -Vec836     VecType operator-() {
37         VecType dst;
38 #if defined(_MSC_VER)
39         dst.value = _mm256_xor_ps(value, _mm256_set1_ps(-0.f)); // Using unary operation to SSE vec is GCC extension. We can not do this directly in MSVC.
40 #else
41         dst.value = -value;
42 #endif
43         return dst;
44     }
Vec8Vec845     Vec8() {
46     }
Vec8Vec847     Vec8(const float v) {
48         value = _mm256_set1_ps(v);
49     }
Vec8Vec850     Vec8(__m256&& v) {
51         value = v;
52     }
Vec8Vec853     Vec8(const VecType& lr) {
54         value = lr.value;
55     }
operator []Vec856     float operator[](size_t i) {
57 #if defined(_MSC_VER)  // X64 native only mandatory support SSE and SSE2 extension, and we can not find intrinsic function to extract element directly by index in SSE and SSE2 extension.
58         float temp[8];
59         _mm256_storeu_ps(temp, value);
60         return temp[i];
61 #else
62         return value[i];
63 #endif
64     }
loadVec865     static VecType load(const float* addr) {
66         VecType v = { _mm256_loadu_ps(addr) };
67         return v;
68     }
saveVec869     static void save(float* addr, const VecType& v) {
70         _mm256_storeu_ps(addr, v.value);
71     }
maxVec872     static VecType max(const VecType& v1, const VecType& v2) {
73         VecType dst = { _mm256_max_ps(v1.value, v2.value) };
74         return dst;
75     }
minVec876     static VecType min(const VecType& v1, const VecType& v2) {
77         VecType dst = { _mm256_min_ps(v1.value, v2.value) };
78         return dst;
79     }
80 };
81 
82 #define TRANSPOSE_8x8 \
83 t0 = _mm256_unpacklo_ps(r0, r1);\
84 t1 = _mm256_unpackhi_ps(r0, r1);\
85 t2 = _mm256_unpacklo_ps(r2, r3);\
86 t3 = _mm256_unpackhi_ps(r2, r3);\
87 t4 = _mm256_unpacklo_ps(r4, r5);\
88 t5 = _mm256_unpackhi_ps(r4, r5);\
89 t6 = _mm256_unpacklo_ps(r6, r7);\
90 t7 = _mm256_unpackhi_ps(r6, r7);\
91 \
92 r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));\
93 r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));\
94 r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));\
95 r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));\
96 r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));\
97 r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));\
98 r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));\
99 r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));\
100 \
101 t0 = _mm256_permute2f128_ps(r0, r4, 0x20);\
102 t1 = _mm256_permute2f128_ps(r1, r5, 0x20);\
103 t2 = _mm256_permute2f128_ps(r2, r6, 0x20);\
104 t3 = _mm256_permute2f128_ps(r3, r7, 0x20);\
105 t4 = _mm256_permute2f128_ps(r0, r4, 0x31);\
106 t5 = _mm256_permute2f128_ps(r1, r5, 0x31);\
107 t6 = _mm256_permute2f128_ps(r2, r6, 0x31);\
108 t7 = _mm256_permute2f128_ps(r3, r7, 0x31);\
109 
110 #define TRANSPOSE_8x8_REPLACE(r0, r1, r2, r3, r4, r5, r6, r7) \
111 {\
112 auto t0 = _mm256_unpacklo_ps(r0, r1);\
113 auto t1 = _mm256_unpackhi_ps(r0, r1);\
114 auto t2 = _mm256_unpacklo_ps(r2, r3);\
115 auto t3 = _mm256_unpackhi_ps(r2, r3);\
116 auto t4 = _mm256_unpacklo_ps(r4, r5);\
117 auto t5 = _mm256_unpackhi_ps(r4, r5);\
118 auto t6 = _mm256_unpacklo_ps(r6, r7);\
119 auto t7 = _mm256_unpackhi_ps(r6, r7);\
120 \
121 r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));\
122 r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));\
123 r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));\
124 r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));\
125 r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));\
126 r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));\
127 r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));\
128 r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));\
129 \
130 t0 = _mm256_permute2f128_ps(r0, r4, 0x20);\
131 t1 = _mm256_permute2f128_ps(r1, r5, 0x20);\
132 t2 = _mm256_permute2f128_ps(r2, r6, 0x20);\
133 t3 = _mm256_permute2f128_ps(r3, r7, 0x20);\
134 t4 = _mm256_permute2f128_ps(r0, r4, 0x31);\
135 t5 = _mm256_permute2f128_ps(r1, r5, 0x31);\
136 t6 = _mm256_permute2f128_ps(r2, r6, 0x31);\
137 t7 = _mm256_permute2f128_ps(r3, r7, 0x31);\
138 r0 = t0, r1 = t1, r2 = t2, r3 = t3;\
139 r4 = t4, r5 = t5, r6 = t6, r7 = t7;\
140 }\
141 
142 
143 #endif
144 
145