1 /**
2 * \file
3 * simd support for intrinsics
4 *
5 * Author:
6 * Rodrigo Kumpera (rkumpera@novell.com)
7 *
8 * (C) 2008 Novell, Inc.
9 */
10
11 #include <config.h>
12 #include <stdio.h>
13
14 #include "mini.h"
15 #include "ir-emit.h"
16 #include "mono/utils/bsearch.h"
17 #include <mono/metadata/abi-details.h>
18 #include <mono/metadata/reflection-internals.h>
19
20 /*
21 General notes on SIMD intrinsics
22
23 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
24 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
25 TODO extend op_to_op_dest_membase to handle simd ops
26 TODO add support for indexed versions of simd ops
27 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
28 TODO make sure locals, arguments and spills are properly aligned.
29 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
30 TODO add stuff to man pages
31 TODO document this under /docs
32 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
33 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
34 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
35 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
36 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
37 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
38 TODO check if we need to init the SSE control word with better precision.
39 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
40 TODO make SimdRuntime.get_AccelMode work under AOT
41 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
42 TODO extend bounds checking code to support for range checking.
43
44 General notes for SIMD intrinsics.
45
46 -Bad extractor and constructor performance
47 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
48 It will be loaded in the FP stack just to be pushed on the call stack.
49
50 A similar thing happens with Vector4f constructor that require float vars to be
51
52 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
53 trip to the FP stack is desirable.
54
55 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
56 for simd and fp.
57
58
59 -Promote OP_EXTRACT_I4 to a STORE op
60 The advantage of this change is that it could have a _membase version and promote further optimizations.
61
62 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
63 without a OP_LDADDR.
64 */
65
66 #if defined (MONO_ARCH_SIMD_INTRINSICS)
67
68 #if defined (DISABLE_JIT)
69
70 void
mono_simd_intrinsics_init(void)71 mono_simd_intrinsics_init (void)
72 {
73 }
74
75 #else
76
77 //#define IS_DEBUG_ON(cfg) (0)
78
79 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
80 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
81 enum {
82 SIMD_EMIT_BINARY,
83 SIMD_EMIT_UNARY,
84 SIMD_EMIT_SETTER,
85 SIMD_EMIT_GETTER,
86 SIMD_EMIT_GETTER_QWORD,
87 SIMD_EMIT_CTOR,
88 SIMD_EMIT_CAST,
89 SIMD_EMIT_SHUFFLE,
90 SIMD_EMIT_SHIFT,
91 SIMD_EMIT_EQUALITY,
92 SIMD_EMIT_LOAD_ALIGNED,
93 SIMD_EMIT_STORE,
94 SIMD_EMIT_EXTRACT_MASK,
95 SIMD_EMIT_PREFETCH
96 };
97
98 #ifdef HAVE_ARRAY_ELEM_INIT
99 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
100 #define MSGSTRFIELD1(line) str##line
101 static const struct msgstr_t {
102 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
103 #include "simd-methods.h"
104 #undef SIMD_METHOD
105 } method_names = {
106 #define SIMD_METHOD(str,name) str,
107 #include "simd-methods.h"
108 #undef SIMD_METHOD
109 };
110
111 enum {
112 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
113 #include "simd-methods.h"
114 };
115 #define method_name(idx) ((const char*)&method_names + (idx))
116
117 #else
118 #define SIMD_METHOD(str,name) str,
119 static const char * const method_names [] = {
120 #include "simd-methods.h"
121 NULL
122 };
123 #undef SIMD_METHOD
124 #define SIMD_METHOD(str,name) name,
125 enum {
126 #include "simd-methods.h"
127 SN_LAST
128 };
129
130 #define method_name(idx) (method_names [(idx)])
131
132 #endif
133
134 typedef struct {
135 guint16 name;
136 guint16 opcode;
137 guint8 simd_version_flags;
138 guint8 simd_emit_mode : 4;
139 guint8 flags : 4;
140 } SimdIntrinsic;
141
142 static const SimdIntrinsic vector4f_intrinsics[] = {
143 { SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
144 { SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
145 { SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
146 { SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
147 { SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
148 { SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
149 { SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
150 { SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
151 { SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
152 { SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
153 { SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
154 { SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
155 { SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
156 { SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
157 { SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
158 { SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
159 { SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
160 { SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
161 { SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
162 { SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
163 { SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
164 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
165 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
166 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
167 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
168 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
169 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
170 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
171 { SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
172 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
173 { SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
174 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
175 { SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
176 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
177 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
178 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
179 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
180 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
181 { SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
182 { SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
183 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 { SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
185 { SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
186 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
187 { SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
188 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
189 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
190 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
191 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
192 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
193 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
194 };
195
196 static const SimdIntrinsic vector2d_intrinsics[] = {
197 { SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
198 { SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
199 { SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
200 { SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
201 { SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
202 { SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
203 { SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
204 { SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
205 { SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
206 { SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
207 { SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
208 { SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
209 { SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
210 { SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
211 { SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
212 { SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
213 { SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
214 { SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
215 { SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
217 { SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 { SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
220 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
221 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
222 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
223 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
224 { SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
225 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
226 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
227 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
228 { SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
229 { SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 { SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 { SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
232 { SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
233 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
234 { SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
235 { SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
236 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
237 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
238 };
239
240 static const SimdIntrinsic vector2ul_intrinsics[] = {
241 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
242 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
243 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
244 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
245 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
246 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
247 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
248 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
249 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
250 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
251 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
252 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
253 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
254 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
255 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
256 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
257 { SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
258 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
259 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
261 { SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
262 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
263 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
264 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
265 };
266
267 static const SimdIntrinsic vector2l_intrinsics[] = {
268 { SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
269 { SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
270 { SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
271 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
272 { SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
273 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
274 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
275 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
276 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
277 { SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
278 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
279 { SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
280 { SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
281 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
282 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
283 { SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
284 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
285 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
287 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
288 { SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
289 { SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
290 { SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
291 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
292 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
293 };
294
295 static const SimdIntrinsic vector4ui_intrinsics[] = {
296 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
297 { SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
298 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
300 { SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
301 { SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
302 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
303 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
304 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
305 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
306 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
307 { SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 { SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
309 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
310 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
311 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
312 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
313 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
314 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
315 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
316 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
317 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
318 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
319 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
320 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
321 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
322 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
323 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
324 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
325 { SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
326 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
327 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
328 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
329 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
330 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
331 };
332
333 static const SimdIntrinsic vector4i_intrinsics[] = {
334 { SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
335 { SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
336 { SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
337 { SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
338 { SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
339 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
340 { SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
341 { SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
342 { SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
343 { SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
344 { SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
345 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
346 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
347 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
348 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
349 { SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
350 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
351 { SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
352 { SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
353 { SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
354 { SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
355 { SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
356 { SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
357 { SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
358 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
359 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 { SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
361 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
362 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
363 { SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
364 { SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 { SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
366 { SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
367 { SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 { SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
369 { SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
370 { SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
371 { SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
372 };
373
374 static const SimdIntrinsic vector8us_intrinsics[] = {
375 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
376 { SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
377 { SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
378 { SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
379 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
380 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
381 { SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
382 { SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
383 { SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
384 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
385 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
386 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
387 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
388 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
389 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
390 { SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
391 { SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
393 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
397 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
398 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
399 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
400 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
401 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
402 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
403 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
404 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
406 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
407 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
408 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
409 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
410 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
411 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
412 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 { SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
414 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
415 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
416 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
417 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
418 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
419 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
420 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
421 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
422 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
423 };
424
425 static const SimdIntrinsic vector8s_intrinsics[] = {
426 { SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
427 { SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
428 { SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
429 { SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
430 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
431 { SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
432 { SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 { SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 { SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 { SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 { SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
437 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
438 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
439 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
440 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
441 { SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
442 { SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
443 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
444 { SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 { SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 { SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
448 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
449 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
450 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
451 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
452 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
453 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
454 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
455 { SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
457 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
458 { SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
459 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
460 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
461 { SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
462 { SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
463 { SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
464 { SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
465 { SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
466 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
467 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
468 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
469 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
470 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
471 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
472 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
473 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
474 };
475
476 static const SimdIntrinsic vector16b_intrinsics[] = {
477 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
478 { SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 { SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
482 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
483 { SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
484 { SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
485 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
486 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
487 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
488 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
489 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
490 { SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 { SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
492 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
493 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
494 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
500 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
501 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
502 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
503 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
504 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
505 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
506 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
507 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
508 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
509 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
510 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
511 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
512 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
513 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
514 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
515 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
516 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
517 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
518 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
523 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
525 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
526 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
530 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
531 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
532 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
533 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
534 };
535
536 /*
537 Missing:
538 setters
539 */
540 static const SimdIntrinsic vector16sb_intrinsics[] = {
541 { SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
542 { SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 { SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 { SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
545 { SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
546 { SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
547 { SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
548 { SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
549 { SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
550 { SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
551 { SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
552 { SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
553 { SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
554 { SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
555 { SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
556 { SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
557 { SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 { SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 { SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 { SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 { SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 { SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
563 { SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
564 { SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
565 { SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
566 { SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
567 { SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
568 { SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
569 { SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
570 { SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
571 { SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
572 { SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
573 { SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
574 { SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
575 { SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
576 { SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
577 { SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
578 { SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
579 { SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
580 { SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
581 { SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 { SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 { SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 { SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 { SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 { SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 { SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 { SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
589 { SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
590 { SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
591 { SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
592 { SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
593 { SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
594 { SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
595 { SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
596 { SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
597 };
598
599 static guint32 simd_supported_versions;
600
601 static MonoInst* emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
602 static MonoInst* emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
603
604 /*TODO match using number of parameters as well*/
605 static int
simd_intrinsic_compare_by_name(const void * key,const void * value)606 simd_intrinsic_compare_by_name (const void *key, const void *value)
607 {
608 return strcmp (key, method_name (((SimdIntrinsic *)value)->name));
609 }
610
611 typedef enum {
612 VREG_USED = 0x01,
613 VREG_HAS_XZERO_BB0 = 0x02,
614 VREG_HAS_OTHER_OP_BB0 = 0x04,
615 VREG_SINGLE_BB_USE = 0x08,
616 VREG_MANY_BB_USE = 0x10,
617 } KillFlags;
618
619 void
mono_simd_intrinsics_init(void)620 mono_simd_intrinsics_init (void)
621 {
622 simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
623 /*TODO log the supported flags*/
624 }
625
626 static inline gboolean
apply_vreg_first_block_interference(MonoCompile * cfg,MonoInst * ins,int reg,int max_vreg,char * vreg_flags)627 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
628 {
629 if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
630 vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
631 vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
632 DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
633 return TRUE;
634 }
635 return FALSE;
636 }
637
638 static inline gboolean
apply_vreg_following_block_interference(MonoCompile * cfg,MonoInst * ins,int reg,MonoBasicBlock * bb,int max_vreg,char * vreg_flags,MonoBasicBlock ** target_bb)639 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
640 {
641 if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
642 return FALSE;
643
644 if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
645 vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
646 vreg_flags [reg] |= VREG_MANY_BB_USE;
647 DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
648 return TRUE;
649 } else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
650 vreg_flags [reg] |= VREG_SINGLE_BB_USE;
651 target_bb [reg] = bb;
652 DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
653 return TRUE;
654 }
655 return FALSE;
656 }
657
658 /*
659 This pass recalculate which vars need MONO_INST_INDIRECT.
660
661 We cannot do this for non SIMD vars since code like mono_get_vtable_var
662 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
663 */
664 void
mono_simd_simplify_indirection(MonoCompile * cfg)665 mono_simd_simplify_indirection (MonoCompile *cfg)
666 {
667 int i, max_vreg = 0;
668 MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
669 MonoInst *ins;
670 char *vreg_flags;
671
672 for (i = 0; i < cfg->num_varinfo; i++) {
673 MonoInst *var = cfg->varinfo [i];
674 if (var->klass->simd_type) {
675 var->flags &= ~MONO_INST_INDIRECT;
676 max_vreg = MAX (var->dreg, max_vreg);
677 }
678 }
679
680 for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
681 if (!first_bb && bb->code)
682 first_bb = bb;
683 for (ins = bb->code; ins; ins = ins->next) {
684 if (ins->opcode == OP_LDADDR) {
685 MonoInst *var = (MonoInst*)ins->inst_p0;
686 if (var->klass->simd_type) {
687 var->flags |= MONO_INST_INDIRECT;
688 }
689 }
690 }
691 }
692
693 DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
694 vreg_flags = (char *)g_malloc0 (max_vreg + 1);
695 target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
696
697 for (i = 0; i < cfg->num_varinfo; i++) {
698 MonoInst *var = cfg->varinfo [i];
699 if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
700 vreg_flags [var->dreg] = VREG_USED;
701 DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
702 }
703 }
704
705 /*Scan the first basic block looking xzeros not used*/
706 for (ins = first_bb->code; ins; ins = ins->next) {
707 int num_sregs;
708 int sregs [MONO_MAX_SRC_REGS];
709
710 if (ins->opcode == OP_XZERO) {
711 if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
712 DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
713 vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
714 }
715 continue;
716 }
717 if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
718 continue;
719 if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
720 continue;
721 num_sregs = mono_inst_get_src_registers (ins, sregs);
722 for (i = 0; i < num_sregs; ++i) {
723 if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
724 break;
725 }
726 }
727
728 if (IS_DEBUG_ON (cfg)) {
729 for (i = 0; i < cfg->num_varinfo; i++) {
730 MonoInst *var = cfg->varinfo [i];
731 if (var->klass->simd_type) {
732 if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
733 DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
734 if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
735 DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
736 }
737 }
738 }
739
740 /*TODO stop here if no var is xzero only*/
741
742 /*
743 Scan all other bb and check if it has only one other use
744 Ideally this would be done after an extended bb formation pass
745
746 FIXME This pass could use dominator information to properly
747 place the XZERO on the bb that dominates all uses of the var,
748 but this will have zero effect with the current local reg alloc
749
750 TODO simply the use of flags.
751 */
752
753 for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
754 for (ins = bb->code; ins; ins = ins->next) {
755 int num_sregs;
756 int sregs [MONO_MAX_SRC_REGS];
757
758 if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
759 continue;
760 if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
761 continue;
762 num_sregs = mono_inst_get_src_registers (ins, sregs);
763 for (i = 0; i < num_sregs; ++i) {
764 if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
765 max_vreg, vreg_flags, target_bb))
766 continue;
767 }
768 }
769 }
770
771 for (i = 0; i < cfg->num_varinfo; i++) {
772 MonoInst *var = cfg->varinfo [i];
773 if (!var->klass->simd_type)
774 continue;
775 if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
776 DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
777 if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
778 DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
779
780 if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
781 continue;
782 for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
783 int num_sregs, j;
784 int sregs [MONO_MAX_SRC_REGS];
785 gboolean found = FALSE;
786
787 num_sregs = mono_inst_get_src_registers (ins, sregs);
788 for (j = 0; j < num_sregs; ++j) {
789 if (sregs [j] == var->dreg)
790 found = TRUE;
791 }
792 /*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
793 if (ins->dreg == var->dreg && !found) {
794 DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
795 break;
796 } else if (found) {
797 DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
798 MonoInst *tmp;
799 MONO_INST_NEW (cfg, tmp, OP_XZERO);
800 tmp->dreg = var->dreg;
801 tmp->type = STACK_VTYPE;
802 tmp->klass = var->klass;
803 mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
804 break;
805 }
806 }
807 }
808
809 for (ins = first_bb->code; ins; ins = ins->next) {
810 if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
811 DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
812 NULLIFY_INS (ins);
813 }
814 }
815
816 g_free (vreg_flags);
817 g_free (target_bb);
818 }
819
820 /*
821 * This function expect that src be a value.
822 */
823 static int
get_simd_vreg(MonoCompile * cfg,MonoMethod * cmethod,MonoInst * src)824 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
825 {
826 const char *spec = INS_INFO (src->opcode);
827
828 if (src->opcode == OP_XMOVE) {
829 return src->sreg1;
830 } else if (spec [MONO_INST_DEST] == 'x') {
831 return src->dreg;
832 } else if (src->opcode == OP_VCALL || src->opcode == OP_VCALL_MEMBASE) {
833 return src->dreg;
834 }
835
836 g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
837 mono_print_ins (src);
838 g_assert_not_reached ();
839 }
840
841 /*
842 * This function will load the value if needed.
843 */
844 static int
load_simd_vreg_class(MonoCompile * cfg,MonoClass * klass,MonoInst * src,gboolean * indirect)845 load_simd_vreg_class (MonoCompile *cfg, MonoClass *klass, MonoInst *src, gboolean *indirect)
846 {
847 const char *spec = INS_INFO (src->opcode);
848
849 if (indirect)
850 *indirect = FALSE;
851 if (src->opcode == OP_XMOVE) {
852 return src->sreg1;
853 } else if (src->opcode == OP_LDADDR) {
854 int res = ((MonoInst*)src->inst_p0)->dreg;
855 return res;
856 } else if (spec [MONO_INST_DEST] == 'x') {
857 return src->dreg;
858 } else if (src->type == STACK_PTR || src->type == STACK_MP) {
859 MonoInst *ins;
860 if (indirect)
861 *indirect = TRUE;
862
863 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
864 ins->klass = klass;
865 ins->sreg1 = src->dreg;
866 ins->type = STACK_VTYPE;
867 ins->dreg = alloc_ireg (cfg);
868 MONO_ADD_INS (cfg->cbb, ins);
869 return ins->dreg;
870 }
871 g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
872 mono_print_ins (src);
873 g_assert_not_reached ();
874 }
875
876 static int
load_simd_vreg(MonoCompile * cfg,MonoMethod * cmethod,MonoInst * src,gboolean * indirect)877 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
878 {
879 return load_simd_vreg_class (cfg, cmethod->klass, src, indirect);
880 }
881
882 /*We share the var with fconv_to_r8_x to save some stack space.*/
883 static MonoInst*
get_double_spill_area(MonoCompile * cfg)884 get_double_spill_area (MonoCompile *cfg)
885 {
886 if (!cfg->fconv_to_r8_x_var) {
887 cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
888 cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
889 }
890 return cfg->fconv_to_r8_x_var;
891 }
892 static MonoInst*
get_simd_ctor_spill_area(MonoCompile * cfg,MonoClass * avector_klass)893 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
894 {
895 if (!cfg->simd_ctor_var) {
896 cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
897 cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
898 }
899 return cfg->simd_ctor_var;
900 }
901
902 static int
mono_type_to_expand_op(MonoType * type)903 mono_type_to_expand_op (MonoType *type)
904 {
905 switch (type->type) {
906 case MONO_TYPE_I1:
907 case MONO_TYPE_U1:
908 return OP_EXPAND_I1;
909 case MONO_TYPE_I2:
910 case MONO_TYPE_U2:
911 return OP_EXPAND_I2;
912 case MONO_TYPE_I4:
913 case MONO_TYPE_U4:
914 return OP_EXPAND_I4;
915 case MONO_TYPE_I8:
916 case MONO_TYPE_U8:
917 return OP_EXPAND_I8;
918 case MONO_TYPE_R4:
919 return OP_EXPAND_R4;
920 case MONO_TYPE_R8:
921 return OP_EXPAND_R8;
922 default:
923 g_assert_not_reached ();
924 }
925 }
926
927 static int
type_to_comp_op(MonoType * t)928 type_to_comp_op (MonoType *t)
929 {
930 switch (t->type) {
931 case MONO_TYPE_I1:
932 case MONO_TYPE_U1:
933 return OP_PCMPEQB;
934 case MONO_TYPE_I2:
935 case MONO_TYPE_U2:
936 return OP_PCMPEQW;
937 case MONO_TYPE_I4:
938 case MONO_TYPE_U4:
939 return OP_PCMPEQD;
940 case MONO_TYPE_I8:
941 case MONO_TYPE_U8:
942 return OP_PCMPEQQ;
943 case MONO_TYPE_R4:
944 return OP_COMPPS;
945 case MONO_TYPE_R8:
946 return OP_COMPPD;
947 default:
948 g_assert_not_reached ();
949 return -1;
950 }
951 }
952
953 static int
type_to_gt_op(MonoType * t)954 type_to_gt_op (MonoType *t)
955 {
956 switch (t->type) {
957 case MONO_TYPE_I1:
958 return OP_PCMPGTB;
959 case MONO_TYPE_I2:
960 return OP_PCMPGTW;
961 case MONO_TYPE_I4:
962 return OP_PCMPGTD;
963 case MONO_TYPE_I8:
964 return OP_PCMPGTQ;
965 default:
966 return -1;
967 }
968 }
969
970 static int
type_to_padd_op(MonoType * t)971 type_to_padd_op (MonoType *t)
972 {
973 switch (t->type) {
974 case MONO_TYPE_U1:
975 case MONO_TYPE_I1:
976 return OP_PADDB;
977 case MONO_TYPE_U2:
978 case MONO_TYPE_I2:
979 return OP_PADDW;
980 case MONO_TYPE_U4:
981 case MONO_TYPE_I4:
982 return OP_PADDD;
983 case MONO_TYPE_U8:
984 case MONO_TYPE_I8:
985 return OP_PADDQ;
986 case MONO_TYPE_R4:
987 return OP_ADDPS;
988 case MONO_TYPE_R8:
989 return OP_ADDPD;
990 default:
991 break;
992 }
993 return -1;
994 }
995
996 static int
type_to_psub_op(MonoType * t)997 type_to_psub_op (MonoType *t)
998 {
999 switch (t->type) {
1000 case MONO_TYPE_U1:
1001 case MONO_TYPE_I1:
1002 return OP_PSUBB;
1003 case MONO_TYPE_U2:
1004 case MONO_TYPE_I2:
1005 return OP_PSUBW;
1006 case MONO_TYPE_U4:
1007 case MONO_TYPE_I4:
1008 return OP_PSUBD;
1009 case MONO_TYPE_U8:
1010 case MONO_TYPE_I8:
1011 return OP_PSUBQ;
1012 case MONO_TYPE_R4:
1013 return OP_SUBPS;
1014 case MONO_TYPE_R8:
1015 return OP_SUBPD;
1016 default:
1017 break;
1018 }
1019 return -1;
1020 }
1021
1022 static int
type_to_pmul_op(MonoType * t)1023 type_to_pmul_op (MonoType *t)
1024 {
1025 switch (t->type) {
1026 case MONO_TYPE_U2:
1027 case MONO_TYPE_I2:
1028 return OP_PMULW;
1029 case MONO_TYPE_U4:
1030 case MONO_TYPE_I4:
1031 return OP_PMULD;
1032 case MONO_TYPE_R4:
1033 return OP_MULPS;
1034 case MONO_TYPE_R8:
1035 return OP_MULPD;
1036 case MONO_TYPE_U8:
1037 /* PMULQ multiplies two 32 bit numbers into a 64 bit one */
1038 return -1;
1039 case MONO_TYPE_I8:
1040 return -1;
1041 default:
1042 break;
1043 }
1044 return -1;
1045 }
1046
1047 static int
type_to_pdiv_op(MonoType * t)1048 type_to_pdiv_op (MonoType *t)
1049 {
1050 switch (t->type) {
1051 case MONO_TYPE_R4:
1052 return OP_DIVPS;
1053 case MONO_TYPE_R8:
1054 return OP_DIVPD;
1055 default:
1056 break;
1057 }
1058 return -1;
1059 }
1060
1061 static int
type_to_pxor_op(MonoType * t)1062 type_to_pxor_op (MonoType *t)
1063 {
1064 /*
1065 * These opcodes have the same semantics, but using the
1066 * correctly typed version is better for performance.
1067 */
1068 switch (t->type) {
1069 case MONO_TYPE_R4:
1070 return OP_XORPS;
1071 case MONO_TYPE_R8:
1072 return OP_XORPD;
1073 default:
1074 return OP_PXOR;
1075 }
1076 }
1077
1078 static int
type_to_pand_op(MonoType * t)1079 type_to_pand_op (MonoType *t)
1080 {
1081 switch (t->type) {
1082 case MONO_TYPE_R4:
1083 return OP_ANDPS;
1084 case MONO_TYPE_R8:
1085 return OP_ANDPD;
1086 default:
1087 return OP_PAND;
1088 }
1089 }
1090
1091 static int
type_to_por_op(MonoType * t)1092 type_to_por_op (MonoType *t)
1093 {
1094 switch (t->type) {
1095 case MONO_TYPE_R4:
1096 return OP_ORPS;
1097 case MONO_TYPE_R8:
1098 return OP_ORPD;
1099 default:
1100 return OP_POR;
1101 }
1102 }
1103
1104 static int
type_to_pmin_op(MonoType * t)1105 type_to_pmin_op (MonoType *t)
1106 {
1107 switch (t->type) {
1108 case MONO_TYPE_R4:
1109 return OP_MINPS;
1110 case MONO_TYPE_R8:
1111 return OP_MINPD;
1112 case MONO_TYPE_I1:
1113 return OP_PMINB;
1114 case MONO_TYPE_U1:
1115 return OP_PMINB_UN;
1116 case MONO_TYPE_I2:
1117 return OP_PMINW;
1118 case MONO_TYPE_U2:
1119 return OP_PMINW_UN;
1120 case MONO_TYPE_I4:
1121 return OP_PMIND;
1122 case MONO_TYPE_U4:
1123 return OP_PMIND_UN;
1124 default:
1125 return -1;
1126 }
1127 }
1128
1129 static int
type_to_pmax_op(MonoType * t)1130 type_to_pmax_op (MonoType *t)
1131 {
1132 switch (t->type) {
1133 case MONO_TYPE_R4:
1134 return OP_MAXPS;
1135 case MONO_TYPE_R8:
1136 return OP_MAXPD;
1137 case MONO_TYPE_I1:
1138 return OP_PMAXB;
1139 case MONO_TYPE_U1:
1140 return OP_PMAXB_UN;
1141 case MONO_TYPE_I2:
1142 return OP_PMAXW;
1143 case MONO_TYPE_U2:
1144 return OP_PMAXW_UN;
1145 case MONO_TYPE_I4:
1146 return OP_PMAXD;
1147 case MONO_TYPE_U4:
1148 return OP_PMAXD_UN;
1149 default:
1150 return -1;
1151 }
1152 }
1153
1154 static int
get_simd_vreg_or_expanded_scalar(MonoCompile * cfg,MonoClass * klass,MonoType * param_type,MonoInst * src)1155 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoClass *klass, MonoType *param_type, MonoInst *src)
1156 {
1157 MonoInst *ins;
1158 int expand_op;
1159
1160 if (mono_class_from_mono_type (param_type)->simd_type)
1161 return get_simd_vreg (cfg, NULL, src);
1162
1163 expand_op = mono_type_to_expand_op (param_type);
1164 MONO_INST_NEW (cfg, ins, expand_op);
1165 ins->klass = klass;
1166 ins->sreg1 = src->dreg;
1167 ins->type = STACK_VTYPE;
1168 ins->dreg = alloc_ireg (cfg);
1169 MONO_ADD_INS (cfg->cbb, ins);
1170
1171 if (expand_op == OP_EXPAND_R4)
1172 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1173 else if (expand_op == OP_EXPAND_R8)
1174 ins->backend.spill_var = get_double_spill_area (cfg);
1175
1176 return ins->dreg;
1177 }
1178
1179 /*
1180 * simd_intrinsic_emit_binary_op:
1181 *
1182 * Emit a binary SIMD opcode.
1183 * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1184 * expanded to the SIMD type.
1185 */
1186 static MonoInst*
simd_intrinsic_emit_binary_op(MonoCompile * cfg,int opcode,int flags,MonoClass * klass,MonoType * lhs_type,MonoType * rhs_type,MonoInst * lhs,MonoInst * rhs)1187 simd_intrinsic_emit_binary_op (MonoCompile *cfg, int opcode, int flags, MonoClass *klass, MonoType *lhs_type, MonoType *rhs_type, MonoInst *lhs, MonoInst *rhs)
1188 {
1189 MonoInst* ins;
1190 int left_vreg, right_vreg;
1191
1192 left_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, lhs_type, lhs);
1193 right_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, rhs_type, rhs);
1194
1195 MONO_INST_NEW (cfg, ins, opcode);
1196 ins->klass = klass;
1197 ins->sreg1 = left_vreg;
1198 ins->sreg2 = right_vreg;
1199 ins->type = STACK_VTYPE;
1200 ins->dreg = alloc_ireg (cfg);
1201 ins->inst_c0 = flags;
1202 MONO_ADD_INS (cfg->cbb, ins);
1203 return ins;
1204 }
1205
1206 static MonoInst*
simd_intrinsic_emit_binary(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1207 simd_intrinsic_emit_binary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1208 {
1209 MonoMethodSignature *sig = mono_method_signature (cmethod);
1210
1211 g_assert (sig->param_count == 2);
1212
1213 return simd_intrinsic_emit_binary_op (cfg, intrinsic->opcode, intrinsic->flags, cmethod->klass, sig->params [0], sig->params [1], args [0], args [1]);
1214 }
1215
1216 static MonoInst*
simd_intrinsic_emit_unary(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1217 simd_intrinsic_emit_unary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1218 {
1219 MonoInst* ins;
1220 int vreg;
1221
1222 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1223
1224 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1225 ins->klass = cmethod->klass;
1226 ins->sreg1 = vreg;
1227 ins->type = STACK_VTYPE;
1228 ins->dreg = alloc_ireg (cfg);
1229 MONO_ADD_INS (cfg->cbb, ins);
1230 return ins;
1231 }
1232
1233 static int
mono_type_to_extract_op(MonoType * type)1234 mono_type_to_extract_op (MonoType *type)
1235 {
1236 switch (type->type) {
1237 case MONO_TYPE_I1:
1238 return OP_EXTRACT_I1;
1239 case MONO_TYPE_U1:
1240 return OP_EXTRACT_U1;
1241 case MONO_TYPE_I2:
1242 return OP_EXTRACT_I2;
1243 case MONO_TYPE_U2:
1244 return OP_EXTRACT_U2;
1245 case MONO_TYPE_I4:
1246 case MONO_TYPE_U4:
1247 case MONO_TYPE_R4:
1248 return OP_EXTRACT_I4;
1249 default:
1250 g_assert_not_reached ();
1251 }
1252 }
1253
1254 /*Returns the amount to shift the element index to get the dword it belongs to*/
1255 static int
mono_type_elements_shift_bits(MonoType * type)1256 mono_type_elements_shift_bits (MonoType *type)
1257 {
1258 switch (type->type) {
1259 case MONO_TYPE_I1:
1260 case MONO_TYPE_U1:
1261 return 2;
1262 case MONO_TYPE_I2:
1263 case MONO_TYPE_U2:
1264 return 1;
1265 case MONO_TYPE_I4:
1266 case MONO_TYPE_U4:
1267 case MONO_TYPE_R4:
1268 return 0;
1269 default:
1270 g_assert_not_reached ();
1271 }
1272 }
1273
1274 static G_GNUC_UNUSED int
mono_type_to_insert_op(MonoType * type)1275 mono_type_to_insert_op (MonoType *type)
1276 {
1277 switch (type->type) {
1278 case MONO_TYPE_I1:
1279 case MONO_TYPE_U1:
1280 return OP_INSERT_I1;
1281 case MONO_TYPE_I2:
1282 case MONO_TYPE_U2:
1283 return OP_INSERT_I2;
1284 case MONO_TYPE_I4:
1285 case MONO_TYPE_U4:
1286 return OP_INSERT_I4;
1287 case MONO_TYPE_I8:
1288 case MONO_TYPE_U8:
1289 return OP_INSERT_I8;
1290 case MONO_TYPE_R4:
1291 return OP_INSERT_R4;
1292 case MONO_TYPE_R8:
1293 return OP_INSERT_R8;
1294 default:
1295 g_assert_not_reached ();
1296 }
1297 }
1298
1299 static int
mono_type_to_slow_insert_op(MonoType * type)1300 mono_type_to_slow_insert_op (MonoType *type)
1301 {
1302 switch (type->type) {
1303 case MONO_TYPE_I1:
1304 case MONO_TYPE_U1:
1305 return OP_INSERTX_U1_SLOW;
1306 case MONO_TYPE_I2:
1307 case MONO_TYPE_U2:
1308 return OP_INSERT_I2;
1309 case MONO_TYPE_I4:
1310 case MONO_TYPE_U4:
1311 return OP_INSERTX_I4_SLOW;
1312 case MONO_TYPE_I8:
1313 case MONO_TYPE_U8:
1314 return OP_INSERTX_I8_SLOW;
1315 case MONO_TYPE_R4:
1316 return OP_INSERTX_R4_SLOW;
1317 case MONO_TYPE_R8:
1318 return OP_INSERTX_R8_SLOW;
1319 default:
1320 g_assert_not_reached ();
1321 }
1322 }
1323
1324 static MonoInst*
simd_intrinsic_emit_setter(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1325 simd_intrinsic_emit_setter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1326 {
1327 MonoInst *ins;
1328 MonoMethodSignature *sig = mono_method_signature (cmethod);
1329 int size, align;
1330 gboolean indirect;
1331 int dreg;
1332
1333 size = mono_type_size (sig->params [0], &align);
1334
1335 if (COMPILE_LLVM (cfg)) {
1336 MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1337 ins->klass = cmethod->klass;
1338 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1339 ins->sreg2 = args [1]->dreg;
1340 ins->inst_c0 = intrinsic->opcode;
1341 MONO_ADD_INS (cfg->cbb, ins);
1342 } else if (size == 2 || size == 4 || size == 8) {
1343 MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1344 ins->klass = cmethod->klass;
1345 /*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1346 ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1347 ins->sreg2 = args [1]->dreg;
1348 ins->inst_c0 = intrinsic->opcode;
1349 if (sig->params [0]->type == MONO_TYPE_R4)
1350 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1351 else if (sig->params [0]->type == MONO_TYPE_R8)
1352 ins->backend.spill_var = get_double_spill_area (cfg);
1353 MONO_ADD_INS (cfg->cbb, ins);
1354 } else {
1355 int vreg, sreg;
1356
1357 MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1358 ins->klass = cmethod->klass;
1359 ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1360 ins->type = STACK_I4;
1361 ins->dreg = vreg = alloc_ireg (cfg);
1362 ins->inst_c0 = intrinsic->opcode / 2;
1363 MONO_ADD_INS (cfg->cbb, ins);
1364
1365 MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1366 ins->klass = cmethod->klass;
1367 ins->sreg1 = vreg;
1368 ins->sreg2 = args [1]->dreg;
1369 ins->dreg = sreg;
1370 ins->inst_c0 = intrinsic->opcode;
1371 MONO_ADD_INS (cfg->cbb, ins);
1372 }
1373
1374 if (indirect) {
1375 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1376 ins->klass = cmethod->klass;
1377 ins->dreg = args [0]->dreg;
1378 ins->sreg1 = dreg;
1379 MONO_ADD_INS (cfg->cbb, ins);
1380 }
1381 return ins;
1382 }
1383
1384 /*
1385 * simd_intrinsic_emit_getter_op:
1386 *
1387 * Emit IR for loading an element of a SIMD value.
1388 *
1389 * @klass is the simd type, @type is the element type.
1390 */
1391 static MonoInst*
simd_intrinsic_emit_getter_op(MonoCompile * cfg,int index,MonoClass * klass,MonoType * type,MonoInst * arg)1392 simd_intrinsic_emit_getter_op (MonoCompile *cfg, int index, MonoClass *klass, MonoType *type, MonoInst *arg)
1393 {
1394 MonoInst *ins;
1395 int vreg, shift_bits;
1396
1397 vreg = load_simd_vreg_class (cfg, klass, arg, NULL);
1398
1399 if (type->type == MONO_TYPE_I8 || type->type == MONO_TYPE_U8 || type->type == MONO_TYPE_R8) {
1400 MonoInst *ins;
1401 gboolean is_r8 = type->type == MONO_TYPE_R8;
1402
1403 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1404 ins->klass = klass;
1405 ins->sreg1 = vreg;
1406 ins->inst_c0 = index;
1407 if (is_r8) {
1408 ins->type = STACK_R8;
1409 ins->dreg = alloc_freg (cfg);
1410 ins->backend.spill_var = get_double_spill_area (cfg);
1411 } else {
1412 ins->type = STACK_I8;
1413 ins->dreg = alloc_lreg (cfg);
1414 }
1415 MONO_ADD_INS (cfg->cbb, ins);
1416 return ins;
1417 }
1418
1419 shift_bits = mono_type_elements_shift_bits (type);
1420
1421 if ((index >> shift_bits) && !cfg->compile_llvm) {
1422 MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1423 ins->klass = klass;
1424 ins->sreg1 = vreg;
1425 ins->inst_c0 = index >> shift_bits;
1426 ins->type = STACK_VTYPE;
1427 ins->dreg = vreg = alloc_ireg (cfg);
1428 MONO_ADD_INS (cfg->cbb, ins);
1429 }
1430
1431 MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (type));
1432 ins->klass = klass;
1433 ins->sreg1 = vreg;
1434 ins->type = STACK_I4;
1435 ins->dreg = vreg = alloc_ireg (cfg);
1436 if (cfg->compile_llvm)
1437 ins->inst_c0 = index;
1438 else
1439 ins->inst_c0 = index & ((1 << shift_bits) - 1);
1440 MONO_ADD_INS (cfg->cbb, ins);
1441
1442 if (type->type == MONO_TYPE_R4) {
1443 MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1444 ins->klass = mono_defaults.single_class;
1445 ins->sreg1 = vreg;
1446 ins->type = cfg->r4_stack_type;
1447 ins->dreg = alloc_freg (cfg);
1448 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1449 MONO_ADD_INS (cfg->cbb, ins);
1450 }
1451 return ins;
1452 }
1453
1454 static MonoInst*
simd_intrinsic_emit_getter(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1455 simd_intrinsic_emit_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1456 {
1457 MonoMethodSignature *sig = mono_method_signature (cmethod);
1458
1459 return simd_intrinsic_emit_getter_op (cfg, intrinsic->opcode, cmethod->klass, sig->ret, args [0]);
1460 }
1461
1462 static MonoInst*
simd_intrinsic_emit_long_getter(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1463 simd_intrinsic_emit_long_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1464 {
1465 MonoInst *ins;
1466 int vreg;
1467 gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1468
1469 vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1470
1471 MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1472 ins->klass = cmethod->klass;
1473 ins->sreg1 = vreg;
1474 ins->inst_c0 = intrinsic->opcode;
1475 if (is_r8) {
1476 ins->type = STACK_R8;
1477 ins->dreg = alloc_freg (cfg);
1478 ins->backend.spill_var = get_double_spill_area (cfg);
1479 } else {
1480 ins->type = STACK_I8;
1481 ins->dreg = alloc_lreg (cfg);
1482 }
1483 MONO_ADD_INS (cfg->cbb, ins);
1484
1485 return ins;
1486 }
1487
1488 static MonoInst*
simd_intrinsic_emit_ctor(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1489 simd_intrinsic_emit_ctor (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1490 {
1491 MonoInst *ins = NULL;
1492 int i, addr_reg;
1493 gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1494 MonoMethodSignature *sig = mono_method_signature (cmethod);
1495 int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1496 int arg_size = mono_type_size (sig->params [0], &i);
1497 int opcode;
1498
1499 if (sig->param_count == 1) {
1500 int dreg;
1501
1502 if (is_ldaddr) {
1503 dreg = args [0]->inst_i0->dreg;
1504 NULLIFY_INS (args [0]);
1505 } else {
1506 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1507 dreg = alloc_ireg (cfg);
1508 }
1509
1510 if (intrinsic)
1511 opcode = intrinsic->opcode;
1512 else
1513 opcode = mono_type_to_expand_op (sig->params [0]);
1514 MONO_INST_NEW (cfg, ins, opcode);
1515 ins->klass = cmethod->klass;
1516 ins->sreg1 = args [1]->dreg;
1517 ins->type = STACK_VTYPE;
1518 ins->dreg = dreg;
1519
1520 MONO_ADD_INS (cfg->cbb, ins);
1521 if (sig->params [0]->type == MONO_TYPE_R4)
1522 ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1523 else if (sig->params [0]->type == MONO_TYPE_R8)
1524 ins->backend.spill_var = get_double_spill_area (cfg);
1525
1526 if (!is_ldaddr) {
1527 MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1528 ins->dreg = args [0]->dreg;
1529 ins->sreg1 = dreg;
1530 MONO_ADD_INS (cfg->cbb, ins);
1531 }
1532 return ins;
1533 }
1534
1535 if (is_ldaddr) {
1536 NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1537 MONO_ADD_INS (cfg->cbb, ins);
1538 addr_reg = ins->dreg;
1539 } else {
1540 g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1541 addr_reg = args [0]->dreg;
1542 }
1543
1544 for (i = sig->param_count - 1; i >= 0; --i) {
1545 EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1546 }
1547
1548 if (sig->param_count * arg_size < 16) {
1549 /* If there are not enough arguments, fill the rest with 0s */
1550 for (i = sig->param_count; i < 16 / arg_size; ++i) {
1551 switch (arg_size) {
1552 case 4:
1553 MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg, OP_STOREI4_MEMBASE_IMM, addr_reg, i * arg_size, 0);
1554 break;
1555 default:
1556 g_assert_not_reached ();
1557 break;
1558 }
1559 }
1560 }
1561
1562 if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1563 int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1564 NULLIFY_INS (args [0]);
1565
1566 MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1567 ins->klass = cmethod->klass;
1568 ins->sreg1 = addr_reg;
1569 ins->type = STACK_VTYPE;
1570 ins->dreg = vreg;
1571 MONO_ADD_INS (cfg->cbb, ins);
1572 }
1573 return ins;
1574 }
1575
1576 static MonoInst*
simd_intrinsic_emit_cast(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1577 simd_intrinsic_emit_cast (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1578 {
1579 MonoInst *ins;
1580 MonoClass *klass;
1581 int vreg;
1582
1583 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1584
1585 if (cmethod->is_inflated)
1586 /* Vector<T> */
1587 klass = mono_class_from_mono_type (mono_method_signature (cmethod)->ret);
1588 else
1589 klass = cmethod->klass;
1590
1591 MONO_INST_NEW (cfg, ins, OP_XMOVE);
1592 ins->klass = klass;
1593 ins->type = STACK_VTYPE;
1594 ins->sreg1 = vreg;
1595 ins->dreg = alloc_ireg (cfg);
1596 MONO_ADD_INS (cfg->cbb, ins);
1597 return ins;
1598 }
1599
1600 static MonoInst*
simd_intrinsic_emit_shift(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1601 simd_intrinsic_emit_shift (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1602 {
1603 MonoInst *ins;
1604 int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1605
1606 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1607
1608 if (args [1]->opcode != OP_ICONST) {
1609 MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1610 ins->klass = mono_defaults.int32_class;
1611 ins->sreg1 = args [1]->dreg;
1612 ins->type = STACK_I4;
1613 ins->dreg = vreg2 = alloc_ireg (cfg);
1614 MONO_ADD_INS (cfg->cbb, ins);
1615
1616 ++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1617 }
1618
1619 MONO_INST_NEW (cfg, ins, opcode);
1620 ins->klass = cmethod->klass;
1621 ins->sreg1 = vreg;
1622 ins->sreg2 = vreg2;
1623
1624 if (args [1]->opcode == OP_ICONST) {
1625 ins->inst_imm = args [1]->inst_c0;
1626 NULLIFY_INS (args [1]);
1627 }
1628
1629 ins->type = STACK_VTYPE;
1630 ins->dreg = alloc_ireg (cfg);
1631 MONO_ADD_INS (cfg->cbb, ins);
1632 return ins;
1633 }
1634
1635 static inline gboolean
mono_op_is_packed_compare(int op)1636 mono_op_is_packed_compare (int op)
1637 {
1638 return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1639 }
1640
1641 static MonoInst*
simd_intrinsic_emit_equality_op(MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args,int opcode,int flags)1642 simd_intrinsic_emit_equality_op (MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args, int opcode, int flags)
1643 {
1644 MonoInst* ins;
1645 int left_vreg, right_vreg, tmp_vreg;
1646
1647 left_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1648 right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1649
1650 MONO_INST_NEW (cfg, ins, opcode);
1651 ins->klass = cmethod->klass;
1652 ins->sreg1 = left_vreg;
1653 ins->sreg2 = right_vreg;
1654 ins->type = STACK_VTYPE;
1655 ins->klass = cmethod->klass;
1656 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1657 ins->inst_c0 = flags;
1658 MONO_ADD_INS (cfg->cbb, ins);
1659
1660 /*FIXME the next ops are SSE specific*/
1661 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1662 ins->klass = cmethod->klass;
1663 ins->sreg1 = tmp_vreg;
1664 ins->type = STACK_I4;
1665 ins->dreg = tmp_vreg = alloc_ireg (cfg);
1666 MONO_ADD_INS (cfg->cbb, ins);
1667
1668 /*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1669 if (mono_op_is_packed_compare (opcode) || flags == SIMD_COMP_EQ) {
1670 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1671 NEW_UNALU (cfg, ins, flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1672 } else {
1673 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1674 NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1675 }
1676 MONO_ADD_INS (cfg->cbb, ins);
1677 return ins;
1678 }
1679
1680 static MonoInst*
simd_intrinsic_emit_equality(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1681 simd_intrinsic_emit_equality (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1682 {
1683 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, intrinsic->opcode, intrinsic->flags);
1684 }
1685
1686 static MonoInst*
simd_intrinsic_emit_shuffle(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1687 simd_intrinsic_emit_shuffle (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1688 {
1689 MonoInst *ins;
1690 int vreg, vreg2 = -1;
1691 int param_count = mono_method_signature (cmethod)->param_count;
1692
1693 if (args [param_count - 1]->opcode != OP_ICONST) {
1694 /*TODO Shuffle with non literals is not yet supported */
1695 return NULL;
1696 }
1697
1698 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1699 if (param_count == 3)
1700 vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1701
1702 NULLIFY_INS (args [param_count - 1]);
1703
1704
1705 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1706 ins->klass = cmethod->klass;
1707 ins->sreg1 = vreg;
1708 ins->sreg2 = vreg2;
1709 ins->inst_c0 = args [param_count - 1]->inst_c0;
1710 ins->type = STACK_VTYPE;
1711 ins->dreg = alloc_ireg (cfg);
1712 MONO_ADD_INS (cfg->cbb, ins);
1713
1714 if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1715 ins->opcode = OP_SHUFPS;
1716 return ins;
1717 }
1718
1719 static MonoInst*
simd_intrinsic_emit_load_aligned(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1720 simd_intrinsic_emit_load_aligned (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1721 {
1722 MonoInst *ins;
1723
1724 MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1725 ins->klass = cmethod->klass;
1726 ins->sreg1 = args [0]->dreg;
1727 ins->type = STACK_VTYPE;
1728 ins->dreg = alloc_ireg (cfg);
1729 MONO_ADD_INS (cfg->cbb, ins);
1730 return ins;
1731 }
1732
1733 static MonoInst*
simd_intrinsic_emit_store(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1734 simd_intrinsic_emit_store (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1735 {
1736 MonoInst *ins;
1737 int vreg;
1738
1739 vreg = get_simd_vreg (cfg, cmethod, args [1]);
1740
1741 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1742 ins->klass = cmethod->klass;
1743 ins->dreg = args [0]->dreg;
1744 ins->sreg1 = vreg;
1745 ins->type = STACK_VTYPE;
1746 MONO_ADD_INS (cfg->cbb, ins);
1747 return ins;
1748 }
1749
1750 static MonoInst*
simd_intrinsic_emit_extract_mask(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1751 simd_intrinsic_emit_extract_mask (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1752 {
1753 MonoInst *ins;
1754 int vreg;
1755
1756 vreg = get_simd_vreg (cfg, cmethod, args [0]);
1757
1758 MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1759 ins->klass = cmethod->klass;
1760 ins->sreg1 = vreg;
1761 ins->type = STACK_I4;
1762 ins->dreg = alloc_ireg (cfg);
1763 MONO_ADD_INS (cfg->cbb, ins);
1764
1765 return ins;
1766 }
1767
1768 static MonoInst*
simd_intrinsic_emit_prefetch(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1769 simd_intrinsic_emit_prefetch (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1770 {
1771 MonoInst *ins;
1772
1773 MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1774 ins->klass = cmethod->klass;
1775 ins->sreg1 = args [0]->dreg;
1776 ins->backend.arg_info = intrinsic->flags;
1777 MONO_ADD_INS (cfg->cbb, ins);
1778 return ins;
1779 }
1780
1781 static MonoInst*
simd_intrinsic_emit_const(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1782 simd_intrinsic_emit_const (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1783 {
1784 MonoInst *ins;
1785
1786 MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1787 ins->klass = cmethod->klass;
1788 ins->type = STACK_VTYPE;
1789 ins->dreg = alloc_xreg (cfg);
1790 MONO_ADD_INS (cfg->cbb, ins);
1791 return ins;
1792 }
1793
1794 static const char *
simd_version_name(guint32 version)1795 simd_version_name (guint32 version)
1796 {
1797 switch (version) {
1798 case SIMD_VERSION_SSE1:
1799 return "sse1";
1800 case SIMD_VERSION_SSE2:
1801 return "sse2";
1802 case SIMD_VERSION_SSE3:
1803 return "sse3";
1804 case SIMD_VERSION_SSSE3:
1805 return "ssse3";
1806 case SIMD_VERSION_SSE41:
1807 return "sse41";
1808 case SIMD_VERSION_SSE42:
1809 return "sse42";
1810 case SIMD_VERSION_SSE4a:
1811 return "sse4a";
1812 }
1813 return "n/a";
1814 }
1815
1816 static MonoInst*
emit_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args,const SimdIntrinsic * intrinsics,guint32 size)1817 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsic *intrinsics, guint32 size)
1818 {
1819 const SimdIntrinsic *result = (const SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
1820 if (!result) {
1821 DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1822 return NULL;
1823 }
1824 if (IS_DEBUG_ON (cfg)) {
1825 int i, max;
1826 printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1827 max = fsig->param_count + fsig->hasthis;
1828 for (i = 0; i < max; ++i) {
1829 printf ("param %d: ", i);
1830 mono_print_ins (args [i]);
1831 }
1832 }
1833 if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1834 if (IS_DEBUG_ON (cfg)) {
1835 int x;
1836 printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1837 for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1838 if (result->simd_version_flags & (1 << x))
1839 printf ("%s ", simd_version_name (1 << x));
1840
1841 printf ("\n");
1842 }
1843 return NULL;
1844 }
1845
1846 switch (result->simd_emit_mode) {
1847 case SIMD_EMIT_BINARY:
1848 return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1849 case SIMD_EMIT_UNARY:
1850 return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1851 case SIMD_EMIT_SETTER:
1852 return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1853 case SIMD_EMIT_GETTER:
1854 return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1855 case SIMD_EMIT_GETTER_QWORD:
1856 return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1857 case SIMD_EMIT_CTOR:
1858 return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1859 case SIMD_EMIT_CAST:
1860 return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1861 case SIMD_EMIT_SHUFFLE:
1862 return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1863 case SIMD_EMIT_SHIFT:
1864 return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1865 case SIMD_EMIT_EQUALITY:
1866 return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1867 case SIMD_EMIT_LOAD_ALIGNED:
1868 return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1869 case SIMD_EMIT_STORE:
1870 return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1871 case SIMD_EMIT_EXTRACT_MASK:
1872 return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1873 case SIMD_EMIT_PREFETCH:
1874 return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1875 }
1876 g_assert_not_reached ();
1877 }
1878
1879 static int
mono_emit_vector_ldelema(MonoCompile * cfg,MonoType * array_type,MonoInst * arr,MonoInst * index,gboolean check_bounds)1880 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1881 {
1882 MonoInst *ins;
1883 guint32 size;
1884 int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1885
1886 size = mono_array_element_size (mono_class_from_mono_type (array_type));
1887 mult_reg = alloc_preg (cfg);
1888 array_reg = arr->dreg;
1889 index_reg = index->dreg;
1890
1891 #if SIZEOF_VOID_P == 8
1892 /* The array reg is 64 bits but the index reg is only 32 */
1893 index2_reg = alloc_preg (cfg);
1894 MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1895 #else
1896 index2_reg = index_reg;
1897 #endif
1898 index3_reg = alloc_preg (cfg);
1899
1900 if (check_bounds) {
1901 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1902 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1903 MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1904 }
1905
1906 add_reg = alloc_preg (cfg);
1907
1908 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1909 MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1910 NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1911 ins->type = STACK_PTR;
1912 MONO_ADD_INS (cfg->cbb, ins);
1913
1914 return add_reg;
1915 }
1916
1917 static MonoInst*
emit_array_extension_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)1918 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1919 {
1920 if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1921 MonoInst *load;
1922 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1923
1924 MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1925 load->klass = cmethod->klass;
1926 load->sreg1 = addr;
1927 load->type = STACK_VTYPE;
1928 load->dreg = alloc_ireg (cfg);
1929 MONO_ADD_INS (cfg->cbb, load);
1930
1931 return load;
1932 }
1933 if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1934 MonoInst *store;
1935 int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1936 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1937
1938 MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG : OP_STOREX_MEMBASE);
1939 store->klass = cmethod->klass;
1940 store->dreg = addr;
1941 store->sreg1 = vreg;
1942 MONO_ADD_INS (cfg->cbb, store);
1943
1944 return store;
1945 }
1946 if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1947 MonoInst *ins;
1948 int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1949
1950 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1951 MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1952 NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1953 MONO_ADD_INS (cfg->cbb, ins);
1954
1955 return ins;
1956 }
1957 return NULL;
1958 }
1959
1960 static MonoInst*
emit_simd_runtime_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)1961 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1962 {
1963 if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1964 MonoInst *ins;
1965 EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1966 return ins;
1967 }
1968 return NULL;
1969 }
1970
1971 static gboolean
is_sys_numerics_assembly(MonoAssembly * assembly)1972 is_sys_numerics_assembly (MonoAssembly *assembly)
1973 {
1974 return !strcmp ("System.Numerics", assembly->aname.name);
1975 }
1976
1977 static gboolean
is_sys_numerics_vectors_assembly(MonoAssembly * assembly)1978 is_sys_numerics_vectors_assembly (MonoAssembly *assembly)
1979 {
1980 return !strcmp ("System.Numerics.Vectors", assembly->aname.name);
1981 }
1982
1983 MonoInst*
mono_emit_simd_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)1984 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1985 {
1986 const char *class_name;
1987
1988 if (is_sys_numerics_assembly (cmethod->klass->image->assembly))
1989 return emit_sys_numerics_intrinsics (cfg, cmethod, fsig, args);
1990
1991 if (is_sys_numerics_vectors_assembly (cmethod->klass->image->assembly))
1992 return emit_sys_numerics_vectors_intrinsics (cfg, cmethod, fsig, args);
1993
1994 if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1995 strcmp ("Mono.Simd", cmethod->klass->name_space))
1996 return NULL;
1997
1998 class_name = cmethod->klass->name;
1999 if (!strcmp ("SimdRuntime", class_name))
2000 return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
2001
2002 if (!strcmp ("ArrayExtensions", class_name))
2003 return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
2004
2005 if (!strcmp ("VectorOperations", class_name)) {
2006 if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
2007 return NULL;
2008 class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
2009 } else if (!cmethod->klass->simd_type)
2010 return NULL;
2011
2012 cfg->uses_simd_intrinsics = 1;
2013 if (!strcmp ("Vector2d", class_name))
2014 return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsic));
2015 if (!strcmp ("Vector4f", class_name))
2016 return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsic));
2017 if (!strcmp ("Vector2ul", class_name))
2018 return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsic));
2019 if (!strcmp ("Vector2l", class_name))
2020 return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsic));
2021 if (!strcmp ("Vector4ui", class_name))
2022 return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsic));
2023 if (!strcmp ("Vector4i", class_name))
2024 return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsic));
2025 if (!strcmp ("Vector8us", class_name))
2026 return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsic));
2027 if (!strcmp ("Vector8s", class_name))
2028 return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsic));
2029 if (!strcmp ("Vector16b", class_name))
2030 return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsic));
2031 if (!strcmp ("Vector16sb", class_name))
2032 return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsic));
2033
2034 return NULL;
2035 }
2036
2037 static void
assert_handled(MonoCompile * cfg,MonoMethod * method)2038 assert_handled (MonoCompile *cfg, MonoMethod *method)
2039 {
2040 MonoCustomAttrInfo *cattr;
2041 MonoError error;
2042
2043 if (cfg->verbose_level > 1) {
2044 cattr = mono_custom_attrs_from_method_checked (method, &error);
2045
2046 if (cattr) {
2047 gboolean has_attr = FALSE;
2048 for (int i = 0; i < cattr->num_attrs; ++i)
2049 if (cattr->attrs [i].ctor && (!strcmp (cattr->attrs [i].ctor->klass->name, "JitIntrinsicAttribute")))
2050 has_attr = TRUE;
2051 if (has_attr) {
2052 printf ("SIMD intrinsic unhandled: %s\n", mono_method_get_name_full (method, TRUE, TRUE, MONO_TYPE_NAME_FORMAT_IL));
2053 fflush (stdout);
2054 //g_assert_not_reached ();
2055 }
2056 mono_custom_attrs_free (cattr);
2057 }
2058 }
2059 }
2060
2061 // The entries should be ordered by name
2062 // System.Numerics.Vector2/Vector3/Vector4
2063 static const SimdIntrinsic vector2_intrinsics[] = {
2064 { SN_ctor, OP_EXPAND_R4 },
2065 { SN_Abs },
2066 { SN_Dot, OP_DPPS },
2067 { SN_Equals, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
2068 { SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2069 { SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2070 { SN_SquareRoot, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
2071 { SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2072 { SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2073 { SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2074 { SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2075 };
2076
2077 static MonoInst*
emit_vector_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2078 emit_vector_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2079 {
2080 const SimdIntrinsic *intrins;
2081 MonoMethodSignature *sig = mono_method_signature (cmethod);
2082 MonoType *type = &cmethod->klass->byval_arg;
2083
2084 /*
2085 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
2086 */
2087 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector2_intrinsics, sizeof (vector2_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2088 if (!intrins) {
2089 assert_handled (cfg, cmethod);
2090 return NULL;
2091 }
2092
2093 if (cfg->verbose_level > 1) {
2094 char *name = mono_method_full_name (cmethod, TRUE);
2095 printf (" SIMD intrinsic %s\n", name);
2096 g_free (name);
2097 }
2098
2099 switch (intrins->name) {
2100 case SN_ctor: {
2101 gboolean match = TRUE;
2102 for (int i = 0; i < fsig->param_count; ++i)
2103 if (fsig->params [i]->type != MONO_TYPE_R4)
2104 match = FALSE;
2105 if (!match)
2106 break;
2107 return simd_intrinsic_emit_ctor (intrins, cfg, cmethod, args);
2108 }
2109 case SN_Equals:
2110 if (!(fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && fsig->params [0] == type))
2111 break;
2112 return simd_intrinsic_emit_equality (intrins, cfg, cmethod, args);
2113 case SN_SquareRoot:
2114 if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2115 break;
2116 return simd_intrinsic_emit_unary (intrins, cfg, cmethod, args);
2117 case SN_Dot:
2118 if (!(fsig->param_count == 2 && fsig->ret->type == MONO_TYPE_R4 && fsig->params [0] == type && fsig->params [1] == type))
2119 break;
2120 if (COMPILE_LLVM (cfg)) {
2121 MonoInst *ins;
2122
2123 ins = simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2124 /* The end result is in the lowest element */
2125 return simd_intrinsic_emit_getter_op (cfg, 0, cmethod->klass, mono_method_signature (cmethod)->ret, ins);
2126 }
2127 break;
2128 case SN_Abs: {
2129 // abs(x) = max(x, sub(0,x))
2130 MonoInst *sub;
2131 MonoInst *zero;
2132
2133 if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2134 break;
2135
2136 MONO_INST_NEW (cfg, zero, OP_XZERO);
2137 zero->dreg = alloc_xreg (cfg);
2138 zero->klass = cmethod->klass;
2139 MONO_ADD_INS (cfg->cbb, zero);
2140
2141 sub = simd_intrinsic_emit_binary_op (cfg, OP_SUBPS, 0, cmethod->klass, sig->params [0], sig->params [0], zero, args [0]);
2142 return simd_intrinsic_emit_binary_op (cfg, OP_MAXPS, 0, cmethod->klass, sig->params [0], sig->params [0], args [0], sub);
2143 }
2144 case SN_Max:
2145 case SN_Min:
2146 case SN_op_Addition:
2147 case SN_op_Division:
2148 case SN_op_Multiply:
2149 case SN_op_Subtraction:
2150 if (!(fsig->param_count == 2 && fsig->ret == type && (fsig->params [0] == type || fsig->params [0]->type == MONO_TYPE_R4) && (fsig->params [1] == type || fsig->params [1]->type == MONO_TYPE_R4)))
2151 break;
2152 return simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2153 default:
2154 break;
2155 }
2156
2157 assert_handled (cfg, cmethod);
2158
2159 if (cfg->verbose_level > 1) {
2160 char *name = mono_method_full_name (cmethod, TRUE);
2161 printf (" SIMD method %s not handled.\n", name);
2162 g_free (name);
2163 }
2164 return NULL;
2165 }
2166
2167 static MonoInst*
emit_vector_is_hardware_accelerated_intrinsic(MonoCompile * cfg)2168 emit_vector_is_hardware_accelerated_intrinsic (MonoCompile *cfg)
2169 {
2170 MonoInst *ins;
2171
2172 if (simd_supported_versions)
2173 EMIT_NEW_ICONST (cfg, ins, 1);
2174 else
2175 EMIT_NEW_ICONST (cfg, ins, 0);
2176 ins->type = STACK_I4;
2177 return ins;
2178 }
2179
2180 /* These should be ordered by name */
2181 static const SimdIntrinsic vector_t_intrinsics[] = {
2182 { SN_ctor },
2183 { SN_Abs },
2184 { SN_CopyTo },
2185 { SN_Equals },
2186 { SN_GreaterThan },
2187 { SN_GreaterThanOrEqual },
2188 { SN_LessThan },
2189 { SN_LessThanOrEqual },
2190 { SN_Max },
2191 { SN_Min },
2192 { SN_get_AllOnes, OP_XONES },
2193 { SN_get_Count },
2194 { SN_get_Item },
2195 { SN_get_Zero, OP_XZERO },
2196 { SN_op_Addition },
2197 { SN_op_BitwiseAnd },
2198 { SN_op_BitwiseOr },
2199 { SN_op_Division },
2200 { SN_op_ExclusiveOr },
2201 { SN_op_Explicit },
2202 { SN_op_Multiply },
2203 { SN_op_Subtraction }
2204 };
2205
2206 static MonoInst*
emit_vector_t_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2207 emit_vector_t_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2208 {
2209 const SimdIntrinsic *intrins;
2210 MonoType *type, *etype;
2211 MonoInst *ins;
2212 int size, len, index;
2213
2214 intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector_t_intrinsics, sizeof (vector_t_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2215 if (!intrins) {
2216 assert_handled (cfg, cmethod);
2217 return NULL;
2218 }
2219
2220 type = &cmethod->klass->byval_arg;
2221 etype = mono_class_get_context (cmethod->klass)->class_inst->type_argv [0];
2222 size = mono_class_value_size (mono_class_from_mono_type (etype), NULL);
2223 g_assert (size);
2224 len = 16 / size;
2225
2226 if (!MONO_TYPE_IS_PRIMITIVE (etype))
2227 return NULL;
2228
2229 if (cfg->verbose_level > 1) {
2230 char *name = mono_method_full_name (cmethod, TRUE);
2231 printf (" SIMD intrinsic %s\n", name);
2232 g_free (name);
2233 }
2234
2235 switch (intrins->name) {
2236 case SN_get_Count:
2237 if (!(fsig->param_count == 0 && fsig->ret->type == MONO_TYPE_I4))
2238 break;
2239 EMIT_NEW_ICONST (cfg, ins, len);
2240 return ins;
2241 case SN_get_AllOnes:
2242 case SN_get_Zero:
2243 if (!(fsig->param_count == 0 && mono_metadata_type_equal (fsig->ret, type)))
2244 break;
2245 return simd_intrinsic_emit_const (intrins, cfg, cmethod, args);
2246 case SN_get_Item:
2247 g_assert (fsig->param_count == 1);
2248 if (args [1]->opcode != OP_ICONST)
2249 return NULL;
2250 index = args [1]->inst_c0;
2251 if (index < 0 || index >= len)
2252 return NULL;
2253 return simd_intrinsic_emit_getter_op (cfg, index, cmethod->klass, etype, args [0]);
2254 case SN_ctor:
2255 if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
2256 return simd_intrinsic_emit_ctor (NULL, cfg, cmethod, args);
2257 if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) {
2258 MonoInst *array_ins = args [1];
2259 MonoInst *index_ins;
2260 MonoInst *ldelema_ins;
2261 MonoInst *var;
2262 int end_index_reg;
2263
2264 if (args [0]->opcode != OP_LDADDR)
2265 return NULL;
2266
2267 /* .ctor (T[]) or .ctor (T[], index) */
2268
2269 if (fsig->param_count == 2) {
2270 index_ins = args [2];
2271 } else {
2272 EMIT_NEW_ICONST (cfg, index_ins, 0);
2273 }
2274
2275 /* Emit index check for the end (index + len - 1 < array length) */
2276 end_index_reg = alloc_ireg (cfg);
2277 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2278 MONO_EMIT_BOUNDS_CHECK (cfg, array_ins->dreg, MonoArray, max_length, end_index_reg);
2279
2280 /* Load the array slice into the simd reg */
2281 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2282 g_assert (args [0]->opcode == OP_LDADDR);
2283 var = args [0]->inst_p0;
2284 EMIT_NEW_LOAD_MEMBASE (cfg, ins, OP_LOADX_MEMBASE, var->dreg, ldelema_ins->dreg, 0);
2285 ins->klass = cmethod->klass;
2286 return args [0];
2287 }
2288 break;
2289 case SN_op_Explicit:
2290 return simd_intrinsic_emit_cast (intrins, cfg, cmethod, args);
2291 case SN_Equals:
2292 if (fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && mono_metadata_type_equal (fsig->params [0], type))
2293 return simd_intrinsic_emit_equality_op (cfg, cmethod, args, type_to_comp_op (etype), SIMD_COMP_EQ);
2294 if (fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type))
2295 return simd_intrinsic_emit_binary_op (cfg, type_to_comp_op (etype), 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2296 break;
2297
2298 case SN_GreaterThan:
2299 case SN_GreaterThanOrEqual:
2300 case SN_LessThan:
2301 case SN_LessThanOrEqual: {
2302 MonoInst *cmp1, *cmp2;
2303 int eq_op, gt_op;
2304
2305 switch (etype->type) {
2306 case MONO_TYPE_I1:
2307 case MONO_TYPE_I2:
2308 case MONO_TYPE_I4:
2309 case MONO_TYPE_I8:
2310 break;
2311 default:
2312 return NULL;
2313 }
2314
2315 eq_op = type_to_comp_op (etype);
2316 gt_op = type_to_gt_op (etype);
2317
2318 switch (intrins->name) {
2319 case SN_GreaterThan:
2320 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2321 case SN_LessThan:
2322 return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2323 case SN_LessThanOrEqual:
2324 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2325 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2326 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2327 case SN_GreaterThanOrEqual:
2328 cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2329 cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2330 return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2331 default:
2332 g_assert_not_reached ();
2333 break;
2334 }
2335 }
2336 case SN_Abs:
2337 /* Vector<T>.Abs */
2338 switch (etype->type) {
2339 case MONO_TYPE_U1:
2340 case MONO_TYPE_U2:
2341 case MONO_TYPE_U4:
2342 case MONO_TYPE_U8: {
2343 MonoInst *ins;
2344
2345 /* No-op */
2346 MONO_INST_NEW (cfg, ins, OP_XMOVE);
2347 ins->klass = cmethod->klass;
2348 ins->type = STACK_VTYPE;
2349 ins->sreg1 = args [0]->dreg;
2350 ins->dreg = alloc_xreg (cfg);
2351 MONO_ADD_INS (cfg->cbb, ins);
2352 return ins;
2353 }
2354 default:
2355 break;
2356 }
2357 break;
2358 case SN_op_Addition:
2359 case SN_op_Subtraction:
2360 case SN_op_Multiply:
2361 case SN_op_Division:
2362 case SN_op_ExclusiveOr:
2363 case SN_op_BitwiseAnd:
2364 case SN_op_BitwiseOr:
2365 case SN_Max:
2366 case SN_Min: {
2367 if (!(fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, fsig->params [0]) && mono_metadata_type_equal (fsig->params [0], fsig->params [1])))
2368 break;
2369 int op = 0;
2370 switch (intrins->name) {
2371 case SN_op_Addition:
2372 op = type_to_padd_op (etype);
2373 break;
2374 case SN_op_Subtraction:
2375 op = type_to_psub_op (etype);
2376 break;
2377 case SN_op_Multiply:
2378 op = type_to_pmul_op (etype);
2379 break;
2380 case SN_op_Division:
2381 op = type_to_pdiv_op (etype);
2382 break;
2383 case SN_op_ExclusiveOr:
2384 op = type_to_pxor_op (etype);
2385 break;
2386 case SN_op_BitwiseAnd:
2387 op = type_to_pand_op (etype);
2388 break;
2389 case SN_op_BitwiseOr:
2390 op = type_to_por_op (etype);
2391 break;
2392 case SN_Min:
2393 op = type_to_pmin_op (etype);
2394 break;
2395 case SN_Max:
2396 op = type_to_pmax_op (etype);
2397 break;
2398 default:
2399 g_assert_not_reached ();
2400 }
2401 if (op != -1)
2402 return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2403 break;
2404 }
2405 case SN_CopyTo: {
2406 MonoInst *array_ins = args [1];
2407 MonoInst *index_ins = args [2];
2408 MonoInst *ldelema_ins;
2409 MonoInst *var;
2410 int end_index_reg;
2411
2412 if (args [0]->opcode != OP_LDADDR)
2413 return NULL;
2414
2415 /* Emit index check for the end (index + len - 1 < array length) */
2416 end_index_reg = alloc_ireg (cfg);
2417 EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2418
2419 int length_reg = alloc_ireg (cfg);
2420 MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg, OP_LOADI4_MEMBASE, length_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length));
2421 MONO_EMIT_NEW_BIALU (cfg, OP_COMPARE, -1, length_reg, end_index_reg);
2422 MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "ArgumentException");
2423
2424 /* Load the simd reg into the array slice */
2425 ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2426 g_assert (args [0]->opcode == OP_LDADDR);
2427 var = args [0]->inst_p0;
2428 EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, var->dreg);
2429 ins->klass = cmethod->klass;
2430 return args [0];
2431 break;
2432 }
2433 default:
2434 break;
2435 }
2436
2437 assert_handled (cfg, cmethod);
2438
2439 if (cfg->verbose_level > 1) {
2440 char *name = mono_method_full_name (cmethod, TRUE);
2441 printf (" SIMD method %s not handled.\n", name);
2442 g_free (name);
2443 }
2444
2445 return NULL;
2446 }
2447
2448 /*
2449 * emit_sys_numerics_intrinsics:
2450 *
2451 * Emit intrinsics for the System.Numerics assembly.
2452 */
2453 static MonoInst*
emit_sys_numerics_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2454 emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2455 {
2456 const char *nspace = cmethod->klass->name_space;
2457 const char *class_name = cmethod->klass->name;
2458
2459 if (!strcmp ("Vector2", class_name) || !strcmp ("Vector4", class_name) || !strcmp ("Vector3", class_name))
2460 return emit_vector_intrinsics (cfg, cmethod, fsig, args);
2461
2462 if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2463 if (!strcmp (cmethod->name, "get_IsHardwareAccelerated"))
2464 return emit_vector_is_hardware_accelerated_intrinsic (cfg);
2465 }
2466
2467 return NULL;
2468 }
2469
2470 static MonoInst*
emit_sys_numerics_vectors_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2471 emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2472 {
2473 const char *nspace = cmethod->klass->name_space;
2474 const char *class_name = cmethod->klass->name;
2475
2476 if (!strcmp (class_name, "Vector`1"))
2477 return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2478
2479 if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2480 if (!strcmp (cmethod->name, "get_IsHardwareAccelerated"))
2481 return emit_vector_is_hardware_accelerated_intrinsic (cfg);
2482 }
2483
2484 return NULL;
2485 }
2486
2487 MonoInst*
mono_emit_simd_field_load(MonoCompile * cfg,MonoClassField * field,MonoInst * addr)2488 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2489 {
2490 if (is_sys_numerics_assembly (field->parent->image->assembly)) {
2491 int index = -1;
2492
2493 if (!strcmp (field->parent->name, "Vector2") ||
2494 !strcmp (field->parent->name, "Vector3") ||
2495 !strcmp (field->parent->name, "Vector4")) {
2496 if (!strcmp (field->name, "X"))
2497 index = 0;
2498 else if (!strcmp (field->name, "Y"))
2499 index = 1;
2500 else if (!strcmp (field->name, "Z"))
2501 index = 2;
2502 else if (!strcmp (field->name, "W"))
2503 index = 3;
2504 }
2505
2506 if (index != -1) {
2507 if (cfg->verbose_level > 1)
2508 printf (" SIMD intrinsic field access: %s\n", field->name);
2509
2510 return simd_intrinsic_emit_getter_op (cfg, index, field->parent, mono_field_get_type (field), addr);
2511 }
2512 }
2513 return NULL;
2514 }
2515
2516 #endif /* DISABLE_JIT */
2517
2518 #else
2519
2520 MonoInst*
mono_emit_simd_field_load(MonoCompile * cfg,MonoClassField * field,MonoInst * addr)2521 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2522 {
2523 return NULL;
2524 }
2525
2526 #endif /* MONO_ARCH_SIMD_INTRINSICS */
2527