1 /**
2  * \file
3  * simd support for intrinsics
4  *
5  * Author:
6  *   Rodrigo Kumpera (rkumpera@novell.com)
7  *
8  * (C) 2008 Novell, Inc.
9  */
10 
11 #include <config.h>
12 #include <stdio.h>
13 
14 #include "mini.h"
15 #include "ir-emit.h"
16 #include "mono/utils/bsearch.h"
17 #include <mono/metadata/abi-details.h>
18 #include <mono/metadata/reflection-internals.h>
19 
20 /*
21 General notes on SIMD intrinsics
22 
23 TODO handle operands with non SIMD args, such as op_Addition (Vector4f, float)
24 TODO optimize r4const in .ctor so it doesn't go into the FP stack first
25 TODO extend op_to_op_dest_membase to handle simd ops
26 TODO add support for indexed versions of simd ops
27 TODO to an amd64 port and figure out how to properly handle extractors/.ctor
28 TODO make sure locals, arguments and spills are properly aligned.
29 TODO add support for fusing a XMOVE into a simd op in mono_spill_global_vars.
30 TODO add stuff to man pages
31 TODO document this under /docs
32 TODO make passing a xmm as argument not cause it to be LDADDR'ed (introduce an OP_XPUSH)
33 TODO revamp the .ctor sequence as it looks very fragile, maybe use a var just like move_i4_to_f. (or just pinst sse ops)
34 TODO figure out what's wrong with OP_STOREX_MEMBASE_REG and OP_STOREX_MEMBASE (the 2nd is for imm operands)
35 TODO maybe add SSE3 emulation on top of SSE2, or just implement the corresponding functions using SSE2 intrinsics.
36 TODO pass simd arguments in registers or, at least, add SSE support for pushing large (>=16) valuetypes
37 TODO pass simd args byval to a non-intrinsic method cause some useless local var load/store to happen.
38 TODO check if we need to init the SSE control word with better precision.
39 TODO add support for 3 reg sources in mini without slowing the common path. Or find a way to make MASKMOVDQU work.
40 TODO make SimdRuntime.get_AccelMode work under AOT
41 TODO patterns such as "a ^= b" generate slower code as the LDADDR op will be copied to a tmp first. Look at adding a indirection reduction pass after the dce pass.
42 TODO extend bounds checking code to support for range checking.
43 
44 General notes for SIMD intrinsics.
45 
46 -Bad extractor and constructor performance
47 Extracting a float from a XMM is a complete disaster if you are passing it as an argument.
48 It will be loaded in the FP stack just to be pushed on the call stack.
49 
50 A similar thing happens with Vector4f constructor that require float vars to be
51 
52 The fix for this issue is similar to the one required for r4const as method args. Avoiding the
53 trip to the FP stack is desirable.
54 
55 -Extractor and constructor code doesn't make sense under amd64. Both currently assume separate banks
56 for simd and fp.
57 
58 
59 -Promote OP_EXTRACT_I4 to a STORE op
60 The advantage of this change is that it could have a _membase version and promote further optimizations.
61 
62 -Create a MONO_INST_DONT_REGALLOC and use it in all places that MONO_INST_INDIRECT is used
63 without a OP_LDADDR.
64 */
65 
66 #if defined (MONO_ARCH_SIMD_INTRINSICS)
67 
68 #if defined (DISABLE_JIT)
69 
70 void
mono_simd_intrinsics_init(void)71 mono_simd_intrinsics_init (void)
72 {
73 }
74 
75 #else
76 
77 //#define IS_DEBUG_ON(cfg) (0)
78 
79 #define IS_DEBUG_ON(cfg) ((cfg)->verbose_level >= 3)
80 #define DEBUG(a) do { if (IS_DEBUG_ON(cfg)) { a; } } while (0)
81 enum {
82 	SIMD_EMIT_BINARY,
83 	SIMD_EMIT_UNARY,
84 	SIMD_EMIT_SETTER,
85 	SIMD_EMIT_GETTER,
86 	SIMD_EMIT_GETTER_QWORD,
87 	SIMD_EMIT_CTOR,
88 	SIMD_EMIT_CAST,
89 	SIMD_EMIT_SHUFFLE,
90 	SIMD_EMIT_SHIFT,
91 	SIMD_EMIT_EQUALITY,
92 	SIMD_EMIT_LOAD_ALIGNED,
93 	SIMD_EMIT_STORE,
94 	SIMD_EMIT_EXTRACT_MASK,
95 	SIMD_EMIT_PREFETCH
96 };
97 
98 #ifdef HAVE_ARRAY_ELEM_INIT
99 #define MSGSTRFIELD(line) MSGSTRFIELD1(line)
100 #define MSGSTRFIELD1(line) str##line
101 static const struct msgstr_t {
102 #define SIMD_METHOD(str,name) char MSGSTRFIELD(__LINE__) [sizeof (str)];
103 #include "simd-methods.h"
104 #undef SIMD_METHOD
105 } method_names = {
106 #define SIMD_METHOD(str,name) str,
107 #include "simd-methods.h"
108 #undef SIMD_METHOD
109 };
110 
111 enum {
112 #define SIMD_METHOD(str,name) name = offsetof (struct msgstr_t, MSGSTRFIELD(__LINE__)),
113 #include "simd-methods.h"
114 };
115 #define method_name(idx) ((const char*)&method_names + (idx))
116 
117 #else
118 #define SIMD_METHOD(str,name) str,
119 static const char * const method_names [] = {
120 #include "simd-methods.h"
121 	NULL
122 };
123 #undef SIMD_METHOD
124 #define SIMD_METHOD(str,name) name,
125 enum {
126 #include "simd-methods.h"
127 	SN_LAST
128 };
129 
130 #define method_name(idx) (method_names [(idx)])
131 
132 #endif
133 
134 typedef struct {
135 	guint16 name;
136 	guint16 opcode;
137 	guint8 simd_version_flags;
138 	guint8 simd_emit_mode : 4;
139 	guint8 flags : 4;
140 } SimdIntrinsic;
141 
142 static const SimdIntrinsic vector4f_intrinsics[] = {
143 	{ SN_ctor, OP_EXPAND_R4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
144 	{ SN_AddSub, OP_ADDSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY},
145 	{ SN_AndNot, OP_ANDNPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY},
146 	{ SN_CompareEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
147 	{ SN_CompareLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
148 	{ SN_CompareLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
149 	{ SN_CompareNotEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
150 	{ SN_CompareNotLessEqual, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
151 	{ SN_CompareNotLessThan, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
152 	{ SN_CompareOrdered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
153 	{ SN_CompareUnordered, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
154 	{ SN_ConvertToDouble, OP_CVTPS2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
155 	{ SN_ConvertToInt, OP_CVTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
156 	{ SN_ConvertToIntTruncated, OP_CVTTPS2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
157 	{ SN_DuplicateHigh, OP_DUPPS_HIGH, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
158 	{ SN_DuplicateLow, OP_DUPPS_LOW, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
159 	{ SN_HorizontalAdd, OP_HADDPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
160 	{ SN_HorizontalSub, OP_HSUBPS, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
161 	{ SN_InterleaveHigh, OP_UNPACK_HIGHPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
162 	{ SN_InterleaveLow, OP_UNPACK_LOWPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
163 	{ SN_InvSqrt, OP_RSQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
164 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
165 	{ SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
166 	{ SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
167 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
168 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
169 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
170 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
171 	{ SN_Reciprocal, OP_RCPPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
172 	{ SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
173 	{ SN_Sqrt, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
174 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
175 	{ SN_StoreNonTemporal, OP_STOREX_NTA_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
176 	{ SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
177 	{ SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
178 	{ SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
179 	{ SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
180 	{ SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
181 	{ SN_op_BitwiseAnd, OP_ANDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
182 	{ SN_op_BitwiseOr, OP_ORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
183 	{ SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
184 	{ SN_op_Equality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
185 	{ SN_op_ExclusiveOr, OP_XORPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
186 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
187 	{ SN_op_Inequality, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
188 	{ SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
189 	{ SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
190 	{ SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
191 	{ SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
192 	{ SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
193 	{ SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER }
194 };
195 
196 static const SimdIntrinsic vector2d_intrinsics[] = {
197 	{ SN_ctor, OP_EXPAND_R8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
198 	{ SN_AddSub, OP_ADDSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY,},
199 	{ SN_AndNot, OP_ANDNPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
200 	{ SN_CompareEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_EQ },
201 	{ SN_CompareLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LE },
202 	{ SN_CompareLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_LT },
203 	{ SN_CompareNotEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NEQ },
204 	{ SN_CompareNotLessEqual, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLE },
205 	{ SN_CompareNotLessThan, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_NLT },
206 	{ SN_CompareOrdered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_ORD },
207 	{ SN_CompareUnordered, OP_COMPPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_COMP_UNORD },
208 	{ SN_ConvertToFloat, OP_CVTPD2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
209 	{ SN_ConvertToInt, OP_CVTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
210 	{ SN_ConvertToIntTruncated, OP_CVTTPD2DQ, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
211 	{ SN_Duplicate, OP_DUPPD, SIMD_VERSION_SSE3, SIMD_EMIT_UNARY },
212 	{ SN_HorizontalAdd, OP_HADDPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
213 	{ SN_HorizontalSub, OP_HSUBPD, SIMD_VERSION_SSE3, SIMD_EMIT_BINARY },
214 	{ SN_InterleaveHigh, OP_UNPACK_HIGHPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
215 	{ SN_InterleaveLow, OP_UNPACK_LOWPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
216 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
217 	{ SN_Max, OP_MAXPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
218 	{ SN_Min, OP_MINPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
219 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
220 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
221 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
222 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
223 	{ SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
224 	{ SN_Sqrt, OP_SQRTPD, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
225 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
226 	{ SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
227 	{ SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
228 	{ SN_op_Addition, OP_ADDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
229 	{ SN_op_BitwiseAnd, OP_ANDPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
230 	{ SN_op_BitwiseOr, OP_ORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
231 	{ SN_op_Division, OP_DIVPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
232 	{ SN_op_ExclusiveOr, OP_XORPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
233 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
234 	{ SN_op_Multiply, OP_MULPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
235 	{ SN_op_Subtraction, OP_SUBPD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
236 	{ SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
237 	{ SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
238 };
239 
240 static const SimdIntrinsic vector2ul_intrinsics[] = {
241 	{ SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
242 	{ SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
243 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
244 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
245 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
246 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
247 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
248 	{ SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
249 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
250 	{ SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
251 	{ SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
252 	{ SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
253 	{ SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
254 	{ SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
255 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
256 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
257 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
258 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
259 	{ SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
260 	{ SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
261 	{ SN_op_RightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
262 	{ SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
263 	{ SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
264 	{ SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
265 };
266 
267 static const SimdIntrinsic vector2l_intrinsics[] = {
268 	{ SN_ctor, OP_EXPAND_I8, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
269 	{ SN_CompareEqual, OP_PCMPEQQ, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
270 	{ SN_CompareGreaterThan, OP_PCMPGTQ, SIMD_VERSION_SSE42, SIMD_EMIT_BINARY },
271 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
272 	{ SN_LogicalRightShift, OP_PSHRQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
273 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
274 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
275 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
276 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
277 	{ SN_Shuffle, OP_SHUFPD, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
278 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
279 	{ SN_UnpackHigh, OP_UNPACK_HIGHQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
280 	{ SN_UnpackLow, OP_UNPACK_LOWQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
281 	{ SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
282 	{ SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER_QWORD },
283 	{ SN_op_Addition, OP_PADDQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
284 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
285 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
286 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
287 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
288 	{ SN_op_LeftShift, OP_PSHLQ, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
289 	{ SN_op_Multiply, OP_PMULQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
290 	{ SN_op_Subtraction, OP_PSUBQ, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
291 	{ SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
292 	{ SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
293 };
294 
295 static const SimdIntrinsic vector4ui_intrinsics[] = {
296 	{ SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
297 	{ SN_ArithmeticRightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
298 	{ SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
299 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
300 	{ SN_Max, OP_PMAXD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
301 	{ SN_Min, OP_PMIND_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
302 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
303 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
304 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
305 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
306 	{ SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
307 	{ SN_SignedPackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
308 	{ SN_SignedPackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
309 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
310 	{ SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
311 	{ SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
312 	{ SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
313 	{ SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
314 	{ SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
315 	{ SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
316 	{ SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
317 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
318 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
319 	{ SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
320 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
321 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
322 	{ SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
323 	{ SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
324 	{ SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
325 	{ SN_op_RightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
326 	{ SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
327 	{ SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
328 	{ SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
329 	{ SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
330 	{ SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
331 };
332 
333 static const SimdIntrinsic vector4i_intrinsics[] = {
334 	{ SN_ctor, OP_EXPAND_I4, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
335 	{ SN_CompareEqual, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
336 	{ SN_CompareGreaterThan, OP_PCMPGTD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
337 	{ SN_ConvertToDouble, OP_CVTDQ2PD, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
338 	{ SN_ConvertToFloat, OP_CVTDQ2PS, SIMD_VERSION_SSE2, SIMD_EMIT_UNARY },
339 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
340 	{ SN_LogicalRightShift, OP_PSHRD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
341 	{ SN_Max, OP_PMAXD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
342 	{ SN_Min, OP_PMIND, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
343 	{ SN_PackWithSignedSaturation, OP_PACKD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
344 	{ SN_PackWithUnsignedSaturation, OP_PACKD_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
345 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
346 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
347 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
348 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
349 	{ SN_Shuffle, OP_PSHUFLED, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
350 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
351 	{ SN_UnpackHigh, OP_UNPACK_HIGHD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
352 	{ SN_UnpackLow, OP_UNPACK_LOWD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
353 	{ SN_get_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
354 	{ SN_get_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
355 	{ SN_get_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
356 	{ SN_get_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
357 	{ SN_op_Addition, OP_PADDD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
358 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
359 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
360 	{ SN_op_Equality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
361 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
362 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
363 	{ SN_op_Inequality, OP_PCMPEQD, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
364 	{ SN_op_LeftShift, OP_PSHLD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
365 	{ SN_op_Multiply, OP_PMULD, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
366 	{ SN_op_RightShift, OP_PSARD, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
367 	{ SN_op_Subtraction, OP_PSUBD, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
368 	{ SN_set_W, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
369 	{ SN_set_X, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
370 	{ SN_set_Y, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
371 	{ SN_set_Z, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
372 };
373 
374 static const SimdIntrinsic vector8us_intrinsics[] = {
375 	{ SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
376 	{ SN_AddWithSaturation, OP_PADDW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
377 	{ SN_ArithmeticRightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
378 	{ SN_Average, OP_PAVGW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
379 	{ SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY, SIMD_VERSION_SSE1 },
380 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
381 	{ SN_Max, OP_PMAXW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
382 	{ SN_Min, OP_PMINW_UN, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
383 	{ SN_MultiplyStoreHigh, OP_PMULW_HIGH_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
384 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
385 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
386 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
387 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
388 	{ SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
389 	{ SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
390 	{ SN_SignedPackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
391 	{ SN_SignedPackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
392 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
393 	{ SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
394 	{ SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
395 	{ SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
396 	{ SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
397 	{ SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
398 	{ SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
399 	{ SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
400 	{ SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
401 	{ SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
402 	{ SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
403 	{ SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
404 	{ SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
405 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
406 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
407 	{ SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
408 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
409 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
410 	{ SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
411 	{ SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
412 	{ SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
413 	{ SN_op_RightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
414 	{ SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
415 	{ SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
416 	{ SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
417 	{ SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
418 	{ SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
419 	{ SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
420 	{ SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
421 	{ SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
422 	{ SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
423 };
424 
425 static const SimdIntrinsic vector8s_intrinsics[] = {
426 	{ SN_ctor, OP_EXPAND_I2, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
427 	{ SN_AddWithSaturation, OP_PADDW_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
428 	{ SN_CompareEqual, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
429 	{ SN_CompareGreaterThan, OP_PCMPGTW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
430 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
431 	{ SN_LogicalRightShift, OP_PSHRW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
432 	{ SN_Max, OP_PMAXW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
433 	{ SN_Min, OP_PMINW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
434 	{ SN_MultiplyStoreHigh, OP_PMULW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
435 	{ SN_PackWithSignedSaturation, OP_PACKW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
436 	{ SN_PackWithUnsignedSaturation, OP_PACKW_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
437 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
438 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
439 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
440 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
441 	{ SN_ShuffleHigh, OP_PSHUFLEW_HIGH, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
442 	{ SN_ShuffleLow, OP_PSHUFLEW_LOW, SIMD_VERSION_SSE1, SIMD_EMIT_SHUFFLE },
443 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
444 	{ SN_SubtractWithSaturation, OP_PSUBW_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
445 	{ SN_UnpackHigh, OP_UNPACK_HIGHW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
446 	{ SN_UnpackLow, OP_UNPACK_LOWW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
447 	{ SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
448 	{ SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
449 	{ SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
450 	{ SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
451 	{ SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
452 	{ SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
453 	{ SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
454 	{ SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
455 	{ SN_op_Addition, OP_PADDW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
456 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
457 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
458 	{ SN_op_Equality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
459 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
460 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
461 	{ SN_op_Inequality, OP_PCMPEQW, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
462 	{ SN_op_LeftShift, OP_PSHLW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
463 	{ SN_op_Multiply, OP_PMULW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
464 	{ SN_op_RightShift, OP_PSARW, SIMD_VERSION_SSE1, SIMD_EMIT_SHIFT },
465 	{ SN_op_Subtraction, OP_PSUBW, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
466 	{ SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
467 	{ SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
468 	{ SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
469 	{ SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
470 	{ SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
471 	{ SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
472 	{ SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
473 	{ SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
474 };
475 
476 static const SimdIntrinsic vector16b_intrinsics[] = {
477 	{ SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
478 	{ SN_AddWithSaturation, OP_PADDB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
479 	{ SN_Average, OP_PAVGB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
480 	{ SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
481 	{ SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
482 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
483 	{ SN_Max, OP_PMAXB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
484 	{ SN_Min, OP_PMINB_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
485 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
486 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
487 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
488 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
489 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
490 	{ SN_SubtractWithSaturation, OP_PSUBB_SAT_UN, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
491 	{ SN_SumOfAbsoluteDifferences, OP_PSUM_ABS_DIFF, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
492 	{ SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
493 	{ SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
494 	{ SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
495 	{ SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
496 	{ SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
497 	{ SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
498 	{ SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
499 	{ SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
500 	{ SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
501 	{ SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
502 	{ SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
503 	{ SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
504 	{ SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
505 	{ SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
506 	{ SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
507 	{ SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
508 	{ SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
509 	{ SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
510 	{ SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
511 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
512 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
513 	{ SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
514 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
515 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
516 	{ SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
517 	{ SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
518 	{ SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
519 	{ SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
520 	{ SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
521 	{ SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
522 	{ SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
523 	{ SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
524 	{ SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
525 	{ SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
526 	{ SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
527 	{ SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
528 	{ SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
529 	{ SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
530 	{ SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
531 	{ SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
532 	{ SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
533 	{ SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
534 };
535 
536 /*
537 Missing:
538 setters
539  */
540 static const SimdIntrinsic vector16sb_intrinsics[] = {
541 	{ SN_ctor, OP_EXPAND_I1, SIMD_VERSION_SSE1, SIMD_EMIT_CTOR },
542 	{ SN_AddWithSaturation, OP_PADDB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
543 	{ SN_CompareEqual, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
544 	{ SN_CompareGreaterThan, OP_PCMPGTB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
545 	{ SN_ExtractByteMask, 0, SIMD_VERSION_SSE1, SIMD_EMIT_EXTRACT_MASK },
546 	{ SN_LoadAligned, 0, SIMD_VERSION_SSE1, SIMD_EMIT_LOAD_ALIGNED },
547 	{ SN_Max, OP_PMAXB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
548 	{ SN_Min, OP_PMINB, SIMD_VERSION_SSE41, SIMD_EMIT_BINARY },
549 	{ SN_PrefetchTemporalAllCacheLevels, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_0 },
550 	{ SN_PrefetchTemporal1stLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_1 },
551 	{ SN_PrefetchTemporal2ndLevelCache, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_2 },
552 	{ SN_PrefetchNonTemporal, 0, SIMD_VERSION_SSE1, SIMD_EMIT_PREFETCH, SIMD_PREFETCH_MODE_NTA },
553 	{ SN_StoreAligned, OP_STOREX_ALIGNED_MEMBASE_REG, SIMD_VERSION_SSE1, SIMD_EMIT_STORE },
554 	{ SN_SubtractWithSaturation, OP_PSUBB_SAT, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
555 	{ SN_UnpackHigh, OP_UNPACK_HIGHB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
556 	{ SN_UnpackLow, OP_UNPACK_LOWB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
557 	{ SN_get_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
558 	{ SN_get_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
559 	{ SN_get_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
560 	{ SN_get_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
561 	{ SN_get_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
562 	{ SN_get_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
563 	{ SN_get_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
564 	{ SN_get_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
565 	{ SN_get_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
566 	{ SN_get_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
567 	{ SN_get_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
568 	{ SN_get_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
569 	{ SN_get_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
570 	{ SN_get_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
571 	{ SN_get_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
572 	{ SN_get_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_GETTER },
573 	{ SN_op_Addition, OP_PADDB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
574 	{ SN_op_BitwiseAnd, OP_PAND, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
575 	{ SN_op_BitwiseOr, OP_POR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
576 	{ SN_op_Equality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
577 	{ SN_op_ExclusiveOr, OP_PXOR, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
578 	{ SN_op_Explicit, 0, SIMD_VERSION_SSE1, SIMD_EMIT_CAST },
579 	{ SN_op_Inequality, OP_PCMPEQB, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_NEQ },
580 	{ SN_op_Subtraction, OP_PSUBB, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
581 	{ SN_set_V0, 0, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
582 	{ SN_set_V1, 1, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
583 	{ SN_set_V10, 10, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
584 	{ SN_set_V11, 11, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
585 	{ SN_set_V12, 12, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
586 	{ SN_set_V13, 13, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
587 	{ SN_set_V14, 14, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
588 	{ SN_set_V15, 15, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
589 	{ SN_set_V2, 2, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
590 	{ SN_set_V3, 3, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
591 	{ SN_set_V4, 4, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
592 	{ SN_set_V5, 5, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
593 	{ SN_set_V6, 6, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
594 	{ SN_set_V7, 7, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
595 	{ SN_set_V8, 8, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
596 	{ SN_set_V9, 9, SIMD_VERSION_SSE1, SIMD_EMIT_SETTER },
597 };
598 
599 static guint32 simd_supported_versions;
600 
601 static MonoInst* emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
602 static MonoInst* emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args);
603 
604 /*TODO match using number of parameters as well*/
605 static int
simd_intrinsic_compare_by_name(const void * key,const void * value)606 simd_intrinsic_compare_by_name (const void *key, const void *value)
607 {
608 	return strcmp (key, method_name (((SimdIntrinsic *)value)->name));
609 }
610 
611 typedef enum {
612 	VREG_USED  				= 0x01,
613 	VREG_HAS_XZERO_BB0		= 0x02,
614 	VREG_HAS_OTHER_OP_BB0	= 0x04,
615 	VREG_SINGLE_BB_USE		= 0x08,
616 	VREG_MANY_BB_USE		= 0x10,
617 } KillFlags;
618 
619 void
mono_simd_intrinsics_init(void)620 mono_simd_intrinsics_init (void)
621 {
622 	simd_supported_versions = mono_arch_cpu_enumerate_simd_versions ();
623 	/*TODO log the supported flags*/
624 }
625 
626 static inline gboolean
apply_vreg_first_block_interference(MonoCompile * cfg,MonoInst * ins,int reg,int max_vreg,char * vreg_flags)627 apply_vreg_first_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, int max_vreg, char *vreg_flags)
628 {
629 	if (reg != -1 && reg <= max_vreg && vreg_flags [reg]) {
630 		vreg_flags [reg] &= ~VREG_HAS_XZERO_BB0;
631 		vreg_flags [reg] |= VREG_HAS_OTHER_OP_BB0;
632 		DEBUG (printf ("[simd-simplify] R%d used: ", reg); mono_print_ins(ins));
633 		return TRUE;
634 	}
635 	return FALSE;
636 }
637 
638 static inline gboolean
apply_vreg_following_block_interference(MonoCompile * cfg,MonoInst * ins,int reg,MonoBasicBlock * bb,int max_vreg,char * vreg_flags,MonoBasicBlock ** target_bb)639 apply_vreg_following_block_interference (MonoCompile *cfg, MonoInst *ins, int reg, MonoBasicBlock *bb, int max_vreg, char *vreg_flags, MonoBasicBlock **target_bb)
640 {
641 	if (reg == -1 || reg > max_vreg || !(vreg_flags [reg] & VREG_HAS_XZERO_BB0) || target_bb [reg] == bb)
642 		return FALSE;
643 
644 	if (vreg_flags [reg] & VREG_SINGLE_BB_USE) {
645 		vreg_flags [reg] &= ~VREG_SINGLE_BB_USE;
646 		vreg_flags [reg] |= VREG_MANY_BB_USE;
647 		DEBUG (printf ("[simd-simplify] R%d used by many bb: ", reg); mono_print_ins(ins));
648 		return TRUE;
649 	} else if (!(vreg_flags [reg] & VREG_MANY_BB_USE)) {
650 		vreg_flags [reg] |= VREG_SINGLE_BB_USE;
651 		target_bb [reg] = bb;
652 		DEBUG (printf ("[simd-simplify] R%d first used by: ", reg); mono_print_ins(ins));
653 		return TRUE;
654 	}
655 	return FALSE;
656 }
657 
658 /*
659 This pass recalculate which vars need MONO_INST_INDIRECT.
660 
661 We cannot do this for non SIMD vars since code like mono_get_vtable_var
662 uses MONO_INST_INDIRECT to signal that the variable must be stack allocated.
663 */
664 void
mono_simd_simplify_indirection(MonoCompile * cfg)665 mono_simd_simplify_indirection (MonoCompile *cfg)
666 {
667 	int i, max_vreg = 0;
668 	MonoBasicBlock *bb, *first_bb = NULL, **target_bb;
669 	MonoInst *ins;
670 	char *vreg_flags;
671 
672 	for (i = 0; i < cfg->num_varinfo; i++) {
673 		MonoInst *var = cfg->varinfo [i];
674 		if (var->klass->simd_type) {
675 			var->flags &= ~MONO_INST_INDIRECT;
676 			max_vreg = MAX (var->dreg, max_vreg);
677 		}
678 	}
679 
680 	for (bb = cfg->bb_entry; bb; bb = bb->next_bb) {
681 		if (!first_bb && bb->code)
682 			first_bb = bb;
683 		for (ins = bb->code; ins; ins = ins->next) {
684 			if (ins->opcode == OP_LDADDR) {
685 				MonoInst *var = (MonoInst*)ins->inst_p0;
686 				if (var->klass->simd_type) {
687 					var->flags |= MONO_INST_INDIRECT;
688 				}
689 			}
690 		}
691 	}
692 
693 	DEBUG (printf ("[simd-simplify] max vreg is %d\n", max_vreg));
694 	vreg_flags = (char *)g_malloc0 (max_vreg + 1);
695 	target_bb = g_new0 (MonoBasicBlock*, max_vreg + 1);
696 
697 	for (i = 0; i < cfg->num_varinfo; i++) {
698 		MonoInst *var = cfg->varinfo [i];
699 		if (var->klass->simd_type && !(var->flags & (MONO_INST_INDIRECT|MONO_INST_VOLATILE))) {
700 			vreg_flags [var->dreg] = VREG_USED;
701 			DEBUG (printf ("[simd-simplify] processing var %d with vreg %d\n", i, var->dreg));
702 		}
703 	}
704 
705 	/*Scan the first basic block looking xzeros not used*/
706 	for (ins = first_bb->code; ins; ins = ins->next) {
707 		int num_sregs;
708 		int sregs [MONO_MAX_SRC_REGS];
709 
710 		if (ins->opcode == OP_XZERO) {
711 			if (!(vreg_flags [ins->dreg] & VREG_HAS_OTHER_OP_BB0)) {
712 				DEBUG (printf ("[simd-simplify] R%d has vzero: ", ins->dreg); mono_print_ins(ins));
713 				vreg_flags [ins->dreg] |= VREG_HAS_XZERO_BB0;
714 			}
715 			continue;
716 		}
717 		if (ins->opcode == OP_LDADDR && apply_vreg_first_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, max_vreg, vreg_flags))
718 			continue;
719 		if (apply_vreg_first_block_interference (cfg, ins, ins->dreg, max_vreg, vreg_flags))
720 			continue;
721 		num_sregs = mono_inst_get_src_registers (ins, sregs);
722 		for (i = 0; i < num_sregs; ++i) {
723 			if (apply_vreg_first_block_interference (cfg, ins, sregs [i], max_vreg, vreg_flags))
724 				break;
725 		}
726 	}
727 
728 	if (IS_DEBUG_ON (cfg)) {
729 		for (i = 0; i < cfg->num_varinfo; i++) {
730 			MonoInst *var = cfg->varinfo [i];
731 			if (var->klass->simd_type) {
732 				if ((vreg_flags [var->dreg] & VREG_HAS_XZERO_BB0))
733 					DEBUG (printf ("[simd-simplify] R%d has xzero only\n", var->dreg));
734 				if ((vreg_flags [var->dreg] & VREG_HAS_OTHER_OP_BB0))
735 					DEBUG (printf ("[simd-simplify] R%d has other ops on bb0\n", var->dreg));
736 			}
737 		}
738 	}
739 
740 	/*TODO stop here if no var is xzero only*/
741 
742 	/*
743 	Scan all other bb and check if it has only one other use
744 	Ideally this would be done after an extended bb formation pass
745 
746 	FIXME This pass could use dominator information to properly
747 	place the XZERO on the bb that dominates all uses of the var,
748 	but this will have zero effect with the current local reg alloc
749 
750 	TODO simply the use of flags.
751 	*/
752 
753 	for (bb = first_bb->next_bb; bb; bb = bb->next_bb) {
754 		for (ins = bb->code; ins; ins = ins->next) {
755 			int num_sregs;
756 			int sregs [MONO_MAX_SRC_REGS];
757 
758 			if (ins->opcode == OP_LDADDR && apply_vreg_following_block_interference (cfg, ins, ((MonoInst*)ins->inst_p0)->dreg, bb, max_vreg, vreg_flags, target_bb))
759 				continue;
760 			if (apply_vreg_following_block_interference (cfg, ins, ins->dreg, bb, max_vreg, vreg_flags, target_bb))
761 				continue;
762 			num_sregs = mono_inst_get_src_registers (ins, sregs);
763 			for (i = 0; i < num_sregs; ++i) {
764 				if (apply_vreg_following_block_interference (cfg, ins, sregs [i], bb,
765 						max_vreg, vreg_flags, target_bb))
766 					continue;
767 			}
768 		}
769 	}
770 
771 	for (i = 0; i < cfg->num_varinfo; i++) {
772 		MonoInst *var = cfg->varinfo [i];
773 		if (!var->klass->simd_type)
774 			continue;
775 		if ((vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
776 			DEBUG (printf ("[simd-simplify] R%d has single bb use\n", var->dreg));
777 		if ((vreg_flags [var->dreg] & VREG_MANY_BB_USE))
778 			DEBUG (printf ("[simd-simplify] R%d has many bb in use\n", var->dreg));
779 
780 		if (!(vreg_flags [var->dreg] & VREG_SINGLE_BB_USE))
781 			continue;
782 		for (ins = target_bb [var->dreg]->code; ins; ins = ins->next) {
783 			int num_sregs, j;
784 			int sregs [MONO_MAX_SRC_REGS];
785 			gboolean found = FALSE;
786 
787 			num_sregs = mono_inst_get_src_registers (ins, sregs);
788 			for (j = 0; j < num_sregs; ++j) {
789 				if (sregs [j] == var->dreg)
790 					found = TRUE;
791 			}
792 			/*We can avoid inserting the XZERO if the first use doesn't depend on the zero'ed value.*/
793 			if (ins->dreg == var->dreg && !found) {
794 				DEBUG (printf ("[simd-simplify] INGORING R%d on BB %d because first op is a def", i, target_bb [var->dreg]->block_num););
795 				break;
796 			} else if (found) {
797 				DEBUG (printf ("[simd-simplify] Adding XZERO for R%d on BB %d: ", i, target_bb [var->dreg]->block_num); );
798 				MonoInst *tmp;
799 				MONO_INST_NEW (cfg, tmp, OP_XZERO);
800 				tmp->dreg = var->dreg;
801 				tmp->type = STACK_VTYPE;
802 		        tmp->klass = var->klass;
803 				mono_bblock_insert_before_ins (target_bb [var->dreg], ins, tmp);
804 				break;
805 			}
806 		}
807 	}
808 
809 	for (ins = first_bb->code; ins; ins = ins->next) {
810 		if (ins->opcode == OP_XZERO && (vreg_flags [ins->dreg] & VREG_SINGLE_BB_USE)) {
811 			DEBUG (printf ("[simd-simplify] Nullify %d on first BB: ", ins->dreg); mono_print_ins(ins));
812 			NULLIFY_INS (ins);
813 		}
814 	}
815 
816 	g_free (vreg_flags);
817 	g_free (target_bb);
818 }
819 
820 /*
821  * This function expect that src be a value.
822  */
823 static int
get_simd_vreg(MonoCompile * cfg,MonoMethod * cmethod,MonoInst * src)824 get_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src)
825 {
826 	const char *spec = INS_INFO (src->opcode);
827 
828 	if (src->opcode == OP_XMOVE) {
829 		return src->sreg1;
830 	} else if (spec [MONO_INST_DEST] == 'x') {
831 		return src->dreg;
832 	} else if (src->opcode == OP_VCALL || src->opcode == OP_VCALL_MEMBASE) {
833 		return src->dreg;
834 	}
835 
836 	g_warning ("get_simd_vreg:: could not infer source simd vreg for op");
837 	mono_print_ins (src);
838 	g_assert_not_reached ();
839 }
840 
841 /*
842  * This function will load the value if needed.
843  */
844 static int
load_simd_vreg_class(MonoCompile * cfg,MonoClass * klass,MonoInst * src,gboolean * indirect)845 load_simd_vreg_class (MonoCompile *cfg, MonoClass *klass, MonoInst *src, gboolean *indirect)
846 {
847 	const char *spec = INS_INFO (src->opcode);
848 
849 	if (indirect)
850 		*indirect = FALSE;
851 	if (src->opcode == OP_XMOVE) {
852 		return src->sreg1;
853 	} else if (src->opcode == OP_LDADDR) {
854 		int res = ((MonoInst*)src->inst_p0)->dreg;
855 		return res;
856 	} else if (spec [MONO_INST_DEST] == 'x') {
857 		return src->dreg;
858 	} else if (src->type == STACK_PTR || src->type == STACK_MP) {
859 		MonoInst *ins;
860 		if (indirect)
861 			*indirect = TRUE;
862 
863 		MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
864 		ins->klass = klass;
865 		ins->sreg1 = src->dreg;
866 		ins->type = STACK_VTYPE;
867 		ins->dreg = alloc_ireg (cfg);
868 		MONO_ADD_INS (cfg->cbb, ins);
869 		return ins->dreg;
870 	}
871 	g_warning ("load_simd_vreg:: could not infer source simd (%d) vreg for op", src->type);
872 	mono_print_ins (src);
873 	g_assert_not_reached ();
874 }
875 
876 static int
load_simd_vreg(MonoCompile * cfg,MonoMethod * cmethod,MonoInst * src,gboolean * indirect)877 load_simd_vreg (MonoCompile *cfg, MonoMethod *cmethod, MonoInst *src, gboolean *indirect)
878 {
879 	return load_simd_vreg_class (cfg, cmethod->klass, src, indirect);
880 }
881 
882 /*We share the var with fconv_to_r8_x to save some stack space.*/
883 static MonoInst*
get_double_spill_area(MonoCompile * cfg)884 get_double_spill_area (MonoCompile *cfg)
885 {
886 	if (!cfg->fconv_to_r8_x_var) {
887 		cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, &mono_defaults.double_class->byval_arg, OP_LOCAL);
888 		cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
889 	}
890 	return cfg->fconv_to_r8_x_var;
891 }
892 static MonoInst*
get_simd_ctor_spill_area(MonoCompile * cfg,MonoClass * avector_klass)893 get_simd_ctor_spill_area (MonoCompile *cfg, MonoClass *avector_klass)
894 {
895 	if (!cfg->simd_ctor_var) {
896 		cfg->simd_ctor_var = mono_compile_create_var (cfg, &avector_klass->byval_arg, OP_LOCAL);
897 		cfg->simd_ctor_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/
898 	}
899 	return cfg->simd_ctor_var;
900 }
901 
902 static int
mono_type_to_expand_op(MonoType * type)903 mono_type_to_expand_op (MonoType *type)
904 {
905 	switch (type->type) {
906 	case MONO_TYPE_I1:
907 	case MONO_TYPE_U1:
908 		return OP_EXPAND_I1;
909 	case MONO_TYPE_I2:
910 	case MONO_TYPE_U2:
911 		return OP_EXPAND_I2;
912 	case MONO_TYPE_I4:
913 	case MONO_TYPE_U4:
914 		return OP_EXPAND_I4;
915 	case MONO_TYPE_I8:
916 	case MONO_TYPE_U8:
917 		return OP_EXPAND_I8;
918 	case MONO_TYPE_R4:
919 		return OP_EXPAND_R4;
920 	case MONO_TYPE_R8:
921 		return OP_EXPAND_R8;
922 	default:
923 		g_assert_not_reached ();
924 	}
925 }
926 
927 static int
type_to_comp_op(MonoType * t)928 type_to_comp_op (MonoType *t)
929 {
930 	switch (t->type) {
931 	case MONO_TYPE_I1:
932 	case MONO_TYPE_U1:
933 		return OP_PCMPEQB;
934 	case MONO_TYPE_I2:
935 	case MONO_TYPE_U2:
936 		return OP_PCMPEQW;
937 	case MONO_TYPE_I4:
938 	case MONO_TYPE_U4:
939 		return OP_PCMPEQD;
940 	case MONO_TYPE_I8:
941 	case MONO_TYPE_U8:
942 		return OP_PCMPEQQ;
943 	case MONO_TYPE_R4:
944 		return OP_COMPPS;
945 	case MONO_TYPE_R8:
946 		return OP_COMPPD;
947 	default:
948 		g_assert_not_reached ();
949 		return -1;
950 	}
951 }
952 
953 static int
type_to_gt_op(MonoType * t)954 type_to_gt_op (MonoType *t)
955 {
956 	switch (t->type) {
957 	case MONO_TYPE_I1:
958 		return OP_PCMPGTB;
959 	case MONO_TYPE_I2:
960 		return OP_PCMPGTW;
961 	case MONO_TYPE_I4:
962 		return OP_PCMPGTD;
963 	case MONO_TYPE_I8:
964 		return OP_PCMPGTQ;
965 	default:
966 		return -1;
967 	}
968 }
969 
970 static int
type_to_padd_op(MonoType * t)971 type_to_padd_op (MonoType *t)
972 {
973 	switch (t->type) {
974 	case MONO_TYPE_U1:
975 	case MONO_TYPE_I1:
976 		return OP_PADDB;
977 	case MONO_TYPE_U2:
978 	case MONO_TYPE_I2:
979 		return OP_PADDW;
980 	case MONO_TYPE_U4:
981 	case MONO_TYPE_I4:
982 		return OP_PADDD;
983 	case MONO_TYPE_U8:
984 	case MONO_TYPE_I8:
985 		return OP_PADDQ;
986 	case MONO_TYPE_R4:
987 		return OP_ADDPS;
988 	case MONO_TYPE_R8:
989 		return OP_ADDPD;
990 	default:
991 		break;
992 	}
993 	return -1;
994 }
995 
996 static int
type_to_psub_op(MonoType * t)997 type_to_psub_op (MonoType *t)
998 {
999 	switch (t->type) {
1000 	case MONO_TYPE_U1:
1001 	case MONO_TYPE_I1:
1002 		return OP_PSUBB;
1003 	case MONO_TYPE_U2:
1004 	case MONO_TYPE_I2:
1005 		return OP_PSUBW;
1006 	case MONO_TYPE_U4:
1007 	case MONO_TYPE_I4:
1008 		return OP_PSUBD;
1009 	case MONO_TYPE_U8:
1010 	case MONO_TYPE_I8:
1011 		return OP_PSUBQ;
1012 	case MONO_TYPE_R4:
1013 		return OP_SUBPS;
1014 	case MONO_TYPE_R8:
1015 		return OP_SUBPD;
1016 	default:
1017 		break;
1018 	}
1019 	return -1;
1020 }
1021 
1022 static int
type_to_pmul_op(MonoType * t)1023 type_to_pmul_op (MonoType *t)
1024 {
1025 	switch (t->type) {
1026 	case MONO_TYPE_U2:
1027 	case MONO_TYPE_I2:
1028 		return OP_PMULW;
1029 	case MONO_TYPE_U4:
1030 	case MONO_TYPE_I4:
1031 		return OP_PMULD;
1032 	case MONO_TYPE_R4:
1033 		return OP_MULPS;
1034 	case MONO_TYPE_R8:
1035 		return OP_MULPD;
1036 	case MONO_TYPE_U8:
1037 		/* PMULQ multiplies two 32 bit numbers into a 64 bit one */
1038 		return -1;
1039 	case MONO_TYPE_I8:
1040 		return -1;
1041 	default:
1042 		break;
1043 	}
1044 	return -1;
1045 }
1046 
1047 static int
type_to_pdiv_op(MonoType * t)1048 type_to_pdiv_op (MonoType *t)
1049 {
1050 	switch (t->type) {
1051 	case MONO_TYPE_R4:
1052 		return OP_DIVPS;
1053 	case MONO_TYPE_R8:
1054 		return OP_DIVPD;
1055 	default:
1056 		break;
1057 	}
1058 	return -1;
1059 }
1060 
1061 static int
type_to_pxor_op(MonoType * t)1062 type_to_pxor_op (MonoType *t)
1063 {
1064 	/*
1065 	 * These opcodes have the same semantics, but using the
1066 	 * correctly typed version is better for performance.
1067 	 */
1068 	switch (t->type) {
1069 	case MONO_TYPE_R4:
1070 		return OP_XORPS;
1071 	case MONO_TYPE_R8:
1072 		return OP_XORPD;
1073 	default:
1074 		return OP_PXOR;
1075 	}
1076 }
1077 
1078 static int
type_to_pand_op(MonoType * t)1079 type_to_pand_op (MonoType *t)
1080 {
1081 	switch (t->type) {
1082 	case MONO_TYPE_R4:
1083 		return OP_ANDPS;
1084 	case MONO_TYPE_R8:
1085 		return OP_ANDPD;
1086 	default:
1087 		return OP_PAND;
1088 	}
1089 }
1090 
1091 static int
type_to_por_op(MonoType * t)1092 type_to_por_op (MonoType *t)
1093 {
1094 	switch (t->type) {
1095 	case MONO_TYPE_R4:
1096 		return OP_ORPS;
1097 	case MONO_TYPE_R8:
1098 		return OP_ORPD;
1099 	default:
1100 		return OP_POR;
1101 	}
1102 }
1103 
1104 static int
type_to_pmin_op(MonoType * t)1105 type_to_pmin_op (MonoType *t)
1106 {
1107 	switch (t->type) {
1108 	case MONO_TYPE_R4:
1109 		return OP_MINPS;
1110 	case MONO_TYPE_R8:
1111 		return OP_MINPD;
1112 	case MONO_TYPE_I1:
1113 		return OP_PMINB;
1114 	case MONO_TYPE_U1:
1115 		return OP_PMINB_UN;
1116 	case MONO_TYPE_I2:
1117 		return OP_PMINW;
1118 	case MONO_TYPE_U2:
1119 		return OP_PMINW_UN;
1120 	case MONO_TYPE_I4:
1121 		return OP_PMIND;
1122 	case MONO_TYPE_U4:
1123 		return OP_PMIND_UN;
1124 	default:
1125 		return -1;
1126 	}
1127 }
1128 
1129 static int
type_to_pmax_op(MonoType * t)1130 type_to_pmax_op (MonoType *t)
1131 {
1132 	switch (t->type) {
1133 	case MONO_TYPE_R4:
1134 		return OP_MAXPS;
1135 	case MONO_TYPE_R8:
1136 		return OP_MAXPD;
1137 	case MONO_TYPE_I1:
1138 		return OP_PMAXB;
1139 	case MONO_TYPE_U1:
1140 		return OP_PMAXB_UN;
1141 	case MONO_TYPE_I2:
1142 		return OP_PMAXW;
1143 	case MONO_TYPE_U2:
1144 		return OP_PMAXW_UN;
1145 	case MONO_TYPE_I4:
1146 		return OP_PMAXD;
1147 	case MONO_TYPE_U4:
1148 		return OP_PMAXD_UN;
1149 	default:
1150 		return -1;
1151 	}
1152 }
1153 
1154 static int
get_simd_vreg_or_expanded_scalar(MonoCompile * cfg,MonoClass * klass,MonoType * param_type,MonoInst * src)1155 get_simd_vreg_or_expanded_scalar (MonoCompile *cfg, MonoClass *klass, MonoType *param_type, MonoInst *src)
1156 {
1157 	MonoInst *ins;
1158 	int expand_op;
1159 
1160 	if (mono_class_from_mono_type (param_type)->simd_type)
1161 		return get_simd_vreg (cfg, NULL, src);
1162 
1163 	expand_op = mono_type_to_expand_op (param_type);
1164 	MONO_INST_NEW (cfg, ins, expand_op);
1165 	ins->klass = klass;
1166 	ins->sreg1 = src->dreg;
1167 	ins->type = STACK_VTYPE;
1168 	ins->dreg = alloc_ireg (cfg);
1169 	MONO_ADD_INS (cfg->cbb, ins);
1170 
1171 	if (expand_op == OP_EXPAND_R4)
1172 		ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1173 	else if (expand_op == OP_EXPAND_R8)
1174 		ins->backend.spill_var = get_double_spill_area (cfg);
1175 
1176 	return ins->dreg;
1177 }
1178 
1179 /*
1180  * simd_intrinsic_emit_binary_op:
1181  *
1182  *   Emit a binary SIMD opcode.
1183  * @LHS/@RHS are the two arguments, they can be either a SIMD type or a scalar one. Scalar arguments are
1184  * expanded to the SIMD type.
1185  */
1186 static MonoInst*
simd_intrinsic_emit_binary_op(MonoCompile * cfg,int opcode,int flags,MonoClass * klass,MonoType * lhs_type,MonoType * rhs_type,MonoInst * lhs,MonoInst * rhs)1187 simd_intrinsic_emit_binary_op (MonoCompile *cfg, int opcode, int flags, MonoClass *klass, MonoType *lhs_type, MonoType *rhs_type, MonoInst *lhs, MonoInst *rhs)
1188 {
1189 	MonoInst* ins;
1190 	int left_vreg, right_vreg;
1191 
1192 	left_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, lhs_type, lhs);
1193 	right_vreg = get_simd_vreg_or_expanded_scalar (cfg, klass, rhs_type, rhs);
1194 
1195 	MONO_INST_NEW (cfg, ins, opcode);
1196 	ins->klass = klass;
1197 	ins->sreg1 = left_vreg;
1198 	ins->sreg2 = right_vreg;
1199 	ins->type = STACK_VTYPE;
1200 	ins->dreg = alloc_ireg (cfg);
1201 	ins->inst_c0 = flags;
1202 	MONO_ADD_INS (cfg->cbb, ins);
1203 	return ins;
1204 }
1205 
1206 static MonoInst*
simd_intrinsic_emit_binary(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1207 simd_intrinsic_emit_binary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1208 {
1209 	MonoMethodSignature *sig = mono_method_signature (cmethod);
1210 
1211 	g_assert (sig->param_count == 2);
1212 
1213 	return simd_intrinsic_emit_binary_op (cfg, intrinsic->opcode, intrinsic->flags, cmethod->klass, sig->params [0], sig->params [1], args [0], args [1]);
1214 }
1215 
1216 static MonoInst*
simd_intrinsic_emit_unary(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1217 simd_intrinsic_emit_unary (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1218 {
1219 	MonoInst* ins;
1220 	int vreg;
1221 
1222 	vreg = get_simd_vreg (cfg, cmethod, args [0]);
1223 
1224 	MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1225 	ins->klass = cmethod->klass;
1226 	ins->sreg1 = vreg;
1227 	ins->type = STACK_VTYPE;
1228 	ins->dreg = alloc_ireg (cfg);
1229 	MONO_ADD_INS (cfg->cbb, ins);
1230 	return ins;
1231 }
1232 
1233 static int
mono_type_to_extract_op(MonoType * type)1234 mono_type_to_extract_op (MonoType *type)
1235 {
1236 	switch (type->type) {
1237 	case MONO_TYPE_I1:
1238 		return OP_EXTRACT_I1;
1239 	case MONO_TYPE_U1:
1240 		return OP_EXTRACT_U1;
1241 	case MONO_TYPE_I2:
1242 		return OP_EXTRACT_I2;
1243 	case MONO_TYPE_U2:
1244 		return OP_EXTRACT_U2;
1245 	case MONO_TYPE_I4:
1246 	case MONO_TYPE_U4:
1247 	case MONO_TYPE_R4:
1248 		return OP_EXTRACT_I4;
1249 	default:
1250 		g_assert_not_reached ();
1251 	}
1252 }
1253 
1254 /*Returns the amount to shift the element index to get the dword it belongs to*/
1255 static int
mono_type_elements_shift_bits(MonoType * type)1256 mono_type_elements_shift_bits (MonoType *type)
1257 {
1258 	switch (type->type) {
1259 	case MONO_TYPE_I1:
1260 	case MONO_TYPE_U1:
1261 		return 2;
1262 	case MONO_TYPE_I2:
1263 	case MONO_TYPE_U2:
1264 		return 1;
1265 	case MONO_TYPE_I4:
1266 	case MONO_TYPE_U4:
1267 	case MONO_TYPE_R4:
1268 		return 0;
1269 	default:
1270 		g_assert_not_reached ();
1271 	}
1272 }
1273 
1274 static G_GNUC_UNUSED int
mono_type_to_insert_op(MonoType * type)1275 mono_type_to_insert_op (MonoType *type)
1276 {
1277 	switch (type->type) {
1278 	case MONO_TYPE_I1:
1279 	case MONO_TYPE_U1:
1280 		return OP_INSERT_I1;
1281 	case MONO_TYPE_I2:
1282 	case MONO_TYPE_U2:
1283 		return OP_INSERT_I2;
1284 	case MONO_TYPE_I4:
1285 	case MONO_TYPE_U4:
1286 		return OP_INSERT_I4;
1287 	case MONO_TYPE_I8:
1288 	case MONO_TYPE_U8:
1289 		return OP_INSERT_I8;
1290 	case MONO_TYPE_R4:
1291 		return OP_INSERT_R4;
1292 	case MONO_TYPE_R8:
1293 		return OP_INSERT_R8;
1294 	default:
1295 		g_assert_not_reached ();
1296 	}
1297 }
1298 
1299 static int
mono_type_to_slow_insert_op(MonoType * type)1300 mono_type_to_slow_insert_op (MonoType *type)
1301 {
1302 	switch (type->type) {
1303 	case MONO_TYPE_I1:
1304 	case MONO_TYPE_U1:
1305 		return OP_INSERTX_U1_SLOW;
1306 	case MONO_TYPE_I2:
1307 	case MONO_TYPE_U2:
1308 		return OP_INSERT_I2;
1309 	case MONO_TYPE_I4:
1310 	case MONO_TYPE_U4:
1311 		return OP_INSERTX_I4_SLOW;
1312 	case MONO_TYPE_I8:
1313 	case MONO_TYPE_U8:
1314 		return OP_INSERTX_I8_SLOW;
1315 	case MONO_TYPE_R4:
1316 		return OP_INSERTX_R4_SLOW;
1317 	case MONO_TYPE_R8:
1318 		return OP_INSERTX_R8_SLOW;
1319 	default:
1320 		g_assert_not_reached ();
1321 	}
1322 }
1323 
1324 static MonoInst*
simd_intrinsic_emit_setter(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1325 simd_intrinsic_emit_setter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1326 {
1327 	MonoInst *ins;
1328 	MonoMethodSignature *sig = mono_method_signature (cmethod);
1329 	int size, align;
1330 	gboolean indirect;
1331 	int dreg;
1332 
1333 	size = mono_type_size (sig->params [0], &align);
1334 
1335 	if (COMPILE_LLVM (cfg)) {
1336 		MONO_INST_NEW (cfg, ins, mono_type_to_insert_op (sig->params [0]));
1337 		ins->klass = cmethod->klass;
1338 		ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1339 		ins->sreg2 = args [1]->dreg;
1340 		ins->inst_c0 = intrinsic->opcode;
1341 		MONO_ADD_INS (cfg->cbb, ins);
1342 	} else if (size == 2 || size == 4 || size == 8) {
1343 		MONO_INST_NEW (cfg, ins, mono_type_to_slow_insert_op (sig->params [0]));
1344 		ins->klass = cmethod->klass;
1345 		/*This is a partial load so we encode the dependency on the previous value by setting dreg and sreg1 to the same value.*/
1346 		ins->dreg = ins->sreg1 = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1347 		ins->sreg2 = args [1]->dreg;
1348 		ins->inst_c0 = intrinsic->opcode;
1349 		if (sig->params [0]->type == MONO_TYPE_R4)
1350 			ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1351 		else if (sig->params [0]->type == MONO_TYPE_R8)
1352 			ins->backend.spill_var = get_double_spill_area (cfg);
1353 		MONO_ADD_INS (cfg->cbb, ins);
1354 	} else {
1355 		int vreg, sreg;
1356 
1357 		MONO_INST_NEW (cfg, ins, OP_EXTRACTX_U2);
1358 		ins->klass = cmethod->klass;
1359 		ins->sreg1 = sreg = dreg = load_simd_vreg (cfg, cmethod, args [0], &indirect);
1360 		ins->type = STACK_I4;
1361 		ins->dreg = vreg = alloc_ireg (cfg);
1362 		ins->inst_c0 = intrinsic->opcode / 2;
1363 		MONO_ADD_INS (cfg->cbb, ins);
1364 
1365 		MONO_INST_NEW (cfg, ins, OP_INSERTX_U1_SLOW);
1366 		ins->klass = cmethod->klass;
1367 		ins->sreg1 = vreg;
1368 		ins->sreg2 = args [1]->dreg;
1369 		ins->dreg = sreg;
1370 		ins->inst_c0 = intrinsic->opcode;
1371 		MONO_ADD_INS (cfg->cbb, ins);
1372 	}
1373 
1374 	if (indirect) {
1375 		MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1376 		ins->klass = cmethod->klass;
1377 		ins->dreg = args [0]->dreg;
1378 		ins->sreg1 = dreg;
1379 		MONO_ADD_INS (cfg->cbb, ins);
1380 	}
1381 	return ins;
1382 }
1383 
1384 /*
1385  * simd_intrinsic_emit_getter_op:
1386  *
1387  *   Emit IR for loading an element of a SIMD value.
1388  *
1389  * @klass is the simd type, @type is the element type.
1390  */
1391 static MonoInst*
simd_intrinsic_emit_getter_op(MonoCompile * cfg,int index,MonoClass * klass,MonoType * type,MonoInst * arg)1392 simd_intrinsic_emit_getter_op (MonoCompile *cfg, int index, MonoClass *klass, MonoType *type, MonoInst *arg)
1393 {
1394 	MonoInst *ins;
1395 	int vreg, shift_bits;
1396 
1397 	vreg = load_simd_vreg_class (cfg, klass, arg, NULL);
1398 
1399 	if (type->type == MONO_TYPE_I8 || type->type == MONO_TYPE_U8 || type->type == MONO_TYPE_R8) {
1400 		MonoInst *ins;
1401 		gboolean is_r8 = type->type == MONO_TYPE_R8;
1402 
1403 		MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1404 		ins->klass = klass;
1405 		ins->sreg1 = vreg;
1406 		ins->inst_c0 = index;
1407 		if (is_r8) {
1408 			ins->type = STACK_R8;
1409 			ins->dreg = alloc_freg (cfg);
1410 			ins->backend.spill_var = get_double_spill_area (cfg);
1411 		} else {
1412 			ins->type = STACK_I8;
1413 			ins->dreg = alloc_lreg (cfg);
1414 		}
1415 		MONO_ADD_INS (cfg->cbb, ins);
1416 		return ins;
1417 	}
1418 
1419 	shift_bits = mono_type_elements_shift_bits (type);
1420 
1421 	if ((index >> shift_bits) && !cfg->compile_llvm) {
1422 		MONO_INST_NEW (cfg, ins, OP_PSHUFLED);
1423 		ins->klass = klass;
1424 		ins->sreg1 = vreg;
1425 		ins->inst_c0 = index >> shift_bits;
1426 		ins->type = STACK_VTYPE;
1427 		ins->dreg = vreg = alloc_ireg (cfg);
1428 		MONO_ADD_INS (cfg->cbb, ins);
1429 	}
1430 
1431 	MONO_INST_NEW (cfg, ins, mono_type_to_extract_op (type));
1432 	ins->klass = klass;
1433 	ins->sreg1 = vreg;
1434 	ins->type = STACK_I4;
1435 	ins->dreg = vreg = alloc_ireg (cfg);
1436 	if (cfg->compile_llvm)
1437 		ins->inst_c0 = index;
1438 	else
1439 		ins->inst_c0 = index & ((1 << shift_bits) - 1);
1440 	MONO_ADD_INS (cfg->cbb, ins);
1441 
1442 	if (type->type == MONO_TYPE_R4) {
1443 		MONO_INST_NEW (cfg, ins, cfg->r4fp ? OP_ICONV_TO_R4_RAW : OP_MOVE_I4_TO_F);
1444 		ins->klass = mono_defaults.single_class;
1445 		ins->sreg1 = vreg;
1446 		ins->type = cfg->r4_stack_type;
1447 		ins->dreg = alloc_freg (cfg);
1448 		ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1449 		MONO_ADD_INS (cfg->cbb, ins);
1450 	}
1451 	return ins;
1452 }
1453 
1454 static MonoInst*
simd_intrinsic_emit_getter(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1455 simd_intrinsic_emit_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1456 {
1457 	MonoMethodSignature *sig = mono_method_signature (cmethod);
1458 
1459 	return simd_intrinsic_emit_getter_op (cfg, intrinsic->opcode, cmethod->klass, sig->ret, args [0]);
1460 }
1461 
1462 static MonoInst*
simd_intrinsic_emit_long_getter(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1463 simd_intrinsic_emit_long_getter (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1464 {
1465 	MonoInst *ins;
1466 	int vreg;
1467 	gboolean is_r8 = mono_method_signature (cmethod)->ret->type == MONO_TYPE_R8;
1468 
1469 	vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1470 
1471 	MONO_INST_NEW (cfg, ins, is_r8 ? OP_EXTRACT_R8 : OP_EXTRACT_I8);
1472 	ins->klass = cmethod->klass;
1473 	ins->sreg1 = vreg;
1474 	ins->inst_c0 = intrinsic->opcode;
1475 	if (is_r8) {
1476 		ins->type = STACK_R8;
1477 		ins->dreg = alloc_freg (cfg);
1478 		ins->backend.spill_var = get_double_spill_area (cfg);
1479 	} else {
1480 		ins->type = STACK_I8;
1481 		ins->dreg = alloc_lreg (cfg);
1482 	}
1483 	MONO_ADD_INS (cfg->cbb, ins);
1484 
1485 	return ins;
1486 }
1487 
1488 static MonoInst*
simd_intrinsic_emit_ctor(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1489 simd_intrinsic_emit_ctor (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1490 {
1491 	MonoInst *ins = NULL;
1492 	int i, addr_reg;
1493 	gboolean is_ldaddr = args [0]->opcode == OP_LDADDR;
1494 	MonoMethodSignature *sig = mono_method_signature (cmethod);
1495 	int store_op = mono_type_to_store_membase (cfg, sig->params [0]);
1496 	int arg_size = mono_type_size (sig->params [0], &i);
1497 	int opcode;
1498 
1499 	if (sig->param_count == 1) {
1500 		int dreg;
1501 
1502 		if (is_ldaddr) {
1503 			dreg = args [0]->inst_i0->dreg;
1504 			NULLIFY_INS (args [0]);
1505 		} else {
1506 			g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1507 			dreg = alloc_ireg (cfg);
1508 		}
1509 
1510 		if (intrinsic)
1511 			opcode = intrinsic->opcode;
1512 		else
1513 			opcode = mono_type_to_expand_op (sig->params [0]);
1514 		MONO_INST_NEW (cfg, ins, opcode);
1515 		ins->klass = cmethod->klass;
1516 		ins->sreg1 = args [1]->dreg;
1517 		ins->type = STACK_VTYPE;
1518 		ins->dreg = dreg;
1519 
1520 		MONO_ADD_INS (cfg->cbb, ins);
1521 		if (sig->params [0]->type == MONO_TYPE_R4)
1522 			ins->backend.spill_var = mini_get_int_to_float_spill_area (cfg);
1523 		else if (sig->params [0]->type == MONO_TYPE_R8)
1524 			ins->backend.spill_var = get_double_spill_area (cfg);
1525 
1526 		if (!is_ldaddr) {
1527 			MONO_INST_NEW (cfg, ins, OP_STOREX_MEMBASE);
1528 			ins->dreg = args [0]->dreg;
1529 			ins->sreg1 = dreg;
1530 			MONO_ADD_INS (cfg->cbb, ins);
1531 		}
1532 		return ins;
1533 	}
1534 
1535 	if (is_ldaddr) {
1536 		NEW_VARLOADA (cfg, ins, get_simd_ctor_spill_area (cfg, cmethod->klass), &cmethod->klass->byref_arg);
1537 		MONO_ADD_INS (cfg->cbb, ins);
1538 		addr_reg = ins->dreg;
1539 	} else {
1540 		g_assert (args [0]->type == STACK_MP || args [0]->type == STACK_PTR);
1541 		addr_reg = args [0]->dreg;
1542 	}
1543 
1544 	for (i = sig->param_count - 1; i >= 0; --i) {
1545 		EMIT_NEW_STORE_MEMBASE (cfg, ins, store_op, addr_reg, i * arg_size, args [i + 1]->dreg);
1546 	}
1547 
1548 	if (sig->param_count * arg_size < 16) {
1549 		/* If there are not enough arguments, fill the rest with 0s */
1550 		for (i = sig->param_count; i < 16 / arg_size; ++i) {
1551 			switch (arg_size) {
1552 			case 4:
1553 				MONO_EMIT_NEW_STORE_MEMBASE_IMM (cfg, OP_STOREI4_MEMBASE_IMM, addr_reg, i * arg_size, 0);
1554 				break;
1555 			default:
1556 				g_assert_not_reached ();
1557 				break;
1558 			}
1559 		}
1560 	}
1561 
1562 	if (is_ldaddr) { /*Eliminate LDADDR if it's initing a local var*/
1563 		int vreg = ((MonoInst*)args [0]->inst_p0)->dreg;
1564 		NULLIFY_INS (args [0]);
1565 
1566 		MONO_INST_NEW (cfg, ins, OP_LOADX_MEMBASE);
1567 		ins->klass = cmethod->klass;
1568 		ins->sreg1 = addr_reg;
1569 		ins->type = STACK_VTYPE;
1570 		ins->dreg = vreg;
1571 		MONO_ADD_INS (cfg->cbb, ins);
1572 	}
1573 	return ins;
1574 }
1575 
1576 static MonoInst*
simd_intrinsic_emit_cast(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1577 simd_intrinsic_emit_cast (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1578 {
1579 	MonoInst *ins;
1580 	MonoClass *klass;
1581 	int vreg;
1582 
1583 	vreg = get_simd_vreg (cfg, cmethod, args [0]);
1584 
1585 	if (cmethod->is_inflated)
1586 		/* Vector<T> */
1587 		klass = mono_class_from_mono_type (mono_method_signature (cmethod)->ret);
1588 	else
1589 		klass = cmethod->klass;
1590 
1591 	MONO_INST_NEW (cfg, ins, OP_XMOVE);
1592 	ins->klass = klass;
1593 	ins->type = STACK_VTYPE;
1594 	ins->sreg1 = vreg;
1595 	ins->dreg = alloc_ireg (cfg);
1596 	MONO_ADD_INS (cfg->cbb, ins);
1597 	return ins;
1598 }
1599 
1600 static MonoInst*
simd_intrinsic_emit_shift(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1601 simd_intrinsic_emit_shift (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1602 {
1603 	MonoInst *ins;
1604 	int vreg, vreg2 = -1, opcode = intrinsic->opcode;
1605 
1606 	vreg = get_simd_vreg (cfg, cmethod, args [0]);
1607 
1608 	if (args [1]->opcode != OP_ICONST) {
1609 		MONO_INST_NEW (cfg, ins, OP_ICONV_TO_X);
1610 		ins->klass = mono_defaults.int32_class;
1611 		ins->sreg1 = args [1]->dreg;
1612 		ins->type = STACK_I4;
1613 		ins->dreg = vreg2 = alloc_ireg (cfg);
1614 		MONO_ADD_INS (cfg->cbb, ins);
1615 
1616 		++opcode; /*The shift_reg version op is always +1 from the regular one.*/
1617 	}
1618 
1619 	MONO_INST_NEW (cfg, ins, opcode);
1620 	ins->klass = cmethod->klass;
1621 	ins->sreg1 = vreg;
1622 	ins->sreg2 = vreg2;
1623 
1624 	if (args [1]->opcode == OP_ICONST) {
1625 		ins->inst_imm = args [1]->inst_c0;
1626 		NULLIFY_INS (args [1]);
1627 	}
1628 
1629 	ins->type = STACK_VTYPE;
1630 	ins->dreg = alloc_ireg (cfg);
1631 	MONO_ADD_INS (cfg->cbb, ins);
1632 	return ins;
1633 }
1634 
1635 static inline gboolean
mono_op_is_packed_compare(int op)1636 mono_op_is_packed_compare (int op)
1637 {
1638 	return op >= OP_PCMPEQB && op <= OP_PCMPEQQ;
1639 }
1640 
1641 static MonoInst*
simd_intrinsic_emit_equality_op(MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args,int opcode,int flags)1642 simd_intrinsic_emit_equality_op (MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args, int opcode, int flags)
1643 {
1644 	MonoInst* ins;
1645 	int left_vreg, right_vreg, tmp_vreg;
1646 
1647 	left_vreg = load_simd_vreg (cfg, cmethod, args [0], NULL);
1648 	right_vreg = get_simd_vreg (cfg, cmethod, args [1]);
1649 
1650 	MONO_INST_NEW (cfg, ins, opcode);
1651 	ins->klass = cmethod->klass;
1652 	ins->sreg1 = left_vreg;
1653 	ins->sreg2 = right_vreg;
1654 	ins->type = STACK_VTYPE;
1655 	ins->klass = cmethod->klass;
1656 	ins->dreg = tmp_vreg = alloc_ireg (cfg);
1657 	ins->inst_c0 = flags;
1658 	MONO_ADD_INS (cfg->cbb, ins);
1659 
1660 	/*FIXME the next ops are SSE specific*/
1661 	MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1662 	ins->klass = cmethod->klass;
1663 	ins->sreg1 = tmp_vreg;
1664 	ins->type = STACK_I4;
1665 	ins->dreg = tmp_vreg = alloc_ireg (cfg);
1666 	MONO_ADD_INS (cfg->cbb, ins);
1667 
1668 	/*FP ops have a not equal instruction, which means that we must test the results with OR semantics.*/
1669 	if (mono_op_is_packed_compare (opcode) || flags == SIMD_COMP_EQ) {
1670 		MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0xFFFF);
1671 		NEW_UNALU (cfg, ins, flags == SIMD_COMP_EQ ? OP_CEQ : OP_CLT_UN, tmp_vreg, -1);
1672 	} else {
1673 		MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, tmp_vreg, 0);
1674 		NEW_UNALU (cfg, ins, OP_CGT_UN, tmp_vreg, -1);
1675 	}
1676 	MONO_ADD_INS (cfg->cbb, ins);
1677 	return ins;
1678 }
1679 
1680 static MonoInst*
simd_intrinsic_emit_equality(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1681 simd_intrinsic_emit_equality (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1682 {
1683 	return simd_intrinsic_emit_equality_op (cfg, cmethod, args, intrinsic->opcode, intrinsic->flags);
1684 }
1685 
1686 static MonoInst*
simd_intrinsic_emit_shuffle(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1687 simd_intrinsic_emit_shuffle (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1688 {
1689 	MonoInst *ins;
1690 	int vreg, vreg2 = -1;
1691 	int param_count = mono_method_signature (cmethod)->param_count;
1692 
1693 	if (args [param_count - 1]->opcode != OP_ICONST) {
1694 		/*TODO Shuffle with non literals is not yet supported */
1695 		return NULL;
1696 	}
1697 
1698 	vreg = get_simd_vreg (cfg, cmethod, args [0]);
1699 	if (param_count == 3)
1700 		vreg2 = get_simd_vreg (cfg, cmethod, args [1]);
1701 
1702 	NULLIFY_INS (args [param_count - 1]);
1703 
1704 
1705 	MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1706 	ins->klass = cmethod->klass;
1707 	ins->sreg1 = vreg;
1708 	ins->sreg2 = vreg2;
1709 	ins->inst_c0 = args [param_count - 1]->inst_c0;
1710 	ins->type = STACK_VTYPE;
1711 	ins->dreg = alloc_ireg (cfg);
1712 	MONO_ADD_INS (cfg->cbb, ins);
1713 
1714 	if (param_count == 3 && ins->opcode == OP_PSHUFLED)
1715 		ins->opcode = OP_SHUFPS;
1716 	return ins;
1717 }
1718 
1719 static MonoInst*
simd_intrinsic_emit_load_aligned(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1720 simd_intrinsic_emit_load_aligned (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1721 {
1722 	MonoInst *ins;
1723 
1724 	MONO_INST_NEW (cfg, ins, OP_LOADX_ALIGNED_MEMBASE);
1725 	ins->klass = cmethod->klass;
1726 	ins->sreg1 = args [0]->dreg;
1727 	ins->type = STACK_VTYPE;
1728 	ins->dreg = alloc_ireg (cfg);
1729 	MONO_ADD_INS (cfg->cbb, ins);
1730 	return ins;
1731 }
1732 
1733 static MonoInst*
simd_intrinsic_emit_store(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1734 simd_intrinsic_emit_store (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1735 {
1736 	MonoInst *ins;
1737 	int vreg;
1738 
1739 	vreg = get_simd_vreg (cfg, cmethod, args [1]);
1740 
1741 	MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1742 	ins->klass = cmethod->klass;
1743 	ins->dreg = args [0]->dreg;
1744 	ins->sreg1 = vreg;
1745 	ins->type = STACK_VTYPE;
1746 	MONO_ADD_INS (cfg->cbb, ins);
1747 	return ins;
1748 }
1749 
1750 static MonoInst*
simd_intrinsic_emit_extract_mask(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1751 simd_intrinsic_emit_extract_mask (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1752 {
1753 	MonoInst *ins;
1754 	int vreg;
1755 
1756 	vreg = get_simd_vreg (cfg, cmethod, args [0]);
1757 
1758 	MONO_INST_NEW (cfg, ins, OP_EXTRACT_MASK);
1759 	ins->klass = cmethod->klass;
1760 	ins->sreg1 = vreg;
1761 	ins->type = STACK_I4;
1762 	ins->dreg = alloc_ireg (cfg);
1763 	MONO_ADD_INS (cfg->cbb, ins);
1764 
1765 	return ins;
1766 }
1767 
1768 static MonoInst*
simd_intrinsic_emit_prefetch(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1769 simd_intrinsic_emit_prefetch (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1770 {
1771 	MonoInst *ins;
1772 
1773 	MONO_INST_NEW (cfg, ins, OP_PREFETCH_MEMBASE);
1774 	ins->klass = cmethod->klass;
1775 	ins->sreg1 = args [0]->dreg;
1776 	ins->backend.arg_info = intrinsic->flags;
1777 	MONO_ADD_INS (cfg->cbb, ins);
1778 	return ins;
1779 }
1780 
1781 static MonoInst*
simd_intrinsic_emit_const(const SimdIntrinsic * intrinsic,MonoCompile * cfg,MonoMethod * cmethod,MonoInst ** args)1782 simd_intrinsic_emit_const (const SimdIntrinsic *intrinsic, MonoCompile *cfg, MonoMethod *cmethod, MonoInst **args)
1783 {
1784 	MonoInst *ins;
1785 
1786 	MONO_INST_NEW (cfg, ins, intrinsic->opcode);
1787 	ins->klass = cmethod->klass;
1788 	ins->type = STACK_VTYPE;
1789 	ins->dreg = alloc_xreg (cfg);
1790 	MONO_ADD_INS (cfg->cbb, ins);
1791 	return ins;
1792 }
1793 
1794 static const char *
simd_version_name(guint32 version)1795 simd_version_name (guint32 version)
1796 {
1797 	switch (version) {
1798 	case SIMD_VERSION_SSE1:
1799 		return "sse1";
1800 	case SIMD_VERSION_SSE2:
1801 		return "sse2";
1802 	case SIMD_VERSION_SSE3:
1803 		return "sse3";
1804 	case SIMD_VERSION_SSSE3:
1805 		return "ssse3";
1806 	case SIMD_VERSION_SSE41:
1807 		return "sse41";
1808 	case SIMD_VERSION_SSE42:
1809 		return "sse42";
1810 	case SIMD_VERSION_SSE4a:
1811 		return "sse4a";
1812 	}
1813 	return "n/a";
1814 }
1815 
1816 static MonoInst*
emit_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args,const SimdIntrinsic * intrinsics,guint32 size)1817 emit_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, const SimdIntrinsic *intrinsics, guint32 size)
1818 {
1819 	const SimdIntrinsic *result = (const SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size, sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
1820 	if (!result) {
1821 		DEBUG (printf ("function doesn't have a simd intrinsic %s::%s/%d\n", cmethod->klass->name, cmethod->name, fsig->param_count));
1822 		return NULL;
1823 	}
1824 	if (IS_DEBUG_ON (cfg)) {
1825 		int i, max;
1826 		printf ("found call to intrinsic %s::%s/%d -> %s\n", cmethod->klass->name, cmethod->name, fsig->param_count, method_name (result->name));
1827 		max = fsig->param_count + fsig->hasthis;
1828 		for (i = 0; i < max; ++i) {
1829 			printf ("param %d:  ", i);
1830 			mono_print_ins (args [i]);
1831 		}
1832 	}
1833 	if (result->simd_version_flags && !(result->simd_version_flags & simd_supported_versions)) {
1834 		if (IS_DEBUG_ON (cfg)) {
1835 			int x;
1836 			printf ("function %s::%s/%d requires one of unsuported SIMD instruction set(s): ", cmethod->klass->name, cmethod->name, fsig->param_count);
1837 			for (x = 1; x <= SIMD_VERSION_INDEX_END; x++)
1838 				if (result->simd_version_flags & (1 << x))
1839 					printf ("%s ", simd_version_name (1 << x));
1840 
1841 			printf ("\n");
1842 		}
1843 		return NULL;
1844 	}
1845 
1846 	switch (result->simd_emit_mode) {
1847 	case SIMD_EMIT_BINARY:
1848 		return simd_intrinsic_emit_binary (result, cfg, cmethod, args);
1849 	case SIMD_EMIT_UNARY:
1850 		return simd_intrinsic_emit_unary (result, cfg, cmethod, args);
1851 	case SIMD_EMIT_SETTER:
1852 		return simd_intrinsic_emit_setter (result, cfg, cmethod, args);
1853 	case SIMD_EMIT_GETTER:
1854 		return simd_intrinsic_emit_getter (result, cfg, cmethod, args);
1855 	case SIMD_EMIT_GETTER_QWORD:
1856 		return simd_intrinsic_emit_long_getter (result, cfg, cmethod, args);
1857 	case SIMD_EMIT_CTOR:
1858 		return simd_intrinsic_emit_ctor (result, cfg, cmethod, args);
1859 	case SIMD_EMIT_CAST:
1860 		return simd_intrinsic_emit_cast (result, cfg, cmethod, args);
1861 	case SIMD_EMIT_SHUFFLE:
1862 		return simd_intrinsic_emit_shuffle (result, cfg, cmethod, args);
1863 	case SIMD_EMIT_SHIFT:
1864 		return simd_intrinsic_emit_shift (result, cfg, cmethod, args);
1865 	case SIMD_EMIT_EQUALITY:
1866 		return simd_intrinsic_emit_equality (result, cfg, cmethod, args);
1867 	case SIMD_EMIT_LOAD_ALIGNED:
1868 		return simd_intrinsic_emit_load_aligned (result, cfg, cmethod, args);
1869 	case SIMD_EMIT_STORE:
1870 		return simd_intrinsic_emit_store (result, cfg, cmethod, args);
1871 	case SIMD_EMIT_EXTRACT_MASK:
1872 		return simd_intrinsic_emit_extract_mask (result, cfg, cmethod, args);
1873 	case SIMD_EMIT_PREFETCH:
1874 		return simd_intrinsic_emit_prefetch (result, cfg, cmethod, args);
1875 	}
1876 	g_assert_not_reached ();
1877 }
1878 
1879 static int
mono_emit_vector_ldelema(MonoCompile * cfg,MonoType * array_type,MonoInst * arr,MonoInst * index,gboolean check_bounds)1880 mono_emit_vector_ldelema (MonoCompile *cfg, MonoType *array_type, MonoInst *arr, MonoInst *index, gboolean check_bounds)
1881 {
1882 	MonoInst *ins;
1883 	guint32 size;
1884 	int mult_reg, add_reg, array_reg, index_reg, index2_reg, index3_reg;
1885 
1886 	size = mono_array_element_size (mono_class_from_mono_type (array_type));
1887 	mult_reg = alloc_preg (cfg);
1888 	array_reg = arr->dreg;
1889 	index_reg = index->dreg;
1890 
1891 #if SIZEOF_VOID_P == 8
1892 	/* The array reg is 64 bits but the index reg is only 32 */
1893 	index2_reg = alloc_preg (cfg);
1894 	MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, index_reg);
1895 #else
1896 	index2_reg = index_reg;
1897 #endif
1898 	index3_reg = alloc_preg (cfg);
1899 
1900 	if (check_bounds) {
1901 		MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index2_reg);
1902 		MONO_EMIT_NEW_BIALU_IMM (cfg,  OP_PADD_IMM, index3_reg, index2_reg, 16 / size - 1);
1903 		MONO_EMIT_BOUNDS_CHECK (cfg, array_reg, MonoArray, max_length, index3_reg);
1904 	}
1905 
1906 	add_reg = alloc_preg (cfg);
1907 
1908 	MONO_EMIT_NEW_BIALU_IMM (cfg, OP_MUL_IMM, mult_reg, index2_reg, size);
1909 	MONO_EMIT_NEW_BIALU (cfg, OP_PADD, add_reg, array_reg, mult_reg);
1910 	NEW_BIALU_IMM (cfg, ins, OP_PADD_IMM, add_reg, add_reg, MONO_STRUCT_OFFSET (MonoArray, vector));
1911 	ins->type = STACK_PTR;
1912 	MONO_ADD_INS (cfg->cbb, ins);
1913 
1914 	return add_reg;
1915 }
1916 
1917 static MonoInst*
emit_array_extension_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)1918 emit_array_extension_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1919 {
1920 	if ((!strcmp ("GetVector", cmethod->name) || !strcmp ("GetVectorAligned", cmethod->name)) && fsig->param_count == 2) {
1921 		MonoInst *load;
1922 		int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], TRUE);
1923 
1924 		MONO_INST_NEW (cfg, load, !strcmp ("GetVectorAligned", cmethod->name) ? OP_LOADX_ALIGNED_MEMBASE : OP_LOADX_MEMBASE );
1925 		load->klass = cmethod->klass;
1926 		load->sreg1 = addr;
1927 		load->type = STACK_VTYPE;
1928 		load->dreg = alloc_ireg (cfg);
1929 		MONO_ADD_INS (cfg->cbb, load);
1930 
1931 		return load;
1932 	}
1933 	if ((!strcmp ("SetVector", cmethod->name) || !strcmp ("SetVectorAligned", cmethod->name)) && fsig->param_count == 3) {
1934 		MonoInst *store;
1935 		int vreg = get_simd_vreg (cfg, cmethod, args [1]);
1936 		int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [2], TRUE);
1937 
1938 		MONO_INST_NEW (cfg, store, !strcmp ("SetVectorAligned", cmethod->name) ? OP_STOREX_ALIGNED_MEMBASE_REG :  OP_STOREX_MEMBASE);
1939 		store->klass = cmethod->klass;
1940 		store->dreg = addr;
1941 		store->sreg1 = vreg;
1942 		MONO_ADD_INS (cfg->cbb, store);
1943 
1944 		return store;
1945 	}
1946 	if (!strcmp ("IsAligned", cmethod->name) && fsig->param_count == 2) {
1947 		MonoInst *ins;
1948 		int addr = mono_emit_vector_ldelema (cfg, fsig->params [0], args [0], args [1], FALSE);
1949 
1950 		MONO_EMIT_NEW_BIALU_IMM (cfg, OP_AND_IMM, addr, addr, 15);
1951 		MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, addr, 0);
1952 		NEW_UNALU (cfg, ins, OP_CEQ, addr, -1);
1953 		MONO_ADD_INS (cfg->cbb, ins);
1954 
1955 		return ins;
1956 	}
1957 	return NULL;
1958 }
1959 
1960 static MonoInst*
emit_simd_runtime_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)1961 emit_simd_runtime_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1962 {
1963 	if (!strcmp ("get_AccelMode", cmethod->name) && fsig->param_count == 0) {
1964 		MonoInst *ins;
1965 		EMIT_NEW_ICONST (cfg, ins, simd_supported_versions);
1966 		return ins;
1967 	}
1968 	return NULL;
1969 }
1970 
1971 static gboolean
is_sys_numerics_assembly(MonoAssembly * assembly)1972 is_sys_numerics_assembly (MonoAssembly *assembly)
1973 {
1974 	return !strcmp ("System.Numerics", assembly->aname.name);
1975 }
1976 
1977 static gboolean
is_sys_numerics_vectors_assembly(MonoAssembly * assembly)1978 is_sys_numerics_vectors_assembly (MonoAssembly *assembly)
1979 {
1980 	return !strcmp ("System.Numerics.Vectors", assembly->aname.name);
1981 }
1982 
1983 MonoInst*
mono_emit_simd_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)1984 mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
1985 {
1986 	const char *class_name;
1987 
1988 	if (is_sys_numerics_assembly (cmethod->klass->image->assembly))
1989 		return emit_sys_numerics_intrinsics (cfg, cmethod, fsig, args);
1990 
1991 	if (is_sys_numerics_vectors_assembly (cmethod->klass->image->assembly))
1992 		return emit_sys_numerics_vectors_intrinsics (cfg, cmethod, fsig, args);
1993 
1994 	if (strcmp ("Mono.Simd", cmethod->klass->image->assembly->aname.name) ||
1995 	    strcmp ("Mono.Simd", cmethod->klass->name_space))
1996 		return NULL;
1997 
1998 	class_name = cmethod->klass->name;
1999 	if (!strcmp ("SimdRuntime", class_name))
2000 		return emit_simd_runtime_intrinsics (cfg, cmethod, fsig, args);
2001 
2002 	if (!strcmp ("ArrayExtensions", class_name))
2003 		return emit_array_extension_intrinsics (cfg, cmethod, fsig, args);
2004 
2005 	if (!strcmp ("VectorOperations", class_name)) {
2006 		if (!(cmethod->flags & METHOD_ATTRIBUTE_STATIC))
2007 			return NULL;
2008 		class_name = mono_class_from_mono_type (mono_method_signature (cmethod)->params [0])->name;
2009 	} else if (!cmethod->klass->simd_type)
2010 		return NULL;
2011 
2012 	cfg->uses_simd_intrinsics = 1;
2013 	if (!strcmp ("Vector2d", class_name))
2014 		return emit_intrinsics (cfg, cmethod, fsig, args, vector2d_intrinsics, sizeof (vector2d_intrinsics) / sizeof (SimdIntrinsic));
2015 	if (!strcmp ("Vector4f", class_name))
2016 		return emit_intrinsics (cfg, cmethod, fsig, args, vector4f_intrinsics, sizeof (vector4f_intrinsics) / sizeof (SimdIntrinsic));
2017 	if (!strcmp ("Vector2ul", class_name))
2018 		return emit_intrinsics (cfg, cmethod, fsig, args, vector2ul_intrinsics, sizeof (vector2ul_intrinsics) / sizeof (SimdIntrinsic));
2019 	if (!strcmp ("Vector2l", class_name))
2020 		return emit_intrinsics (cfg, cmethod, fsig, args, vector2l_intrinsics, sizeof (vector2l_intrinsics) / sizeof (SimdIntrinsic));
2021 	if (!strcmp ("Vector4ui", class_name))
2022 		return emit_intrinsics (cfg, cmethod, fsig, args, vector4ui_intrinsics, sizeof (vector4ui_intrinsics) / sizeof (SimdIntrinsic));
2023 	if (!strcmp ("Vector4i", class_name))
2024 		return emit_intrinsics (cfg, cmethod, fsig, args, vector4i_intrinsics, sizeof (vector4i_intrinsics) / sizeof (SimdIntrinsic));
2025 	if (!strcmp ("Vector8us", class_name))
2026 		return emit_intrinsics (cfg, cmethod, fsig, args, vector8us_intrinsics, sizeof (vector8us_intrinsics) / sizeof (SimdIntrinsic));
2027 	if (!strcmp ("Vector8s", class_name))
2028 		return emit_intrinsics (cfg, cmethod, fsig, args, vector8s_intrinsics, sizeof (vector8s_intrinsics) / sizeof (SimdIntrinsic));
2029 	if (!strcmp ("Vector16b", class_name))
2030 		return emit_intrinsics (cfg, cmethod, fsig, args, vector16b_intrinsics, sizeof (vector16b_intrinsics) / sizeof (SimdIntrinsic));
2031 	if (!strcmp ("Vector16sb", class_name))
2032 		return emit_intrinsics (cfg, cmethod, fsig, args, vector16sb_intrinsics, sizeof (vector16sb_intrinsics) / sizeof (SimdIntrinsic));
2033 
2034 	return NULL;
2035 }
2036 
2037 static void
assert_handled(MonoCompile * cfg,MonoMethod * method)2038 assert_handled (MonoCompile *cfg, MonoMethod *method)
2039 {
2040 	MonoCustomAttrInfo *cattr;
2041 	MonoError error;
2042 
2043 	if (cfg->verbose_level > 1) {
2044 		cattr = mono_custom_attrs_from_method_checked (method, &error);
2045 
2046 		if (cattr) {
2047 			gboolean has_attr = FALSE;
2048 			for (int i = 0; i < cattr->num_attrs; ++i)
2049 				if (cattr->attrs [i].ctor && (!strcmp (cattr->attrs [i].ctor->klass->name, "JitIntrinsicAttribute")))
2050 					has_attr = TRUE;
2051 			if (has_attr) {
2052 				printf ("SIMD intrinsic unhandled: %s\n", mono_method_get_name_full (method, TRUE, TRUE, MONO_TYPE_NAME_FORMAT_IL));
2053 				fflush (stdout);
2054 				//g_assert_not_reached ();
2055 			}
2056 			mono_custom_attrs_free (cattr);
2057 		}
2058 	}
2059 }
2060 
2061 // The entries should be ordered by name
2062 // System.Numerics.Vector2/Vector3/Vector4
2063 static const SimdIntrinsic vector2_intrinsics[] = {
2064 	{ SN_ctor, OP_EXPAND_R4 },
2065 	{ SN_Abs },
2066 	{ SN_Dot, OP_DPPS },
2067 	{ SN_Equals, OP_COMPPS, SIMD_VERSION_SSE1, SIMD_EMIT_EQUALITY, SIMD_COMP_EQ },
2068 	{ SN_Max, OP_MAXPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2069 	{ SN_Min, OP_MINPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2070 	{ SN_SquareRoot, OP_SQRTPS, SIMD_VERSION_SSE1, SIMD_EMIT_UNARY },
2071 	{ SN_op_Addition, OP_ADDPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2072 	{ SN_op_Division, OP_DIVPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2073 	{ SN_op_Multiply, OP_MULPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2074 	{ SN_op_Subtraction, OP_SUBPS, SIMD_VERSION_SSE1, SIMD_EMIT_BINARY },
2075 };
2076 
2077 static MonoInst*
emit_vector_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2078 emit_vector_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2079 {
2080 	const SimdIntrinsic *intrins;
2081 	MonoMethodSignature *sig = mono_method_signature (cmethod);
2082 	MonoType *type = &cmethod->klass->byval_arg;
2083 
2084 	/*
2085 	 * Vector2/3/4 are handled the same way, since the underlying SIMD type is the same (4 * r4).
2086 	 */
2087 	intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector2_intrinsics, sizeof (vector2_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2088 	if (!intrins) {
2089 		assert_handled (cfg, cmethod);
2090 		return NULL;
2091 	}
2092 
2093 	if (cfg->verbose_level > 1) {
2094 		char *name = mono_method_full_name (cmethod, TRUE);
2095 		printf ("  SIMD intrinsic %s\n", name);
2096 		g_free (name);
2097 	}
2098 
2099 	switch (intrins->name) {
2100 	case SN_ctor: {
2101 		gboolean match = TRUE;
2102 		for (int i = 0; i < fsig->param_count; ++i)
2103 			if (fsig->params [i]->type != MONO_TYPE_R4)
2104 				match = FALSE;
2105 		if (!match)
2106 			break;
2107 		return simd_intrinsic_emit_ctor (intrins, cfg, cmethod, args);
2108 	}
2109 	case SN_Equals:
2110 		if (!(fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && fsig->params [0] == type))
2111 			break;
2112 		return simd_intrinsic_emit_equality (intrins, cfg, cmethod, args);
2113 	case SN_SquareRoot:
2114 		if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2115 			break;
2116 		return simd_intrinsic_emit_unary (intrins, cfg, cmethod, args);
2117 	case SN_Dot:
2118 		if (!(fsig->param_count == 2 && fsig->ret->type == MONO_TYPE_R4 && fsig->params [0] == type && fsig->params [1] == type))
2119 			break;
2120 		if (COMPILE_LLVM (cfg)) {
2121 			MonoInst *ins;
2122 
2123 			ins = simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2124 			/* The end result is in the lowest element */
2125 			return simd_intrinsic_emit_getter_op (cfg, 0, cmethod->klass, mono_method_signature (cmethod)->ret, ins);
2126 		}
2127 		break;
2128 	case SN_Abs: {
2129 		// abs(x) = max(x, sub(0,x))
2130 		MonoInst *sub;
2131 		MonoInst *zero;
2132 
2133 		if (!(fsig->param_count == 1 && fsig->ret == type && fsig->params [0] == type))
2134 			break;
2135 
2136 		MONO_INST_NEW (cfg, zero, OP_XZERO);
2137 		zero->dreg = alloc_xreg (cfg);
2138 		zero->klass = cmethod->klass;
2139 		MONO_ADD_INS (cfg->cbb, zero);
2140 
2141 		sub = simd_intrinsic_emit_binary_op (cfg, OP_SUBPS, 0, cmethod->klass, sig->params [0], sig->params [0], zero, args [0]);
2142 		return simd_intrinsic_emit_binary_op (cfg, OP_MAXPS, 0, cmethod->klass, sig->params [0], sig->params [0], args [0], sub);
2143 	}
2144 	case SN_Max:
2145 	case SN_Min:
2146 	case SN_op_Addition:
2147 	case SN_op_Division:
2148 	case SN_op_Multiply:
2149 	case SN_op_Subtraction:
2150 		if (!(fsig->param_count == 2 && fsig->ret == type && (fsig->params [0] == type || fsig->params [0]->type == MONO_TYPE_R4) && (fsig->params [1] == type || fsig->params [1]->type == MONO_TYPE_R4)))
2151 			break;
2152 		return simd_intrinsic_emit_binary (intrins, cfg, cmethod, args);
2153 	default:
2154 		break;
2155 	}
2156 
2157 	assert_handled (cfg, cmethod);
2158 
2159 	if (cfg->verbose_level > 1) {
2160 		char *name = mono_method_full_name (cmethod, TRUE);
2161 		printf ("  SIMD method %s not handled.\n", name);
2162 		g_free (name);
2163 	}
2164 	return NULL;
2165 }
2166 
2167 static MonoInst*
emit_vector_is_hardware_accelerated_intrinsic(MonoCompile * cfg)2168 emit_vector_is_hardware_accelerated_intrinsic (MonoCompile *cfg)
2169 {
2170 	MonoInst *ins;
2171 
2172 	if (simd_supported_versions)
2173 		EMIT_NEW_ICONST (cfg, ins, 1);
2174 	else
2175 		EMIT_NEW_ICONST (cfg, ins, 0);
2176 	ins->type = STACK_I4;
2177 	return ins;
2178 }
2179 
2180 /* These should be ordered by name */
2181 static const SimdIntrinsic vector_t_intrinsics[] = {
2182 	{ SN_ctor },
2183 	{ SN_Abs },
2184 	{ SN_CopyTo },
2185 	{ SN_Equals },
2186 	{ SN_GreaterThan },
2187 	{ SN_GreaterThanOrEqual },
2188 	{ SN_LessThan },
2189 	{ SN_LessThanOrEqual },
2190 	{ SN_Max },
2191 	{ SN_Min },
2192 	{ SN_get_AllOnes, OP_XONES },
2193 	{ SN_get_Count },
2194 	{ SN_get_Item },
2195 	{ SN_get_Zero, OP_XZERO },
2196 	{ SN_op_Addition },
2197 	{ SN_op_BitwiseAnd },
2198 	{ SN_op_BitwiseOr },
2199 	{ SN_op_Division },
2200 	{ SN_op_ExclusiveOr },
2201 	{ SN_op_Explicit },
2202 	{ SN_op_Multiply },
2203 	{ SN_op_Subtraction }
2204 };
2205 
2206 static MonoInst*
emit_vector_t_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2207 emit_vector_t_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2208 {
2209 	const SimdIntrinsic *intrins;
2210 	MonoType *type, *etype;
2211 	MonoInst *ins;
2212 	int size, len, index;
2213 
2214 	intrins = (const SimdIntrinsic*)mono_binary_search (cmethod->name, vector_t_intrinsics, sizeof (vector_t_intrinsics) / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_compare_by_name);
2215 	if (!intrins) {
2216 		assert_handled (cfg, cmethod);
2217 		return NULL;
2218 	}
2219 
2220 	type = &cmethod->klass->byval_arg;
2221 	etype = mono_class_get_context (cmethod->klass)->class_inst->type_argv [0];
2222 	size = mono_class_value_size (mono_class_from_mono_type (etype), NULL);
2223 	g_assert (size);
2224 	len = 16 / size;
2225 
2226 	if (!MONO_TYPE_IS_PRIMITIVE (etype))
2227 		return NULL;
2228 
2229 	if (cfg->verbose_level > 1) {
2230 		char *name = mono_method_full_name (cmethod, TRUE);
2231 		printf ("  SIMD intrinsic %s\n", name);
2232 		g_free (name);
2233 	}
2234 
2235 	switch (intrins->name) {
2236 	case SN_get_Count:
2237 		if (!(fsig->param_count == 0 && fsig->ret->type == MONO_TYPE_I4))
2238 			break;
2239 		EMIT_NEW_ICONST (cfg, ins, len);
2240 		return ins;
2241 	case SN_get_AllOnes:
2242 	case SN_get_Zero:
2243 		if (!(fsig->param_count == 0 && mono_metadata_type_equal (fsig->ret, type)))
2244 			break;
2245 		return simd_intrinsic_emit_const (intrins, cfg, cmethod, args);
2246 	case SN_get_Item:
2247 		g_assert (fsig->param_count == 1);
2248 		if (args [1]->opcode != OP_ICONST)
2249 			return NULL;
2250 		index = args [1]->inst_c0;
2251 		if (index < 0 || index >= len)
2252 			return NULL;
2253 		return simd_intrinsic_emit_getter_op (cfg, index, cmethod->klass, etype, args [0]);
2254 	case SN_ctor:
2255 		if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
2256 			return simd_intrinsic_emit_ctor (NULL, cfg, cmethod, args);
2257 		if ((fsig->param_count == 1 || fsig->param_count == 2) && (fsig->params [0]->type == MONO_TYPE_SZARRAY)) {
2258 			MonoInst *array_ins = args [1];
2259 			MonoInst *index_ins;
2260 			MonoInst *ldelema_ins;
2261 			MonoInst *var;
2262 			int end_index_reg;
2263 
2264 			if (args [0]->opcode != OP_LDADDR)
2265 				return NULL;
2266 
2267 			/* .ctor (T[]) or .ctor (T[], index) */
2268 
2269 			if (fsig->param_count == 2) {
2270 				index_ins = args [2];
2271 			} else {
2272 				EMIT_NEW_ICONST (cfg, index_ins, 0);
2273 			}
2274 
2275 			/* Emit index check for the end (index + len - 1 < array length) */
2276 			end_index_reg = alloc_ireg (cfg);
2277 			EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2278 			MONO_EMIT_BOUNDS_CHECK (cfg, array_ins->dreg, MonoArray, max_length, end_index_reg);
2279 
2280 			/* Load the array slice into the simd reg */
2281 			ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2282 			g_assert (args [0]->opcode == OP_LDADDR);
2283 			var = args [0]->inst_p0;
2284 			EMIT_NEW_LOAD_MEMBASE (cfg, ins, OP_LOADX_MEMBASE, var->dreg, ldelema_ins->dreg, 0);
2285 			ins->klass = cmethod->klass;
2286 			return args [0];
2287 		}
2288 		break;
2289 	case SN_op_Explicit:
2290 		return simd_intrinsic_emit_cast (intrins, cfg, cmethod, args);
2291 	case SN_Equals:
2292 		if (fsig->param_count == 1 && fsig->ret->type == MONO_TYPE_BOOLEAN && mono_metadata_type_equal (fsig->params [0], type))
2293 			return simd_intrinsic_emit_equality_op (cfg, cmethod, args, type_to_comp_op (etype), SIMD_COMP_EQ);
2294 		if (fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type))
2295 			return simd_intrinsic_emit_binary_op (cfg, type_to_comp_op (etype), 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2296 		break;
2297 
2298 	case SN_GreaterThan:
2299 	case SN_GreaterThanOrEqual:
2300 	case SN_LessThan:
2301 	case SN_LessThanOrEqual: {
2302 		MonoInst *cmp1, *cmp2;
2303 		int eq_op, gt_op;
2304 
2305 		switch (etype->type) {
2306 		case MONO_TYPE_I1:
2307 		case MONO_TYPE_I2:
2308 		case MONO_TYPE_I4:
2309 		case MONO_TYPE_I8:
2310 			break;
2311 		default:
2312 			return NULL;
2313 		}
2314 
2315 		eq_op = type_to_comp_op (etype);
2316 		gt_op = type_to_gt_op (etype);
2317 
2318 		switch (intrins->name) {
2319 		case SN_GreaterThan:
2320 			return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2321 		case SN_LessThan:
2322 			return simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2323 		case SN_LessThanOrEqual:
2324 			cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2325 			cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [1], args [0]);
2326 			return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2327 		case SN_GreaterThanOrEqual:
2328 			cmp1 = simd_intrinsic_emit_binary_op (cfg, eq_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2329 			cmp2 = simd_intrinsic_emit_binary_op (cfg, gt_op, 0, cmethod->klass, fsig->params [0], fsig->params [1], args [0], args [1]);
2330 			return simd_intrinsic_emit_binary_op (cfg, OP_POR, 0, cmethod->klass, fsig->params [0], fsig->params [1], cmp1, cmp2);
2331 		default:
2332 			g_assert_not_reached ();
2333 			break;
2334 		}
2335 	}
2336 	case SN_Abs:
2337 		/* Vector<T>.Abs */
2338 		switch (etype->type) {
2339 		case MONO_TYPE_U1:
2340 		case MONO_TYPE_U2:
2341 		case MONO_TYPE_U4:
2342 		case MONO_TYPE_U8: {
2343 			MonoInst *ins;
2344 
2345 			/* No-op */
2346 			MONO_INST_NEW (cfg, ins, OP_XMOVE);
2347 			ins->klass = cmethod->klass;
2348 			ins->type = STACK_VTYPE;
2349 			ins->sreg1 = args [0]->dreg;
2350 			ins->dreg = alloc_xreg (cfg);
2351 			MONO_ADD_INS (cfg->cbb, ins);
2352 			return ins;
2353 		}
2354 		default:
2355 			break;
2356 		}
2357 		break;
2358 	case SN_op_Addition:
2359 	case SN_op_Subtraction:
2360 	case SN_op_Multiply:
2361 	case SN_op_Division:
2362 	case SN_op_ExclusiveOr:
2363 	case SN_op_BitwiseAnd:
2364 	case SN_op_BitwiseOr:
2365 	case SN_Max:
2366 	case SN_Min: {
2367 		if (!(fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, fsig->params [0]) && mono_metadata_type_equal (fsig->params [0], fsig->params [1])))
2368 			break;
2369 		int op = 0;
2370 		switch (intrins->name) {
2371 		case SN_op_Addition:
2372 			op = type_to_padd_op (etype);
2373 			break;
2374 		case SN_op_Subtraction:
2375 			op = type_to_psub_op (etype);
2376 			break;
2377 		case SN_op_Multiply:
2378 			op = type_to_pmul_op (etype);
2379 			break;
2380 		case SN_op_Division:
2381 			op = type_to_pdiv_op (etype);
2382 			break;
2383 		case SN_op_ExclusiveOr:
2384 			op = type_to_pxor_op (etype);
2385 			break;
2386 		case SN_op_BitwiseAnd:
2387 			op = type_to_pand_op (etype);
2388 			break;
2389 		case SN_op_BitwiseOr:
2390 			op = type_to_por_op (etype);
2391 			break;
2392 		case SN_Min:
2393 			op = type_to_pmin_op (etype);
2394 			break;
2395 		case SN_Max:
2396 			op = type_to_pmax_op (etype);
2397 			break;
2398 		default:
2399 			g_assert_not_reached ();
2400 		}
2401 		if (op != -1)
2402 			return simd_intrinsic_emit_binary_op (cfg, op, 0, cmethod->klass, fsig->params [0], fsig->params [0], args [0], args [1]);
2403 		break;
2404 	}
2405 	case SN_CopyTo: {
2406 		MonoInst *array_ins = args [1];
2407 		MonoInst *index_ins = args [2];
2408 		MonoInst *ldelema_ins;
2409 		MonoInst *var;
2410 		int end_index_reg;
2411 
2412 		if (args [0]->opcode != OP_LDADDR)
2413 			return NULL;
2414 
2415 		/* Emit index check for the end (index + len - 1 < array length) */
2416 		end_index_reg = alloc_ireg (cfg);
2417 		EMIT_NEW_BIALU_IMM (cfg, ins, OP_IADD_IMM, end_index_reg, index_ins->dreg, len - 1);
2418 
2419 		int length_reg = alloc_ireg (cfg);
2420 		MONO_EMIT_NEW_LOAD_MEMBASE_OP_FAULT (cfg, OP_LOADI4_MEMBASE, length_reg, array_ins->dreg, MONO_STRUCT_OFFSET (MonoArray, max_length));
2421 		MONO_EMIT_NEW_BIALU (cfg, OP_COMPARE, -1, length_reg, end_index_reg);
2422 		MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "ArgumentException");
2423 
2424 		/* Load the simd reg into the array slice */
2425 		ldelema_ins = mini_emit_ldelema_1_ins (cfg, mono_class_from_mono_type (etype), array_ins, index_ins, TRUE);
2426 		g_assert (args [0]->opcode == OP_LDADDR);
2427 		var = args [0]->inst_p0;
2428 		EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, ldelema_ins->dreg, 0, var->dreg);
2429 		ins->klass = cmethod->klass;
2430 		return args [0];
2431 		break;
2432 	}
2433 	default:
2434 		break;
2435 	}
2436 
2437 	assert_handled (cfg, cmethod);
2438 
2439 	if (cfg->verbose_level > 1) {
2440 		char *name = mono_method_full_name (cmethod, TRUE);
2441 		printf ("  SIMD method %s not handled.\n", name);
2442 		g_free (name);
2443 	}
2444 
2445 	return NULL;
2446 }
2447 
2448 /*
2449  * emit_sys_numerics_intrinsics:
2450  *
2451  *   Emit intrinsics for the System.Numerics assembly.
2452  */
2453 static MonoInst*
emit_sys_numerics_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2454 emit_sys_numerics_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2455 {
2456 	const char *nspace = cmethod->klass->name_space;
2457 	const char *class_name = cmethod->klass->name;
2458 
2459 	if (!strcmp ("Vector2", class_name) || !strcmp ("Vector4", class_name) || !strcmp ("Vector3", class_name))
2460 		return emit_vector_intrinsics (cfg, cmethod, fsig, args);
2461 
2462 	if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2463 		if (!strcmp (cmethod->name, "get_IsHardwareAccelerated"))
2464 			return emit_vector_is_hardware_accelerated_intrinsic (cfg);
2465 	}
2466 
2467 	return NULL;
2468 }
2469 
2470 static MonoInst*
emit_sys_numerics_vectors_intrinsics(MonoCompile * cfg,MonoMethod * cmethod,MonoMethodSignature * fsig,MonoInst ** args)2471 emit_sys_numerics_vectors_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args)
2472 {
2473 	const char *nspace = cmethod->klass->name_space;
2474 	const char *class_name = cmethod->klass->name;
2475 
2476 	if (!strcmp (class_name, "Vector`1"))
2477 		return emit_vector_t_intrinsics (cfg, cmethod, fsig, args);
2478 
2479 	if (!strcmp ("System.Numerics", nspace) && !strcmp ("Vector", class_name)) {
2480 		if (!strcmp (cmethod->name, "get_IsHardwareAccelerated"))
2481 			return emit_vector_is_hardware_accelerated_intrinsic (cfg);
2482 	}
2483 
2484 	return NULL;
2485 }
2486 
2487 MonoInst*
mono_emit_simd_field_load(MonoCompile * cfg,MonoClassField * field,MonoInst * addr)2488 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2489 {
2490 	if (is_sys_numerics_assembly (field->parent->image->assembly)) {
2491 		int index = -1;
2492 
2493 		if (!strcmp (field->parent->name, "Vector2") ||
2494 			!strcmp (field->parent->name, "Vector3") ||
2495 			!strcmp (field->parent->name, "Vector4")) {
2496 			if (!strcmp (field->name, "X"))
2497 				index = 0;
2498 			else if (!strcmp (field->name, "Y"))
2499 				index = 1;
2500 			else if (!strcmp (field->name, "Z"))
2501 				index = 2;
2502 			else if (!strcmp (field->name, "W"))
2503 				index = 3;
2504 		}
2505 
2506 		if (index != -1) {
2507 			if (cfg->verbose_level > 1)
2508 				printf ("  SIMD intrinsic field access: %s\n", field->name);
2509 
2510 			return simd_intrinsic_emit_getter_op (cfg, index, field->parent, mono_field_get_type (field), addr);
2511 		}
2512 	}
2513 	return NULL;
2514 }
2515 
2516 #endif /* DISABLE_JIT */
2517 
2518 #else
2519 
2520 MonoInst*
mono_emit_simd_field_load(MonoCompile * cfg,MonoClassField * field,MonoInst * addr)2521 mono_emit_simd_field_load (MonoCompile *cfg, MonoClassField *field, MonoInst *addr)
2522 {
2523 	return NULL;
2524 }
2525 
2526 #endif /* MONO_ARCH_SIMD_INTRINSICS */
2527