1; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
2; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
3
4define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
5  ; CHECK: vshf_v16i8_0:
6
7  %1 = load <16 x i8>* %a
8  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
9  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
10  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
11  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
12  ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]]
13  store <16 x i8> %2, <16 x i8>* %c
14  ; CHECK-DAG: st.b [[R3]], 0($4)
15
16  ret void
17  ; CHECK: .size vshf_v16i8_0
18}
19
20define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
21  ; CHECK: vshf_v16i8_1:
22
23  %1 = load <16 x i8>* %a
24  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
25  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
26  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
27  store <16 x i8> %2, <16 x i8>* %c
28  ; CHECK-DAG: st.b [[R3]], 0($4)
29
30  ret void
31  ; CHECK: .size vshf_v16i8_1
32}
33
34define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
35  ; CHECK: vshf_v16i8_2:
36
37  %1 = load <16 x i8>* %a
38  %2 = load <16 x i8>* %b
39  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
40  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
41  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
42  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
43  ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R2]]
44  store <16 x i8> %3, <16 x i8>* %c
45  ; CHECK-DAG: st.b [[R3]], 0($4)
46
47  ret void
48  ; CHECK: .size vshf_v16i8_2
49}
50
51define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
52  ; CHECK: vshf_v16i8_3:
53
54  %1 = load <16 x i8>* %a
55  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
56  %2 = load <16 x i8>* %b
57  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
58  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 17, i32 24, i32 25, i32 18, i32 19, i32 20, i32 28, i32 19, i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
59  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
60  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
61  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
62  ; the operands to get the right answer.
63  ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R1]]
64  store <16 x i8> %3, <16 x i8>* %c
65  ; CHECK-DAG: st.b [[R3]], 0($4)
66
67  ret void
68  ; CHECK: .size vshf_v16i8_3
69}
70
71define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
72  ; CHECK: vshf_v16i8_4:
73
74  %1 = load <16 x i8>* %a
75  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
76  %2 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17>
77  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
78  store <16 x i8> %2, <16 x i8>* %c
79  ; CHECK-DAG: st.b [[R3]], 0($4)
80
81  ret void
82  ; CHECK: .size vshf_v16i8_4
83}
84
85define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
86  ; CHECK: vshf_v8i16_0:
87
88  %1 = load <8 x i16>* %a
89  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
90  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
91  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
92  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
93  ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]]
94  store <8 x i16> %2, <8 x i16>* %c
95  ; CHECK-DAG: st.h [[R3]], 0($4)
96
97  ret void
98  ; CHECK: .size vshf_v8i16_0
99}
100
101define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
102  ; CHECK: vshf_v8i16_1:
103
104  %1 = load <8 x i16>* %a
105  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
106  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
107  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
108  store <8 x i16> %2, <8 x i16>* %c
109  ; CHECK-DAG: st.h [[R3]], 0($4)
110
111  ret void
112  ; CHECK: .size vshf_v8i16_1
113}
114
115define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
116  ; CHECK: vshf_v8i16_2:
117
118  %1 = load <8 x i16>* %a
119  %2 = load <8 x i16>* %b
120  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
121  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
122  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
123  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
124  ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R2]]
125  store <8 x i16> %3, <8 x i16>* %c
126  ; CHECK-DAG: st.h [[R3]], 0($4)
127
128  ret void
129  ; CHECK: .size vshf_v8i16_2
130}
131
132define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
133  ; CHECK: vshf_v8i16_3:
134
135  %1 = load <8 x i16>* %a
136  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
137  %2 = load <8 x i16>* %b
138  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
139  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
140  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
141  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
142  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
143  ; the operands to get the right answer.
144  ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R1]]
145  store <8 x i16> %3, <8 x i16>* %c
146  ; CHECK-DAG: st.h [[R3]], 0($4)
147
148  ret void
149  ; CHECK: .size vshf_v8i16_3
150}
151
152define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
153  ; CHECK: vshf_v8i16_4:
154
155  %1 = load <8 x i16>* %a
156  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
157  %2 = shufflevector <8 x i16> %1, <8 x i16> %1, <8 x i32> <i32 1, i32 9, i32 1, i32 9, i32 1, i32 9, i32 1, i32 9>
158  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
159  store <8 x i16> %2, <8 x i16>* %c
160  ; CHECK-DAG: st.h [[R3]], 0($4)
161
162  ret void
163  ; CHECK: .size vshf_v8i16_4
164}
165
166; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w
167; instruction when using a single vector.
168
169define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
170  ; CHECK: vshf_v4i32_0:
171
172  %1 = load <4 x i32>* %a
173  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
174  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
175  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
176  store <4 x i32> %2, <4 x i32>* %c
177  ; CHECK-DAG: st.w [[R3]], 0($4)
178
179  ret void
180  ; CHECK: .size vshf_v4i32_0
181}
182
183define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
184  ; CHECK: vshf_v4i32_1:
185
186  %1 = load <4 x i32>* %a
187  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
188  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
189  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
190  store <4 x i32> %2, <4 x i32>* %c
191  ; CHECK-DAG: st.w [[R3]], 0($4)
192
193  ret void
194  ; CHECK: .size vshf_v4i32_1
195}
196
197define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
198  ; CHECK: vshf_v4i32_2:
199
200  %1 = load <4 x i32>* %a
201  %2 = load <4 x i32>* %b
202  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
203  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 5, i32 6, i32 4>
204  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R2]], 36
205  store <4 x i32> %3, <4 x i32>* %c
206  ; CHECK-DAG: st.w [[R3]], 0($4)
207
208  ret void
209  ; CHECK: .size vshf_v4i32_2
210}
211
212define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
213  ; CHECK: vshf_v4i32_3:
214
215  %1 = load <4 x i32>* %a
216  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
217  %2 = load <4 x i32>* %b
218  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
219  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 6, i32 4>
220  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
221  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[PTR_A]])
222  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
223  ; the operands to get the right answer.
224  ; CHECK-DAG: vshf.w [[R3]], [[R2]], [[R1]]
225  store <4 x i32> %3, <4 x i32>* %c
226  ; CHECK-DAG: st.w [[R3]], 0($4)
227
228  ret void
229  ; CHECK: .size vshf_v4i32_3
230}
231
232define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
233  ; CHECK: vshf_v4i32_4:
234
235  %1 = load <4 x i32>* %a
236  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
237  %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 5, i32 5, i32 1>
238  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
239  store <4 x i32> %2, <4 x i32>* %c
240  ; CHECK-DAG: st.w [[R3]], 0($4)
241
242  ret void
243  ; CHECK: .size vshf_v4i32_4
244}
245
246define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
247  ; CHECK: vshf_v2i64_0:
248
249  %1 = load <2 x i64>* %a
250  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
251  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
252  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
253  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
254  ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]]
255  store <2 x i64> %2, <2 x i64>* %c
256  ; CHECK-DAG: st.d [[R3]], 0($4)
257
258  ret void
259  ; CHECK: .size vshf_v2i64_0
260}
261
262define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
263  ; CHECK: vshf_v2i64_1:
264
265  %1 = load <2 x i64>* %a
266  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
267  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
268  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
269  store <2 x i64> %2, <2 x i64>* %c
270  ; CHECK-DAG: st.d [[R3]], 0($4)
271
272  ret void
273  ; CHECK: .size vshf_v2i64_1
274}
275
276define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
277  ; CHECK: vshf_v2i64_2:
278
279  %1 = load <2 x i64>* %a
280  %2 = load <2 x i64>* %b
281  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
282  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 2>
283  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
284  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
285  ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R2]]
286  store <2 x i64> %3, <2 x i64>* %c
287  ; CHECK-DAG: st.d [[R3]], 0($4)
288
289  ret void
290  ; CHECK: .size vshf_v2i64_2
291}
292
293define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
294  ; CHECK: vshf_v2i64_3:
295
296  %1 = load <2 x i64>* %a
297  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
298  %2 = load <2 x i64>* %b
299  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
300  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 2>
301  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
302  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
303  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
304  ; the operands to get the right answer.
305  ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R1]]
306  store <2 x i64> %3, <2 x i64>* %c
307  ; CHECK-DAG: st.d [[R3]], 0($4)
308
309  ret void
310  ; CHECK: .size vshf_v2i64_3
311}
312
313define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
314  ; CHECK: vshf_v2i64_4:
315
316  %1 = load <2 x i64>* %a
317  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
318  %2 = shufflevector <2 x i64> %1, <2 x i64> %1, <2 x i32> <i32 1, i32 3>
319  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
320  store <2 x i64> %2, <2 x i64>* %c
321  ; CHECK-DAG: st.d [[R3]], 0($4)
322
323  ret void
324  ; CHECK: .size vshf_v2i64_4
325}
326
327define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
328  ; CHECK: shf_v16i8_0:
329
330  %1 = load <16 x i8>* %a
331  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
332  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 0, i32 5, i32 7, i32 6, i32 4, i32 9, i32 11, i32 10, i32 8, i32 13, i32 15, i32 14, i32 12>
333  ; CHECK-DAG: shf.b [[R3:\$w[0-9]+]], [[R1]], 45
334  store <16 x i8> %2, <16 x i8>* %c
335  ; CHECK-DAG: st.b [[R3]], 0($4)
336
337  ret void
338  ; CHECK: .size shf_v16i8_0
339}
340
341define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
342  ; CHECK: shf_v8i16_0:
343
344  %1 = load <8 x i16>* %a
345  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
346  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
347  ; CHECK-DAG: shf.h [[R3:\$w[0-9]+]], [[R1]], 27
348  store <8 x i16> %2, <8 x i16>* %c
349  ; CHECK-DAG: st.h [[R3]], 0($4)
350
351  ret void
352  ; CHECK: .size shf_v8i16_0
353}
354
355define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
356  ; CHECK: shf_v4i32_0:
357
358  %1 = load <4 x i32>* %a
359  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
360  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
361  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
362  store <4 x i32> %2, <4 x i32>* %c
363  ; CHECK-DAG: st.w [[R3]], 0($4)
364
365  ret void
366  ; CHECK: .size shf_v4i32_0
367}
368
369; shf.d does not exist
370
371define void @ilvev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
372  ; CHECK: ilvev_v16i8_0:
373
374  %1 = load <16 x i8>* %a
375  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
376  %2 = load <16 x i8>* %b
377  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
378  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
379                     <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
380  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
381  store <16 x i8> %3, <16 x i8>* %c
382  ; CHECK-DAG: st.b [[R3]], 0($4)
383
384  ret void
385  ; CHECK: .size ilvev_v16i8_0
386}
387
388define void @ilvev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
389  ; CHECK: ilvev_v8i16_0:
390
391  %1 = load <8 x i16>* %a
392  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
393  %2 = load <8 x i16>* %b
394  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
395  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
396  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
397  store <8 x i16> %3, <8 x i16>* %c
398  ; CHECK-DAG: st.h [[R3]], 0($4)
399
400  ret void
401  ; CHECK: .size ilvev_v8i16_0
402}
403
404define void @ilvev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
405  ; CHECK: ilvev_v4i32_0:
406
407  %1 = load <4 x i32>* %a
408  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
409  %2 = load <4 x i32>* %b
410  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
411  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
412  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
413  store <4 x i32> %3, <4 x i32>* %c
414  ; CHECK-DAG: st.w [[R3]], 0($4)
415
416  ret void
417  ; CHECK: .size ilvev_v4i32_0
418}
419
420define void @ilvev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
421  ; CHECK: ilvev_v2i64_0:
422
423  %1 = load <2 x i64>* %a
424  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
425  %2 = load <2 x i64>* %b
426  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
427  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
428  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
429  store <2 x i64> %3, <2 x i64>* %c
430  ; CHECK-DAG: st.d [[R3]], 0($4)
431
432  ret void
433  ; CHECK: .size ilvev_v2i64_0
434}
435
436define void @ilvod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
437  ; CHECK: ilvod_v16i8_0:
438
439  %1 = load <16 x i8>* %a
440  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
441  %2 = load <16 x i8>* %b
442  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
443  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
444                     <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
445  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
446  store <16 x i8> %3, <16 x i8>* %c
447  ; CHECK-DAG: st.b [[R3]], 0($4)
448
449  ret void
450  ; CHECK: .size ilvod_v16i8_0
451}
452
453define void @ilvod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
454  ; CHECK: ilvod_v8i16_0:
455
456  %1 = load <8 x i16>* %a
457  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
458  %2 = load <8 x i16>* %b
459  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
460  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
461  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
462  store <8 x i16> %3, <8 x i16>* %c
463  ; CHECK-DAG: st.h [[R3]], 0($4)
464
465  ret void
466  ; CHECK: .size ilvod_v8i16_0
467}
468
469define void @ilvod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
470  ; CHECK: ilvod_v4i32_0:
471
472  %1 = load <4 x i32>* %a
473  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
474  %2 = load <4 x i32>* %b
475  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
476  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
477  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
478  store <4 x i32> %3, <4 x i32>* %c
479  ; CHECK-DAG: st.w [[R3]], 0($4)
480
481  ret void
482  ; CHECK: .size ilvod_v4i32_0
483}
484
485define void @ilvod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
486  ; CHECK: ilvod_v2i64_0:
487
488  %1 = load <2 x i64>* %a
489  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
490  %2 = load <2 x i64>* %b
491  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
492  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
493  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
494  store <2 x i64> %3, <2 x i64>* %c
495  ; CHECK-DAG: st.d [[R3]], 0($4)
496
497  ret void
498  ; CHECK: .size ilvod_v2i64_0
499}
500
501define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
502  ; CHECK: ilvl_v16i8_0:
503
504  %1 = load <16 x i8>* %a
505  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
506  %2 = load <16 x i8>* %b
507  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
508  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
509                     <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
510  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
511  store <16 x i8> %3, <16 x i8>* %c
512  ; CHECK-DAG: st.b [[R3]], 0($4)
513
514  ret void
515  ; CHECK: .size ilvl_v16i8_0
516}
517
518define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
519  ; CHECK: ilvl_v8i16_0:
520
521  %1 = load <8 x i16>* %a
522  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
523  %2 = load <8 x i16>* %b
524  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
525  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
526  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
527  store <8 x i16> %3, <8 x i16>* %c
528  ; CHECK-DAG: st.h [[R3]], 0($4)
529
530  ret void
531  ; CHECK: .size ilvl_v8i16_0
532}
533
534define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
535  ; CHECK: ilvl_v4i32_0:
536
537  %1 = load <4 x i32>* %a
538  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
539  %2 = load <4 x i32>* %b
540  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
541  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
542  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
543  store <4 x i32> %3, <4 x i32>* %c
544  ; CHECK-DAG: st.w [[R3]], 0($4)
545
546  ret void
547  ; CHECK: .size ilvl_v4i32_0
548}
549
550define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
551  ; CHECK: ilvl_v2i64_0:
552
553  %1 = load <2 x i64>* %a
554  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
555  %2 = load <2 x i64>* %b
556  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
557  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
558  ; ilvl.d and ilvev.d are equivalent for v2i64
559  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
560  store <2 x i64> %3, <2 x i64>* %c
561  ; CHECK-DAG: st.d [[R3]], 0($4)
562
563  ret void
564  ; CHECK: .size ilvl_v2i64_0
565}
566
567define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
568  ; CHECK: ilvr_v16i8_0:
569
570  %1 = load <16 x i8>* %a
571  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
572  %2 = load <16 x i8>* %b
573  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
574  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
575                     <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
576  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
577  store <16 x i8> %3, <16 x i8>* %c
578  ; CHECK-DAG: st.b [[R3]], 0($4)
579
580  ret void
581  ; CHECK: .size ilvr_v16i8_0
582}
583
584define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
585  ; CHECK: ilvr_v8i16_0:
586
587  %1 = load <8 x i16>* %a
588  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
589  %2 = load <8 x i16>* %b
590  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
591  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
592  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
593  store <8 x i16> %3, <8 x i16>* %c
594  ; CHECK-DAG: st.h [[R3]], 0($4)
595
596  ret void
597  ; CHECK: .size ilvr_v8i16_0
598}
599
600define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
601  ; CHECK: ilvr_v4i32_0:
602
603  %1 = load <4 x i32>* %a
604  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
605  %2 = load <4 x i32>* %b
606  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
607  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
608  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
609  store <4 x i32> %3, <4 x i32>* %c
610  ; CHECK-DAG: st.w [[R3]], 0($4)
611
612  ret void
613  ; CHECK: .size ilvr_v4i32_0
614}
615
616define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
617  ; CHECK: ilvr_v2i64_0:
618
619  %1 = load <2 x i64>* %a
620  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
621  %2 = load <2 x i64>* %b
622  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
623  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
624  ; ilvr.d and ilvod.d are equivalent for v2i64
625  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
626  store <2 x i64> %3, <2 x i64>* %c
627  ; CHECK-DAG: st.d [[R3]], 0($4)
628
629  ret void
630  ; CHECK: .size ilvr_v2i64_0
631}
632
633define void @pckev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
634  ; CHECK: pckev_v16i8_0:
635
636  %1 = load <16 x i8>* %a
637  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
638  %2 = load <16 x i8>* %b
639  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
640  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
641                     <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
642  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
643  store <16 x i8> %3, <16 x i8>* %c
644  ; CHECK-DAG: st.b [[R3]], 0($4)
645
646  ret void
647  ; CHECK: .size pckev_v16i8_0
648}
649
650define void @pckev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
651  ; CHECK: pckev_v8i16_0:
652
653  %1 = load <8 x i16>* %a
654  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
655  %2 = load <8 x i16>* %b
656  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
657  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
658  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
659  store <8 x i16> %3, <8 x i16>* %c
660  ; CHECK-DAG: st.h [[R3]], 0($4)
661
662  ret void
663  ; CHECK: .size pckev_v8i16_0
664}
665
666define void @pckev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
667  ; CHECK: pckev_v4i32_0:
668
669  %1 = load <4 x i32>* %a
670  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
671  %2 = load <4 x i32>* %b
672  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
673  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
674  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
675  store <4 x i32> %3, <4 x i32>* %c
676  ; CHECK-DAG: st.w [[R3]], 0($4)
677
678  ret void
679  ; CHECK: .size pckev_v4i32_0
680}
681
682define void @pckev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
683  ; CHECK: pckev_v2i64_0:
684
685  %1 = load <2 x i64>* %a
686  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
687  %2 = load <2 x i64>* %b
688  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
689  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
690  ; pckev.d and ilvev.d are equivalent for v2i64
691  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
692  store <2 x i64> %3, <2 x i64>* %c
693  ; CHECK-DAG: st.d [[R3]], 0($4)
694
695  ret void
696  ; CHECK: .size pckev_v2i64_0
697}
698
699define void @pckod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
700  ; CHECK: pckod_v16i8_0:
701
702  %1 = load <16 x i8>* %a
703  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
704  %2 = load <16 x i8>* %b
705  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
706  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
707                     <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
708  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
709  store <16 x i8> %3, <16 x i8>* %c
710  ; CHECK-DAG: st.b [[R3]], 0($4)
711
712  ret void
713  ; CHECK: .size pckod_v16i8_0
714}
715
716define void @pckod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
717  ; CHECK: pckod_v8i16_0:
718
719  %1 = load <8 x i16>* %a
720  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
721  %2 = load <8 x i16>* %b
722  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
723  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
724  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
725  store <8 x i16> %3, <8 x i16>* %c
726  ; CHECK-DAG: st.h [[R3]], 0($4)
727
728  ret void
729  ; CHECK: .size pckod_v8i16_0
730}
731
732define void @pckod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
733  ; CHECK: pckod_v4i32_0:
734
735  %1 = load <4 x i32>* %a
736  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
737  %2 = load <4 x i32>* %b
738  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
739  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
740  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
741  store <4 x i32> %3, <4 x i32>* %c
742  ; CHECK-DAG: st.w [[R3]], 0($4)
743
744  ret void
745  ; CHECK: .size pckod_v4i32_0
746}
747
748define void @pckod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
749  ; CHECK: pckod_v2i64_0:
750
751  %1 = load <2 x i64>* %a
752  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
753  %2 = load <2 x i64>* %b
754  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
755  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
756  ; pckod.d and ilvod.d are equivalent for v2i64
757  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
758  store <2 x i64> %3, <2 x i64>* %c
759  ; CHECK-DAG: st.d [[R3]], 0($4)
760
761  ret void
762  ; CHECK: .size pckod_v2i64_0
763}
764
765define void @splati_v16i8_0(<16 x i8>* %c, <16 x i8>* %a) nounwind {
766  ; CHECK: splati_v16i8_0:
767
768  %1 = load <16 x i8>* %a
769  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
770  %2 = shufflevector <16 x i8> %1, <16 x i8> undef,
771                     <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
772  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][4]
773  store <16 x i8> %2, <16 x i8>* %c
774  ; CHECK-DAG: st.b [[R3]], 0($4)
775
776  ret void
777  ; CHECK: .size splati_v16i8_0
778}
779
780define void @splati_v8i16_0(<8 x i16>* %c, <8 x i16>* %a) nounwind {
781  ; CHECK: splati_v8i16_0:
782
783  %1 = load <8 x i16>* %a
784  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
785  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
786  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][4]
787  store <8 x i16> %2, <8 x i16>* %c
788  ; CHECK-DAG: st.h [[R3]], 0($4)
789
790  ret void
791  ; CHECK: .size splati_v8i16_0
792}
793
794define void @splati_v4i32_0(<4 x i32>* %c, <4 x i32>* %a) nounwind {
795  ; CHECK: splati_v4i32_0:
796
797  %1 = load <4 x i32>* %a
798  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
799  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
800  ; shf.w and splati.w are equivalent
801  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 255
802  store <4 x i32> %2, <4 x i32>* %c
803  ; CHECK-DAG: st.w [[R3]], 0($4)
804
805  ret void
806  ; CHECK: .size splati_v4i32_0
807}
808
809define void @splati_v2i64_0(<2 x i64>* %c, <2 x i64>* %a) nounwind {
810  ; CHECK: splati_v2i64_0:
811
812  %1 = load <2 x i64>* %a
813  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
814  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
815  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
816  store <2 x i64> %2, <2 x i64>* %c
817  ; CHECK-DAG: st.d [[R3]], 0($4)
818
819  ret void
820  ; CHECK: .size splati_v2i64_0
821}
822