1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) {
5; CHECK-LABEL: zext_scaled_i16_i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vldrw.u32 q1, [r1]
8; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
9; CHECK-NEXT:    bx lr
10entry:
11  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
12  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
13  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
14  %gather.zext = zext <4 x i16> %gather to <4 x i32>
15  ret <4 x i32> %gather.zext
16}
17
18define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32_opaque(ptr %base, ptr %offptr) {
19; CHECK-LABEL: zext_scaled_i16_i32_opaque:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vldrw.u32 q1, [r1]
22; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
23; CHECK-NEXT:    bx lr
24entry:
25  %offs = load <4 x i32>, ptr %offptr, align 4
26  %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs
27  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
28  %gather.zext = zext <4 x i16> %gather to <4 x i32>
29  ret <4 x i32> %gather.zext
30}
31
32define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) {
33; CHECK-LABEL: sext_scaled_i16_i32:
34; CHECK:       @ %bb.0: @ %entry
35; CHECK-NEXT:    vldrw.u32 q1, [r1]
36; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
37; CHECK-NEXT:    bx lr
38entry:
39  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
40  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
41  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
42  %gather.sext = sext <4 x i16> %gather to <4 x i32>
43  ret <4 x i32> %gather.sext
44}
45
46define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(i32* %base, <4 x i32>* %offptr) {
47; CHECK-LABEL: scaled_i32_i32:
48; CHECK:       @ %bb.0: @ %entry
49; CHECK-NEXT:    vldrw.u32 q1, [r1]
50; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
51; CHECK-NEXT:    bx lr
52entry:
53  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
54  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
55  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
56  ret <4 x i32> %gather
57}
58
59; TODO: scaled_f16_i32
60
61define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(i32* %base, <4 x i32>* %offptr) {
62; CHECK-LABEL: scaled_f32_i32:
63; CHECK:       @ %bb.0: @ %entry
64; CHECK-NEXT:    vldrw.u32 q1, [r1]
65; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
66; CHECK-NEXT:    bx lr
67entry:
68  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
69  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
70  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
71  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
72  ret <4 x float> %gather
73}
74
75define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32_opaque(ptr %base, ptr %offptr) {
76; CHECK-LABEL: scaled_f32_i32_opaque:
77; CHECK:       @ %bb.0: @ %entry
78; CHECK-NEXT:    vldrw.u32 q1, [r1]
79; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
80; CHECK-NEXT:    bx lr
81entry:
82  %offs = load <4 x i32>, ptr %offptr, align 4
83  %i32_ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs
84  %ptrs = bitcast <4 x ptr> %i32_ptrs to <4 x ptr>
85  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
86  ret <4 x float> %gather
87}
88
89define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr) {
90; CHECK-LABEL: unsigned_scaled_b_i32_i16:
91; CHECK:       @ %bb.0: @ %entry
92; CHECK-NEXT:    vldrh.u32 q1, [r1]
93; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
94; CHECK-NEXT:    bx lr
95entry:
96  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
97  %offs.zext = zext <4 x i16> %offs to <4 x i32>
98  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
99  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
100  ret <4 x i32> %gather
101}
102
103define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr) {
104; CHECK-LABEL: signed_scaled_i32_i16:
105; CHECK:       @ %bb.0: @ %entry
106; CHECK-NEXT:    vldrh.s32 q1, [r1]
107; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
108; CHECK-NEXT:    bx lr
109entry:
110  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
111  %offs.sext = sext <4 x i16> %offs to <4 x i32>
112  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
113  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
114  ret <4 x i32> %gather
115}
116
117define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16_opaque(ptr %base, ptr %offptr) {
118; CHECK-LABEL: unsigned_scaled_b_i32_i16_opaque:
119; CHECK:       @ %bb.0: @ %entry
120; CHECK-NEXT:    vldrh.u32 q1, [r1]
121; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
122; CHECK-NEXT:    bx lr
123entry:
124  %offs = load <4 x i16>, ptr %offptr, align 2
125  %offs.zext = zext <4 x i16> %offs to <4 x i32>
126  %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.zext
127  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
128  ret <4 x i32> %gather
129}
130
131define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16_opaque(ptr %base, ptr %offptr) {
132; CHECK-LABEL: signed_scaled_i32_i16_opaque:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    vldrh.s32 q1, [r1]
135; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
136; CHECK-NEXT:    bx lr
137entry:
138  %offs = load <4 x i16>, ptr %offptr, align 2
139  %offs.sext = sext <4 x i16> %offs to <4 x i32>
140  %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.sext
141  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
142  ret <4 x i32> %gather
143}
144
145define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) {
146; CHECK-LABEL: a_unsigned_scaled_f32_i16:
147; CHECK:       @ %bb.0: @ %entry
148; CHECK-NEXT:    vldrh.u32 q1, [r1]
149; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
150; CHECK-NEXT:    bx lr
151entry:
152  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
153  %offs.zext = zext <4 x i16> %offs to <4 x i32>
154  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
155  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
156  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
157  ret <4 x float> %gather
158}
159
160define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) {
161; CHECK-LABEL: b_signed_scaled_f32_i16:
162; CHECK:       @ %bb.0: @ %entry
163; CHECK-NEXT:    vldrh.s32 q1, [r1]
164; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
165; CHECK-NEXT:    bx lr
166entry:
167  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
168  %offs.sext = sext <4 x i16> %offs to <4 x i32>
169  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
170  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
171  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
172  ret <4 x float> %gather
173}
174
175define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
176; CHECK-LABEL: zext_signed_scaled_i16_i16:
177; CHECK:       @ %bb.0: @ %entry
178; CHECK-NEXT:    vldrh.s32 q1, [r1]
179; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
180; CHECK-NEXT:    bx lr
181entry:
182  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
183  %offs.sext = sext <4 x i16> %offs to <4 x i32>
184  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
185  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
186  %gather.zext = zext <4 x i16> %gather to <4 x i32>
187  ret <4 x i32> %gather.zext
188}
189
190define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
191; CHECK-LABEL: sext_signed_scaled_i16_i16:
192; CHECK:       @ %bb.0: @ %entry
193; CHECK-NEXT:    vldrh.s32 q1, [r1]
194; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
195; CHECK-NEXT:    bx lr
196entry:
197  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
198  %offs.sext = sext <4 x i16> %offs to <4 x i32>
199  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
200  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
201  %gather.sext = sext <4 x i16> %gather to <4 x i32>
202  ret <4 x i32> %gather.sext
203}
204
205define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
206; CHECK-LABEL: zext_unsigned_scaled_i16_i16:
207; CHECK:       @ %bb.0: @ %entry
208; CHECK-NEXT:    vldrh.u32 q1, [r1]
209; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
210; CHECK-NEXT:    bx lr
211entry:
212  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
213  %offs.zext = zext <4 x i16> %offs to <4 x i32>
214  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
215  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
216  %gather.zext = zext <4 x i16> %gather to <4 x i32>
217  ret <4 x i32> %gather.zext
218}
219
220define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
221; CHECK-LABEL: sext_unsigned_scaled_i16_i16:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    vldrh.u32 q1, [r1]
224; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
225; CHECK-NEXT:    bx lr
226entry:
227  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
228  %offs.zext = zext <4 x i16> %offs to <4 x i32>
229  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
230  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
231  %gather.sext = sext <4 x i16> %gather to <4 x i32>
232  ret <4 x i32> %gather.sext
233}
234
235define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr) {
236; CHECK-LABEL: unsigned_scaled_b_i32_i8:
237; CHECK:       @ %bb.0: @ %entry
238; CHECK-NEXT:    vldrb.u32 q1, [r1]
239; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
240; CHECK-NEXT:    bx lr
241entry:
242  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
243  %offs.zext = zext <4 x i8> %offs to <4 x i32>
244  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
245  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
246  ret <4 x i32> %gather
247}
248
249define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr) {
250; CHECK-LABEL: signed_scaled_i32_i8:
251; CHECK:       @ %bb.0: @ %entry
252; CHECK-NEXT:    vldrb.s32 q1, [r1]
253; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
254; CHECK-NEXT:    bx lr
255entry:
256  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
257  %offs.sext = sext <4 x i8> %offs to <4 x i32>
258  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
259  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
260  ret <4 x i32> %gather
261}
262
263define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) {
264; CHECK-LABEL: a_unsigned_scaled_f32_i8:
265; CHECK:       @ %bb.0: @ %entry
266; CHECK-NEXT:    vldrb.u32 q1, [r1]
267; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
268; CHECK-NEXT:    bx lr
269entry:
270  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
271  %offs.zext = zext <4 x i8> %offs to <4 x i32>
272  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
273  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
274  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
275  ret <4 x float> %gather
276}
277
278define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) {
279; CHECK-LABEL: b_signed_scaled_f32_i8:
280; CHECK:       @ %bb.0: @ %entry
281; CHECK-NEXT:    vldrb.s32 q1, [r1]
282; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
283; CHECK-NEXT:    bx lr
284entry:
285  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
286  %offs.sext = sext <4 x i8> %offs to <4 x i32>
287  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
288  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
289  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
290  ret <4 x float> %gather
291}
292
293define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
294; CHECK-LABEL: zext_signed_scaled_i16_i8:
295; CHECK:       @ %bb.0: @ %entry
296; CHECK-NEXT:    vldrb.s32 q1, [r1]
297; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
298; CHECK-NEXT:    bx lr
299entry:
300  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
301  %offs.sext = sext <4 x i8> %offs to <4 x i32>
302  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
303  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
304  %gather.zext = zext <4 x i16> %gather to <4 x i32>
305  ret <4 x i32> %gather.zext
306}
307
308define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
309; CHECK-LABEL: sext_signed_scaled_i16_i8:
310; CHECK:       @ %bb.0: @ %entry
311; CHECK-NEXT:    vldrb.s32 q1, [r1]
312; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
313; CHECK-NEXT:    bx lr
314entry:
315  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
316  %offs.sext = sext <4 x i8> %offs to <4 x i32>
317  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
318  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
319  %gather.sext = sext <4 x i16> %gather to <4 x i32>
320  ret <4 x i32> %gather.sext
321}
322
323define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
324; CHECK-LABEL: zext_unsigned_scaled_i16_i8:
325; CHECK:       @ %bb.0: @ %entry
326; CHECK-NEXT:    vldrb.u32 q1, [r1]
327; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
328; CHECK-NEXT:    bx lr
329entry:
330  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
331  %offs.zext = zext <4 x i8> %offs to <4 x i32>
332  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
333  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
334  %gather.zext = zext <4 x i16> %gather to <4 x i32>
335  ret <4 x i32> %gather.zext
336}
337
338define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
339; CHECK-LABEL: sext_unsigned_scaled_i16_i8:
340; CHECK:       @ %bb.0: @ %entry
341; CHECK-NEXT:    vldrb.u32 q1, [r1]
342; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
343; CHECK-NEXT:    bx lr
344entry:
345  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
346  %offs.zext = zext <4 x i8> %offs to <4 x i32>
347  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
348  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
349  %gather.sext = sext <4 x i16> %gather to <4 x i32>
350  ret <4 x i32> %gather.sext
351}
352
353define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) {
354; CHECK-LABEL: scaled_i32_i32_2gep:
355; CHECK:       @ %bb.0: @ %entry
356; CHECK-NEXT:    vldrw.u32 q1, [r1]
357; CHECK-NEXT:    vmov.i32 q0, #0x14
358; CHECK-NEXT:    vshl.i32 q1, q1, #2
359; CHECK-NEXT:    vadd.i32 q1, q1, r0
360; CHECK-NEXT:    vadd.i32 q1, q1, q0
361; CHECK-NEXT:    vldrw.u32 q0, [q1]
362; CHECK-NEXT:    bx lr
363entry:
364  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
365  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
366  %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
367  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
368  ret <4 x i32> %gather
369}
370
371define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base) {
372; CHECK-LABEL: scaled_i32_i32_2gep2:
373; CHECK:       @ %bb.0: @ %entry
374; CHECK-NEXT:    adr r1, .LCPI25_0
375; CHECK-NEXT:    vldrw.u32 q1, [r1]
376; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
377; CHECK-NEXT:    bx lr
378; CHECK-NEXT:    .p2align 4
379; CHECK-NEXT:  @ %bb.1:
380; CHECK-NEXT:  .LCPI25_0:
381; CHECK-NEXT:    .long 5 @ 0x5
382; CHECK-NEXT:    .long 8 @ 0x8
383; CHECK-NEXT:    .long 11 @ 0xb
384; CHECK-NEXT:    .long 14 @ 0xe
385entry:
386  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
387  %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
388  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
389  ret <4 x i32> %gather
390}
391
392define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep_opaque(ptr %base, ptr %offptr) {
393; CHECK-LABEL: scaled_i32_i32_2gep_opaque:
394; CHECK:       @ %bb.0: @ %entry
395; CHECK-NEXT:    vldrw.u32 q1, [r1]
396; CHECK-NEXT:    vmov.i32 q0, #0x14
397; CHECK-NEXT:    vshl.i32 q1, q1, #2
398; CHECK-NEXT:    vadd.i32 q1, q1, r0
399; CHECK-NEXT:    vadd.i32 q1, q1, q0
400; CHECK-NEXT:    vldrw.u32 q0, [q1]
401; CHECK-NEXT:    bx lr
402entry:
403  %offs = load <4 x i32>, ptr %offptr, align 4
404  %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs
405  %ptrs2 = getelementptr inbounds i32, <4 x ptr> %ptrs, i32 5
406  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
407  ret <4 x i32> %gather
408}
409
410define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2_opaque(ptr %base) {
411; CHECK-LABEL: scaled_i32_i32_2gep2_opaque:
412; CHECK:       @ %bb.0: @ %entry
413; CHECK-NEXT:    adr r1, .LCPI27_0
414; CHECK-NEXT:    vldrw.u32 q1, [r1]
415; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
416; CHECK-NEXT:    bx lr
417; CHECK-NEXT:    .p2align 4
418; CHECK-NEXT:  @ %bb.1:
419; CHECK-NEXT:  .LCPI27_0:
420; CHECK-NEXT:    .long 5 @ 0x5
421; CHECK-NEXT:    .long 8 @ 0x8
422; CHECK-NEXT:    .long 11 @ 0xb
423; CHECK-NEXT:    .long 14 @ 0xe
424entry:
425  %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
426  %ptrs2 = getelementptr inbounds i32, <4 x ptr> %ptrs, i32 5
427  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
428  ret <4 x i32> %gather
429}
430
431declare <4 x i8>  @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
432declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
433declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
434declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
435declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
436declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>)
437declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
438declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
439