1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -opaque-pointers %s -o - | FileCheck %s
4
5; i32
6
7define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(<2 x i32*>* %offptr) {
8; CHECK-LABEL: ptr_v2i32:
9; CHECK:       @ %bb.0: @ %entry
10; CHECK-NEXT:    ldrd r1, r0, [r0]
11; CHECK-NEXT:    ldr r0, [r0]
12; CHECK-NEXT:    ldr r1, [r1]
13; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
14; CHECK-NEXT:    bx lr
15entry:
16  %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4
17  %gather = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
18  ret <2 x i32> %gather
19}
20
21define arm_aapcs_vfpcc <4 x i32> @ptr_v4i32(<4 x i32*>* %offptr) {
22; CHECK-LABEL: ptr_v4i32:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldrw.u32 q1, [r0]
25; CHECK-NEXT:    vldrw.u32 q0, [q1]
26; CHECK-NEXT:    bx lr
27entry:
28  %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4
29  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
30  ret <4 x i32> %gather
31}
32
33define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) {
34; CHECK-LABEL: ptr_v8i32:
35; CHECK:       @ %bb.0: @ %entry
36; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
37; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
38; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
39; CHECK-NEXT:    vmov r1, r2, d1
40; CHECK-NEXT:    vmov r3, r12, d0
41; CHECK-NEXT:    vldrw.u32 q0, [r0]
42; CHECK-NEXT:    vmov r0, lr, d1
43; CHECK-NEXT:    ldr r7, [r2]
44; CHECK-NEXT:    vmov r2, r4, d0
45; CHECK-NEXT:    ldr r6, [r1]
46; CHECK-NEXT:    ldr r3, [r3]
47; CHECK-NEXT:    ldr r0, [r0]
48; CHECK-NEXT:    ldr.w r1, [r12]
49; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
50; CHECK-NEXT:    ldr.w r5, [lr]
51; CHECK-NEXT:    vmov q1[3], q1[1], r1, r7
52; CHECK-NEXT:    ldr r2, [r2]
53; CHECK-NEXT:    ldr r4, [r4]
54; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
55; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
56; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
57entry:
58  %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
59  %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
60  ret <8 x i32> %gather
61}
62
63define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) {
64; CHECK-LABEL: ptr_v16i32:
65; CHECK:       @ %bb.0: @ %entry
66; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
67; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
68; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
69; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
70; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
71; CHECK-NEXT:    vmov r1, r2, d1
72; CHECK-NEXT:    vmov r3, lr, d0
73; CHECK-NEXT:    vldrw.u32 q0, [r0]
74; CHECK-NEXT:    vmov r4, r5, d1
75; CHECK-NEXT:    ldr r7, [r2]
76; CHECK-NEXT:    vmov r2, r6, d0
77; CHECK-NEXT:    ldr.w r12, [r1]
78; CHECK-NEXT:    ldr r3, [r3]
79; CHECK-NEXT:    ldr r4, [r4]
80; CHECK-NEXT:    ldr r5, [r5]
81; CHECK-NEXT:    vmov q3[2], q3[0], r3, r12
82; CHECK-NEXT:    ldr.w r1, [lr]
83; CHECK-NEXT:    vmov q3[3], q3[1], r1, r7
84; CHECK-NEXT:    ldr r2, [r2]
85; CHECK-NEXT:    ldr r6, [r6]
86; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
87; CHECK-NEXT:    vmov r2, r4, d3
88; CHECK-NEXT:    vmov q0[3], q0[1], r6, r5
89; CHECK-NEXT:    vmov r6, r5, d2
90; CHECK-NEXT:    ldr r2, [r2]
91; CHECK-NEXT:    ldr r6, [r6]
92; CHECK-NEXT:    ldr r5, [r5]
93; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
94; CHECK-NEXT:    ldr r6, [r4]
95; CHECK-NEXT:    vmov r0, r2, d5
96; CHECK-NEXT:    vmov q1[3], q1[1], r5, r6
97; CHECK-NEXT:    vmov r6, r5, d4
98; CHECK-NEXT:    ldr r0, [r0]
99; CHECK-NEXT:    ldr r6, [r6]
100; CHECK-NEXT:    ldr r2, [r2]
101; CHECK-NEXT:    ldr r5, [r5]
102; CHECK-NEXT:    vmov q2[2], q2[0], r6, r0
103; CHECK-NEXT:    vmov q2[3], q2[1], r5, r2
104; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
105entry:
106  %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
107  %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
108  ret <16 x i32> %gather
109}
110
111; f32
112
113define arm_aapcs_vfpcc <2 x float> @ptr_v2f32(<2 x float*>* %offptr) {
114; CHECK-LABEL: ptr_v2f32:
115; CHECK:       @ %bb.0: @ %entry
116; CHECK-NEXT:    ldrd r1, r0, [r0]
117; CHECK-NEXT:    vldr s1, [r0]
118; CHECK-NEXT:    vldr s0, [r1]
119; CHECK-NEXT:    bx lr
120entry:
121  %offs = load <2 x float*>, <2 x float*>* %offptr, align 4
122  %gather = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
123  ret <2 x float> %gather
124}
125
126define arm_aapcs_vfpcc <4 x float> @ptr_v4f32(<4 x float*>* %offptr) {
127; CHECK-LABEL: ptr_v4f32:
128; CHECK:       @ %bb.0: @ %entry
129; CHECK-NEXT:    vldrw.u32 q1, [r0]
130; CHECK-NEXT:    vldrw.u32 q0, [q1]
131; CHECK-NEXT:    bx lr
132entry:
133  %offs = load <4 x float*>, <4 x float*>* %offptr, align 4
134  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
135  ret <4 x float> %gather
136}
137
138define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) {
139; CHECK-LABEL: ptr_v8f32:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    .save {r4, r5, r7, lr}
142; CHECK-NEXT:    push {r4, r5, r7, lr}
143; CHECK-NEXT:    vldrw.u32 q0, [r0]
144; CHECK-NEXT:    vmov r12, r2, d1
145; CHECK-NEXT:    vmov lr, r1, d0
146; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
147; CHECK-NEXT:    vmov r0, r3, d1
148; CHECK-NEXT:    vmov r4, r5, d0
149; CHECK-NEXT:    vldr s3, [r2]
150; CHECK-NEXT:    vldr s2, [r12]
151; CHECK-NEXT:    vldr s1, [r1]
152; CHECK-NEXT:    vldr s0, [lr]
153; CHECK-NEXT:    vldr s7, [r3]
154; CHECK-NEXT:    vldr s6, [r0]
155; CHECK-NEXT:    vldr s5, [r5]
156; CHECK-NEXT:    vldr s4, [r4]
157; CHECK-NEXT:    pop {r4, r5, r7, pc}
158entry:
159  %offs = load <8 x float*>, <8 x float*>* %offptr, align 4
160  %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
161  ret <8 x float> %gather
162}
163
164; i16
165
166define arm_aapcs_vfpcc <8 x i16> @ptr_i16(<8 x i16*>* %offptr) {
167; CHECK-LABEL: ptr_i16:
168; CHECK:       @ %bb.0: @ %entry
169; CHECK-NEXT:    .save {r4, r5, r6, lr}
170; CHECK-NEXT:    push {r4, r5, r6, lr}
171; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
172; CHECK-NEXT:    vmov r1, r2, d0
173; CHECK-NEXT:    vmov r3, r12, d1
174; CHECK-NEXT:    vldrw.u32 q0, [r0]
175; CHECK-NEXT:    vmov r4, r5, d0
176; CHECK-NEXT:    vmov r0, lr, d1
177; CHECK-NEXT:    ldrh r1, [r1]
178; CHECK-NEXT:    ldrh r6, [r3]
179; CHECK-NEXT:    ldrh r2, [r2]
180; CHECK-NEXT:    ldrh r4, [r4]
181; CHECK-NEXT:    ldrh r5, [r5]
182; CHECK-NEXT:    vmov.16 q0[0], r4
183; CHECK-NEXT:    ldrh r0, [r0]
184; CHECK-NEXT:    vmov.16 q0[1], r5
185; CHECK-NEXT:    ldrh.w r3, [lr]
186; CHECK-NEXT:    vmov.16 q0[2], r0
187; CHECK-NEXT:    ldrh.w r12, [r12]
188; CHECK-NEXT:    vmov.16 q0[3], r3
189; CHECK-NEXT:    vmov.16 q0[4], r1
190; CHECK-NEXT:    vmov.16 q0[5], r2
191; CHECK-NEXT:    vmov.16 q0[6], r6
192; CHECK-NEXT:    vmov.16 q0[7], r12
193; CHECK-NEXT:    pop {r4, r5, r6, pc}
194entry:
195  %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
196  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
197  ret <8 x i16> %gather
198}
199
200define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(<2 x i16*>* %offptr) {
201; CHECK-LABEL: ptr_v2i16_sext:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    ldrd r1, r0, [r0]
204; CHECK-NEXT:    ldrsh.w r0, [r0]
205; CHECK-NEXT:    ldrsh.w r1, [r1]
206; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
207; CHECK-NEXT:    asrs r0, r0, #31
208; CHECK-NEXT:    asrs r1, r1, #31
209; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
210; CHECK-NEXT:    bx lr
211entry:
212  %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
213  %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef)
214  %ext = sext <2 x i16> %gather to <2 x i32>
215  ret <2 x i32> %ext
216}
217
218define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(<2 x i16*>* %offptr) {
219; CHECK-LABEL: ptr_v2i16_zext:
220; CHECK:       @ %bb.0: @ %entry
221; CHECK-NEXT:    ldrd r1, r0, [r0]
222; CHECK-NEXT:    vmov.i64 q0, #0xffff
223; CHECK-NEXT:    ldrh r0, [r0]
224; CHECK-NEXT:    ldrh r1, [r1]
225; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
226; CHECK-NEXT:    vand q0, q1, q0
227; CHECK-NEXT:    bx lr
228entry:
229  %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
230  %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef)
231  %ext = zext <2 x i16> %gather to <2 x i32>
232  ret <2 x i32> %ext
233}
234
235define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) {
236; CHECK-LABEL: ptr_v4i16_sext:
237; CHECK:       @ %bb.0: @ %entry
238; CHECK-NEXT:    vldrw.u32 q1, [r0]
239; CHECK-NEXT:    movs r1, #0
240; CHECK-NEXT:    vldrh.s32 q0, [r1, q1]
241; CHECK-NEXT:    bx lr
242entry:
243  %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
244  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
245  %ext = sext <4 x i16> %gather to <4 x i32>
246  ret <4 x i32> %ext
247}
248
249define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) {
250; CHECK-LABEL: ptr_v4i16_zext:
251; CHECK:       @ %bb.0: @ %entry
252; CHECK-NEXT:    vldrw.u32 q1, [r0]
253; CHECK-NEXT:    movs r1, #0
254; CHECK-NEXT:    vldrh.u32 q0, [r1, q1]
255; CHECK-NEXT:    bx lr
256entry:
257  %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
258  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
259  %ext = zext <4 x i16> %gather to <4 x i32>
260  ret <4 x i32> %ext
261}
262
263define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) {
264; CHECK-LABEL: ptr_v4i16:
265; CHECK:       @ %bb.0: @ %entry
266; CHECK-NEXT:    vldrw.u32 q1, [r0]
267; CHECK-NEXT:    movs r1, #0
268; CHECK-NEXT:    vldrh.u32 q0, [r1, q1]
269; CHECK-NEXT:    bx lr
270entry:
271  %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
272  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
273  ret <4 x i16> %gather
274}
275
276define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
277; CHECK-LABEL: ptr_v8i16_sext:
278; CHECK:       @ %bb.0: @ %entry
279; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
280; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
281; CHECK-NEXT:    .pad #16
282; CHECK-NEXT:    sub sp, #16
283; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
284; CHECK-NEXT:    vmov r3, r1, d1
285; CHECK-NEXT:    vmov r12, r2, d0
286; CHECK-NEXT:    vldrw.u32 q0, [r0]
287; CHECK-NEXT:    vmov lr, r0, d1
288; CHECK-NEXT:    ldrh r7, [r1]
289; CHECK-NEXT:    ldrh.w r1, [r12]
290; CHECK-NEXT:    ldrh r2, [r2]
291; CHECK-NEXT:    ldrh r4, [r0]
292; CHECK-NEXT:    vmov r0, r5, d0
293; CHECK-NEXT:    ldrh.w r6, [lr]
294; CHECK-NEXT:    ldrh r3, [r3]
295; CHECK-NEXT:    ldrh r0, [r0]
296; CHECK-NEXT:    ldrh r5, [r5]
297; CHECK-NEXT:    vmov.16 q0[0], r0
298; CHECK-NEXT:    mov r0, sp
299; CHECK-NEXT:    vmov.16 q0[1], r5
300; CHECK-NEXT:    vmov.16 q0[2], r6
301; CHECK-NEXT:    vmov.16 q0[3], r4
302; CHECK-NEXT:    vmov.16 q0[4], r1
303; CHECK-NEXT:    vmov.16 q0[5], r2
304; CHECK-NEXT:    vmov.16 q0[6], r3
305; CHECK-NEXT:    vmov.16 q0[7], r7
306; CHECK-NEXT:    vstrw.32 q0, [r0]
307; CHECK-NEXT:    vldrh.s32 q0, [r0]
308; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
309; CHECK-NEXT:    add sp, #16
310; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
311entry:
312  %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
313  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
314  %ext = sext <8 x i16> %gather to <8 x i32>
315  ret <8 x i32> %ext
316}
317
318define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) {
319; CHECK-LABEL: ptr_v8i16_zext:
320; CHECK:       @ %bb.0: @ %entry
321; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
322; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
323; CHECK-NEXT:    .pad #16
324; CHECK-NEXT:    sub sp, #16
325; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
326; CHECK-NEXT:    vmov r3, r1, d1
327; CHECK-NEXT:    vmov r12, r2, d0
328; CHECK-NEXT:    vldrw.u32 q0, [r0]
329; CHECK-NEXT:    vmov lr, r0, d1
330; CHECK-NEXT:    ldrh r7, [r1]
331; CHECK-NEXT:    ldrh.w r1, [r12]
332; CHECK-NEXT:    ldrh r2, [r2]
333; CHECK-NEXT:    ldrh r4, [r0]
334; CHECK-NEXT:    vmov r0, r5, d0
335; CHECK-NEXT:    ldrh.w r6, [lr]
336; CHECK-NEXT:    ldrh r3, [r3]
337; CHECK-NEXT:    ldrh r0, [r0]
338; CHECK-NEXT:    ldrh r5, [r5]
339; CHECK-NEXT:    vmov.16 q0[0], r0
340; CHECK-NEXT:    mov r0, sp
341; CHECK-NEXT:    vmov.16 q0[1], r5
342; CHECK-NEXT:    vmov.16 q0[2], r6
343; CHECK-NEXT:    vmov.16 q0[3], r4
344; CHECK-NEXT:    vmov.16 q0[4], r1
345; CHECK-NEXT:    vmov.16 q0[5], r2
346; CHECK-NEXT:    vmov.16 q0[6], r3
347; CHECK-NEXT:    vmov.16 q0[7], r7
348; CHECK-NEXT:    vstrw.32 q0, [r0]
349; CHECK-NEXT:    vldrh.u32 q0, [r0]
350; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
351; CHECK-NEXT:    add sp, #16
352; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
353entry:
354  %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
355  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
356  %ext = zext <8 x i16> %gather to <8 x i32>
357  ret <8 x i32> %ext
358}
359
360; f16
361
362define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
363; CHECK-LABEL: ptr_f16:
364; CHECK:       @ %bb.0: @ %entry
365; CHECK-NEXT:    vldrw.u32 q0, [r0]
366; CHECK-NEXT:    vmov r1, r2, d0
367; CHECK-NEXT:    vldr.16 s4, [r2]
368; CHECK-NEXT:    vldr.16 s0, [r1]
369; CHECK-NEXT:    vmov r1, r2, d1
370; CHECK-NEXT:    vins.f16 s0, s4
371; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
372; CHECK-NEXT:    vldr.16 s1, [r1]
373; CHECK-NEXT:    vldr.16 s2, [r2]
374; CHECK-NEXT:    vmov r0, r1, d2
375; CHECK-NEXT:    vins.f16 s1, s2
376; CHECK-NEXT:    vldr.16 s4, [r1]
377; CHECK-NEXT:    vldr.16 s2, [r0]
378; CHECK-NEXT:    vmov r0, r1, d3
379; CHECK-NEXT:    vldr.16 s3, [r0]
380; CHECK-NEXT:    vins.f16 s2, s4
381; CHECK-NEXT:    vldr.16 s4, [r1]
382; CHECK-NEXT:    vins.f16 s3, s4
383; CHECK-NEXT:    bx lr
384entry:
385  %offs = load <8 x half*>, <8 x half*>* %offptr, align 4
386  %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
387  ret <8 x half> %gather
388}
389
390define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
391; CHECK-LABEL: ptr_v4f16:
392; CHECK:       @ %bb.0: @ %entry
393; CHECK-NEXT:    vldrw.u32 q0, [r0]
394; CHECK-NEXT:    vmov r0, r1, d0
395; CHECK-NEXT:    vldr.16 s4, [r1]
396; CHECK-NEXT:    vldr.16 s0, [r0]
397; CHECK-NEXT:    vmov r0, r1, d1
398; CHECK-NEXT:    vldr.16 s2, [r1]
399; CHECK-NEXT:    vldr.16 s1, [r0]
400; CHECK-NEXT:    vins.f16 s0, s4
401; CHECK-NEXT:    vins.f16 s1, s2
402; CHECK-NEXT:    bx lr
403entry:
404  %offs = load <4 x half*>, <4 x half*>* %offptr, align 4
405  %gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef)
406  ret <4 x half> %gather
407}
408
409; i8
410
411define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
412; CHECK-LABEL: ptr_i8:
413; CHECK:       @ %bb.0: @ %entry
414; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
415; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
416; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
417; CHECK-NEXT:    vldrw.u32 q2, [r0]
418; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
419; CHECK-NEXT:    vmov r1, r2, d0
420; CHECK-NEXT:    vmov r6, r7, d4
421; CHECK-NEXT:    vmov r4, r3, d1
422; CHECK-NEXT:    ldrb r5, [r1]
423; CHECK-NEXT:    ldrb r1, [r2]
424; CHECK-NEXT:    ldrb r2, [r6]
425; CHECK-NEXT:    ldrb.w r12, [r3]
426; CHECK-NEXT:    vmov.8 q0[0], r2
427; CHECK-NEXT:    vmov r2, r3, d3
428; CHECK-NEXT:    ldrb.w lr, [r4]
429; CHECK-NEXT:    ldrb r4, [r2]
430; CHECK-NEXT:    ldrb r2, [r3]
431; CHECK-NEXT:    ldrb r3, [r7]
432; CHECK-NEXT:    vmov.8 q0[1], r3
433; CHECK-NEXT:    vmov r3, r6, d5
434; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
435; CHECK-NEXT:    ldrb r3, [r3]
436; CHECK-NEXT:    ldrb r6, [r6]
437; CHECK-NEXT:    vmov.8 q0[2], r3
438; CHECK-NEXT:    vmov r0, r3, d4
439; CHECK-NEXT:    vmov.8 q0[3], r6
440; CHECK-NEXT:    ldrb r0, [r0]
441; CHECK-NEXT:    ldrb r3, [r3]
442; CHECK-NEXT:    vmov.8 q0[4], r0
443; CHECK-NEXT:    vmov.8 q0[5], r3
444; CHECK-NEXT:    vmov r0, r3, d5
445; CHECK-NEXT:    ldrb r0, [r0]
446; CHECK-NEXT:    ldrb r3, [r3]
447; CHECK-NEXT:    vmov.8 q0[6], r0
448; CHECK-NEXT:    vmov.8 q0[7], r3
449; CHECK-NEXT:    vmov r0, r3, d2
450; CHECK-NEXT:    ldrb r0, [r0]
451; CHECK-NEXT:    ldrb r3, [r3]
452; CHECK-NEXT:    vmov.8 q0[8], r0
453; CHECK-NEXT:    vmov.8 q0[9], r3
454; CHECK-NEXT:    vmov.8 q0[10], r4
455; CHECK-NEXT:    vmov.8 q0[11], r2
456; CHECK-NEXT:    vmov.8 q0[12], r5
457; CHECK-NEXT:    vmov.8 q0[13], r1
458; CHECK-NEXT:    vmov.8 q0[14], lr
459; CHECK-NEXT:    vmov.8 q0[15], r12
460; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
461entry:
462  %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4
463  %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
464  ret <16 x i8> %gather
465}
466
467define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(<8 x i8*>* %offptr) {
468; CHECK-LABEL: ptr_v8i8_sext16:
469; CHECK:       @ %bb.0: @ %entry
470; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
471; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
472; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
473; CHECK-NEXT:    vmov r3, r1, d1
474; CHECK-NEXT:    vmov r12, r2, d0
475; CHECK-NEXT:    vldrw.u32 q0, [r0]
476; CHECK-NEXT:    vmov r4, r5, d0
477; CHECK-NEXT:    vmov lr, r0, d1
478; CHECK-NEXT:    ldrb r7, [r1]
479; CHECK-NEXT:    ldrb.w r1, [r12]
480; CHECK-NEXT:    ldrb r2, [r2]
481; CHECK-NEXT:    ldrb r4, [r4]
482; CHECK-NEXT:    ldrb r5, [r5]
483; CHECK-NEXT:    vmov.16 q0[0], r4
484; CHECK-NEXT:    ldrb.w r6, [lr]
485; CHECK-NEXT:    vmov.16 q0[1], r5
486; CHECK-NEXT:    ldrb r0, [r0]
487; CHECK-NEXT:    vmov.16 q0[2], r6
488; CHECK-NEXT:    ldrb r3, [r3]
489; CHECK-NEXT:    vmov.16 q0[3], r0
490; CHECK-NEXT:    vmov.16 q0[4], r1
491; CHECK-NEXT:    vmov.16 q0[5], r2
492; CHECK-NEXT:    vmov.16 q0[6], r3
493; CHECK-NEXT:    vmov.16 q0[7], r7
494; CHECK-NEXT:    vmovlb.s8 q0, q0
495; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
496entry:
497  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
498  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
499  %ext = sext <8 x i8> %gather to <8 x i16>
500  ret <8 x i16> %ext
501}
502
503define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(<8 x i8*>* %offptr) {
504; CHECK-LABEL: ptr_v8i8_zext16:
505; CHECK:       @ %bb.0: @ %entry
506; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
507; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
508; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
509; CHECK-NEXT:    vmov r3, r1, d1
510; CHECK-NEXT:    vmov r12, r2, d0
511; CHECK-NEXT:    vldrw.u32 q0, [r0]
512; CHECK-NEXT:    vmov r4, r5, d0
513; CHECK-NEXT:    vmov lr, r0, d1
514; CHECK-NEXT:    ldrb r7, [r1]
515; CHECK-NEXT:    ldrb.w r1, [r12]
516; CHECK-NEXT:    ldrb r2, [r2]
517; CHECK-NEXT:    ldrb r4, [r4]
518; CHECK-NEXT:    ldrb r5, [r5]
519; CHECK-NEXT:    vmov.16 q0[0], r4
520; CHECK-NEXT:    ldrb.w r6, [lr]
521; CHECK-NEXT:    vmov.16 q0[1], r5
522; CHECK-NEXT:    ldrb r0, [r0]
523; CHECK-NEXT:    vmov.16 q0[2], r6
524; CHECK-NEXT:    ldrb r3, [r3]
525; CHECK-NEXT:    vmov.16 q0[3], r0
526; CHECK-NEXT:    vmov.16 q0[4], r1
527; CHECK-NEXT:    vmov.16 q0[5], r2
528; CHECK-NEXT:    vmov.16 q0[6], r3
529; CHECK-NEXT:    vmov.16 q0[7], r7
530; CHECK-NEXT:    vmovlb.u8 q0, q0
531; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
532entry:
533  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
534  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
535  %ext = zext <8 x i8> %gather to <8 x i16>
536  ret <8 x i16> %ext
537}
538
539define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(<8 x i8*>* %offptr) {
540; CHECK-LABEL: ptr_v8i8:
541; CHECK:       @ %bb.0: @ %entry
542; CHECK-NEXT:    .save {r4, r5, r6, lr}
543; CHECK-NEXT:    push {r4, r5, r6, lr}
544; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
545; CHECK-NEXT:    vmov r1, r2, d0
546; CHECK-NEXT:    vmov r3, r12, d1
547; CHECK-NEXT:    vldrw.u32 q0, [r0]
548; CHECK-NEXT:    vmov r4, r5, d0
549; CHECK-NEXT:    vmov r0, lr, d1
550; CHECK-NEXT:    ldrb r1, [r1]
551; CHECK-NEXT:    ldrb r6, [r3]
552; CHECK-NEXT:    ldrb r2, [r2]
553; CHECK-NEXT:    ldrb r4, [r4]
554; CHECK-NEXT:    ldrb r5, [r5]
555; CHECK-NEXT:    vmov.16 q0[0], r4
556; CHECK-NEXT:    ldrb r0, [r0]
557; CHECK-NEXT:    vmov.16 q0[1], r5
558; CHECK-NEXT:    ldrb.w r3, [lr]
559; CHECK-NEXT:    vmov.16 q0[2], r0
560; CHECK-NEXT:    ldrb.w r12, [r12]
561; CHECK-NEXT:    vmov.16 q0[3], r3
562; CHECK-NEXT:    vmov.16 q0[4], r1
563; CHECK-NEXT:    vmov.16 q0[5], r2
564; CHECK-NEXT:    vmov.16 q0[6], r6
565; CHECK-NEXT:    vmov.16 q0[7], r12
566; CHECK-NEXT:    pop {r4, r5, r6, pc}
567entry:
568  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
569  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
570  ret <8 x i8> %gather
571}
572
573define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
574; CHECK-LABEL: ptr_v4i8_sext32:
575; CHECK:       @ %bb.0: @ %entry
576; CHECK-NEXT:    vldrw.u32 q1, [r0]
577; CHECK-NEXT:    movs r1, #0
578; CHECK-NEXT:    vldrb.s32 q0, [r1, q1]
579; CHECK-NEXT:    bx lr
580entry:
581  %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
582  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
583  %ext = sext <4 x i8> %gather to <4 x i32>
584  ret <4 x i32> %ext
585}
586
587define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) {
588; CHECK-LABEL: ptr_v4i8_zext32:
589; CHECK:       @ %bb.0: @ %entry
590; CHECK-NEXT:    vldrw.u32 q1, [r0]
591; CHECK-NEXT:    movs r1, #0
592; CHECK-NEXT:    vldrb.u32 q0, [r1, q1]
593; CHECK-NEXT:    bx lr
594entry:
595  %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
596  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
597  %ext = zext <4 x i8> %gather to <4 x i32>
598  ret <4 x i32> %ext
599}
600
601define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) {
602; CHECK-LABEL: ptr_v4i8:
603; CHECK:       @ %bb.0: @ %entry
604; CHECK-NEXT:    vldrw.u32 q1, [r0]
605; CHECK-NEXT:    movs r1, #0
606; CHECK-NEXT:    vldrb.u32 q0, [r1, q1]
607; CHECK-NEXT:    bx lr
608entry:
609  %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
610  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
611  ret <4 x i8> %gather
612}
613
614define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
615; CHECK-LABEL: ptr_v8i8_sext32:
616; CHECK:       @ %bb.0: @ %entry
617; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
618; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
619; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
620; CHECK-NEXT:    vmov r1, r2, d1
621; CHECK-NEXT:    vmov r3, r12, d0
622; CHECK-NEXT:    vldrw.u32 q0, [r0]
623; CHECK-NEXT:    vmov r0, lr, d1
624; CHECK-NEXT:    ldrb r7, [r2]
625; CHECK-NEXT:    vmov r2, r4, d0
626; CHECK-NEXT:    ldrb r6, [r1]
627; CHECK-NEXT:    ldrb r3, [r3]
628; CHECK-NEXT:    ldrb r0, [r0]
629; CHECK-NEXT:    ldrb.w r1, [r12]
630; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
631; CHECK-NEXT:    ldrb.w r5, [lr]
632; CHECK-NEXT:    vmov q1[3], q1[1], r1, r7
633; CHECK-NEXT:    vmovlb.s8 q1, q1
634; CHECK-NEXT:    vmovlb.s16 q1, q1
635; CHECK-NEXT:    ldrb r2, [r2]
636; CHECK-NEXT:    ldrb r4, [r4]
637; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
638; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
639; CHECK-NEXT:    vmovlb.s8 q0, q0
640; CHECK-NEXT:    vmovlb.s16 q0, q0
641; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
642entry:
643  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
644  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
645  %ext = sext <8 x i8> %gather to <8 x i32>
646  ret <8 x i32> %ext
647}
648
649define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) {
650; CHECK-LABEL: ptr_v8i8_zext32:
651; CHECK:       @ %bb.0: @ %entry
652; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
653; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
654; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
655; CHECK-NEXT:    vmov.i32 q1, #0xff
656; CHECK-NEXT:    vmov r1, r2, d1
657; CHECK-NEXT:    vmov r12, r3, d0
658; CHECK-NEXT:    vldrw.u32 q0, [r0]
659; CHECK-NEXT:    vmov r4, r5, d0
660; CHECK-NEXT:    vmov r0, lr, d1
661; CHECK-NEXT:    ldrb r7, [r2]
662; CHECK-NEXT:    ldrb r1, [r1]
663; CHECK-NEXT:    ldrb.w r2, [r12]
664; CHECK-NEXT:    ldrb r4, [r4]
665; CHECK-NEXT:    ldrb r0, [r0]
666; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
667; CHECK-NEXT:    ldrb r3, [r3]
668; CHECK-NEXT:    ldrb.w r6, [lr]
669; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
670; CHECK-NEXT:    ldrb r5, [r5]
671; CHECK-NEXT:    vmov q2[3], q2[1], r3, r7
672; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
673; CHECK-NEXT:    vand q0, q0, q1
674; CHECK-NEXT:    vand q1, q2, q1
675; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
676entry:
677  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
678  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
679  %ext = zext <8 x i8> %gather to <8 x i32>
680  ret <8 x i32> %ext
681}
682
683; loops
684
685define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
686; CHECK-LABEL: foo_ptr_p_int32_t:
687; CHECK:       @ %bb.0: @ %entry
688; CHECK-NEXT:    .save {r7, lr}
689; CHECK-NEXT:    push {r7, lr}
690; CHECK-NEXT:    bic r2, r2, #15
691; CHECK-NEXT:    cmp r2, #1
692; CHECK-NEXT:    it lt
693; CHECK-NEXT:    poplt {r7, pc}
694; CHECK-NEXT:  .LBB26_1: @ %vector.body.preheader
695; CHECK-NEXT:    subs r2, #4
696; CHECK-NEXT:    movs r3, #1
697; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
698; CHECK-NEXT:  .LBB26_2: @ %vector.body
699; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
700; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
701; CHECK-NEXT:    vptt.i32 ne, q0, zr
702; CHECK-NEXT:    vldrwt.u32 q1, [q0]
703; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
704; CHECK-NEXT:    le lr, .LBB26_2
705; CHECK-NEXT:  @ %bb.3: @ %for.end
706; CHECK-NEXT:    pop {r7, pc}
707entry:
708  %and = and i32 %n, -16
709  %cmp11 = icmp sgt i32 %and, 0
710  br i1 %cmp11, label %vector.body, label %for.end
711
712vector.body:                                      ; preds = %entry, %vector.body
713  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
714  %0 = getelementptr inbounds i32*, i32** %src, i32 %index
715  %1 = bitcast i32** %0 to <4 x i32*>*
716  %wide.load = load <4 x i32*>, <4 x i32*>* %1, align 4
717  %2 = icmp ne <4 x i32*> %wide.load, zeroinitializer
718  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %wide.load, i32 4, <4 x i1> %2, <4 x i32> undef)
719  %3 = getelementptr inbounds i32, i32* %dest, i32 %index
720  %4 = bitcast i32* %3 to <4 x i32>*
721  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.gather, <4 x i32>* %4, i32 4, <4 x i1> %2)
722  %index.next = add i32 %index, 4
723  %5 = icmp eq i32 %index.next, %and
724  br i1 %5, label %for.end, label %vector.body
725
726for.end:                                          ; preds = %vector.body, %entry
727  ret void
728}
729
730define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
731; CHECK-LABEL: foo_ptr_p_float:
732; CHECK:       @ %bb.0: @ %entry
733; CHECK-NEXT:    .save {r7, lr}
734; CHECK-NEXT:    push {r7, lr}
735; CHECK-NEXT:    bic r2, r2, #15
736; CHECK-NEXT:    cmp r2, #1
737; CHECK-NEXT:    it lt
738; CHECK-NEXT:    poplt {r7, pc}
739; CHECK-NEXT:  .LBB27_1: @ %vector.body.preheader
740; CHECK-NEXT:    subs r2, #4
741; CHECK-NEXT:    movs r3, #1
742; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
743; CHECK-NEXT:  .LBB27_2: @ %vector.body
744; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
745; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
746; CHECK-NEXT:    vptt.i32 ne, q0, zr
747; CHECK-NEXT:    vldrwt.u32 q1, [q0]
748; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
749; CHECK-NEXT:    le lr, .LBB27_2
750; CHECK-NEXT:  @ %bb.3: @ %for.end
751; CHECK-NEXT:    pop {r7, pc}
752entry:
753  %and = and i32 %n, -16
754  %cmp11 = icmp sgt i32 %and, 0
755  br i1 %cmp11, label %vector.body, label %for.end
756
757vector.body:                                      ; preds = %entry, %vector.body
758  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
759  %0 = getelementptr inbounds float*, float** %src, i32 %index
760  %1 = bitcast float** %0 to <4 x float*>*
761  %wide.load = load <4 x float*>, <4 x float*>* %1, align 4
762  %2 = icmp ne <4 x float*> %wide.load, zeroinitializer
763  %3 = bitcast <4 x float*> %wide.load to <4 x i32*>
764  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> %2, <4 x i32> undef)
765  %4 = getelementptr inbounds float, float* %dest, i32 %index
766  %5 = bitcast float* %4 to <4 x i32>*
767  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.gather, <4 x i32>* %5, i32 4, <4 x i1> %2)
768  %index.next = add i32 %index, 4
769  %6 = icmp eq i32 %index.next, %and
770  br i1 %6, label %for.end, label %vector.body
771
772for.end:                                          ; preds = %vector.body, %entry
773  ret void
774}
775
776define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
777; CHECK-LABEL: qi4:
778; CHECK:       @ %bb.0: @ %entry
779; CHECK-NEXT:    vmov.i32 q1, #0x10
780; CHECK-NEXT:    vadd.i32 q1, q0, q1
781; CHECK-NEXT:    vldrw.u32 q0, [q1]
782; CHECK-NEXT:    bx lr
783entry:
784  %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4
785  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
786  ret <4 x i32> %gather
787}
788
789define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) {
790; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64:
791; CHECK:       @ %bb.0: @ %entry
792; CHECK-NEXT:    vldrb.u16 q0, [r1]
793; CHECK-NEXT:    vldrb.u16 q1, [r0, q0]
794; CHECK-NEXT:    vmov.u16 r0, q1[2]
795; CHECK-NEXT:    vmov.u16 r1, q1[0]
796; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
797; CHECK-NEXT:    vmov.u16 r0, q1[3]
798; CHECK-NEXT:    vmov.u16 r1, q1[1]
799; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
800; CHECK-NEXT:    vmov.u16 r0, q1[6]
801; CHECK-NEXT:    vmov.u16 r1, q1[4]
802; CHECK-NEXT:    vmovlb.s8 q0, q0
803; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
804; CHECK-NEXT:    vmov.u16 r0, q1[7]
805; CHECK-NEXT:    vmov.u16 r1, q1[5]
806; CHECK-NEXT:    vmovlb.s16 q0, q0
807; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
808; CHECK-NEXT:    vmovlb.s8 q1, q2
809; CHECK-NEXT:    vmovlb.s16 q1, q1
810; CHECK-NEXT:    bx lr
811entry:
812  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
813  %offs.zext = zext <8 x i8> %offs to <8 x i32>
814  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
815  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
816  %gather.sext = sext <8 x i8> %gather to <8 x i32>
817  ret <8 x i32> %gather.sext
818}
819
820declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
821declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
822declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)
823declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
824declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
825declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
826declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
827declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>)
828declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
829declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>)
830declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>)
831declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>)
832declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>)
833declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>)
834declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
835declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
836declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
837declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>)
838declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
839