1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4; VLDRB.8
5define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
6; CHECK-LABEL: unscaled_v16i8_i8:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrb.u8 q1, [r1]
9; CHECK-NEXT:    vstrb.8 q0, [r0, q1]
10; CHECK-NEXT:    bx lr
11entry:
12  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
13  %offs.zext = zext <16 x i8> %offs to <16 x i32>
14  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
15  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
16  ret void
17}
18
19define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
20; CHECK-LABEL: unscaled_v8i8_i8:
21; CHECK:       @ %bb.0: @ %entry
22; CHECK-NEXT:    vldrb.u16 q1, [r1]
23; CHECK-NEXT:    vmovlb.u8 q0, q0
24; CHECK-NEXT:    vstrb.16 q0, [r0, q1]
25; CHECK-NEXT:    bx lr
26entry:
27  %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
28  %offs.zext = zext <8 x i8> %offs to <8 x i32>
29  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
30  call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %input, <8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
31  ret void
32}
33
34; Expand
35define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x i8> %input) {
36; CHECK-LABEL: unscaled_v2i8_i8:
37; CHECK:       @ %bb.0: @ %entry
38; CHECK-NEXT:    ldrb r2, [r1]
39; CHECK-NEXT:    vmov.i32 q1, #0xff
40; CHECK-NEXT:    ldrb r1, [r1, #1]
41; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
42; CHECK-NEXT:    vmov r2, s0
43; CHECK-NEXT:    vand q1, q2, q1
44; CHECK-NEXT:    vmov r1, s4
45; CHECK-NEXT:    strb r2, [r0, r1]
46; CHECK-NEXT:    vmov r1, s6
47; CHECK-NEXT:    vmov r2, s2
48; CHECK-NEXT:    strb r2, [r0, r1]
49; CHECK-NEXT:    bx lr
50entry:
51  %offs = load <2 x i8>, <2 x i8>* %offptr, align 1
52  %offs.zext = zext <2 x i8> %offs to <2 x i32>
53  %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext
54  call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %input, <2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>)
55  ret void
56}
57
58; Expand - sext offsets
59define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
60; CHECK-LABEL: unscaled_v16i8_sext:
61; CHECK:       @ %bb.0: @ %entry
62; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
63; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
64; CHECK-NEXT:    vldrb.s32 q1, [r1]
65; CHECK-NEXT:    vldrb.s32 q3, [r1, #8]
66; CHECK-NEXT:    vmov.u8 r6, q0[0]
67; CHECK-NEXT:    vmov.u8 r5, q0[4]
68; CHECK-NEXT:    vadd.i32 q1, q1, r0
69; CHECK-NEXT:    vadd.i32 q3, q3, r0
70; CHECK-NEXT:    vmov r2, r3, d2
71; CHECK-NEXT:    vmov.u8 r7, q0[6]
72; CHECK-NEXT:    vmov r12, lr, d3
73; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
74; CHECK-NEXT:    vadd.i32 q2, q1, r0
75; CHECK-NEXT:    vldrb.s32 q1, [r1, #12]
76; CHECK-NEXT:    vmov r4, r8, d4
77; CHECK-NEXT:    vadd.i32 q1, q1, r0
78; CHECK-NEXT:    vmov r0, r9, d5
79; CHECK-NEXT:    strb r6, [r2]
80; CHECK-NEXT:    vmov.u8 r2, q0[1]
81; CHECK-NEXT:    strb r2, [r3]
82; CHECK-NEXT:    vmov.u8 r6, q0[2]
83; CHECK-NEXT:    vmov r2, r10, d6
84; CHECK-NEXT:    strb.w r6, [r12]
85; CHECK-NEXT:    vmov.u8 r6, q0[3]
86; CHECK-NEXT:    vmov.u8 r3, q0[8]
87; CHECK-NEXT:    strb.w r6, [lr]
88; CHECK-NEXT:    vmov r6, r1, d7
89; CHECK-NEXT:    strb r5, [r4]
90; CHECK-NEXT:    vmov.u8 r5, q0[5]
91; CHECK-NEXT:    strb.w r5, [r8]
92; CHECK-NEXT:    vmov r5, r4, d2
93; CHECK-NEXT:    strb r7, [r0]
94; CHECK-NEXT:    vmov.u8 r0, q0[7]
95; CHECK-NEXT:    strb.w r0, [r9]
96; CHECK-NEXT:    vmov r0, r7, d3
97; CHECK-NEXT:    strb r3, [r2]
98; CHECK-NEXT:    vmov.u8 r2, q0[9]
99; CHECK-NEXT:    strb.w r2, [r10]
100; CHECK-NEXT:    vmov.u8 r2, q0[10]
101; CHECK-NEXT:    strb r2, [r6]
102; CHECK-NEXT:    vmov.u8 r2, q0[11]
103; CHECK-NEXT:    strb r2, [r1]
104; CHECK-NEXT:    vmov.u8 r1, q0[12]
105; CHECK-NEXT:    strb r1, [r5]
106; CHECK-NEXT:    vmov.u8 r1, q0[13]
107; CHECK-NEXT:    strb r1, [r4]
108; CHECK-NEXT:    vmov.u8 r1, q0[14]
109; CHECK-NEXT:    strb r1, [r0]
110; CHECK-NEXT:    vmov.u8 r0, q0[15]
111; CHECK-NEXT:    strb r0, [r7]
112; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
113entry:
114  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
115  %offs.sext = sext <16 x i8> %offs to <16 x i32>
116  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
117  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
118  ret void
119}
120
121; Expand - sext offsets
122define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
123; CHECK-LABEL: unscaled_v16i8_i16:
124; CHECK:       @ %bb.0: @ %entry
125; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
126; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
127; CHECK-NEXT:    vldrh.s32 q1, [r1]
128; CHECK-NEXT:    vldrh.s32 q3, [r1, #16]
129; CHECK-NEXT:    vmov.u8 r6, q0[0]
130; CHECK-NEXT:    vmov.u8 r5, q0[4]
131; CHECK-NEXT:    vadd.i32 q1, q1, r0
132; CHECK-NEXT:    vadd.i32 q3, q3, r0
133; CHECK-NEXT:    vmov r2, r3, d2
134; CHECK-NEXT:    vmov.u8 r7, q0[6]
135; CHECK-NEXT:    vmov r12, lr, d3
136; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
137; CHECK-NEXT:    vadd.i32 q2, q1, r0
138; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
139; CHECK-NEXT:    vmov r4, r8, d4
140; CHECK-NEXT:    vadd.i32 q1, q1, r0
141; CHECK-NEXT:    vmov r0, r9, d5
142; CHECK-NEXT:    strb r6, [r2]
143; CHECK-NEXT:    vmov.u8 r2, q0[1]
144; CHECK-NEXT:    strb r2, [r3]
145; CHECK-NEXT:    vmov.u8 r6, q0[2]
146; CHECK-NEXT:    vmov r2, r10, d6
147; CHECK-NEXT:    strb.w r6, [r12]
148; CHECK-NEXT:    vmov.u8 r6, q0[3]
149; CHECK-NEXT:    vmov.u8 r3, q0[8]
150; CHECK-NEXT:    strb.w r6, [lr]
151; CHECK-NEXT:    vmov r6, r1, d7
152; CHECK-NEXT:    strb r5, [r4]
153; CHECK-NEXT:    vmov.u8 r5, q0[5]
154; CHECK-NEXT:    strb.w r5, [r8]
155; CHECK-NEXT:    vmov r5, r4, d2
156; CHECK-NEXT:    strb r7, [r0]
157; CHECK-NEXT:    vmov.u8 r0, q0[7]
158; CHECK-NEXT:    strb.w r0, [r9]
159; CHECK-NEXT:    vmov r0, r7, d3
160; CHECK-NEXT:    strb r3, [r2]
161; CHECK-NEXT:    vmov.u8 r2, q0[9]
162; CHECK-NEXT:    strb.w r2, [r10]
163; CHECK-NEXT:    vmov.u8 r2, q0[10]
164; CHECK-NEXT:    strb r2, [r6]
165; CHECK-NEXT:    vmov.u8 r2, q0[11]
166; CHECK-NEXT:    strb r2, [r1]
167; CHECK-NEXT:    vmov.u8 r1, q0[12]
168; CHECK-NEXT:    strb r1, [r5]
169; CHECK-NEXT:    vmov.u8 r1, q0[13]
170; CHECK-NEXT:    strb r1, [r4]
171; CHECK-NEXT:    vmov.u8 r1, q0[14]
172; CHECK-NEXT:    strb r1, [r0]
173; CHECK-NEXT:    vmov.u8 r0, q0[15]
174; CHECK-NEXT:    strb r0, [r7]
175; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
176entry:
177  %offs = load <16 x i16>, <16 x i16>* %offptr, align 2
178  %offs.sext = sext <16 x i16> %offs to <16 x i32>
179  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
180  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
181  ret void
182}
183
184; Could be manually scaled offsets
185define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
186; CHECK-LABEL: unscaled_v16i8_scaled:
187; CHECK:       @ %bb.0: @ %entry
188; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
189; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
190; CHECK-NEXT:    vldrb.u32 q1, [r1]
191; CHECK-NEXT:    vldrb.u32 q3, [r1, #8]
192; CHECK-NEXT:    vmov.u8 r6, q0[0]
193; CHECK-NEXT:    vmov.u8 r7, q0[4]
194; CHECK-NEXT:    vshl.i32 q1, q1, #2
195; CHECK-NEXT:    vshl.i32 q3, q3, #2
196; CHECK-NEXT:    vadd.i32 q1, q1, r0
197; CHECK-NEXT:    vadd.i32 q3, q3, r0
198; CHECK-NEXT:    vmov r2, r3, d2
199; CHECK-NEXT:    vmov r12, lr, d3
200; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
201; CHECK-NEXT:    vshl.i32 q1, q1, #2
202; CHECK-NEXT:    vadd.i32 q2, q1, r0
203; CHECK-NEXT:    vldrb.u32 q1, [r1, #12]
204; CHECK-NEXT:    vmov r4, r8, d4
205; CHECK-NEXT:    vmov.u8 r1, q0[6]
206; CHECK-NEXT:    vshl.i32 q1, q1, #2
207; CHECK-NEXT:    vadd.i32 q1, q1, r0
208; CHECK-NEXT:    vmov r0, r9, d5
209; CHECK-NEXT:    strb r6, [r2]
210; CHECK-NEXT:    vmov.u8 r2, q0[1]
211; CHECK-NEXT:    strb r2, [r3]
212; CHECK-NEXT:    vmov.u8 r6, q0[2]
213; CHECK-NEXT:    vmov r2, r10, d6
214; CHECK-NEXT:    strb.w r6, [r12]
215; CHECK-NEXT:    vmov.u8 r6, q0[3]
216; CHECK-NEXT:    vmov.u8 r3, q0[8]
217; CHECK-NEXT:    strb.w r6, [lr]
218; CHECK-NEXT:    vmov r6, r5, d7
219; CHECK-NEXT:    strb r7, [r4]
220; CHECK-NEXT:    vmov.u8 r7, q0[5]
221; CHECK-NEXT:    strb.w r7, [r8]
222; CHECK-NEXT:    vmov r7, r4, d2
223; CHECK-NEXT:    strb r1, [r0]
224; CHECK-NEXT:    vmov.u8 r0, q0[7]
225; CHECK-NEXT:    strb.w r0, [r9]
226; CHECK-NEXT:    vmov r0, r1, d3
227; CHECK-NEXT:    strb r3, [r2]
228; CHECK-NEXT:    vmov.u8 r2, q0[9]
229; CHECK-NEXT:    strb.w r2, [r10]
230; CHECK-NEXT:    vmov.u8 r2, q0[10]
231; CHECK-NEXT:    strb r2, [r6]
232; CHECK-NEXT:    vmov.u8 r2, q0[11]
233; CHECK-NEXT:    strb r2, [r5]
234; CHECK-NEXT:    vmov.u8 r2, q0[12]
235; CHECK-NEXT:    strb r2, [r7]
236; CHECK-NEXT:    vmov.u8 r2, q0[13]
237; CHECK-NEXT:    strb r2, [r4]
238; CHECK-NEXT:    vmov.u8 r2, q0[14]
239; CHECK-NEXT:    strb r2, [r0]
240; CHECK-NEXT:    vmov.u8 r0, q0[15]
241; CHECK-NEXT:    strb r0, [r1]
242; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
243entry:
244  %offs = load <16 x i8>, <16 x i8>* %offptr, align 4
245  %offs.zext = zext <16 x i8> %offs to <16 x i32>
246  %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext
247  %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*>
248  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
249  ret void
250}
251
252; Expand - large offsets
253define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
254; CHECK-LABEL: unscaled_v16i8_i8_next:
255; CHECK:       @ %bb.0: @ %entry
256; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
257; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
258; CHECK-NEXT:    vldrw.u32 q1, [r1]
259; CHECK-NEXT:    vldrw.u32 q3, [r1, #32]
260; CHECK-NEXT:    vmov.u8 r6, q0[0]
261; CHECK-NEXT:    vmov.u8 r5, q0[4]
262; CHECK-NEXT:    vadd.i32 q1, q1, r0
263; CHECK-NEXT:    vadd.i32 q3, q3, r0
264; CHECK-NEXT:    vmov r2, r3, d2
265; CHECK-NEXT:    vmov.u8 r7, q0[6]
266; CHECK-NEXT:    vmov r12, lr, d3
267; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
268; CHECK-NEXT:    vadd.i32 q2, q1, r0
269; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
270; CHECK-NEXT:    vmov r4, r8, d4
271; CHECK-NEXT:    vadd.i32 q1, q1, r0
272; CHECK-NEXT:    vmov r0, r9, d5
273; CHECK-NEXT:    strb r6, [r2]
274; CHECK-NEXT:    vmov.u8 r2, q0[1]
275; CHECK-NEXT:    strb r2, [r3]
276; CHECK-NEXT:    vmov.u8 r6, q0[2]
277; CHECK-NEXT:    vmov r2, r10, d6
278; CHECK-NEXT:    strb.w r6, [r12]
279; CHECK-NEXT:    vmov.u8 r6, q0[3]
280; CHECK-NEXT:    vmov.u8 r3, q0[8]
281; CHECK-NEXT:    strb.w r6, [lr]
282; CHECK-NEXT:    vmov r6, r1, d7
283; CHECK-NEXT:    strb r5, [r4]
284; CHECK-NEXT:    vmov.u8 r5, q0[5]
285; CHECK-NEXT:    strb.w r5, [r8]
286; CHECK-NEXT:    vmov r5, r4, d2
287; CHECK-NEXT:    strb r7, [r0]
288; CHECK-NEXT:    vmov.u8 r0, q0[7]
289; CHECK-NEXT:    strb.w r0, [r9]
290; CHECK-NEXT:    vmov r0, r7, d3
291; CHECK-NEXT:    strb r3, [r2]
292; CHECK-NEXT:    vmov.u8 r2, q0[9]
293; CHECK-NEXT:    strb.w r2, [r10]
294; CHECK-NEXT:    vmov.u8 r2, q0[10]
295; CHECK-NEXT:    strb r2, [r6]
296; CHECK-NEXT:    vmov.u8 r2, q0[11]
297; CHECK-NEXT:    strb r2, [r1]
298; CHECK-NEXT:    vmov.u8 r1, q0[12]
299; CHECK-NEXT:    strb r1, [r5]
300; CHECK-NEXT:    vmov.u8 r1, q0[13]
301; CHECK-NEXT:    strb r1, [r4]
302; CHECK-NEXT:    vmov.u8 r1, q0[14]
303; CHECK-NEXT:    strb r1, [r0]
304; CHECK-NEXT:    vmov.u8 r0, q0[15]
305; CHECK-NEXT:    strb r0, [r7]
306; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
307entry:
308  %offs = load <16 x i32>, <16 x i32>* %offptr, align 4
309  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
310  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
311  ret void
312}
313
314define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
315; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
316; CHECK:       @ %bb.0: @ %entry
317; CHECK-NEXT:    .save {r4, lr}
318; CHECK-NEXT:    push {r4, lr}
319; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
320; CHECK-NEXT:    vpush {d8, d9, d10, d11}
321; CHECK-NEXT:    vmov r4, s0
322; CHECK-NEXT:    add r3, sp, #40
323; CHECK-NEXT:    vmov.8 q5[0], r4
324; CHECK-NEXT:    vmov r4, s2
325; CHECK-NEXT:    vmov.8 q5[1], r4
326; CHECK-NEXT:    vmov r4, s4
327; CHECK-NEXT:    vmov.8 q5[2], r4
328; CHECK-NEXT:    vmov r4, s6
329; CHECK-NEXT:    vmov.8 q5[3], r4
330; CHECK-NEXT:    vmov r4, s8
331; CHECK-NEXT:    vmov.8 q5[4], r4
332; CHECK-NEXT:    vmov r4, s10
333; CHECK-NEXT:    vldrw.u32 q0, [r3]
334; CHECK-NEXT:    vmov.8 q5[5], r4
335; CHECK-NEXT:    vmov r4, s12
336; CHECK-NEXT:    add.w lr, sp, #56
337; CHECK-NEXT:    vmov.8 q5[6], r4
338; CHECK-NEXT:    vmov r4, s14
339; CHECK-NEXT:    vmov.8 q5[7], r4
340; CHECK-NEXT:    vmov r3, s0
341; CHECK-NEXT:    vmov.8 q5[8], r3
342; CHECK-NEXT:    vmov r3, s2
343; CHECK-NEXT:    vldrw.u32 q0, [lr]
344; CHECK-NEXT:    vmov.8 q5[9], r3
345; CHECK-NEXT:    add.w r12, sp, #72
346; CHECK-NEXT:    add r2, sp, #88
347; CHECK-NEXT:    vmov r3, s0
348; CHECK-NEXT:    vldrw.u32 q4, [r2]
349; CHECK-NEXT:    vmov.8 q5[10], r3
350; CHECK-NEXT:    vmov r3, s2
351; CHECK-NEXT:    vldrw.u32 q0, [r12]
352; CHECK-NEXT:    vmov.8 q5[11], r3
353; CHECK-NEXT:    vmov r2, s18
354; CHECK-NEXT:    vmov r3, s0
355; CHECK-NEXT:    vmov.8 q5[12], r3
356; CHECK-NEXT:    vmov r3, s2
357; CHECK-NEXT:    vmov.8 q5[13], r3
358; CHECK-NEXT:    vmov r3, s16
359; CHECK-NEXT:    vmov.8 q5[14], r3
360; CHECK-NEXT:    vldrb.u8 q0, [r1]
361; CHECK-NEXT:    vmov.8 q5[15], r2
362; CHECK-NEXT:    vstrb.8 q5, [r0, q0]
363; CHECK-NEXT:    vpop {d8, d9, d10, d11}
364; CHECK-NEXT:    pop {r4, pc}
365entry:
366  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
367  %offs.zext = zext <16 x i8> %offs to <16 x i32>
368  %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
369  %input.trunc = trunc <16 x i64> %input to <16 x i8>
370  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
371  ret void
372}
373
374define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
375; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
376; CHECK:       @ %bb.0: @ %entry
377; CHECK-NEXT:    .pad #16
378; CHECK-NEXT:    sub sp, #16
379; CHECK-NEXT:    mov r2, sp
380; CHECK-NEXT:    vstrb.32 q3, [r2, #12]
381; CHECK-NEXT:    vstrb.32 q2, [r2, #8]
382; CHECK-NEXT:    vstrb.32 q1, [r2, #4]
383; CHECK-NEXT:    vstrb.32 q0, [r2]
384; CHECK-NEXT:    vldrb.u8 q0, [r1]
385; CHECK-NEXT:    vldrw.u32 q1, [r2]
386; CHECK-NEXT:    vstrb.8 q1, [r0, q0]
387; CHECK-NEXT:    add sp, #16
388; CHECK-NEXT:    bx lr
389entry:
390  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
391  %offs.zext = zext <16 x i8> %offs to <16 x i32>
392  %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
393  %input.trunc = trunc <16 x i32> %input to <16 x i8>
394  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
395  ret void
396}
397
398define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
399; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
400; CHECK:       @ %bb.0: @ %entry
401; CHECK-NEXT:    .pad #16
402; CHECK-NEXT:    sub sp, #16
403; CHECK-NEXT:    mov r2, sp
404; CHECK-NEXT:    vstrb.16 q1, [r2, #8]
405; CHECK-NEXT:    vstrb.16 q0, [r2]
406; CHECK-NEXT:    vldrb.u8 q0, [r1]
407; CHECK-NEXT:    vldrw.u32 q1, [r2]
408; CHECK-NEXT:    vstrb.8 q1, [r0, q0]
409; CHECK-NEXT:    add sp, #16
410; CHECK-NEXT:    bx lr
411entry:
412  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
413  %offs.zext = zext <16 x i8> %offs to <16 x i32>
414  %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
415  %input.trunc = trunc <16 x i16> %input to <16 x i8>
416  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
417  ret void
418}
419
420define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
421; CHECK-LABEL: unscaled_v16i8_i8_2gep:
422; CHECK:       @ %bb.0: @ %entry
423; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
424; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
425; CHECK-NEXT:    .vsave {d8, d9}
426; CHECK-NEXT:    vpush {d8, d9}
427; CHECK-NEXT:    vldrb.s32 q2, [r1]
428; CHECK-NEXT:    vmov.i32 q1, #0x5
429; CHECK-NEXT:    vldrb.s32 q4, [r1, #8]
430; CHECK-NEXT:    vmov.u8 r6, q0[0]
431; CHECK-NEXT:    vadd.i32 q2, q2, r0
432; CHECK-NEXT:    vmov.u8 r5, q0[4]
433; CHECK-NEXT:    vadd.i32 q2, q2, q1
434; CHECK-NEXT:    vadd.i32 q4, q4, r0
435; CHECK-NEXT:    vmov r2, r3, d4
436; CHECK-NEXT:    vmov.u8 r7, q0[6]
437; CHECK-NEXT:    vmov r12, lr, d5
438; CHECK-NEXT:    vldrb.s32 q2, [r1, #4]
439; CHECK-NEXT:    vadd.i32 q2, q2, r0
440; CHECK-NEXT:    vadd.i32 q3, q2, q1
441; CHECK-NEXT:    vldrb.s32 q2, [r1, #12]
442; CHECK-NEXT:    vmov r4, r8, d6
443; CHECK-NEXT:    vadd.i32 q2, q2, r0
444; CHECK-NEXT:    vmov r0, r9, d7
445; CHECK-NEXT:    vadd.i32 q3, q4, q1
446; CHECK-NEXT:    vadd.i32 q1, q2, q1
447; CHECK-NEXT:    strb r6, [r2]
448; CHECK-NEXT:    vmov.u8 r2, q0[1]
449; CHECK-NEXT:    strb r2, [r3]
450; CHECK-NEXT:    vmov.u8 r6, q0[2]
451; CHECK-NEXT:    vmov r2, r10, d6
452; CHECK-NEXT:    strb.w r6, [r12]
453; CHECK-NEXT:    vmov.u8 r6, q0[3]
454; CHECK-NEXT:    vmov.u8 r3, q0[8]
455; CHECK-NEXT:    strb.w r6, [lr]
456; CHECK-NEXT:    vmov r6, r1, d7
457; CHECK-NEXT:    strb r5, [r4]
458; CHECK-NEXT:    vmov.u8 r5, q0[5]
459; CHECK-NEXT:    strb.w r5, [r8]
460; CHECK-NEXT:    vmov r5, r4, d2
461; CHECK-NEXT:    strb r7, [r0]
462; CHECK-NEXT:    vmov.u8 r0, q0[7]
463; CHECK-NEXT:    strb.w r0, [r9]
464; CHECK-NEXT:    vmov r0, r7, d3
465; CHECK-NEXT:    strb r3, [r2]
466; CHECK-NEXT:    vmov.u8 r2, q0[9]
467; CHECK-NEXT:    strb.w r2, [r10]
468; CHECK-NEXT:    vmov.u8 r2, q0[10]
469; CHECK-NEXT:    strb r2, [r6]
470; CHECK-NEXT:    vmov.u8 r2, q0[11]
471; CHECK-NEXT:    strb r2, [r1]
472; CHECK-NEXT:    vmov.u8 r1, q0[12]
473; CHECK-NEXT:    strb r1, [r5]
474; CHECK-NEXT:    vmov.u8 r1, q0[13]
475; CHECK-NEXT:    strb r1, [r4]
476; CHECK-NEXT:    vmov.u8 r1, q0[14]
477; CHECK-NEXT:    strb r1, [r0]
478; CHECK-NEXT:    vmov.u8 r0, q0[15]
479; CHECK-NEXT:    strb r0, [r7]
480; CHECK-NEXT:    vpop {d8, d9}
481; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
482entry:
483  %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
484  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs
485  %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
486  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
487  ret void
488}
489
490define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
491; CHECK-LABEL: unscaled_v16i8_i8_2gep2:
492; CHECK:       @ %bb.0: @ %entry
493; CHECK-NEXT:    adr r1, .LCPI11_0
494; CHECK-NEXT:    vldrw.u32 q1, [r1]
495; CHECK-NEXT:    vstrb.8 q0, [r0, q1]
496; CHECK-NEXT:    bx lr
497; CHECK-NEXT:    .p2align 4
498; CHECK-NEXT:  @ %bb.1:
499; CHECK-NEXT:  .LCPI11_0:
500; CHECK-NEXT:    .byte 5 @ 0x5
501; CHECK-NEXT:    .byte 8 @ 0x8
502; CHECK-NEXT:    .byte 11 @ 0xb
503; CHECK-NEXT:    .byte 14 @ 0xe
504; CHECK-NEXT:    .byte 17 @ 0x11
505; CHECK-NEXT:    .byte 20 @ 0x14
506; CHECK-NEXT:    .byte 23 @ 0x17
507; CHECK-NEXT:    .byte 26 @ 0x1a
508; CHECK-NEXT:    .byte 29 @ 0x1d
509; CHECK-NEXT:    .byte 32 @ 0x20
510; CHECK-NEXT:    .byte 35 @ 0x23
511; CHECK-NEXT:    .byte 38 @ 0x26
512; CHECK-NEXT:    .byte 41 @ 0x29
513; CHECK-NEXT:    .byte 44 @ 0x2c
514; CHECK-NEXT:    .byte 47 @ 0x2f
515; CHECK-NEXT:    .byte 50 @ 0x32
516entry:
517  %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45>
518  %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5
519  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
520  ret void
521}
522
523
524declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>)
525declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
526declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
527